博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
基于用户的协同过滤算法
阅读量:5085 次
发布时间:2019-06-13

本文共 6775 字,大约阅读时间需要 22 分钟。

基于用户的协同过滤算法-参考《推荐系统实践》一书,作者:项亮

1 import random  2 import math  3 class UserBasedCF:  4     def __init__(self,datafile = None):  5         self.datafile = datafile  6         self.readData()  7         self.splitData(3,47)  8     def readData(self,datafile = None):  9         """ 10         read the data from the data file which is a data set 11         """ 12         self.datafile = datafile or self.datafile 13         self.data = [] 14         for line in open(self.datafile): 15             userid,itemid,record,_ = line.split() 16             self.data.append((userid,itemid,int(record))) 17     def splitData(self,k,seed,data=None,M = 8): 18         """ 19         split the data set 20         testdata is a test data set 21         traindata is a train set  22         test data set / train data set is 1:M-1 23         """ 24         self.testdata = {} 25         self.traindata = {} 26         data = data or self.data 27         random.seed(seed) 28         for user,item, record in self.data: 29             if random.randint(0,M) == k: 30                 self.testdata.setdefault(user,{}) 31                 self.testdata[user][item] = record  32             else: 33                 self.traindata.setdefault(user,{}) 34                 self.traindata[user][item] = record 35     def userSimilarity(self,train = None): 36         """ 37         One method of getting user similarity matrix 38         """ 39         train = train or self.traindata 40         self.userSim = dict() 41         for u in train.keys(): 42             for v in train.keys(): 43                 if u == v: 44                     continue 45                 self.userSim.setdefault(u,{}) 46                 self.userSim[u][v] = len(set(train[u].keys()) & set(train[v].keys())) 47                 self.userSim[u][v] /=math.sqrt(len(train[u]) * len(train[v]) *1.0) 48     def userSimilarityBest(self,train = None): 49         """ 50         the other method of getting user similarity which is better than above 51         you can get the method on page 46 52         In this experiment,we use this method 53         """ 54         train = train or self.traindata 55         self.userSimBest = dict() 56         item_users = dict() 57         for u,item in train.items(): 58             for i in item.keys(): 59                 item_users.setdefault(i,set()) 60                 item_users[i].add(u) 61         user_item_count = dict() 62         count = dict() 63         for item,users in item_users.items(): 64             for u in users: 65                 user_item_count.setdefault(u,0) 66                 user_item_count[u] += 1 67                 for v in users: 68                     if u == v:continue 69                     count.setdefault(u,{}) 70                     count[u].setdefault(v,0) 71                     count[u][v] += 1 72         for u ,related_users in count.items(): 73             self.userSimBest.setdefault(u,dict()) 74             for v, cuv in related_users.items(): 75                 self.userSimBest[u][v] = cuv / math.sqrt(user_item_count[u] * user_item_count[v] * 1.0) 76   77     def recommend(self,user,train = None,k = 8,nitem = 40): 78         train = train or self.traindata 79         rank = dict() 80         interacted_items = train.get(user,{}) 81         for v ,wuv in sorted(self.userSimBest[user].items(),key = lambda x : x[1],reverse = True)[0:k]: 82             for i , rvi in train[v].items(): 83                 if i in interacted_items: 84                     continue 85                 rank.setdefault(i,0) 86                 rank[i] += wuv 87         return dict(sorted(rank.items(),key = lambda x :x[1],reverse = True)[0:nitem]) 88     def recallAndPrecision(self,train = None,test = None,k = 8,nitem = 10): 89         """ 90         Get the recall and precision, the method you want to know is listed  91         in the page 43 92         """ 93         train  = train or self.traindata 94         test = test or self.testdata 95         hit = 0 96         recall = 0 97         precision = 0 98         for user in train.keys(): 99             tu = test.get(user,{})100             rank = self.recommend(user, train = train,k = k,nitem = nitem) 101             for item,_ in rank.items():102                 if item in tu:103                     hit += 1104             recall += len(tu)105             precision += nitem106         return (hit / (recall * 1.0),hit / (precision * 1.0))107     def coverage(self,train = None,test = None,k = 8,nitem = 10):108         train = train or self.traindata109         test = test or self.testdata110         recommend_items = set()111         all_items  = set()112         for user in train.keys():113             for item in train[user].keys():114                 all_items.add(item)115             rank = self.recommend(user, train, k = k, nitem = nitem)116             for item,_ in rank.items():117                 recommend_items.add(item)118         return len(recommend_items) / (len(all_items) * 1.0)119     def popularity(self,train = None,test = None,k = 8,nitem = 10):120         """121         Get the popularity122         the algorithm on page 44123         """124         train = train or self.traindata125         test = test or self.testdata126         item_popularity = dict()127         for user ,items in train.items():128             for item in items.keys():129                 item_popularity.setdefault(item,0)130                 item_popularity[item] += 1131         ret = 0132         n = 0133         for user in train.keys():134             rank = self.recommend(user, train, k = k, nitem = nitem)135             for item ,_ in rank.items():136                 ret += math.log(1+item_popularity[item])137                 n += 1138         return ret / (n * 1.0)139      140 def testRecommend():141     ubcf = UserBasedCF('u.data')142     ubcf.readData()143     ubcf.splitData(4,100)144     ubcf.userSimilarity()145     user = "345"146     rank = ubcf.recommend(user,k = 3)147     for i,rvi in rank.items():148          149         items = ubcf.testdata.get(user,{})150         record = items.get(i,0)151         print "%5s: %.4f--%.4f" %(i,rvi,record)152 def testUserBasedCF():153     cf  =  UserBasedCF('u.data')154     cf.userSimilarityBest()155     print "%3s%20s%20s%20s%20s" % ('K',"recall",'precision','coverage','popularity')156     for k in [5,10,20,40,80,160]:157         recall,precision = cf.recallAndPrecision( k = k)158         coverage = cf.coverage(k = k)159         popularity = cf.popularity(k = k)160         print "%3d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (k,recall * 100,precision * 100,coverage * 100,popularity)161          162 if __name__ == "__main__":163     testUserBasedCF()

转载于:https://www.cnblogs.com/nn-xiaoliuzi/p/4157046.html

你可能感兴趣的文章
深入理解JVM读书笔记--字节码执行引擎
查看>>
vue-搜索功能-实时监听搜索框的输入,N毫秒请求一次数据
查看>>
批处理 windows 服务的安装与卸载
查看>>
React文档翻译 (快速入门)
查看>>
nodejs fs路径
查看>>
动态规划算法之最大子段和
查看>>
linux c:关联变量的双for循环
查看>>
深入浅出理解zend framework(三)
查看>>
python语句----->if语句,while语句,for循环
查看>>
javascript之数组操作
查看>>
LinkedList源码分析
查看>>
TF-IDF原理
查看>>
用JS制作博客页面背景随滚动渐变的效果
查看>>
JavaScript的迭代函数与迭代函数的实现
查看>>
一步步教你学会browserify
查看>>
Jmeter入门实例
查看>>
亲近用户—回归本质
查看>>
中文脏话识别的解决方案
查看>>
CSS之不常用但重要的样式总结
查看>>
Python编译错误总结
查看>>