All ratings are contained in the file "ratings.dat" and are in the following format:
UserID::MovieID::Rating::Timestamp
- UserIDs range between 1 and 6040 - MovieIDs range between 1 and 3952 - Ratings are made on a 5-star scale (whole-star ratings only) - Timestamp is represented in seconds since the epoch as returned by time(2) - Each user has at least 20 ratings
# 划分数据集 defSplitData(data, M, k, seed): test = [] train = [] random.seed(seed) for user, item in data: if random.randint(0, M) == k: test.append([user, item]) else: train.append([user, item]) train_ = defaultdict(set) test_ = defaultdict(set) for user, item in train: train_[user].add(item) for user, item in test: test_[user].add(item) return train_, test_
# 评价指标:召回率、准确率 defMetric(train, test, N, all_recommend_list):# N:推荐N个物品 hit = 0 recall_all = 0# recall 的分母 precision_all = 0# precision 的分母 for user in train.keys(): tu = test[user] rank = all_recommend_list[user][0:N] for item, pui in rank: if item in tu: hit += 1 recall_all += len(tu) precision_all += N recall = hit / (recall_all * 1.0) precision = hit / (precision_all * 1.0) return recall, precision
# 评价指标:覆盖率 defCoverage(train, test, N, all_recommend_list):# N:推荐N个物品 recommend_items = set() all_items = set() for user in train.keys(): for item in train[user]: all_items.add(item) for item, pui in rank: recommend_items.add(item) coverage = len(recommend_items) / (len(all_items) * 1.0) return coverage
# 评价指标:新颖度 defPopularity(train, test, N, recommend_res):# N:推荐N个物品 item_popularity = dict() for user, items in train.items(): for item in items: if item notin item_popularity: item_popularity[item] = 0 item_popularity[item] += 1 popularity = 0 n = 0 for user in train.keys(): rank = recommend_res[user][0:N] for item, pui in rank: popularity += math.log(1 + item_popularity[item]) n += 1 popularity /= n * 1.0 return popularity