2020-大数据分析-Tec10-基于MovieLens的推荐系统

Tec10-基于MovieLens的推荐系统

1. 概述

  1. 基于MovieLens数据集进行推荐
  2. 使用了基于内容的推荐和基于协同过滤的推荐

2. 完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# -*- coding: utf-8 -*-
import csv
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

class MyRecommend:
def __init__(self, movie_data_path, rating_data_path, tags_data_path):
self.movie_data = pd.read_csv(movie_data_path)
self.movie_data['genres'] = self.movie_data["genres"].fillna("")
self.rating_data = pd.read_csv(rating_data_path)
self.tags_data = pd.read_csv(tags_data_path)
self.movie_origin_source = {}
self.user_average = {}
self.movie_average = {}

def init_similarity(self):
user_ids = self.rating_data["userId"].drop_duplicates().sort_values()
movie_ids = self.rating_data["movieId"].drop_duplicates().sort_values()
i = 0
for movie_id in movie_ids:
self.movie_origin_source[movie_id] = i
i += 1

all_user_matrix = []
zeros = []
for x in range(len(movie_ids)):
zeros.append(0)
for user_id in tqdm(user_ids):
tmp = zeros.copy()
seen_movies = self.rating_data[self.rating_data["userId"] == user_id]
seen_movies_ids = seen_movies["movieId"]
for seen_movies_id in seen_movies_ids:
position = self.movie_origin_source[seen_movies_id]
tmp[position] = seen_movies[seen_movies["movieId"] == seen_movies_id]["rating"].values[0]
average = np.mean(tmp)
self.user_average[user_id] = average
for i in range(len(tmp)):
tmp[i] -= average
all_user_matrix.append(tmp)

all_user_df = pd.DataFrame(all_user_matrix)
all_user_matrix.clear()
self.all_user_similarity = cosine_similarity(all_user_df.values)

all_movie_matrix = []
zeros = []
for x in range(len(user_ids)):
zeros.append(0)
for movie_id in tqdm(movie_ids):
tmp = zeros.copy()
seen_users = self.rating_data[self.rating_data["movieId"] == movie_id]
seen_users_ids = seen_users["userId"]
for seen_user_id in seen_users_ids:
tmp[seen_user_id - 1] = seen_users[seen_users["userId"] == seen_user_id]["rating"].values[0]
average = np.mean(tmp)
self.movie_average[movie_id] = average
for i in range(len(tmp)):
tmp[i] -= average
all_movie_matrix.append(tmp)

all_movie_df = pd.DataFrame(all_movie_matrix)
all_movie_matrix.clear()
self.all_movie_similarity = cosine_similarity(all_movie_df.values)

def user_based_one(self, user_id, movie_id, is_seen=True):
"""
用户-用户协同过滤
预测 user_id 会给 movie_id 的评分
:param is_seen:
:param user_id:
:param movie_id:
:return:
"""
# 用户 user_id 的全部评分信息
user_ratings = self.rating_data[self.rating_data["userId"] == user_id]

# 检查是不是没有评价过
if len(user_ratings[user_ratings["movieId"] == movie_id].values) and is_seen:
return 0.0

# 看过影片 movie_id 的全部用户id
seen_user_ids = self.rating_data[self.rating_data["movieId"] == movie_id]["userId"]
# 用户相似度
similarity_users = []
weight_similarity = 0.0
total_similarity = 0.0

for seen_user_id in seen_user_ids:
# 看过movie_id的电影的用户的全部评分情况
seen_user_ratings = self.rating_data[self.rating_data["userId"] == seen_user_id]

similarity = self.all_user_similarity[seen_user_id - 1][user_id - 1]
similarity_user = [seen_user_id, similarity]
similarity_users.append(similarity_user)

weight_similarity += similarity * \
seen_user_ratings[seen_user_ratings["movieId"] == movie_id]["rating"].values[0]
total_similarity += similarity
if total_similarity == 0.0 or total_similarity == 0:
return 0.0
return weight_similarity / total_similarity + self.user_average[user_id]

def movie_based_one(self, user_id, movie_id, is_seen=True):
"""
物品-物品协同过滤
预测 user_id 会给 movie_id 的评分
:param is_seen:
:param user_id:
:param movie_id:
:return:
"""
# movie_id的全部评分情况
movie_ratings = self.rating_data[self.rating_data["movieId"] == movie_id]

# 检查是不是没有评价过
if len(movie_ratings[movie_ratings["userId"] == user_id].values) and is_seen:
return 0.0

# user_id 看过的所有影片 movie_id
seen_movie_ids = self.rating_data[self.rating_data["userId"] == user_id]["movieId"]

# 影片相似度
similarity_movies = []
weight_similarity = 0.0
total_similarity = 0.0

for seen_movie_id in seen_movie_ids:
seen_movie_ratings = self.rating_data[self.rating_data["movieId"] == seen_movie_id]

try:
similarity = self.all_movie_similarity[self.movie_origin_source[seen_movie_id]][
self.movie_origin_source[movie_id]]
except Exception as e:
similarity = 0.0
similarity_movie = [seen_movie_id, similarity]
similarity_movies.append(similarity_movie)

weight_similarity += similarity * \
seen_movie_ratings[seen_movie_ratings["userId"] == user_id]["rating"].values[0]
total_similarity += similarity
result = weight_similarity / total_similarity
if(movie_id not in self.movie_average):
return result
return result + self.movie_average[movie_id]

def content_based_predict(self, movie_ids, seen_movie_ids, n=10):
"""
找到和 movie_id 相似的电影
:param movie_ids:
:param seen_movie_ids: 已经看过的电影
:param n: 推荐个数
:return:
"""
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(self.movie_data["genres"])
# 计算余弦相似度
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# 建立电影列表索引,方便进行相互索引
indices = pd.Series(self.movie_data.index, index=self.movie_data["movieId"]).drop_duplicates()

similarities = {}
for movie_id in movie_ids:
idx = indices[movie_id]
similarity = list(enumerate(cosine_sim[idx]))
similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
similarity = similarity[1: 1 + int(n / 2)]
for s in similarity:
if s[0] in similarities or s[0] in seen_movie_ids:
continue
else:
similarities[s[0]] = s[1]
recommend_ids = list(similarities.keys())
recommend_ids.sort(key=lambda x: similarities[x], reverse=True)
movie_indices = recommend_ids[:n]
recommends = self.movie_data["movieId"].iloc[movie_indices]
return recommends

def predict(self, user_ids, n=10, least_rating=3.0):
"""
联合推荐,先通过内容推荐来缩小范围,然后使用协同过滤
:param user_ids: 用户范围
:param n:内容过滤输出为前10
:param least_rating:最低可以接受的评分
:return:
"""
users_result = {}
for user_id in tqdm(user_ids):
# 推荐的电影
result = []
seen_movie_ids = self.rating_data[self.rating_data["userId"] == user_id]["movieId"].drop_duplicates()
# 选择用户最喜欢的前n个电影
love_movie_ids = self.rating_data[self.rating_data["userId"] == user_id].sort_values(by=["rating"]).head(n)[
"movieId"].drop_duplicates()
recommend_ids = self.content_based_predict(love_movie_ids, seen_movie_ids, n)
for recommend_id in recommend_ids:
result.append(recommend_id)
# 进行协同过滤(物品和用户两种),低于预测评分不进行推荐
recommend_scores = []
for recommend_id in recommend_ids:
user_based_score = self.user_based_one(user_id, recommend_id)
movie_based_score = self.movie_based_one(user_id, recommend_id)
if user_based_score < least_rating and movie_based_score < least_rating:
continue
recommend_scores.append([recommend_id, user_based_score, movie_based_score])
recommend_scores.sort(key=lambda x: x[1] + x[2])
# 推荐至少前2名
if (len(recommend_scores) < 2):
users_result[user_id] = [recommend_id[0] for recommend_id in recommend_scores]
else:
users_result[user_id] = [recommend_id[0] for recommend_id in recommend_scores[:2]]

with open("result/movie" + str(time.time()) + ".csv", "w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(["userId", "movieId"])
for user_id in users_result.keys():
for recommend_id in users_result[user_id]:
writer.writerow([user_id, recommend_id])

def evaluate(self, user_ids, edges):
"""
评估效果,以一个用户为例,检验协同过滤算法
:param user_ids:
:param edges: 边界
:return:
"""
valid = {}
for edge in edges:
valid[edge] = 0
total = 0
for user_id in user_ids:
seen_movies = self.rating_data[self.rating_data["userId"] == user_id]
seen_movie_ids = seen_movies["movieId"]
total += len(seen_movie_ids)
for seen_movie_id in tqdm(seen_movie_ids):
real_score = seen_movies[seen_movies["movieId"] == seen_movie_id]["rating"].values[0]
user_based_score = self.user_based_one(user_id, seen_movie_id, False)
movie_based_score = self.movie_based_one(user_id, seen_movie_id, False)
for edge in edges:
if (abs(real_score - user_based_score) <= edge or
abs(real_score - movie_based_score) <= edge):
valid[edge] += 1
for edge in edges:
print("上下浮动范围为" + str(edge) + "时的准确率为" + str(valid[edge] / total * 100) + "%")


if __name__ == '__main__':
myRecommend = MyRecommend("./ml-latest-small/movies.csv",
"./ml-latest-small/ratings.csv",
"./ml-latest-small/tags.csv")
myRecommend.init_similarity()
print(myRecommend.user_based_one(1, 1, False))
myRecommend.predict(range(611))
myRecommend.evaluate(range(5), [0.25, 0.5, 0.75, 1])

3. 参考

  1. 分析9000部电影|一个简单的电影推荐系统
  2. 基于MovieLens的电影推荐系统
  3. Python推荐算法案例(2)——基于内容的电影推荐

2020-大数据分析-Tec10-基于MovieLens的推荐系统
https://spricoder.github.io/2020/11/01/2020-Big-data-analysis/2020-Big-data-analysis-Tec10-%E5%9F%BA%E4%BA%8EMovieLens%E7%9A%84%E6%8E%A8%E8%8D%90%E7%B3%BB%E7%BB%9F/
作者
SpriCoder
发布于
2020年11月1日
许可协议