kltn21110's picture
Upload 239 files
325b400 verified
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import random
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
import response.ResponseRecommender as res_rcm
from function.recommender import get_data
class HybridRecommender:
def __init__(self, cosine_sim, products, reviews, favourite_items, order_history, new_item_cart,
num_factors=3, regularization=0.02):
self.cosine_sim = cosine_sim
self.products = products
self.reviews = reviews
self.favourite_items = favourite_items
self.order_history = order_history
self.new_item_cart = new_item_cart
self.num_factors = num_factors
self.regularization = regularization
self.user_ids = reviews["user_id"].unique()
self.product_ids = products["product_id"].unique()
self.user_to_index = {uid: i for i, uid in enumerate(self.user_ids)}
self.product_to_index = {pid: i for i, pid in enumerate(self.product_ids)}
self.index_to_product = {i: pid for pid, i in self.product_to_index.items()}
self.ratings_matrix = self._create_ratings_matrix()
self.global_mean = self.reviews['rating'].mean()
def _create_ratings_matrix(self):
matrix = np.zeros((len(self.user_ids), len(self.product_ids)))
for _, row in self.reviews.iterrows():
u_idx = self.user_to_index[row["user_id"]]
p_idx = self.product_to_index[row["product_id"]]
matrix[u_idx, p_idx] = row["rating"]
return matrix
def train_svd(self):
ratings_filled = np.where(self.ratings_matrix == 0, self.global_mean, self.ratings_matrix)
U, sigma, Vt = svds(ratings_filled, k=self.num_factors)
sigma = np.diag(sigma)
self.predicted_ratings = np.dot(np.dot(U, sigma), Vt)
scaler = MinMaxScaler(feature_range=(1, 5))
self.predicted_ratings = scaler.fit_transform(
self.predicted_ratings.reshape(-1, 1)
).reshape(self.ratings_matrix.shape)
def get_content_score(self, user_id, top_n=3):
fav_items = self.favourite_items[self.favourite_items['user_id'] == user_id]['product_id']
past_orders = self.order_history[self.order_history['user_id'] == user_id]['product_id']
cart_items = self.new_item_cart[self.new_item_cart['user_id'] == user_id]['product_id']
relevant_items = set(fav_items).union(set(past_orders)).union(set(cart_items))
content_scores = {}
for item in relevant_items:
if item in self.product_to_index:
idx = self.product_to_index[item]
sim_scores = enumerate(self.cosine_sim[idx])
top_similar = sorted(sim_scores, key=lambda x: x[1], reverse=True)[:top_n]
for sim_idx, score in top_similar:
prod_id = self.index_to_product[sim_idx]
content_scores[prod_id] = content_scores.get(prod_id, 0) + score
return content_scores
def hybrid_recommend(self, user_id, top_n=10, weights={'collab': 0.2, 'content': 0.1, 'history': 0.7}, randomness=0.1):
if user_id not in self.user_to_index:
print(f"User {user_id} chưa có đánh giá nào. Kiểm tra dữ liệu khác...")
content_scores = self.get_content_score(user_id)
recommendations = sorted(content_scores.items(), key=lambda x: x[1], reverse=True)
# Nếu chưa đủ sản phẩm thì lấy thêm từ cold-start (sản phẩm phổ biến)
if len(recommendations) < top_n:
popular_items = self.reviews.groupby('product_id')['rating'].mean().sort_values(ascending=False)
popular_items = popular_items.loc[~popular_items.index.isin(content_scores.keys())]
extra_items = list(popular_items.head(top_n - len(recommendations)).items())
recommendations.extend(extra_items)
return recommendations[:top_n]
user_idx = self.user_to_index[user_id]
collab_scores = dict(enumerate(self.predicted_ratings[user_idx]))
collab_scores = {self.index_to_product[i]: s for i, s in collab_scores.items()}
content_scores = self.get_content_score(user_id)
history_scores = {}
user_history = self.order_history[self.order_history['user_id'] == user_id]
for _, row in user_history.iterrows():
history_scores[row['product_id']] = row['time_weight']
final_scores = {}
for prod_id in self.product_ids:
final_score = 0
if prod_id in collab_scores:
final_score += weights['collab'] * collab_scores[prod_id]
if prod_id in content_scores:
final_score += weights['content'] * content_scores[prod_id]
if prod_id in history_scores:
final_score += weights['history'] * history_scores[prod_id]
if final_score > 0:
noise = random.uniform(-randomness, randomness) * final_score
final_scores[prod_id] = final_score + noise
purchased = set(self.order_history[self.order_history['user_id'] == user_id]['product_id'])
final_scores = {k: v for k, v in final_scores.items() if k not in purchased}
top_candidates = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)
if len(top_candidates) < top_n:
popular_items = self.reviews.groupby('product_id')['rating'].mean().sort_values(ascending=False)
popular_items = popular_items.loc[~popular_items.index.isin(final_scores.keys())]
extra_items = list(popular_items.head(top_n - len(top_candidates)).items())
top_candidates.extend(extra_items)
return sorted(top_candidates[:top_n], key=lambda x: x[1], reverse=True)
async def recommend(user_id: int, number: int):
data = await get_data.get_data_recommend()
products = pd.DataFrame(list(data)[2])
reviews = pd.DataFrame(list(data)[1])
favourite_items = pd.DataFrame(list(data)[0])
order_history = pd.DataFrame(list(data)[3])
new_item_cart = pd.DataFrame(list(data)[4])
products['description'] = products['name'] + ' ' + products['category']
products['description'] = products['description'].str.lower()
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)
tfidf_matrix = tfidf.fit_transform(products['description'].fillna(''))
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
recommender = HybridRecommender(cosine_sim, products, reviews, favourite_items, order_history, new_item_cart)
recommender.train_svd()
recommendations = recommender.hybrid_recommend(user_id, top_n=number)
items = [res_rcm.ItemRecommend(pro_id=pid, product_name=products.loc[products['product_id'] == pid, 'name'].iloc[0])
for pid, _ in recommendations]
return res_rcm.ListItemRecommend(user_id=user_id, total=len(items), list_item=items)
async def recommend(user_id:int, number:int):
data = await get_data.get_data_recommend()
products = pd.DataFrame(list(data)[2])
reviews = pd.DataFrame(list(data)[1])
favourite_items = pd.DataFrame(list(data)[0])
order_history = pd.DataFrame(list(data)[3])
new_item_cart = pd.DataFrame(list(data)[4])
products['description'] = products['name'] + ' ' + products['category']
products['description'] = products['description'].str.lower()
# Thêm trọng số thời gian cho lịch sử mua hàng
order_history['timestamp'] = pd.date_range(start='2024-10-01', periods=len(order_history), freq='D')
order_history['time_weight'] = 1 - (pd.Timestamp.now() - order_history['timestamp']).dt.days / 365
# Content-Based Filtering cải tiến
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)
tfidf_matrix = tfidf.fit_transform(products['description'].fillna(''))
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
recommender = HybridRecommender(cosine_sim,products, reviews, favourite_items, order_history,new_item_cart)
recommender.train_svd()
recommendations = recommender.hybrid_recommend(user_id, top_n=number, randomness=0.1)
items = [
res_rcm.ItemRecommend(
pro_id=product_id,
product_name=products.loc[products['product_id'] == product_id, 'name'].iloc[0]
)
for product_id, _ in recommendations
]
return res_rcm.ListItemRecommend(user_id=user_id, total=len(items), list_item=items)
if __name__ == "__main__":
import asyncio
print(asyncio.run(recommend(4,10)))