import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from scipy.sparse.linalg import svds from sklearn.preprocessing import MinMaxScaler import random import sys import os sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) import response.ResponseRecommender as res_rcm from function.recommender import get_data class HybridRecommender: def __init__(self, cosine_sim, products, reviews, favourite_items, order_history, new_item_cart, num_factors=3, regularization=0.02): self.cosine_sim = cosine_sim self.products = products self.reviews = reviews self.favourite_items = favourite_items self.order_history = order_history self.new_item_cart = new_item_cart self.num_factors = num_factors self.regularization = regularization self.user_ids = reviews["user_id"].unique() self.product_ids = products["product_id"].unique() self.user_to_index = {uid: i for i, uid in enumerate(self.user_ids)} self.product_to_index = {pid: i for i, pid in enumerate(self.product_ids)} self.index_to_product = {i: pid for pid, i in self.product_to_index.items()} self.ratings_matrix = self._create_ratings_matrix() self.global_mean = self.reviews['rating'].mean() def _create_ratings_matrix(self): matrix = np.zeros((len(self.user_ids), len(self.product_ids))) for _, row in self.reviews.iterrows(): u_idx = self.user_to_index[row["user_id"]] p_idx = self.product_to_index[row["product_id"]] matrix[u_idx, p_idx] = row["rating"] return matrix def train_svd(self): ratings_filled = np.where(self.ratings_matrix == 0, self.global_mean, self.ratings_matrix) U, sigma, Vt = svds(ratings_filled, k=self.num_factors) sigma = np.diag(sigma) self.predicted_ratings = np.dot(np.dot(U, sigma), Vt) scaler = MinMaxScaler(feature_range=(1, 5)) self.predicted_ratings = scaler.fit_transform( self.predicted_ratings.reshape(-1, 1) ).reshape(self.ratings_matrix.shape) def get_content_score(self, user_id, top_n=3): fav_items = self.favourite_items[self.favourite_items['user_id'] == user_id]['product_id'] past_orders = self.order_history[self.order_history['user_id'] == user_id]['product_id'] cart_items = self.new_item_cart[self.new_item_cart['user_id'] == user_id]['product_id'] relevant_items = set(fav_items).union(set(past_orders)).union(set(cart_items)) content_scores = {} for item in relevant_items: if item in self.product_to_index: idx = self.product_to_index[item] sim_scores = enumerate(self.cosine_sim[idx]) top_similar = sorted(sim_scores, key=lambda x: x[1], reverse=True)[:top_n] for sim_idx, score in top_similar: prod_id = self.index_to_product[sim_idx] content_scores[prod_id] = content_scores.get(prod_id, 0) + score return content_scores def hybrid_recommend(self, user_id, top_n=10, weights={'collab': 0.2, 'content': 0.1, 'history': 0.7}, randomness=0.1): if user_id not in self.user_to_index: print(f"User {user_id} chưa có đánh giá nào. Kiểm tra dữ liệu khác...") content_scores = self.get_content_score(user_id) recommendations = sorted(content_scores.items(), key=lambda x: x[1], reverse=True) # Nếu chưa đủ sản phẩm thì lấy thêm từ cold-start (sản phẩm phổ biến) if len(recommendations) < top_n: popular_items = self.reviews.groupby('product_id')['rating'].mean().sort_values(ascending=False) popular_items = popular_items.loc[~popular_items.index.isin(content_scores.keys())] extra_items = list(popular_items.head(top_n - len(recommendations)).items()) recommendations.extend(extra_items) return recommendations[:top_n] user_idx = self.user_to_index[user_id] collab_scores = dict(enumerate(self.predicted_ratings[user_idx])) collab_scores = {self.index_to_product[i]: s for i, s in collab_scores.items()} content_scores = self.get_content_score(user_id) history_scores = {} user_history = self.order_history[self.order_history['user_id'] == user_id] for _, row in user_history.iterrows(): history_scores[row['product_id']] = row['time_weight'] final_scores = {} for prod_id in self.product_ids: final_score = 0 if prod_id in collab_scores: final_score += weights['collab'] * collab_scores[prod_id] if prod_id in content_scores: final_score += weights['content'] * content_scores[prod_id] if prod_id in history_scores: final_score += weights['history'] * history_scores[prod_id] if final_score > 0: noise = random.uniform(-randomness, randomness) * final_score final_scores[prod_id] = final_score + noise purchased = set(self.order_history[self.order_history['user_id'] == user_id]['product_id']) final_scores = {k: v for k, v in final_scores.items() if k not in purchased} top_candidates = sorted(final_scores.items(), key=lambda x: x[1], reverse=True) if len(top_candidates) < top_n: popular_items = self.reviews.groupby('product_id')['rating'].mean().sort_values(ascending=False) popular_items = popular_items.loc[~popular_items.index.isin(final_scores.keys())] extra_items = list(popular_items.head(top_n - len(top_candidates)).items()) top_candidates.extend(extra_items) return sorted(top_candidates[:top_n], key=lambda x: x[1], reverse=True) async def recommend(user_id: int, number: int): data = await get_data.get_data_recommend() products = pd.DataFrame(list(data)[2]) reviews = pd.DataFrame(list(data)[1]) favourite_items = pd.DataFrame(list(data)[0]) order_history = pd.DataFrame(list(data)[3]) new_item_cart = pd.DataFrame(list(data)[4]) products['description'] = products['name'] + ' ' + products['category'] products['description'] = products['description'].str.lower() tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000) tfidf_matrix = tfidf.fit_transform(products['description'].fillna('')) cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) recommender = HybridRecommender(cosine_sim, products, reviews, favourite_items, order_history, new_item_cart) recommender.train_svd() recommendations = recommender.hybrid_recommend(user_id, top_n=number) items = [res_rcm.ItemRecommend(pro_id=pid, product_name=products.loc[products['product_id'] == pid, 'name'].iloc[0]) for pid, _ in recommendations] return res_rcm.ListItemRecommend(user_id=user_id, total=len(items), list_item=items) async def recommend(user_id:int, number:int): data = await get_data.get_data_recommend() products = pd.DataFrame(list(data)[2]) reviews = pd.DataFrame(list(data)[1]) favourite_items = pd.DataFrame(list(data)[0]) order_history = pd.DataFrame(list(data)[3]) new_item_cart = pd.DataFrame(list(data)[4]) products['description'] = products['name'] + ' ' + products['category'] products['description'] = products['description'].str.lower() # Thêm trọng số thời gian cho lịch sử mua hàng order_history['timestamp'] = pd.date_range(start='2024-10-01', periods=len(order_history), freq='D') order_history['time_weight'] = 1 - (pd.Timestamp.now() - order_history['timestamp']).dt.days / 365 # Content-Based Filtering cải tiến tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000) tfidf_matrix = tfidf.fit_transform(products['description'].fillna('')) cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) recommender = HybridRecommender(cosine_sim,products, reviews, favourite_items, order_history,new_item_cart) recommender.train_svd() recommendations = recommender.hybrid_recommend(user_id, top_n=number, randomness=0.1) items = [ res_rcm.ItemRecommend( pro_id=product_id, product_name=products.loc[products['product_id'] == product_id, 'name'].iloc[0] ) for product_id, _ in recommendations ] return res_rcm.ListItemRecommend(user_id=user_id, total=len(items), list_item=items) if __name__ == "__main__": import asyncio print(asyncio.run(recommend(4,10)))