Spaces:

kltn21110
/

chatbot_server

Sleeping

App Files Files Community

chatbot_server / function /recommender /recommend.py

kltn21110

Upload 239 files

325b400 verified 6 months ago

raw

history blame contribute delete

9.1 kB

	import numpy as np
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from scipy.sparse.linalg import svds
	from sklearn.preprocessing import MinMaxScaler
	import random
	import sys
	import os
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
	import response.ResponseRecommender as res_rcm
	from function.recommender import get_data


	class HybridRecommender:
	def __init__(self, cosine_sim, products, reviews, favourite_items, order_history, new_item_cart,
	num_factors=3, regularization=0.02):
	self.cosine_sim = cosine_sim
	self.products = products
	self.reviews = reviews
	self.favourite_items = favourite_items
	self.order_history = order_history
	self.new_item_cart = new_item_cart
	self.num_factors = num_factors
	self.regularization = regularization

	self.user_ids = reviews["user_id"].unique()
	self.product_ids = products["product_id"].unique()
	self.user_to_index = {uid: i for i, uid in enumerate(self.user_ids)}
	self.product_to_index = {pid: i for i, pid in enumerate(self.product_ids)}
	self.index_to_product = {i: pid for pid, i in self.product_to_index.items()}

	self.ratings_matrix = self._create_ratings_matrix()
	self.global_mean = self.reviews['rating'].mean()

	def _create_ratings_matrix(self):
	matrix = np.zeros((len(self.user_ids), len(self.product_ids)))
	for _, row in self.reviews.iterrows():
	u_idx = self.user_to_index[row["user_id"]]
	p_idx = self.product_to_index[row["product_id"]]
	matrix[u_idx, p_idx] = row["rating"]
	return matrix

	def train_svd(self):
	ratings_filled = np.where(self.ratings_matrix == 0, self.global_mean, self.ratings_matrix)
	U, sigma, Vt = svds(ratings_filled, k=self.num_factors)
	sigma = np.diag(sigma)
	self.predicted_ratings = np.dot(np.dot(U, sigma), Vt)
	scaler = MinMaxScaler(feature_range=(1, 5))
	self.predicted_ratings = scaler.fit_transform(
	self.predicted_ratings.reshape(-1, 1)
	).reshape(self.ratings_matrix.shape)

	def get_content_score(self, user_id, top_n=3):
	fav_items = self.favourite_items[self.favourite_items['user_id'] == user_id]['product_id']
	past_orders = self.order_history[self.order_history['user_id'] == user_id]['product_id']
	cart_items = self.new_item_cart[self.new_item_cart['user_id'] == user_id]['product_id']

	relevant_items = set(fav_items).union(set(past_orders)).union(set(cart_items))
	content_scores = {}
	for item in relevant_items:
	if item in self.product_to_index:
	idx = self.product_to_index[item]
	sim_scores = enumerate(self.cosine_sim[idx])
	top_similar = sorted(sim_scores, key=lambda x: x[1], reverse=True)[:top_n]
	for sim_idx, score in top_similar:
	prod_id = self.index_to_product[sim_idx]
	content_scores[prod_id] = content_scores.get(prod_id, 0) + score

	return content_scores

	def hybrid_recommend(self, user_id, top_n=10, weights={'collab': 0.2, 'content': 0.1, 'history': 0.7}, randomness=0.1):
	if user_id not in self.user_to_index:
	print(f"User {user_id} chưa có đánh giá nào. Kiểm tra dữ liệu khác...")
	content_scores = self.get_content_score(user_id)
	recommendations = sorted(content_scores.items(), key=lambda x: x[1], reverse=True)

	# Nếu chưa đủ sản phẩm thì lấy thêm từ cold-start (sản phẩm phổ biến)
	if len(recommendations) < top_n:
	popular_items = self.reviews.groupby('product_id')['rating'].mean().sort_values(ascending=False)
	popular_items = popular_items.loc[~popular_items.index.isin(content_scores.keys())]
	extra_items = list(popular_items.head(top_n - len(recommendations)).items())
	recommendations.extend(extra_items)

	return recommendations[:top_n]

	user_idx = self.user_to_index[user_id]

	collab_scores = dict(enumerate(self.predicted_ratings[user_idx]))
	collab_scores = {self.index_to_product[i]: s for i, s in collab_scores.items()}

	content_scores = self.get_content_score(user_id)

	history_scores = {}
	user_history = self.order_history[self.order_history['user_id'] == user_id]
	for _, row in user_history.iterrows():
	history_scores[row['product_id']] = row['time_weight']

	final_scores = {}
	for prod_id in self.product_ids:
	final_score = 0
	if prod_id in collab_scores:
	final_score += weights['collab'] * collab_scores[prod_id]
	if prod_id in content_scores:
	final_score += weights['content'] * content_scores[prod_id]
	if prod_id in history_scores:
	final_score += weights['history'] * history_scores[prod_id]
	if final_score > 0:
	noise = random.uniform(-randomness, randomness) * final_score
	final_scores[prod_id] = final_score + noise

	purchased = set(self.order_history[self.order_history['user_id'] == user_id]['product_id'])
	final_scores = {k: v for k, v in final_scores.items() if k not in purchased}

	top_candidates = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)

	if len(top_candidates) < top_n:
	popular_items = self.reviews.groupby('product_id')['rating'].mean().sort_values(ascending=False)
	popular_items = popular_items.loc[~popular_items.index.isin(final_scores.keys())]
	extra_items = list(popular_items.head(top_n - len(top_candidates)).items())
	top_candidates.extend(extra_items)

	return sorted(top_candidates[:top_n], key=lambda x: x[1], reverse=True)

	async def recommend(user_id: int, number: int):
	data = await get_data.get_data_recommend()
	products = pd.DataFrame(list(data)[2])
	reviews = pd.DataFrame(list(data)[1])
	favourite_items = pd.DataFrame(list(data)[0])
	order_history = pd.DataFrame(list(data)[3])
	new_item_cart = pd.DataFrame(list(data)[4])

	products['description'] = products['name'] + ' ' + products['category']
	products['description'] = products['description'].str.lower()

	tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)
	tfidf_matrix = tfidf.fit_transform(products['description'].fillna(''))
	cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

	recommender = HybridRecommender(cosine_sim, products, reviews, favourite_items, order_history, new_item_cart)
	recommender.train_svd()

	recommendations = recommender.hybrid_recommend(user_id, top_n=number)
	items = [res_rcm.ItemRecommend(pro_id=pid, product_name=products.loc[products['product_id'] == pid, 'name'].iloc[0])
	for pid, _ in recommendations]

	return res_rcm.ListItemRecommend(user_id=user_id, total=len(items), list_item=items)






	async def recommend(user_id:int, number:int):
	data = await get_data.get_data_recommend()
	products = pd.DataFrame(list(data)[2])
	reviews = pd.DataFrame(list(data)[1])
	favourite_items = pd.DataFrame(list(data)[0])
	order_history = pd.DataFrame(list(data)[3])
	new_item_cart = pd.DataFrame(list(data)[4])
	products['description'] = products['name'] + ' ' + products['category']
	products['description'] = products['description'].str.lower()
	# Thêm trọng số thời gian cho lịch sử mua hàng
	order_history['timestamp'] = pd.date_range(start='2024-10-01', periods=len(order_history), freq='D')
	order_history['time_weight'] = 1 - (pd.Timestamp.now() - order_history['timestamp']).dt.days / 365
	# Content-Based Filtering cải tiến
	tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)
	tfidf_matrix = tfidf.fit_transform(products['description'].fillna(''))
	cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


	recommender = HybridRecommender(cosine_sim,products, reviews, favourite_items, order_history,new_item_cart)
	recommender.train_svd()
	recommendations = recommender.hybrid_recommend(user_id, top_n=number, randomness=0.1)
	items = [
	res_rcm.ItemRecommend(
	pro_id=product_id,
	product_name=products.loc[products['product_id'] == product_id, 'name'].iloc[0]
	)
	for product_id, _ in recommendations
	]

	return res_rcm.ListItemRecommend(user_id=user_id, total=len(items), list_item=items)
	if __name__ == "__main__":
	import asyncio
	print(asyncio.run(recommend(4,10)))