{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " user_id pro_id\n", "0 3 2\n", "1 3 1\n", "2 3 13\n", "3 3 13\n", "4 3 13\n", "5 3 13\n", "6 3 13\n", "7 3 7\n", "8 3 12\n", "9 3 4\n", "10 7 5\n", "11 7 12\n", "12 8 7\n", "13 9 2\n", "14 9 1\n", "15 9 4\n", "16 9 14\n", "17 10 5\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\vonhu\\AppData\\Local\\Temp\\ipykernel_5672\\2900762178.py:20: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", " df_favourite_items = pd.read_sql(query, conn)\n" ] } ], "source": [ "import pymysql\n", "import pandas as pd\n", "\n", "# Kết nối đến MySQL\n", "conn = pymysql.connect(\n", " host=\"localhost\",\n", " user=\"root\",\n", " password=\"Yahana0509@\",\n", " database=\"demohmdrinks\",\n", " port=4000,\n", " charset=\"utf8mb4\"\n", ")\n", "\n", "# Truy vấn dữ liệu\n", "query = \"\"\"\n", "SELECT f.user_id, fi.pro_id\n", "FROM favourite_item fi\n", "JOIN favourite f ON fi.fav_id = f.fav_id;\n", "\"\"\"\n", "df_favourite_items = pd.read_sql(query, conn)\n", "\n", "# Đóng kết nối\n", "conn.close()\n", "\n", "# Xem trước dữ liệu\n", "print(df_favourite_items)\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{3: [2, 1, 13, 13, 13, 13, 13, 7, 12, 4], 7: [5, 12], 8: [7], 9: [2, 1, 4, 14], 10: [5]}\n" ] } ], "source": [ "# Chuyển DataFrame thành dictionary dạng mong muốn\n", "favourite_items_data = df_favourite_items.groupby(\"user_id\")[\"pro_id\"].apply(list).to_dict()\n", "\n", "print(favourite_items_data)\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'user_id': [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 8, 9, 9, 9, 9, 10], 'product_id': [2, 1, 13, 13, 13, 13, 13, 7, 12, 4, 5, 12, 7, 2, 1, 4, 14, 5]}\n" ] } ], "source": [ "favourite_items_data = {\n", " \"user_id\": df_favourite_items[\"user_id\"].tolist(),\n", " \"product_id\": df_favourite_items[\"pro_id\"].tolist()\n", "}\n", "\n", "print(favourite_items_data)\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'user_id': [2, 3, 3, 3, 5, 2, 4, 4, 1, 1, 1, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], 'product_id': [3, 3, 3, 3, 2, 3, 4, 3, 2, 3, 1, 1, 1, 1, 4, 4, 13, 12, 12, 12, 2, 2, 2, 3, 3, 3, 3, 1, 4, 1, 3, 4], 'rating': [5, 5, 5, 5, 3, 3, 5, 2, 2, 1, 1, 3, 3, 5, 5, 5, 4, 5, 5, 5, 4, 1, 4, 4, 4, 3, 4, 4, 2, 2, 2, 2]}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\vonhu\\AppData\\Local\\Temp\\ipykernel_5672\\2058965346.py:12: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", " df = pd.read_sql(query, conn)\n" ] } ], "source": [ "# Truy vấn SQL\n", "conn = pymysql.connect(\n", " host=\"localhost\",\n", " user=\"root\",\n", " password=\"Yahana0509@\",\n", " database=\"demohmdrinks\",\n", " port=4000,\n", " charset=\"utf8mb4\"\n", ")\n", "\n", "query = \"SELECT user_id, pro_id, rating_star FROM review;\"\n", "df = pd.read_sql(query, conn)\n", "\n", "# Đóng kết nối\n", "conn.close()\n", "\n", "# Chuyển DataFrame thành dictionary\n", "reviews_data = {\n", " \"user_id\": df[\"user_id\"].tolist(),\n", " \"product_id\": df[\"pro_id\"].tolist(),\n", " \"rating\": df[\"rating_star\"].tolist()\n", "}\n", "\n", "# Kiểm tra kết quả\n", "print(reviews_data)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'product_id': [1, 2, 3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37], 'name': ['string', 'nhuy_test2', 'Trà1', 'string111111', 'Nuoc ep hoa qua ', 'SP2', 'NCep2', 'Như ý', 'Nước tết 1', 'Nước tết 2', 'Test vui vẻ', 'SPTest22', 'demo1', 'test1', 'test11', 'test11', 'bubu1', 'bubu1', 'bubu123', 'bubu thúi1', 'bubu thúi1', 'Dâu nước', 'Trà lá sen', 'Dâu nước1', 'trà dâu', 'Trà bưởi lá hồng', 'Bò sữa Gia Lai 1', 'Bò sữa Gia Lai 2', 'Bò sữa Gia Lai 3', 'Bò sữa Gia Lai 3', 'Nước trà gừng chua cay'], 'category': ['nhuytest2', 'nhuytest2', 'nhuytest2', 'string', 'nuoc ep', 'string1fggg', 'string', 'nhuytest2', 'Nước tt1', 'string1fggg', 'SP2', 'Test5', 'demo4', 'Test_Category', 'Test_Category1', 'Test_Category1', 'Test_Bubu', 'Test_Bubu', 'Test_Bubu1', 'nhuytest2', 'nhuytest2', 'nhuytest2', 'string', 'nhuytest2', 'string', 'nhuytest2', 'Sữa bò Phú Yên', 'Sữa bò Phú Yên', 'Sữa bò Phú Yên', 'nuoc ep', 'string']}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\vonhu\\AppData\\Local\\Temp\\ipykernel_5672\\101450748.py:14: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", " df = pd.read_sql(query, conn)\n" ] } ], "source": [ "conn = pymysql.connect(\n", " host=\"localhost\",\n", " user=\"root\",\n", " password=\"Yahana0509@\",\n", " database=\"demohmdrinks\",\n", " port=4000,\n", " charset=\"utf8mb4\"\n", ")\n", "query = \"\"\"\n", " SELECT p.pro_id, p.pro_name, c.cate_name AS category\n", " FROM product p\n", " JOIN category c ON p.category_id = c.cate_id;\n", "\"\"\"\n", "df = pd.read_sql(query, conn)\n", "\n", "# Đóng kết nối\n", "conn.close()\n", "\n", "# Chuyển DataFrame thành dictionary\n", "products_data = {\n", " \"product_id\": df[\"pro_id\"].tolist(),\n", " \"name\": df[\"pro_name\"].tolist(),\n", " \"category\": df[\"category\"].tolist()\n", "}\n", "\n", "# Kiểm tra kết quả\n", "print(products_data)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " user_id product_id\n", "0 3 1\n", "1 3 1\n", "2 3 1\n", "3 3 1\n", "4 3 1\n", "5 3 1\n", "6 3 1\n", "7 3 1\n", "8 3 1\n", "9 3 1\n", "10 3 2\n", "11 3 2\n", "12 3 2\n", "13 3 2\n", "14 3 2\n", "15 3 2\n", "16 3 2\n", "17 3 5\n", "18 3 5\n", "19 3 7\n", "20 3 7\n", "21 3 7\n", "22 3 7\n", "23 3 7\n", "24 3 14\n", "25 3 14\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\vonhu\\AppData\\Local\\Temp\\ipykernel_5672\\940316926.py:32: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", " df = pd.read_sql(query, conn)\n" ] } ], "source": [ "import pymysql\n", "conn = pymysql.connect(\n", " host=\"localhost\",\n", " user=\"root\",\n", " password=\"Yahana0509@\",\n", " database=\"demohmdrinks\",\n", " port=4000,\n", " charset=\"utf8mb4\"\n", ")\n", "\n", "\n", "\n", "query = \"\"\"\n", "SELECT o.user_id, ci.pro_id AS product_id\n", "FROM `orders` o\n", "JOIN order_item oi ON o.order_id = oi.order_id\n", "JOIN cart c ON oi.cart_id = c.cart_id\n", "JOIN cart_item ci ON c.cart_id = ci.cart_id\n", "JOIN payments p ON o.order_id = p.order_id\n", "WHERE o.status = \"CONFIRM\"\n", " AND p.status = 'SUCCESS'\n", "\n", "UNION ALL\n", "\n", "SELECT c.user_id, ci.pro_id AS product_id\n", "FROM cart c\n", "JOIN cart_item ci ON c.cart_id = ci.cart_id\n", "ORDER BY user_id, product_id;\n", "\"\"\"\n", "\n", "# Đọc dữ liệu vào DataFrame\n", "df = pd.read_sql(query, conn)\n", "\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'user_id': [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], 'product_id': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 5, 5, 7, 7, 7, 7, 7, 14, 14]}\n" ] } ], "source": [ "order_history_data = {\n", " \"user_id\": df[\"user_id\"].tolist(),\n", " \"product_id\": df[\"product_id\"].tolist()\n", "}\n", "\n", "# In kết quả\n", "print(order_history_data)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "from scipy.sparse.linalg import svds\n", "from sklearn.preprocessing import MinMaxScaler" ] }, { "cell_type": "code", "execution_count": 253, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append(r\"D:\\HmDrinks_Chat\\chatbot\\function\\recommender\")\n", "import get_data\n" ] }, { "cell_type": "code", "execution_count": 256, "metadata": {}, "outputs": [], "source": [ "data = await get_data.get_data_recommend()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "# Tạo DataFrame\n", "products = pd.DataFrame(products_data)\n", "reviews = pd.DataFrame(reviews_data)\n", "favourite_items = pd.DataFrame(favourite_items_data)\n", "order_history = pd.DataFrame(order_history_data)" ] }, { "cell_type": "code", "execution_count": 257, "metadata": {}, "outputs": [], "source": [ "# Tạo DataFrame\n", "products = pd.DataFrame(list(data)[2])\n", "reviews = pd.DataFrame(list(data)[1])\n", "favourite_items = pd.DataFrame(list(data)[0])\n", "order_history = pd.DataFrame(list(data)[3])" ] }, { "cell_type": "code", "execution_count": 258, "metadata": {}, "outputs": [], "source": [ "# Tiền xử lý dữ liệu nâng cao\n", "products['description'] = products['name'] + ' ' + products['category']\n", "products['description'] = products['description'].str.lower()\n", "\n", "# Thêm trọng số thời gian cho lịch sử mua hàng\n", "order_history['timestamp'] = pd.date_range(start='2024-01-01', periods=len(order_history), freq='D')\n", "order_history['time_weight'] = 1 - (pd.Timestamp.now() - order_history['timestamp']).dt.days / 365\n", "\n", "# Content-Based Filtering cải tiến\n", "tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)\n", "tfidf_matrix = tfidf.fit_transform(products['description'].fillna(''))\n", "cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)\n", "\n", "# Collaborative Filtering với SVD\n", "import random" ] }, { "cell_type": "code", "execution_count": 259, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "class HybridRecommender:\n", " def __init__(self, products, reviews, favourite_items, order_history, \n", " num_factors=2, regularization=0.02):\n", " self.products = products\n", " self.reviews = reviews\n", " self.favourite_items = favourite_items\n", " self.order_history = order_history\n", " self.num_factors = num_factors\n", " self.regularization = regularization\n", " \n", " # Chuẩn bị dữ liệu\n", " self.user_ids = reviews[\"user_id\"].unique()\n", " self.product_ids = products[\"product_id\"].unique()\n", " self.user_to_index = {uid: i for i, uid in enumerate(self.user_ids)}\n", " self.product_to_index = {pid: i for i, pid in enumerate(self.product_ids)}\n", " self.index_to_product = {i: pid for pid, i in self.product_to_index.items()}\n", " \n", " # Tạo ma trận rating\n", " self.ratings_matrix = self._create_ratings_matrix()\n", " self.global_mean = self.reviews['rating'].mean()\n", " \n", " def _create_ratings_matrix(self):\n", " matrix = np.zeros((len(self.user_ids), len(self.product_ids)))\n", " for _, row in self.reviews.iterrows():\n", " u_idx = self.user_to_index[row[\"user_id\"]]\n", " p_idx = self.product_to_index[row[\"product_id\"]]\n", " matrix[u_idx, p_idx] = row[\"rating\"]\n", " return matrix\n", " \n", " def train_svd(self):\n", " # Chuẩn hóa ma trận rating\n", " ratings_filled = np.where(self.ratings_matrix == 0, \n", " self.global_mean, \n", " self.ratings_matrix)\n", " \n", " # Phân tích SVD\n", " U, sigma, Vt = svds(ratings_filled, k=self.num_factors)\n", " sigma = np.diag(sigma)\n", " self.predicted_ratings = np.dot(np.dot(U, sigma), Vt)\n", " \n", " # Chuẩn hóa kết quả về khoảng [1, 5]\n", " scaler = MinMaxScaler(feature_range=(1, 5))\n", " self.predicted_ratings = scaler.fit_transform(\n", " self.predicted_ratings.reshape(-1, 1)\n", " ).reshape(self.ratings_matrix.shape)\n", " \n", " def get_content_score(self, user_id, top_n=3):\n", " # Lấy các sản phẩm liên quan từ yêu thích và lịch sử\n", " fav_items = self.favourite_items[self.favourite_items['user_id'] == user_id]['product_id']\n", " past_orders = self.order_history[self.order_history['user_id'] == user_id]\n", " \n", " content_scores = {}\n", " for item in set(fav_items).union(set(past_orders['product_id'])):\n", " if item in self.product_to_index:\n", " idx = self.product_to_index[item]\n", " sim_scores = enumerate(cosine_sim[idx])\n", " top_similar = sorted(sim_scores, key=lambda x: x[1], reverse=True)[:top_n]\n", " for sim_idx, score in top_similar:\n", " prod_id = self.index_to_product[sim_idx]\n", " content_scores[prod_id] = content_scores.get(prod_id, 0) + score\n", " \n", " return content_scores\n", " \n", " def hybrid_recommend(self, user_id, top_n=10, weights={'collab': 0.6, 'content': 0.3, 'history': 0.1}, randomness=0.1):\n", " # Kiểm tra nếu user_id không có trong hệ thống\n", " if user_id not in self.user_to_index:\n", " print(f\"User {user_id} chưa có đánh giá nào. Kiểm tra sản phẩm yêu thích...\")\n", "\n", " # Lấy danh sách sản phẩm yêu thích của người dùng\n", " fav_items = self.favourite_items[self.favourite_items['user_id'] == user_id]['product_id']\n", "\n", " if not fav_items.empty:\n", " print(f\"Người dùng có sản phẩm yêu thích. Gợi ý dựa trên nội dung...\")\n", " content_scores = self.get_content_score(user_id)\n", " return sorted(content_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]\n", " \n", " print(f\"Người dùng không có sản phẩm yêu thích. Áp dụng cold-start recommendations...\")\n", "\n", " # Nếu không có sản phẩm yêu thích, sử dụng cold-start (dựa vào sản phẩm phổ biến)\n", " popular_items = self.reviews.groupby('product_id')['rating'].mean()\n", " popular_items = popular_items.sort_values(ascending=False).head(top_n)\n", "\n", " # Chuyển sang dict để dễ xử lý\n", " cold_start_scores = {pid: score for pid, score in popular_items.items()}\n", "\n", " return sorted(cold_start_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]\n", "\n", " \n", " user_idx = self.user_to_index[user_id]\n", " \n", " # Collaborative filtering score\n", " collab_scores = dict(enumerate(self.predicted_ratings[user_idx]))\n", " collab_scores = {self.index_to_product[i]: s for i, s in collab_scores.items()}\n", " \n", " # Content-based score\n", " content_scores = self.get_content_score(user_id)\n", " \n", " # History score\n", " history_scores = {}\n", " user_history = self.order_history[self.order_history['user_id'] == user_id]\n", " for _, row in user_history.iterrows():\n", " history_scores[row['product_id']] = row['time_weight']\n", " \n", " # Kết hợp scores với trọng số\n", " final_scores = {}\n", " for prod_id in self.product_ids:\n", " final_score = 0\n", " if prod_id in collab_scores:\n", " final_score += weights['collab'] * collab_scores[prod_id]\n", " if prod_id in content_scores:\n", " final_score += weights['content'] * content_scores[prod_id]\n", " if prod_id in history_scores:\n", " final_score += weights['history'] * history_scores[prod_id]\n", " if final_score > 0:\n", " # Thêm yếu tố ngẫu nhiên có kiểm soát\n", " noise = random.uniform(-randomness, randomness) * final_score\n", " final_scores[prod_id] = final_score + noise\n", " \n", " # Lọc sản phẩm đã mua/yêu thích để gợi ý sản phẩm mới\n", " purchased = set(self.order_history[self.order_history['user_id'] == user_id]['product_id'])\n", " liked = set(self.favourite_items[self.favourite_items['user_id'] == user_id]['product_id'])\n", " final_scores = {k: v for k, v in final_scores.items() if k not in purchased}\n", "\n", " # final_scores = {k: v for k, v in final_scores.items() if k not in purchased.union(liked)}\n", " \n", " # Giới hạn top N sản phẩm\n", " top_candidates = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)\n", " print(top_candidates)\n", " \n", " # Chọn ngẫu nhiên từ top 2 * top_n để tăng tính ngẫu nhiên, nhưng vẫn ưu tiên sản phẩm liên quan\n", " if len(top_candidates) > top_n:\n", " top_candidates = random.sample(top_candidates[:2 * top_n], top_n)\n", " \n", " # Cập nhật lịch sử mua hàng giả lập sau khi gợi ý\n", " # for product_id, _ in top_candidates:\n", " # self.order_history = pd.concat([\n", " # self.order_history,\n", " # pd.DataFrame({'user_id': [user_id], 'product_id': [product_id], \n", " # 'timestamp': [pd.Timestamp.now()], 'time_weight': [1.0]})\n", " # ], ignore_index=True)\n", "\n", " return sorted(top_candidates, key=lambda x: x[1], reverse=True)\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 260, "metadata": {}, "outputs": [], "source": [ "# Sử dụng hệ thống\n", "recommender = HybridRecommender(products, reviews, favourite_items, order_history)\n", "recommender.train_svd()" ] }, { "cell_type": "code", "execution_count": 263, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User 6 chưa có đánh giá nào. Kiểm tra sản phẩm yêu thích...\n", "Người dùng không có sản phẩm yêu thích. Áp dụng cold-start recommendations...\n", "Gợi ý cho user 6:\n", "- Nước tết 2 (score: 5.000)\n", "- Test vui vẻ (score: 4.000)\n", "- string111111 (score: 3.800)\n", "- Trà1 (score: 3.583)\n", "- string (score: 3.000)\n" ] } ], "source": [ "user_id = 6\n", "recommendations = recommender.hybrid_recommend(user_id, top_n= 5,randomness=0.1)\n", "print(f\"Gợi ý cho user {user_id}:\")\n", "for product_id, score in recommendations:\n", " product_name = products[products['product_id'] == product_id]['name'].iloc[0]\n", " print(f\"- {product_name} (score: {score:.3f})\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" } }, "nbformat": 4, "nbformat_minor": 2 }