Commit
·
67f0e42
1
Parent(s):
8ffebd2
Add some comment
Browse files
.ipynb_checkpoints/plot_based_recommender_supabase-checkpoint.ipynb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b14b564f7355b1b79d3144edeadc4eea82260f1941d1501065d29ad93489c66a
|
| 3 |
+
size 45020
|
Dockerfile
CHANGED
|
@@ -8,8 +8,7 @@ COPY . .
|
|
| 8 |
|
| 9 |
RUN pip install -r requirements.txt
|
| 10 |
|
| 11 |
-
EXPOSE
|
| 12 |
-
EXPOSE 5001
|
| 13 |
|
| 14 |
# Command to run your application (replace with your actual command)
|
| 15 |
CMD ["python", "plot_based_recommender_supabase.py"]
|
|
|
|
| 8 |
|
| 9 |
RUN pip install -r requirements.txt
|
| 10 |
|
| 11 |
+
EXPOSE 8080
|
|
|
|
| 12 |
|
| 13 |
# Command to run your application (replace with your actual command)
|
| 14 |
CMD ["python", "plot_based_recommender_supabase.py"]
|
plot_based_recommender_supabase.ipynb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b14b564f7355b1b79d3144edeadc4eea82260f1941d1501065d29ad93489c66a
|
| 3 |
+
size 45020
|
plot_based_recommender_supabase.py
CHANGED
|
@@ -1,47 +1,15 @@
|
|
| 1 |
#!/usr/bin/env python
|
| 2 |
# coding: utf-8
|
| 3 |
|
| 4 |
-
# In[1]:
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
# get_ipython().system('pip install supabase')
|
| 8 |
-
# get_ipython().system('pip install flask')
|
| 9 |
-
# get_ipython().system('pip install flask-ngrok')
|
| 10 |
-
# get_ipython().system('pip install waitress')
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
# In[2]:
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
# pip install --upgrade supabase
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
# In[3]:
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
# pip list
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
# In[4]:
|
| 26 |
-
|
| 27 |
-
|
| 28 |
import pandas as pd
|
| 29 |
import numpy as np
|
| 30 |
from supabase import create_client, Client
|
| 31 |
|
| 32 |
-
|
| 33 |
-
# In[5]:
|
| 34 |
-
|
| 35 |
-
|
| 36 |
# Your Supabase project details
|
| 37 |
URL = "https://oflclzbsbgkadqiagxqk.supabase.co" # Supabase project URL
|
| 38 |
KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Im9mbGNsemJzYmdrYWRxaWFneHFrIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MDY0OTY3OTIsImV4cCI6MjAyMjA3Mjc5Mn0.2IGuSFqHbNp75vs-LskGjK0fw3ypqbiHJ9MKAAaYE8s" # Supabase API key
|
| 39 |
supabase: Client = create_client(URL, KEY)
|
| 40 |
|
| 41 |
-
|
| 42 |
-
# In[6]:
|
| 43 |
-
|
| 44 |
-
|
| 45 |
def convert_table_to_pandas_dataframe(supabase, table_name):
|
| 46 |
# Retrieve data from Supabase
|
| 47 |
data = supabase.table(table_name).select("*").execute()
|
|
@@ -53,17 +21,28 @@ def convert_table_to_pandas_dataframe(supabase, table_name):
|
|
| 53 |
|
| 54 |
books_df = convert_table_to_pandas_dataframe(supabase, "books")
|
| 55 |
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
|
| 60 |
-
books_df['description']
|
| 61 |
|
| 62 |
|
| 63 |
-
#
|
| 64 |
-
|
| 65 |
-
# In[8]:
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
#Import TfIdfVectorizer from scikit-learn
|
| 69 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
@@ -71,54 +50,48 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|
| 71 |
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
|
| 72 |
tfidf = TfidfVectorizer(stop_words='english')
|
| 73 |
|
| 74 |
-
#Replace NaN with an empty string
|
| 75 |
-
books_df['descripion'] = books_df['description'].fillna('')
|
| 76 |
-
|
| 77 |
#Construct the required TF-IDF matrix by fitting and transforming the data
|
| 78 |
-
tfidf_matrix = tfidf.fit_transform(books_df['
|
|
|
|
|
|
|
| 79 |
|
| 80 |
#Output the shape of tfidf_matrix
|
| 81 |
tfidf_matrix.shape
|
| 82 |
|
| 83 |
|
| 84 |
-
#
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
tfidf
|
| 88 |
|
|
|
|
| 89 |
|
| 90 |
-
#
|
|
|
|
| 91 |
|
|
|
|
|
|
|
| 92 |
|
| 93 |
-
|
|
|
|
| 94 |
|
|
|
|
|
|
|
| 95 |
|
| 96 |
-
|
| 97 |
|
|
|
|
| 98 |
|
|
|
|
| 99 |
# Import linear_kernel
|
| 100 |
from sklearn.metrics.pairwise import linear_kernel
|
| 101 |
|
| 102 |
# Compute the cosine similarity matrix
|
| 103 |
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
|
| 104 |
|
| 105 |
-
|
| 106 |
-
# In[12]:
|
| 107 |
-
|
| 108 |
-
|
| 109 |
indices = pd.Series(books_df.index, index=books_df['title']).drop_duplicates()
|
| 110 |
|
| 111 |
-
|
| 112 |
-
# In[13]:
|
| 113 |
-
|
| 114 |
-
|
| 115 |
def get_original_book_id(title):
|
| 116 |
return books_df.loc[books_df['title'] == title, 'id'].values[0]
|
| 117 |
|
| 118 |
-
|
| 119 |
-
# In[14]:
|
| 120 |
-
|
| 121 |
-
|
| 122 |
# Function that takes in movie title as input and outputs most similar movies
|
| 123 |
def get_top_five_recommendations(title, cosine_sim=cosine_sim):
|
| 124 |
# Get the index of the movie that matches the title
|
|
@@ -131,7 +104,7 @@ def get_top_five_recommendations(title, cosine_sim=cosine_sim):
|
|
| 131 |
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
|
| 132 |
|
| 133 |
# Get the scores of the 10 most similar movies
|
| 134 |
-
sim_scores = sim_scores[:
|
| 135 |
|
| 136 |
# Get the movie indices
|
| 137 |
book_indices = [i[0] for i in sim_scores]
|
|
@@ -142,74 +115,21 @@ def get_top_five_recommendations(title, cosine_sim=cosine_sim):
|
|
| 142 |
ids = []
|
| 143 |
for title in books_df['title'].iloc[book_indices]:
|
| 144 |
ids.append(get_original_book_id(title))
|
|
|
|
| 145 |
return ids
|
| 146 |
|
| 147 |
-
|
| 148 |
-
# In[15]:
|
| 149 |
-
|
| 150 |
-
|
| 151 |
get_top_five_recommendations('Walls of Ash')
|
| 152 |
|
| 153 |
-
|
| 154 |
-
# In[16]:
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
pd.set_option('display.max_colwidth', None)
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
# In[17]:
|
| 161 |
-
|
| 162 |
-
|
| 163 |
books_df[books_df['id'].isin(get_top_five_recommendations('Walls of Ash'))]['url']
|
| 164 |
|
| 165 |
|
| 166 |
-
# In[18]:
|
| 167 |
-
|
| 168 |
-
|
| 169 |
from flask import Flask, jsonify, request
|
| 170 |
from flask_ngrok import run_with_ngrok
|
| 171 |
|
| 172 |
-
|
| 173 |
-
# In[19]:
|
| 174 |
-
|
| 175 |
-
|
| 176 |
app = Flask(__name__)
|
| 177 |
run_with_ngrok(app) # Start ngrok when app is run
|
| 178 |
|
| 179 |
-
|
| 180 |
-
# In[20]:
|
| 181 |
-
|
| 182 |
-
|
| 183 |
import json
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
# In[21]:
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
from waitress import serve
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
# In[23]:
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
# get_ipython().system('pip freeze > requirements.txt')
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
# In[24]:
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
# pip install pipdeptree
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
# In[25]:
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
# pipdeptree --output requirements.txt --graph >> requirements.txt
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
# In[65]:
|
| 211 |
-
|
| 212 |
-
|
| 213 |
@app.route('/predict/<int:id>', methods=['GET'])
|
| 214 |
def predict(id):
|
| 215 |
title = books_df[books_df['id'] == id]['title'].values[0]
|
|
@@ -217,10 +137,8 @@ def predict(id):
|
|
| 217 |
prediction_result = [int(x) for x in get_top_five_recommendations(title)]
|
| 218 |
return json.dumps(prediction_result)
|
| 219 |
|
| 220 |
-
|
| 221 |
-
# In[66]:
|
| 222 |
-
|
| 223 |
|
| 224 |
if __name__ == '__main__':
|
| 225 |
-
serve(app, host="0.0.0.0", port=
|
| 226 |
|
|
|
|
| 1 |
#!/usr/bin/env python
|
| 2 |
# coding: utf-8
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
import numpy as np
|
| 6 |
from supabase import create_client, Client
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
# Your Supabase project details
|
| 9 |
URL = "https://oflclzbsbgkadqiagxqk.supabase.co" # Supabase project URL
|
| 10 |
KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Im9mbGNsemJzYmdrYWRxaWFneHFrIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MDY0OTY3OTIsImV4cCI6MjAyMjA3Mjc5Mn0.2IGuSFqHbNp75vs-LskGjK0fw3ypqbiHJ9MKAAaYE8s" # Supabase API key
|
| 11 |
supabase: Client = create_client(URL, KEY)
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
def convert_table_to_pandas_dataframe(supabase, table_name):
|
| 14 |
# Retrieve data from Supabase
|
| 15 |
data = supabase.table(table_name).select("*").execute()
|
|
|
|
| 21 |
|
| 22 |
books_df = convert_table_to_pandas_dataframe(supabase, "books")
|
| 23 |
|
| 24 |
+
pd.set_option('display.max_colwidth', 50)
|
| 25 |
+
pd.set_option('display.max_columns', None)
|
| 26 |
|
| 27 |
+
books_df.head(5)
|
|
|
|
| 28 |
|
| 29 |
+
books_df['combined'] = books_df['description'] + ' ' + books_df['title'] + ' ' + books_df['author_name']
|
| 30 |
|
| 31 |
|
| 32 |
+
# Content-based recommender
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
# Trong khai phá dữ liệu văn bản (text mining), thuật ngữ TF-IDF (term frequency - inverse document frequency) là một phương thức thống kê được biết đến rộng rãi nhất để xác định độ quan trọng của một từ trong đoạn văn bản trong một tập nhiều đoạn văn bản khác nhau.
|
| 35 |
+
#
|
| 36 |
+
# TF (Term Frequency): là tần suất xuất hiện của một từ trong một đoạn văn bản.
|
| 37 |
+
# TF(t) = f(t,d)/T
|
| 38 |
+
# (t là từ, f(t,d) là tần suất xuất hiện từ t trong đoạn d, T là tổng số từ trong đoạn văn T)
|
| 39 |
+
#
|
| 40 |
+
# IDF (Inverse Document Frequency): tính toán độ quan trọng của một từ. Khi tính toán TF, mỗi từ đều quan trọng như nhau, nhưng có một số từ trong tiếng Anh như "is", "of", "that",... xuất hiện khá nhiều nhưng lại rất ít quan trọng. Vì vậy, chúng ta cần một phương thức bù trừ những từ xuất hiện nhiều lần và tăng độ quan trọng của những từ ít xuất hiện những có ý nghĩa đặc biệt cho một số đoạn văn bản hơn bằng cách tính IDF.
|
| 41 |
+
#
|
| 42 |
+
# IDF(t) = log(N/∣t∈D:t∈d∣)
|
| 43 |
+
# (N là tổng số đoạn văn bản)
|
| 44 |
+
#
|
| 45 |
+
# TF-IDF(t) = TF(t) * IDF(t)
|
| 46 |
|
| 47 |
#Import TfIdfVectorizer from scikit-learn
|
| 48 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
| 50 |
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
|
| 51 |
tfidf = TfidfVectorizer(stop_words='english')
|
| 52 |
|
|
|
|
|
|
|
|
|
|
| 53 |
#Construct the required TF-IDF matrix by fitting and transforming the data
|
| 54 |
+
tfidf_matrix = tfidf.fit_transform(books_df['combined'])
|
| 55 |
+
|
| 56 |
+
feature_names = tfidf.get_feature_names()
|
| 57 |
|
| 58 |
#Output the shape of tfidf_matrix
|
| 59 |
tfidf_matrix.shape
|
| 60 |
|
| 61 |
|
| 62 |
+
# 20183 -> total document in the corpus
|
| 63 |
+
# 141956 -> total distinct terms in the corpus
|
|
|
|
|
|
|
| 64 |
|
| 65 |
+
feature_names[2000:2500]
|
| 66 |
|
| 67 |
+
# Assuming 'tfidf_matrix' is your TF-IDF matrix
|
| 68 |
+
# Assuming 'document_index' is the index of the document you want to calculate the total terms for
|
| 69 |
|
| 70 |
+
# Get the TF-IDF vector for the specified document
|
| 71 |
+
document_tfidf_vector = tfidf_matrix[10]
|
| 72 |
|
| 73 |
+
# Sum up the TF-IDF weights for all terms in the document
|
| 74 |
+
total_terms_in_document = document_tfidf_vector.sum()
|
| 75 |
|
| 76 |
+
print("Document vector: ", tfidf_matrix[10])
|
| 77 |
+
print("Total terms in document {}: {}".format(10, total_terms_in_document))
|
| 78 |
|
| 79 |
+
tfidf
|
| 80 |
|
| 81 |
+
print(tfidf_matrix[0].shape)
|
| 82 |
|
| 83 |
+
# Cosine similarity function for comparing every two documents
|
| 84 |
# Import linear_kernel
|
| 85 |
from sklearn.metrics.pairwise import linear_kernel
|
| 86 |
|
| 87 |
# Compute the cosine similarity matrix
|
| 88 |
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
indices = pd.Series(books_df.index, index=books_df['title']).drop_duplicates()
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
def get_original_book_id(title):
|
| 93 |
return books_df.loc[books_df['title'] == title, 'id'].values[0]
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
# Function that takes in movie title as input and outputs most similar movies
|
| 96 |
def get_top_five_recommendations(title, cosine_sim=cosine_sim):
|
| 97 |
# Get the index of the movie that matches the title
|
|
|
|
| 104 |
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
|
| 105 |
|
| 106 |
# Get the scores of the 10 most similar movies
|
| 107 |
+
sim_scores = sim_scores[:11]
|
| 108 |
|
| 109 |
# Get the movie indices
|
| 110 |
book_indices = [i[0] for i in sim_scores]
|
|
|
|
| 115 |
ids = []
|
| 116 |
for title in books_df['title'].iloc[book_indices]:
|
| 117 |
ids.append(get_original_book_id(title))
|
| 118 |
+
ids.pop(0)
|
| 119 |
return ids
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
get_top_five_recommendations('Walls of Ash')
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
books_df[books_df['id'].isin(get_top_five_recommendations('Walls of Ash'))]['url']
|
| 124 |
|
| 125 |
|
|
|
|
|
|
|
|
|
|
| 126 |
from flask import Flask, jsonify, request
|
| 127 |
from flask_ngrok import run_with_ngrok
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
app = Flask(__name__)
|
| 130 |
run_with_ngrok(app) # Start ngrok when app is run
|
| 131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
@app.route('/predict/<int:id>', methods=['GET'])
|
| 134 |
def predict(id):
|
| 135 |
title = books_df[books_df['id'] == id]['title'].values[0]
|
|
|
|
| 137 |
prediction_result = [int(x) for x in get_top_five_recommendations(title)]
|
| 138 |
return json.dumps(prediction_result)
|
| 139 |
|
| 140 |
+
from waitress import serve
|
|
|
|
|
|
|
| 141 |
|
| 142 |
if __name__ == '__main__':
|
| 143 |
+
serve(app, host="0.0.0.0", port=8080)
|
| 144 |
|