SurgeousJP commited on
Commit
67f0e42
·
1 Parent(s): 8ffebd2

Add some comment

Browse files
.ipynb_checkpoints/plot_based_recommender_supabase-checkpoint.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:afa1c81342bd382d45590fe490dcbfd659ec912efc57238035dec01a0d4f319b
3
- size 36012
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b14b564f7355b1b79d3144edeadc4eea82260f1941d1501065d29ad93489c66a
3
+ size 45020
Dockerfile CHANGED
@@ -8,8 +8,7 @@ COPY . .
8
 
9
  RUN pip install -r requirements.txt
10
 
11
- EXPOSE 5000
12
- EXPOSE 5001
13
 
14
  # Command to run your application (replace with your actual command)
15
  CMD ["python", "plot_based_recommender_supabase.py"]
 
8
 
9
  RUN pip install -r requirements.txt
10
 
11
+ EXPOSE 8080
 
12
 
13
  # Command to run your application (replace with your actual command)
14
  CMD ["python", "plot_based_recommender_supabase.py"]
plot_based_recommender_supabase.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba3df418463084db0c0a2e43de93b5286852706b6774ace11e8dcdfdd0aaf21a
3
- size 41180
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b14b564f7355b1b79d3144edeadc4eea82260f1941d1501065d29ad93489c66a
3
+ size 45020
plot_based_recommender_supabase.py CHANGED
@@ -1,47 +1,15 @@
1
  #!/usr/bin/env python
2
  # coding: utf-8
3
 
4
- # In[1]:
5
-
6
-
7
- # get_ipython().system('pip install supabase')
8
- # get_ipython().system('pip install flask')
9
- # get_ipython().system('pip install flask-ngrok')
10
- # get_ipython().system('pip install waitress')
11
-
12
-
13
- # In[2]:
14
-
15
-
16
- # pip install --upgrade supabase
17
-
18
-
19
- # In[3]:
20
-
21
-
22
- # pip list
23
-
24
-
25
- # In[4]:
26
-
27
-
28
  import pandas as pd
29
  import numpy as np
30
  from supabase import create_client, Client
31
 
32
-
33
- # In[5]:
34
-
35
-
36
  # Your Supabase project details
37
  URL = "https://oflclzbsbgkadqiagxqk.supabase.co" # Supabase project URL
38
  KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Im9mbGNsemJzYmdrYWRxaWFneHFrIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MDY0OTY3OTIsImV4cCI6MjAyMjA3Mjc5Mn0.2IGuSFqHbNp75vs-LskGjK0fw3ypqbiHJ9MKAAaYE8s" # Supabase API key
39
  supabase: Client = create_client(URL, KEY)
40
 
41
-
42
- # In[6]:
43
-
44
-
45
  def convert_table_to_pandas_dataframe(supabase, table_name):
46
  # Retrieve data from Supabase
47
  data = supabase.table(table_name).select("*").execute()
@@ -53,17 +21,28 @@ def convert_table_to_pandas_dataframe(supabase, table_name):
53
 
54
  books_df = convert_table_to_pandas_dataframe(supabase, "books")
55
 
 
 
56
 
57
- # In[7]:
58
-
59
 
60
- books_df['description'].head(5)
61
 
62
 
63
- # ## Plot-based recommender
64
-
65
- # In[8]:
66
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  #Import TfIdfVectorizer from scikit-learn
69
  from sklearn.feature_extraction.text import TfidfVectorizer
@@ -71,54 +50,48 @@ from sklearn.feature_extraction.text import TfidfVectorizer
71
  #Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
72
  tfidf = TfidfVectorizer(stop_words='english')
73
 
74
- #Replace NaN with an empty string
75
- books_df['descripion'] = books_df['description'].fillna('')
76
-
77
  #Construct the required TF-IDF matrix by fitting and transforming the data
78
- tfidf_matrix = tfidf.fit_transform(books_df['description'])
 
 
79
 
80
  #Output the shape of tfidf_matrix
81
  tfidf_matrix.shape
82
 
83
 
84
- # In[9]:
85
-
86
-
87
- tfidf
88
 
 
89
 
90
- # In[10]:
 
91
 
 
 
92
 
93
- print(tfidf_matrix[0].shape)
 
94
 
 
 
95
 
96
- # In[11]:
97
 
 
98
 
 
99
  # Import linear_kernel
100
  from sklearn.metrics.pairwise import linear_kernel
101
 
102
  # Compute the cosine similarity matrix
103
  cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
104
 
105
-
106
- # In[12]:
107
-
108
-
109
  indices = pd.Series(books_df.index, index=books_df['title']).drop_duplicates()
110
 
111
-
112
- # In[13]:
113
-
114
-
115
  def get_original_book_id(title):
116
  return books_df.loc[books_df['title'] == title, 'id'].values[0]
117
 
118
-
119
- # In[14]:
120
-
121
-
122
  # Function that takes in movie title as input and outputs most similar movies
123
  def get_top_five_recommendations(title, cosine_sim=cosine_sim):
124
  # Get the index of the movie that matches the title
@@ -131,7 +104,7 @@ def get_top_five_recommendations(title, cosine_sim=cosine_sim):
131
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
132
 
133
  # Get the scores of the 10 most similar movies
134
- sim_scores = sim_scores[:10]
135
 
136
  # Get the movie indices
137
  book_indices = [i[0] for i in sim_scores]
@@ -142,74 +115,21 @@ def get_top_five_recommendations(title, cosine_sim=cosine_sim):
142
  ids = []
143
  for title in books_df['title'].iloc[book_indices]:
144
  ids.append(get_original_book_id(title))
 
145
  return ids
146
 
147
-
148
- # In[15]:
149
-
150
-
151
  get_top_five_recommendations('Walls of Ash')
152
 
153
-
154
- # In[16]:
155
-
156
-
157
- pd.set_option('display.max_colwidth', None)
158
-
159
-
160
- # In[17]:
161
-
162
-
163
  books_df[books_df['id'].isin(get_top_five_recommendations('Walls of Ash'))]['url']
164
 
165
 
166
- # In[18]:
167
-
168
-
169
  from flask import Flask, jsonify, request
170
  from flask_ngrok import run_with_ngrok
171
 
172
-
173
- # In[19]:
174
-
175
-
176
  app = Flask(__name__)
177
  run_with_ngrok(app) # Start ngrok when app is run
178
 
179
-
180
- # In[20]:
181
-
182
-
183
  import json
184
-
185
-
186
- # In[21]:
187
-
188
-
189
- from waitress import serve
190
-
191
-
192
- # In[23]:
193
-
194
-
195
- # get_ipython().system('pip freeze > requirements.txt')
196
-
197
-
198
- # In[24]:
199
-
200
-
201
- # pip install pipdeptree
202
-
203
-
204
- # In[25]:
205
-
206
-
207
- # pipdeptree --output requirements.txt --graph >> requirements.txt
208
-
209
-
210
- # In[65]:
211
-
212
-
213
  @app.route('/predict/<int:id>', methods=['GET'])
214
  def predict(id):
215
  title = books_df[books_df['id'] == id]['title'].values[0]
@@ -217,10 +137,8 @@ def predict(id):
217
  prediction_result = [int(x) for x in get_top_five_recommendations(title)]
218
  return json.dumps(prediction_result)
219
 
220
-
221
- # In[66]:
222
-
223
 
224
  if __name__ == '__main__':
225
- serve(app, host="0.0.0.0", port=5000)
226
 
 
1
  #!/usr/bin/env python
2
  # coding: utf-8
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import pandas as pd
5
  import numpy as np
6
  from supabase import create_client, Client
7
 
 
 
 
 
8
  # Your Supabase project details
9
  URL = "https://oflclzbsbgkadqiagxqk.supabase.co" # Supabase project URL
10
  KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Im9mbGNsemJzYmdrYWRxaWFneHFrIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MDY0OTY3OTIsImV4cCI6MjAyMjA3Mjc5Mn0.2IGuSFqHbNp75vs-LskGjK0fw3ypqbiHJ9MKAAaYE8s" # Supabase API key
11
  supabase: Client = create_client(URL, KEY)
12
 
 
 
 
 
13
  def convert_table_to_pandas_dataframe(supabase, table_name):
14
  # Retrieve data from Supabase
15
  data = supabase.table(table_name).select("*").execute()
 
21
 
22
  books_df = convert_table_to_pandas_dataframe(supabase, "books")
23
 
24
+ pd.set_option('display.max_colwidth', 50)
25
+ pd.set_option('display.max_columns', None)
26
 
27
+ books_df.head(5)
 
28
 
29
+ books_df['combined'] = books_df['description'] + ' ' + books_df['title'] + ' ' + books_df['author_name']
30
 
31
 
32
+ # Content-based recommender
 
 
33
 
34
+ # Trong khai phá dữ liệu văn bản (text mining), thuật ngữ TF-IDF (term frequency - inverse document frequency) là một phương thức thống kê được biết đến rộng rãi nhất để xác định độ quan trọng của một từ trong đoạn văn bản trong một tập nhiều đoạn văn bản khác nhau.
35
+ #
36
+ # TF (Term Frequency): là tần suất xuất hiện của một từ trong một đoạn văn bản.
37
+ # TF(t) = f(t,d)/T
38
+ # (t là từ, f(t,d) là tần suất xuất hiện từ t trong đoạn d, T là tổng số từ trong đoạn văn T)
39
+ #
40
+ # IDF (Inverse Document Frequency): tính toán độ quan trọng của một từ. Khi tính toán TF, mỗi từ đều quan trọng như nhau, nhưng có một số từ trong tiếng Anh như "is", "of", "that",... xuất hiện khá nhiều nhưng lại rất ít quan trọng. Vì vậy, chúng ta cần một phương thức bù trừ những từ xuất hiện nhiều lần và tăng độ quan trọng của những từ ít xuất hiện những có ý nghĩa đặc biệt cho một số đoạn văn bản hơn bằng cách tính IDF.
41
+ #
42
+ # IDF(t) = log(N/∣t∈D:t∈d∣)
43
+ # (N là tổng số đoạn văn bản)
44
+ #
45
+ # TF-IDF(t) = TF(t) * IDF(t)
46
 
47
  #Import TfIdfVectorizer from scikit-learn
48
  from sklearn.feature_extraction.text import TfidfVectorizer
 
50
  #Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
51
  tfidf = TfidfVectorizer(stop_words='english')
52
 
 
 
 
53
  #Construct the required TF-IDF matrix by fitting and transforming the data
54
+ tfidf_matrix = tfidf.fit_transform(books_df['combined'])
55
+
56
+ feature_names = tfidf.get_feature_names()
57
 
58
  #Output the shape of tfidf_matrix
59
  tfidf_matrix.shape
60
 
61
 
62
+ # 20183 -> total document in the corpus
63
+ # 141956 -> total distinct terms in the corpus
 
 
64
 
65
+ feature_names[2000:2500]
66
 
67
+ # Assuming 'tfidf_matrix' is your TF-IDF matrix
68
+ # Assuming 'document_index' is the index of the document you want to calculate the total terms for
69
 
70
+ # Get the TF-IDF vector for the specified document
71
+ document_tfidf_vector = tfidf_matrix[10]
72
 
73
+ # Sum up the TF-IDF weights for all terms in the document
74
+ total_terms_in_document = document_tfidf_vector.sum()
75
 
76
+ print("Document vector: ", tfidf_matrix[10])
77
+ print("Total terms in document {}: {}".format(10, total_terms_in_document))
78
 
79
+ tfidf
80
 
81
+ print(tfidf_matrix[0].shape)
82
 
83
+ # Cosine similarity function for comparing every two documents
84
  # Import linear_kernel
85
  from sklearn.metrics.pairwise import linear_kernel
86
 
87
  # Compute the cosine similarity matrix
88
  cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
89
 
 
 
 
 
90
  indices = pd.Series(books_df.index, index=books_df['title']).drop_duplicates()
91
 
 
 
 
 
92
  def get_original_book_id(title):
93
  return books_df.loc[books_df['title'] == title, 'id'].values[0]
94
 
 
 
 
 
95
  # Function that takes in movie title as input and outputs most similar movies
96
  def get_top_five_recommendations(title, cosine_sim=cosine_sim):
97
  # Get the index of the movie that matches the title
 
104
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
105
 
106
  # Get the scores of the 10 most similar movies
107
+ sim_scores = sim_scores[:11]
108
 
109
  # Get the movie indices
110
  book_indices = [i[0] for i in sim_scores]
 
115
  ids = []
116
  for title in books_df['title'].iloc[book_indices]:
117
  ids.append(get_original_book_id(title))
118
+ ids.pop(0)
119
  return ids
120
 
 
 
 
 
121
  get_top_five_recommendations('Walls of Ash')
122
 
 
 
 
 
 
 
 
 
 
 
123
  books_df[books_df['id'].isin(get_top_five_recommendations('Walls of Ash'))]['url']
124
 
125
 
 
 
 
126
  from flask import Flask, jsonify, request
127
  from flask_ngrok import run_with_ngrok
128
 
 
 
 
 
129
  app = Flask(__name__)
130
  run_with_ngrok(app) # Start ngrok when app is run
131
 
 
 
 
 
132
  import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  @app.route('/predict/<int:id>', methods=['GET'])
134
  def predict(id):
135
  title = books_df[books_df['id'] == id]['title'].values[0]
 
137
  prediction_result = [int(x) for x in get_top_five_recommendations(title)]
138
  return json.dumps(prediction_result)
139
 
140
+ from waitress import serve
 
 
141
 
142
  if __name__ == '__main__':
143
+ serve(app, host="0.0.0.0", port=8080)
144