Javier Real commited on
Commit
73969b9
·
1 Parent(s): aae5cfe
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -3,6 +3,9 @@ from course_project.application_tools import get_recommendations_text, get_data
3
 
4
  # Load app with: streamlit run course_project/app.py
5
 
 
 
 
6
  # Cargar los datos de películas para mostrar en la interfaz
7
  movies_data = get_data()
8
  movies_titles = movies_data['title'].tolist()
@@ -34,6 +37,6 @@ n_results = st.number_input('Number of recommendations', min_value=1, max_value=
34
 
35
  if st.button('Get Recommendations'):
36
  # st.write("Debugging Suggestions:", suggestions)
37
- recommendations = get_recommendations_text(suggestions, n_results=n_results)
38
  # st.write("Debugging Recommendations:\n", recommendations)
39
  st.markdown(recommendations.replace("\n", "\n\n"))
 
3
 
4
  # Load app with: streamlit run course_project/app.py
5
 
6
+ COSINE_SIMILARITY = 0
7
+ EUCLIDEAN_SIMILARITY = 1
8
+
9
  # Cargar los datos de películas para mostrar en la interfaz
10
  movies_data = get_data()
11
  movies_titles = movies_data['title'].tolist()
 
37
 
38
  if st.button('Get Recommendations'):
39
  # st.write("Debugging Suggestions:", suggestions)
40
+ recommendations = get_recommendations_text(suggestions, n_results=n_results, similarity=EUCLIDEAN_SIMILARITY)
41
  # st.write("Debugging Recommendations:\n", recommendations)
42
  st.markdown(recommendations.replace("\n", "\n\n"))
course_project/application_tools.py CHANGED
@@ -6,8 +6,11 @@ import pandas as pd
6
  DATA_FOLDER_LOCATION = "course_project/data/"
7
  OLLAMA_EMBEDDING_BK_LOCATION = DATA_FOLDER_LOCATION + "ollama_embedding_bk_location.pkl"
8
 
 
 
 
9
  def get_data(overwrite_db=False):
10
- movies_data = data_persistence.get_db()
11
  if not isinstance(movies_data, pd.DataFrame) or overwrite_db:
12
  movies_data = data_import.read_or_generate_movies_data()
13
  movies_data["vector"] = \
@@ -20,7 +23,7 @@ def get_data(overwrite_db=False):
20
  movies_data["vector"] = embedding.encode_text_series_ollama_method(movies_data["vector"], file_backup_location=OLLAMA_EMBEDDING_BK_LOCATION, batch_size=500)
21
  print(movies_data["vector"])
22
 
23
- data_persistence.save_db(movies_data)
24
  return movies_data
25
 
26
  def replace_ids_with_titles(suggestions, movies_data):
@@ -29,9 +32,9 @@ def replace_ids_with_titles(suggestions, movies_data):
29
  suggestions[user] = movie_title
30
  return suggestions
31
 
32
- def get_recommendations_df(suggestions, n_results):
33
  movies_data = get_data()
34
- recommendations_raw = embedding.find_recommendations(suggestions, movies_data["vector"], n_results=n_results)
35
  recommendations_indexes = [recommendation["movie_index"] for recommendation in recommendations_raw]
36
  columns_of_interest = ["title", "director", "genres", "cast", "overview", "rating", "poster_path", "imdb_id"]
37
  recommendations = (movies_data.loc[recommendations_indexes])[columns_of_interest].copy()
@@ -50,13 +53,15 @@ def recommendations_to_text(df):
50
  return result_string
51
 
52
 
53
- def get_recommendations_text(suggestions, n_results=10):
54
- recommendations = get_recommendations_df(suggestions, n_results)
55
  return recommendations_to_text(recommendations)
56
 
57
  if __name__ == "__main__":
58
  suggestions = {"Pepe": [0, 2960], "Juan": [1945, 6174]} # Pepe: Toy Story 1 y 2. Juan: Peter Pan y Buscando a Nemo
59
- get_recommendations_text(suggestions)
 
 
60
 
61
 
62
 
 
6
  DATA_FOLDER_LOCATION = "course_project/data/"
7
  OLLAMA_EMBEDDING_BK_LOCATION = DATA_FOLDER_LOCATION + "ollama_embedding_bk_location.pkl"
8
 
9
+ COSINE_SIMILARITY = 0
10
+ EUCLIDEAN_SIMILARITY = 1
11
+
12
  def get_data(overwrite_db=False):
13
+ movies_data = data_persistence.get_transformed_db()
14
  if not isinstance(movies_data, pd.DataFrame) or overwrite_db:
15
  movies_data = data_import.read_or_generate_movies_data()
16
  movies_data["vector"] = \
 
23
  movies_data["vector"] = embedding.encode_text_series_ollama_method(movies_data["vector"], file_backup_location=OLLAMA_EMBEDDING_BK_LOCATION, batch_size=500)
24
  print(movies_data["vector"])
25
 
26
+ data_persistence.save_transformed_db(movies_data)
27
  return movies_data
28
 
29
  def replace_ids_with_titles(suggestions, movies_data):
 
32
  suggestions[user] = movie_title
33
  return suggestions
34
 
35
+ def get_recommendations_df(suggestions, n_results, similarity=COSINE_SIMILARITY):
36
  movies_data = get_data()
37
+ recommendations_raw = embedding.find_recommendations(suggestions, movies_data["vector"], n_results=n_results, similarity=similarity)
38
  recommendations_indexes = [recommendation["movie_index"] for recommendation in recommendations_raw]
39
  columns_of_interest = ["title", "director", "genres", "cast", "overview", "rating", "poster_path", "imdb_id"]
40
  recommendations = (movies_data.loc[recommendations_indexes])[columns_of_interest].copy()
 
53
  return result_string
54
 
55
 
56
+ def get_recommendations_text(suggestions, n_results=10, similarity=COSINE_SIMILARITY):
57
+ recommendations = get_recommendations_df(suggestions, n_results, similarity=similarity)
58
  return recommendations_to_text(recommendations)
59
 
60
  if __name__ == "__main__":
61
  suggestions = {"Pepe": [0, 2960], "Juan": [1945, 6174]} # Pepe: Toy Story 1 y 2. Juan: Peter Pan y Buscando a Nemo
62
+ result = get_recommendations_text(suggestions, similarity=EUCLIDEAN_SIMILARITY)
63
+ print("\n\n--------\n\n")
64
+ print(result)
65
 
66
 
67
 
course_project/data_persistence.py CHANGED
@@ -3,6 +3,7 @@ import os
3
 
4
  DATABASE_LOCATION = "course_project/database/"
5
  MOVIES_BK_FILE_LOCATION = DATABASE_LOCATION + "provisional_database_simulation.pkl"
 
6
 
7
  def save_pickle(df, filename):
8
  with open(filename, 'wb') as f:
@@ -21,3 +22,12 @@ def get_db():
21
  return read_pickle(MOVIES_BK_FILE_LOCATION)
22
  else:
23
  return None
 
 
 
 
 
 
 
 
 
 
3
 
4
  DATABASE_LOCATION = "course_project/database/"
5
  MOVIES_BK_FILE_LOCATION = DATABASE_LOCATION + "provisional_database_simulation.pkl"
6
+ MOVIES_BK_FILE_TRANSFORMED_LOCATION = DATABASE_LOCATION + "provisional_database_transformed.pkl"
7
 
8
  def save_pickle(df, filename):
9
  with open(filename, 'wb') as f:
 
22
  return read_pickle(MOVIES_BK_FILE_LOCATION)
23
  else:
24
  return None
25
+
26
+ def save_transformed_db(series_to_store):
27
+ save_pickle(series_to_store, MOVIES_BK_FILE_TRANSFORMED_LOCATION)
28
+
29
+ def get_transformed_db():
30
+ if os.path.exists(MOVIES_BK_FILE_TRANSFORMED_LOCATION):
31
+ return read_pickle(MOVIES_BK_FILE_TRANSFORMED_LOCATION)
32
+ else:
33
+ return None
course_project/embedding.py CHANGED
@@ -9,10 +9,13 @@ import ollama
9
  from course_project import data_persistence
10
 
11
  DATA_FOLDER_LOCATION = "data/"
12
- QUESTIONS_FILE_LOCATION = DATA_FOLDER_LOCATION + "train.csv"
13
- QUESTIONS_WITH_EMBEDDINGS_LOCATION = DATA_FOLDER_LOCATION + "train_with_embeddings.pkl"
14
  WORD2VEC_FILE_LOCATION = DATA_FOLDER_LOCATION + "glove.6B.100d.txt"
15
  WORD2VEC_MODEL_FILE_LOCATION = DATA_FOLDER_LOCATION + "glove_model"
 
 
 
 
 
16
 
17
  # ollama pull gemma:7b
18
 
@@ -93,34 +96,65 @@ def encode_text_series_ollama_method(original_series, file_backup_location=None,
93
 
94
  def find_most_similar_coincidences_indexes(n_results, objective_vector, movies_embeddings):
95
  cosine_sim_scores = cosine_similarity([objective_vector.tolist()], movies_embeddings.tolist())[0].tolist()
96
- return [cosine_sim_scores.index(winner_score) for winner_score in sorted(cosine_sim_scores, reverse=True)[:n_results]]
97
-
98
- def get_winning_choice_per_person(movie_tested, suggestions):
99
- cosine_sim_scores = cosine_similarity([movie_tested.tolist()], suggestions.tolist())[0].tolist()
100
- cosine_sim_scores = [score if score<=0.99999 else 0 for score in cosine_sim_scores]
101
- winning_score = sorted(cosine_sim_scores, reverse=True)[0]
102
- winning_suggestion_index = cosine_sim_scores.index(winning_score)
103
- return winning_suggestion_index, winning_score
104
-
105
- def find_recommendations(suggestions_dict, movies_embeddings, n_results=10, rating_weight=0):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  scores = []
107
  suggestions_considered = []
108
  for movie_tested in movies_embeddings:
109
  movie_tested_score = 0
110
  movie_tested_suggestions_considered = {}
111
  for person in suggestions_dict:
112
- winning_suggestion_index, winning_score = get_winning_choice_per_person(movie_tested, movies_embeddings.loc[suggestions_dict[person]])
113
  movie_tested_score += winning_score
114
  movie_tested_suggestions_considered[person] = suggestions_dict[person][winning_suggestion_index]
115
  movie_tested_score /= len(suggestions_dict)
116
  movie_tested_score += rating_weight
117
  scores.append(movie_tested_score)
118
  suggestions_considered.append(movie_tested_suggestions_considered)
119
-
120
- recommended_movies_indexes = [scores.index(winner_score) for winner_score in sorted(scores, reverse=True)[:n_results]]
121
- recommendations = []
122
- for recommended_movie_index in recommended_movies_indexes:
123
- recommendation = {"movie_index": recommended_movie_index, "suggestions_considered": suggestions_considered[recommended_movie_index]}
124
- recommendations.append(recommendation)
125
  return recommendations
126
-
 
9
  from course_project import data_persistence
10
 
11
  DATA_FOLDER_LOCATION = "data/"
 
 
12
  WORD2VEC_FILE_LOCATION = DATA_FOLDER_LOCATION + "glove.6B.100d.txt"
13
  WORD2VEC_MODEL_FILE_LOCATION = DATA_FOLDER_LOCATION + "glove_model"
14
+ MOVIES_BK_FILE_LOCATION = "course_project/database/provisional_database_simulation.pkl"
15
+ MOVIES_BK_FILE_TRANSFORMED_LOCATION = "course_project/database/provisional_database_transformed.pkl"
16
+
17
+ COSINE_SIMILARITY = 0
18
+ EUCLIDEAN_SIMILARITY = 1
19
 
20
  # ollama pull gemma:7b
21
 
 
96
 
97
  def find_most_similar_coincidences_indexes(n_results, objective_vector, movies_embeddings):
98
  cosine_sim_scores = cosine_similarity([objective_vector.tolist()], movies_embeddings.tolist())[0].tolist()
99
+ most_similar_indexes = np.argsort(cosine_sim_scores)[-n_results:][::-1]
100
+ return most_similar_indexes.tolist()
101
+
102
+ def get_winning_choice_per_person(movie_tested, suggestions, similarity=COSINE_SIMILARITY):
103
+ if similarity==COSINE_SIMILARITY:
104
+ scores = np.array(cosine_similarity([movie_tested.tolist()], suggestions.tolist())[0].tolist())
105
+ scores[scores >= 0.99999] = 0
106
+ winning_suggestion_index = np.argmax(scores)
107
+ winning_score = scores[winning_suggestion_index]
108
+ return winning_suggestion_index, winning_score
109
+ elif similarity==EUCLIDEAN_SIMILARITY:
110
+ distances = np.linalg.norm(np.array(suggestions.tolist()) - movie_tested, axis=1)
111
+ scores = np.exp(-0.005*distances)
112
+ scores[scores >= 0.99999] = 0
113
+ winning_suggestion_index = np.argmax(scores)
114
+ winning_score = scores[winning_suggestion_index]
115
+ return winning_suggestion_index, winning_score
116
+ else:
117
+ raise ValueError('Wrong similarity value')
118
+
119
+ def normalize_to_minus_one_and_one(arr):
120
+ min_val = np.min(arr)
121
+ max_val = np.max(arr)
122
+ # Evitamos la división por cero en caso de que todos los valores sean iguales
123
+ if min_val == max_val:
124
+ return np.zeros_like(arr)
125
+ normalized_arr = 2 * (arr - min_val) / (max_val - min_val) - 1
126
+ return normalized_arr
127
+
128
+ def linearly_transform_embeddings():
129
+ movie_data = data_persistence.read_pickle(MOVIES_BK_FILE_LOCATION)
130
+ embeddings = movie_data["vector"]
131
+ embeddings = np.array(embeddings)
132
+ for embedding in embeddings:
133
+ # print(embedding)
134
+ embedding = normalize_to_minus_one_and_one(np.array(embedding))
135
+ embedding = pd.Series(embedding)
136
+ embeddings = pd.Series(embeddings)
137
+ movie_data["vector"] = embeddings
138
+ data_persistence.save_pickle(movie_data, MOVIES_BK_FILE_TRANSFORMED_LOCATION)
139
+
140
+ def find_recommendations(suggestions_dict, movies_embeddings, n_results=10, rating_weight=0, similarity=COSINE_SIMILARITY):
141
  scores = []
142
  suggestions_considered = []
143
  for movie_tested in movies_embeddings:
144
  movie_tested_score = 0
145
  movie_tested_suggestions_considered = {}
146
  for person in suggestions_dict:
147
+ winning_suggestion_index, winning_score = get_winning_choice_per_person(movie_tested, movies_embeddings.loc[suggestions_dict[person]], similarity=similarity)
148
  movie_tested_score += winning_score
149
  movie_tested_suggestions_considered[person] = suggestions_dict[person][winning_suggestion_index]
150
  movie_tested_score /= len(suggestions_dict)
151
  movie_tested_score += rating_weight
152
  scores.append(movie_tested_score)
153
  suggestions_considered.append(movie_tested_suggestions_considered)
154
+
155
+ recommended_movies_indexes = np.argsort(scores)[-n_results:][::-1]
156
+ recommendations = [
157
+ {"movie_index": idx, "suggestions_considered": suggestions_considered[idx]}
158
+ for idx in recommended_movies_indexes
159
+ ]
160
  return recommendations