Spaces:
Sleeping
Sleeping
Javier Real
commited on
Commit
·
73969b9
1
Parent(s):
aae5cfe
Progress
Browse files- .DS_Store +0 -0
- app.py +4 -1
- course_project/application_tools.py +12 -7
- course_project/data_persistence.py +10 -0
- course_project/embedding.py +54 -20
.DS_Store
CHANGED
|
Binary files a/.DS_Store and b/.DS_Store differ
|
|
|
app.py
CHANGED
|
@@ -3,6 +3,9 @@ from course_project.application_tools import get_recommendations_text, get_data
|
|
| 3 |
|
| 4 |
# Load app with: streamlit run course_project/app.py
|
| 5 |
|
|
|
|
|
|
|
|
|
|
| 6 |
# Cargar los datos de películas para mostrar en la interfaz
|
| 7 |
movies_data = get_data()
|
| 8 |
movies_titles = movies_data['title'].tolist()
|
|
@@ -34,6 +37,6 @@ n_results = st.number_input('Number of recommendations', min_value=1, max_value=
|
|
| 34 |
|
| 35 |
if st.button('Get Recommendations'):
|
| 36 |
# st.write("Debugging Suggestions:", suggestions)
|
| 37 |
-
recommendations = get_recommendations_text(suggestions, n_results=n_results)
|
| 38 |
# st.write("Debugging Recommendations:\n", recommendations)
|
| 39 |
st.markdown(recommendations.replace("\n", "\n\n"))
|
|
|
|
| 3 |
|
| 4 |
# Load app with: streamlit run course_project/app.py
|
| 5 |
|
| 6 |
+
COSINE_SIMILARITY = 0
|
| 7 |
+
EUCLIDEAN_SIMILARITY = 1
|
| 8 |
+
|
| 9 |
# Cargar los datos de películas para mostrar en la interfaz
|
| 10 |
movies_data = get_data()
|
| 11 |
movies_titles = movies_data['title'].tolist()
|
|
|
|
| 37 |
|
| 38 |
if st.button('Get Recommendations'):
|
| 39 |
# st.write("Debugging Suggestions:", suggestions)
|
| 40 |
+
recommendations = get_recommendations_text(suggestions, n_results=n_results, similarity=EUCLIDEAN_SIMILARITY)
|
| 41 |
# st.write("Debugging Recommendations:\n", recommendations)
|
| 42 |
st.markdown(recommendations.replace("\n", "\n\n"))
|
course_project/application_tools.py
CHANGED
|
@@ -6,8 +6,11 @@ import pandas as pd
|
|
| 6 |
DATA_FOLDER_LOCATION = "course_project/data/"
|
| 7 |
OLLAMA_EMBEDDING_BK_LOCATION = DATA_FOLDER_LOCATION + "ollama_embedding_bk_location.pkl"
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
def get_data(overwrite_db=False):
|
| 10 |
-
movies_data = data_persistence.
|
| 11 |
if not isinstance(movies_data, pd.DataFrame) or overwrite_db:
|
| 12 |
movies_data = data_import.read_or_generate_movies_data()
|
| 13 |
movies_data["vector"] = \
|
|
@@ -20,7 +23,7 @@ def get_data(overwrite_db=False):
|
|
| 20 |
movies_data["vector"] = embedding.encode_text_series_ollama_method(movies_data["vector"], file_backup_location=OLLAMA_EMBEDDING_BK_LOCATION, batch_size=500)
|
| 21 |
print(movies_data["vector"])
|
| 22 |
|
| 23 |
-
data_persistence.
|
| 24 |
return movies_data
|
| 25 |
|
| 26 |
def replace_ids_with_titles(suggestions, movies_data):
|
|
@@ -29,9 +32,9 @@ def replace_ids_with_titles(suggestions, movies_data):
|
|
| 29 |
suggestions[user] = movie_title
|
| 30 |
return suggestions
|
| 31 |
|
| 32 |
-
def get_recommendations_df(suggestions, n_results):
|
| 33 |
movies_data = get_data()
|
| 34 |
-
recommendations_raw = embedding.find_recommendations(suggestions, movies_data["vector"], n_results=n_results)
|
| 35 |
recommendations_indexes = [recommendation["movie_index"] for recommendation in recommendations_raw]
|
| 36 |
columns_of_interest = ["title", "director", "genres", "cast", "overview", "rating", "poster_path", "imdb_id"]
|
| 37 |
recommendations = (movies_data.loc[recommendations_indexes])[columns_of_interest].copy()
|
|
@@ -50,13 +53,15 @@ def recommendations_to_text(df):
|
|
| 50 |
return result_string
|
| 51 |
|
| 52 |
|
| 53 |
-
def get_recommendations_text(suggestions, n_results=10):
|
| 54 |
-
recommendations = get_recommendations_df(suggestions, n_results)
|
| 55 |
return recommendations_to_text(recommendations)
|
| 56 |
|
| 57 |
if __name__ == "__main__":
|
| 58 |
suggestions = {"Pepe": [0, 2960], "Juan": [1945, 6174]} # Pepe: Toy Story 1 y 2. Juan: Peter Pan y Buscando a Nemo
|
| 59 |
-
get_recommendations_text(suggestions)
|
|
|
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
|
|
|
|
| 6 |
DATA_FOLDER_LOCATION = "course_project/data/"
|
| 7 |
OLLAMA_EMBEDDING_BK_LOCATION = DATA_FOLDER_LOCATION + "ollama_embedding_bk_location.pkl"
|
| 8 |
|
| 9 |
+
COSINE_SIMILARITY = 0
|
| 10 |
+
EUCLIDEAN_SIMILARITY = 1
|
| 11 |
+
|
| 12 |
def get_data(overwrite_db=False):
|
| 13 |
+
movies_data = data_persistence.get_transformed_db()
|
| 14 |
if not isinstance(movies_data, pd.DataFrame) or overwrite_db:
|
| 15 |
movies_data = data_import.read_or_generate_movies_data()
|
| 16 |
movies_data["vector"] = \
|
|
|
|
| 23 |
movies_data["vector"] = embedding.encode_text_series_ollama_method(movies_data["vector"], file_backup_location=OLLAMA_EMBEDDING_BK_LOCATION, batch_size=500)
|
| 24 |
print(movies_data["vector"])
|
| 25 |
|
| 26 |
+
data_persistence.save_transformed_db(movies_data)
|
| 27 |
return movies_data
|
| 28 |
|
| 29 |
def replace_ids_with_titles(suggestions, movies_data):
|
|
|
|
| 32 |
suggestions[user] = movie_title
|
| 33 |
return suggestions
|
| 34 |
|
| 35 |
+
def get_recommendations_df(suggestions, n_results, similarity=COSINE_SIMILARITY):
|
| 36 |
movies_data = get_data()
|
| 37 |
+
recommendations_raw = embedding.find_recommendations(suggestions, movies_data["vector"], n_results=n_results, similarity=similarity)
|
| 38 |
recommendations_indexes = [recommendation["movie_index"] for recommendation in recommendations_raw]
|
| 39 |
columns_of_interest = ["title", "director", "genres", "cast", "overview", "rating", "poster_path", "imdb_id"]
|
| 40 |
recommendations = (movies_data.loc[recommendations_indexes])[columns_of_interest].copy()
|
|
|
|
| 53 |
return result_string
|
| 54 |
|
| 55 |
|
| 56 |
+
def get_recommendations_text(suggestions, n_results=10, similarity=COSINE_SIMILARITY):
|
| 57 |
+
recommendations = get_recommendations_df(suggestions, n_results, similarity=similarity)
|
| 58 |
return recommendations_to_text(recommendations)
|
| 59 |
|
| 60 |
if __name__ == "__main__":
|
| 61 |
suggestions = {"Pepe": [0, 2960], "Juan": [1945, 6174]} # Pepe: Toy Story 1 y 2. Juan: Peter Pan y Buscando a Nemo
|
| 62 |
+
result = get_recommendations_text(suggestions, similarity=EUCLIDEAN_SIMILARITY)
|
| 63 |
+
print("\n\n--------\n\n")
|
| 64 |
+
print(result)
|
| 65 |
|
| 66 |
|
| 67 |
|
course_project/data_persistence.py
CHANGED
|
@@ -3,6 +3,7 @@ import os
|
|
| 3 |
|
| 4 |
DATABASE_LOCATION = "course_project/database/"
|
| 5 |
MOVIES_BK_FILE_LOCATION = DATABASE_LOCATION + "provisional_database_simulation.pkl"
|
|
|
|
| 6 |
|
| 7 |
def save_pickle(df, filename):
|
| 8 |
with open(filename, 'wb') as f:
|
|
@@ -21,3 +22,12 @@ def get_db():
|
|
| 21 |
return read_pickle(MOVIES_BK_FILE_LOCATION)
|
| 22 |
else:
|
| 23 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
DATABASE_LOCATION = "course_project/database/"
|
| 5 |
MOVIES_BK_FILE_LOCATION = DATABASE_LOCATION + "provisional_database_simulation.pkl"
|
| 6 |
+
MOVIES_BK_FILE_TRANSFORMED_LOCATION = DATABASE_LOCATION + "provisional_database_transformed.pkl"
|
| 7 |
|
| 8 |
def save_pickle(df, filename):
|
| 9 |
with open(filename, 'wb') as f:
|
|
|
|
| 22 |
return read_pickle(MOVIES_BK_FILE_LOCATION)
|
| 23 |
else:
|
| 24 |
return None
|
| 25 |
+
|
| 26 |
+
def save_transformed_db(series_to_store):
|
| 27 |
+
save_pickle(series_to_store, MOVIES_BK_FILE_TRANSFORMED_LOCATION)
|
| 28 |
+
|
| 29 |
+
def get_transformed_db():
|
| 30 |
+
if os.path.exists(MOVIES_BK_FILE_TRANSFORMED_LOCATION):
|
| 31 |
+
return read_pickle(MOVIES_BK_FILE_TRANSFORMED_LOCATION)
|
| 32 |
+
else:
|
| 33 |
+
return None
|
course_project/embedding.py
CHANGED
|
@@ -9,10 +9,13 @@ import ollama
|
|
| 9 |
from course_project import data_persistence
|
| 10 |
|
| 11 |
DATA_FOLDER_LOCATION = "data/"
|
| 12 |
-
QUESTIONS_FILE_LOCATION = DATA_FOLDER_LOCATION + "train.csv"
|
| 13 |
-
QUESTIONS_WITH_EMBEDDINGS_LOCATION = DATA_FOLDER_LOCATION + "train_with_embeddings.pkl"
|
| 14 |
WORD2VEC_FILE_LOCATION = DATA_FOLDER_LOCATION + "glove.6B.100d.txt"
|
| 15 |
WORD2VEC_MODEL_FILE_LOCATION = DATA_FOLDER_LOCATION + "glove_model"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# ollama pull gemma:7b
|
| 18 |
|
|
@@ -93,34 +96,65 @@ def encode_text_series_ollama_method(original_series, file_backup_location=None,
|
|
| 93 |
|
| 94 |
def find_most_similar_coincidences_indexes(n_results, objective_vector, movies_embeddings):
|
| 95 |
cosine_sim_scores = cosine_similarity([objective_vector.tolist()], movies_embeddings.tolist())[0].tolist()
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
scores = []
|
| 107 |
suggestions_considered = []
|
| 108 |
for movie_tested in movies_embeddings:
|
| 109 |
movie_tested_score = 0
|
| 110 |
movie_tested_suggestions_considered = {}
|
| 111 |
for person in suggestions_dict:
|
| 112 |
-
winning_suggestion_index, winning_score = get_winning_choice_per_person(movie_tested, movies_embeddings.loc[suggestions_dict[person]])
|
| 113 |
movie_tested_score += winning_score
|
| 114 |
movie_tested_suggestions_considered[person] = suggestions_dict[person][winning_suggestion_index]
|
| 115 |
movie_tested_score /= len(suggestions_dict)
|
| 116 |
movie_tested_score += rating_weight
|
| 117 |
scores.append(movie_tested_score)
|
| 118 |
suggestions_considered.append(movie_tested_suggestions_considered)
|
| 119 |
-
|
| 120 |
-
recommended_movies_indexes =
|
| 121 |
-
recommendations = [
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
return recommendations
|
| 126 |
-
|
|
|
|
| 9 |
from course_project import data_persistence
|
| 10 |
|
| 11 |
DATA_FOLDER_LOCATION = "data/"
|
|
|
|
|
|
|
| 12 |
WORD2VEC_FILE_LOCATION = DATA_FOLDER_LOCATION + "glove.6B.100d.txt"
|
| 13 |
WORD2VEC_MODEL_FILE_LOCATION = DATA_FOLDER_LOCATION + "glove_model"
|
| 14 |
+
MOVIES_BK_FILE_LOCATION = "course_project/database/provisional_database_simulation.pkl"
|
| 15 |
+
MOVIES_BK_FILE_TRANSFORMED_LOCATION = "course_project/database/provisional_database_transformed.pkl"
|
| 16 |
+
|
| 17 |
+
COSINE_SIMILARITY = 0
|
| 18 |
+
EUCLIDEAN_SIMILARITY = 1
|
| 19 |
|
| 20 |
# ollama pull gemma:7b
|
| 21 |
|
|
|
|
| 96 |
|
| 97 |
def find_most_similar_coincidences_indexes(n_results, objective_vector, movies_embeddings):
|
| 98 |
cosine_sim_scores = cosine_similarity([objective_vector.tolist()], movies_embeddings.tolist())[0].tolist()
|
| 99 |
+
most_similar_indexes = np.argsort(cosine_sim_scores)[-n_results:][::-1]
|
| 100 |
+
return most_similar_indexes.tolist()
|
| 101 |
+
|
| 102 |
+
def get_winning_choice_per_person(movie_tested, suggestions, similarity=COSINE_SIMILARITY):
|
| 103 |
+
if similarity==COSINE_SIMILARITY:
|
| 104 |
+
scores = np.array(cosine_similarity([movie_tested.tolist()], suggestions.tolist())[0].tolist())
|
| 105 |
+
scores[scores >= 0.99999] = 0
|
| 106 |
+
winning_suggestion_index = np.argmax(scores)
|
| 107 |
+
winning_score = scores[winning_suggestion_index]
|
| 108 |
+
return winning_suggestion_index, winning_score
|
| 109 |
+
elif similarity==EUCLIDEAN_SIMILARITY:
|
| 110 |
+
distances = np.linalg.norm(np.array(suggestions.tolist()) - movie_tested, axis=1)
|
| 111 |
+
scores = np.exp(-0.005*distances)
|
| 112 |
+
scores[scores >= 0.99999] = 0
|
| 113 |
+
winning_suggestion_index = np.argmax(scores)
|
| 114 |
+
winning_score = scores[winning_suggestion_index]
|
| 115 |
+
return winning_suggestion_index, winning_score
|
| 116 |
+
else:
|
| 117 |
+
raise ValueError('Wrong similarity value')
|
| 118 |
+
|
| 119 |
+
def normalize_to_minus_one_and_one(arr):
|
| 120 |
+
min_val = np.min(arr)
|
| 121 |
+
max_val = np.max(arr)
|
| 122 |
+
# Evitamos la división por cero en caso de que todos los valores sean iguales
|
| 123 |
+
if min_val == max_val:
|
| 124 |
+
return np.zeros_like(arr)
|
| 125 |
+
normalized_arr = 2 * (arr - min_val) / (max_val - min_val) - 1
|
| 126 |
+
return normalized_arr
|
| 127 |
+
|
| 128 |
+
def linearly_transform_embeddings():
|
| 129 |
+
movie_data = data_persistence.read_pickle(MOVIES_BK_FILE_LOCATION)
|
| 130 |
+
embeddings = movie_data["vector"]
|
| 131 |
+
embeddings = np.array(embeddings)
|
| 132 |
+
for embedding in embeddings:
|
| 133 |
+
# print(embedding)
|
| 134 |
+
embedding = normalize_to_minus_one_and_one(np.array(embedding))
|
| 135 |
+
embedding = pd.Series(embedding)
|
| 136 |
+
embeddings = pd.Series(embeddings)
|
| 137 |
+
movie_data["vector"] = embeddings
|
| 138 |
+
data_persistence.save_pickle(movie_data, MOVIES_BK_FILE_TRANSFORMED_LOCATION)
|
| 139 |
+
|
| 140 |
+
def find_recommendations(suggestions_dict, movies_embeddings, n_results=10, rating_weight=0, similarity=COSINE_SIMILARITY):
|
| 141 |
scores = []
|
| 142 |
suggestions_considered = []
|
| 143 |
for movie_tested in movies_embeddings:
|
| 144 |
movie_tested_score = 0
|
| 145 |
movie_tested_suggestions_considered = {}
|
| 146 |
for person in suggestions_dict:
|
| 147 |
+
winning_suggestion_index, winning_score = get_winning_choice_per_person(movie_tested, movies_embeddings.loc[suggestions_dict[person]], similarity=similarity)
|
| 148 |
movie_tested_score += winning_score
|
| 149 |
movie_tested_suggestions_considered[person] = suggestions_dict[person][winning_suggestion_index]
|
| 150 |
movie_tested_score /= len(suggestions_dict)
|
| 151 |
movie_tested_score += rating_weight
|
| 152 |
scores.append(movie_tested_score)
|
| 153 |
suggestions_considered.append(movie_tested_suggestions_considered)
|
| 154 |
+
|
| 155 |
+
recommended_movies_indexes = np.argsort(scores)[-n_results:][::-1]
|
| 156 |
+
recommendations = [
|
| 157 |
+
{"movie_index": idx, "suggestions_considered": suggestions_considered[idx]}
|
| 158 |
+
for idx in recommended_movies_indexes
|
| 159 |
+
]
|
| 160 |
return recommendations
|
|
|