yigitcanozdemir
commited on
Commit
ยท
a7dcca0
1
Parent(s):
af6b243
Refactor recommendation engine and similarity calculator: remove unnecessary print statements and update status messages for clarity.
Browse files- components/similarity.py +2 -12
- models/recommendation_engine.py +18 -47
components/similarity.py
CHANGED
|
@@ -21,7 +21,7 @@ class SimilarityCalculator:
|
|
| 21 |
) -> Dict[str, Any]:
|
| 22 |
if filtered_data.empty:
|
| 23 |
return {
|
| 24 |
-
"status": "
|
| 25 |
"results": [],
|
| 26 |
"search_time": 0,
|
| 27 |
"total_candidates": 0,
|
|
@@ -30,7 +30,6 @@ class SimilarityCalculator:
|
|
| 30 |
start_time = time.time()
|
| 31 |
positive_themes = features.positive_themes
|
| 32 |
negative_themes = features.negative_themes
|
| 33 |
-
print(f"๐ Calculating similarity for query: {positive_themes}")
|
| 34 |
|
| 35 |
positive_query_embeddings_np = self.model.encode(
|
| 36 |
positive_themes, convert_to_numpy=True
|
|
@@ -78,18 +77,9 @@ class SimilarityCalculator:
|
|
| 78 |
else:
|
| 79 |
combined_embedding = avg_positive
|
| 80 |
|
| 81 |
-
print("Positive query embedding", avg_positive)
|
| 82 |
-
|
| 83 |
similarities = self.model.similarity(combined_embedding, document_embeddings)
|
| 84 |
similarities = similarities[0]
|
| 85 |
|
| 86 |
-
print("Magnitude of avg_positive:", torch.norm(avg_positive))
|
| 87 |
-
if negative_themes is not None and len(negative_themes) > 0:
|
| 88 |
-
print("Magnitude of avg_negative:", torch.norm(avg_negative))
|
| 89 |
-
print("Magnitude of combined_embedding:", torch.norm(combined_embedding))
|
| 90 |
-
print("Mean:", similarities.mean())
|
| 91 |
-
print("Max:", similarities.max())
|
| 92 |
-
print("Std:", similarities.std())
|
| 93 |
quality_config = QUALITY_LEVELS.get(features.quality_level, {})
|
| 94 |
rating_weight = quality_config.get("rating_weight")
|
| 95 |
hybrid_scores = self._calculate_hybrid_score(
|
|
@@ -133,7 +123,7 @@ class SimilarityCalculator:
|
|
| 133 |
search_time = end_time - start_time
|
| 134 |
|
| 135 |
return {
|
| 136 |
-
"status": "
|
| 137 |
"results": results,
|
| 138 |
"search_time": search_time,
|
| 139 |
"total_candidates": len(filtered_data),
|
|
|
|
| 21 |
) -> Dict[str, Any]:
|
| 22 |
if filtered_data.empty:
|
| 23 |
return {
|
| 24 |
+
"status": "No results found with current filters.",
|
| 25 |
"results": [],
|
| 26 |
"search_time": 0,
|
| 27 |
"total_candidates": 0,
|
|
|
|
| 30 |
start_time = time.time()
|
| 31 |
positive_themes = features.positive_themes
|
| 32 |
negative_themes = features.negative_themes
|
|
|
|
| 33 |
|
| 34 |
positive_query_embeddings_np = self.model.encode(
|
| 35 |
positive_themes, convert_to_numpy=True
|
|
|
|
| 77 |
else:
|
| 78 |
combined_embedding = avg_positive
|
| 79 |
|
|
|
|
|
|
|
| 80 |
similarities = self.model.similarity(combined_embedding, document_embeddings)
|
| 81 |
similarities = similarities[0]
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
quality_config = QUALITY_LEVELS.get(features.quality_level, {})
|
| 84 |
rating_weight = quality_config.get("rating_weight")
|
| 85 |
hybrid_scores = self._calculate_hybrid_score(
|
|
|
|
| 123 |
search_time = end_time - start_time
|
| 124 |
|
| 125 |
return {
|
| 126 |
+
"status": "Search completed successfully.",
|
| 127 |
"results": results,
|
| 128 |
"search_time": search_time,
|
| 129 |
"total_candidates": len(filtered_data),
|
models/recommendation_engine.py
CHANGED
|
@@ -22,85 +22,59 @@ class RecommendationEngine:
|
|
| 22 |
self.similarity_calc = SimilarityCalculator(self.model)
|
| 23 |
self.filter = MovieFilter()
|
| 24 |
|
| 25 |
-
print(f"โ
Recommendation engine initialized with {len(self.data)} items.")
|
| 26 |
-
|
| 27 |
def get_recommendations(self, user_query: str, top_k: int = 40):
|
| 28 |
-
print(f"
|
| 29 |
if not user_query.strip():
|
| 30 |
-
return "
|
| 31 |
|
| 32 |
try:
|
| 33 |
-
print("๐ Parsing user query...")
|
| 34 |
start_time = time.time()
|
| 35 |
features = self._parse_user_query(user_query)
|
| 36 |
-
parse_time = time.time() - start_time
|
| 37 |
-
print(f"โ
Query parsed in {parse_time:.4f} seconds")
|
| 38 |
-
|
| 39 |
-
print("๐ Applying filters...")
|
| 40 |
-
start_time = time.time()
|
| 41 |
filtered_data = self.filter.apply_filters(self.data, features)
|
| 42 |
-
|
| 43 |
-
print(f"โ
Filters applied in {filter_time:.4f} seconds")
|
| 44 |
-
print(f"๐ Filtered data contains {len(filtered_data)} items.")
|
| 45 |
-
print("๐ง Preparing query input...")
|
| 46 |
-
print(
|
| 47 |
-
f"๐ Query text for embedding: Positive ['{features.positive_themes}'], Negative [{features.negative_themes}]"
|
| 48 |
-
)
|
| 49 |
-
print("๐งฎ Starting similarity calculation...")
|
| 50 |
-
start_time = time.time()
|
| 51 |
try:
|
| 52 |
search_results = self.similarity_calc.calculate_similarity(
|
| 53 |
features, filtered_data, top_k
|
| 54 |
)
|
| 55 |
-
similarity_time = time.time() - start_time
|
| 56 |
-
print(
|
| 57 |
-
f"โ
Similarity calculation completed in {similarity_time:.4f} seconds"
|
| 58 |
-
)
|
| 59 |
-
|
| 60 |
except Exception as similarity_error:
|
| 61 |
-
print(f"
|
| 62 |
-
print(f"
|
| 63 |
|
| 64 |
-
print("
|
| 65 |
if len(filtered_data) > 1000:
|
| 66 |
smaller_data = filtered_data.sample(n=1000, random_state=42)
|
| 67 |
search_results = self.similarity_calc.calculate_similarity(
|
| 68 |
features, smaller_data, top_k
|
| 69 |
)
|
| 70 |
-
print("
|
| 71 |
else:
|
| 72 |
raise similarity_error
|
| 73 |
|
| 74 |
-
print(f"
|
| 75 |
|
| 76 |
-
print("๐ Creating results dataframe...")
|
| 77 |
-
start_time = time.time()
|
| 78 |
results_df = self._create_results_dataframe(search_results)
|
| 79 |
-
|
| 80 |
-
print(f"
|
| 81 |
-
|
| 82 |
-
print("๐ Recommendation process completed successfully!")
|
| 83 |
return features.prompt_title, results_df
|
| 84 |
|
| 85 |
except Exception as e:
|
| 86 |
-
print(f"
|
| 87 |
-
print(f"
|
| 88 |
-
print(f"
|
| 89 |
|
| 90 |
try:
|
| 91 |
import psutil
|
| 92 |
|
| 93 |
process = psutil.Process()
|
| 94 |
memory_usage = process.memory_info().rss / 1024 / 1024
|
| 95 |
-
print(f"
|
| 96 |
except:
|
| 97 |
pass
|
| 98 |
|
| 99 |
-
return f"
|
| 100 |
|
| 101 |
def _parse_user_query(self, query: str) -> Features:
|
| 102 |
try:
|
| 103 |
-
print(f"๐ค Sending query to OpenAI: '{query}'")
|
| 104 |
response = self.client.beta.chat.completions.parse(
|
| 105 |
model="gpt-4o",
|
| 106 |
messages=[
|
|
@@ -315,19 +289,16 @@ class RecommendationEngine:
|
|
| 315 |
)
|
| 316 |
|
| 317 |
response_model = response.choices[0].message.parsed
|
| 318 |
-
print(f"
|
| 319 |
-
print(f"๐ Response type: {type(response_model)}")
|
| 320 |
-
print(f"๐ Response content: {response_model.model_dump_json(indent=2)}")
|
| 321 |
return response_model
|
| 322 |
except Exception as e:
|
| 323 |
-
print(f"
|
| 324 |
-
print(f"๐ Parse error traceback: {traceback.format_exc()}")
|
| 325 |
return Features(
|
| 326 |
movie_or_series="both",
|
| 327 |
genres=[],
|
| 328 |
quality_level="any",
|
| 329 |
themes=[query],
|
| 330 |
-
date_range=[
|
| 331 |
negative_keywords=[],
|
| 332 |
production_region=[],
|
| 333 |
)
|
|
|
|
| 22 |
self.similarity_calc = SimilarityCalculator(self.model)
|
| 23 |
self.filter = MovieFilter()
|
| 24 |
|
|
|
|
|
|
|
| 25 |
def get_recommendations(self, user_query: str, top_k: int = 40):
|
| 26 |
+
print(f"Starting recommendation process for query: '{user_query}'")
|
| 27 |
if not user_query.strip():
|
| 28 |
+
return "Please enter some text.", None
|
| 29 |
|
| 30 |
try:
|
|
|
|
| 31 |
start_time = time.time()
|
| 32 |
features = self._parse_user_query(user_query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
filtered_data = self.filter.apply_filters(self.data, features)
|
| 34 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
try:
|
| 36 |
search_results = self.similarity_calc.calculate_similarity(
|
| 37 |
features, filtered_data, top_k
|
| 38 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
except Exception as similarity_error:
|
| 40 |
+
print(f"Error in similarity calculation: {str(similarity_error)}")
|
| 41 |
+
print(f"Traceback: {traceback.format_exc()}")
|
| 42 |
|
| 43 |
+
print("Attempting recovery with smaller dataset...")
|
| 44 |
if len(filtered_data) > 1000:
|
| 45 |
smaller_data = filtered_data.sample(n=1000, random_state=42)
|
| 46 |
search_results = self.similarity_calc.calculate_similarity(
|
| 47 |
features, smaller_data, top_k
|
| 48 |
)
|
| 49 |
+
print("Recovery successful with smaller dataset")
|
| 50 |
else:
|
| 51 |
raise similarity_error
|
| 52 |
|
| 53 |
+
print(f"Found {len(search_results['results'])} results.")
|
| 54 |
|
|
|
|
|
|
|
| 55 |
results_df = self._create_results_dataframe(search_results)
|
| 56 |
+
total_time = time.time() - start_time
|
| 57 |
+
print(f"Recommendation finished in {total_time:.4f} seconds")
|
|
|
|
|
|
|
| 58 |
return features.prompt_title, results_df
|
| 59 |
|
| 60 |
except Exception as e:
|
| 61 |
+
print(f"Critical error in recommendation process: {str(e)}")
|
| 62 |
+
print(f"Full traceback: {traceback.format_exc()}")
|
| 63 |
+
print(f"Exception type: {type(e).__name__}")
|
| 64 |
|
| 65 |
try:
|
| 66 |
import psutil
|
| 67 |
|
| 68 |
process = psutil.Process()
|
| 69 |
memory_usage = process.memory_info().rss / 1024 / 1024
|
| 70 |
+
print(f"Current memory usage: {memory_usage:.2f} MB")
|
| 71 |
except:
|
| 72 |
pass
|
| 73 |
|
| 74 |
+
return f"Error: {str(e)}", None
|
| 75 |
|
| 76 |
def _parse_user_query(self, query: str) -> Features:
|
| 77 |
try:
|
|
|
|
| 78 |
response = self.client.beta.chat.completions.parse(
|
| 79 |
model="gpt-4o",
|
| 80 |
messages=[
|
|
|
|
| 289 |
)
|
| 290 |
|
| 291 |
response_model = response.choices[0].message.parsed
|
| 292 |
+
print(f"Response content: {response_model.model_dump_json(indent=2)}")
|
|
|
|
|
|
|
| 293 |
return response_model
|
| 294 |
except Exception as e:
|
| 295 |
+
print(f"Parse error traceback: {traceback.format_exc()}")
|
|
|
|
| 296 |
return Features(
|
| 297 |
movie_or_series="both",
|
| 298 |
genres=[],
|
| 299 |
quality_level="any",
|
| 300 |
themes=[query],
|
| 301 |
+
date_range=[1900, 2025],
|
| 302 |
negative_keywords=[],
|
| 303 |
production_region=[],
|
| 304 |
)
|