Update app.py
Browse files
app.py
CHANGED
|
@@ -253,6 +253,15 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
|
|
| 253 |
if file_path not in best_scores or score > best_scores[file_path]:
|
| 254 |
best_scores[file_path] = score
|
| 255 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
if not best_scores:
|
| 257 |
return []
|
| 258 |
|
|
@@ -279,10 +288,14 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
|
|
| 279 |
max_docs = 5
|
| 280 |
|
| 281 |
# Create a lookup dictionary for efficient metadata retrieval
|
|
|
|
| 282 |
metadata_lookup = {}
|
| 283 |
for match in res["matches"]:
|
| 284 |
file_path_key = match["metadata"].get("file_path", "")
|
| 285 |
-
|
|
|
|
|
|
|
|
|
|
| 286 |
metadata_lookup[file_path_key] = match
|
| 287 |
|
| 288 |
results = []
|
|
@@ -334,6 +347,12 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
|
|
| 334 |
results.append(result)
|
| 335 |
doc_count += 1
|
| 336 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
return results
|
| 338 |
except Exception as e:
|
| 339 |
logger.error(f"Search error: {e}")
|
|
|
|
| 253 |
if file_path not in best_scores or score > best_scores[file_path]:
|
| 254 |
best_scores[file_path] = score
|
| 255 |
|
| 256 |
+
# Debug logging for duplicate investigation
|
| 257 |
+
if query.lower() == "quality industrial zone":
|
| 258 |
+
logger.info(f"Debug - Query: {query}")
|
| 259 |
+
logger.info(f"Debug - Total matches from Pinecone: {len(res['matches'])}")
|
| 260 |
+
logger.info(f"Debug - Unique documents after deduplication: {len(best_scores)}")
|
| 261 |
+
logger.info(f"Debug - Document scores: {list(best_scores.items())[:5]}")
|
| 262 |
+
for file_path, score in list(best_scores.items())[:3]:
|
| 263 |
+
logger.info(f"Debug - Document: {file_path}, Score: {score}")
|
| 264 |
+
|
| 265 |
if not best_scores:
|
| 266 |
return []
|
| 267 |
|
|
|
|
| 288 |
max_docs = 5
|
| 289 |
|
| 290 |
# Create a lookup dictionary for efficient metadata retrieval
|
| 291 |
+
# Store the match with the highest score for each file_path
|
| 292 |
metadata_lookup = {}
|
| 293 |
for match in res["matches"]:
|
| 294 |
file_path_key = match["metadata"].get("file_path", "")
|
| 295 |
+
score = match["score"]
|
| 296 |
+
|
| 297 |
+
# Only store if this is the first match for this file_path or if it has a higher score
|
| 298 |
+
if file_path_key not in metadata_lookup or score > metadata_lookup[file_path_key]["score"]:
|
| 299 |
metadata_lookup[file_path_key] = match
|
| 300 |
|
| 301 |
results = []
|
|
|
|
| 347 |
results.append(result)
|
| 348 |
doc_count += 1
|
| 349 |
|
| 350 |
+
# Debug logging for final results
|
| 351 |
+
if query.lower() == "quality industrial zone":
|
| 352 |
+
logger.info(f"Debug - Final results count: {len(results)}")
|
| 353 |
+
for i, result in enumerate(results):
|
| 354 |
+
logger.info(f"Debug - Result {i+1}: {result.get('title', 'No title')} - {result.get('file_path', 'No path')}")
|
| 355 |
+
|
| 356 |
return results
|
| 357 |
except Exception as e:
|
| 358 |
logger.error(f"Search error: {e}")
|