danulr05 commited on
Commit
0442da0
·
verified ·
1 Parent(s): 463ec7b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -1
app.py CHANGED
@@ -253,6 +253,15 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
253
  if file_path not in best_scores or score > best_scores[file_path]:
254
  best_scores[file_path] = score
255
 
 
 
 
 
 
 
 
 
 
256
  if not best_scores:
257
  return []
258
 
@@ -279,10 +288,14 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
279
  max_docs = 5
280
 
281
  # Create a lookup dictionary for efficient metadata retrieval
 
282
  metadata_lookup = {}
283
  for match in res["matches"]:
284
  file_path_key = match["metadata"].get("file_path", "")
285
- if file_path_key not in metadata_lookup:
 
 
 
286
  metadata_lookup[file_path_key] = match
287
 
288
  results = []
@@ -334,6 +347,12 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
334
  results.append(result)
335
  doc_count += 1
336
 
 
 
 
 
 
 
337
  return results
338
  except Exception as e:
339
  logger.error(f"Search error: {e}")
 
253
  if file_path not in best_scores or score > best_scores[file_path]:
254
  best_scores[file_path] = score
255
 
256
+ # Debug logging for duplicate investigation
257
+ if query.lower() == "quality industrial zone":
258
+ logger.info(f"Debug - Query: {query}")
259
+ logger.info(f"Debug - Total matches from Pinecone: {len(res['matches'])}")
260
+ logger.info(f"Debug - Unique documents after deduplication: {len(best_scores)}")
261
+ logger.info(f"Debug - Document scores: {list(best_scores.items())[:5]}")
262
+ for file_path, score in list(best_scores.items())[:3]:
263
+ logger.info(f"Debug - Document: {file_path}, Score: {score}")
264
+
265
  if not best_scores:
266
  return []
267
 
 
288
  max_docs = 5
289
 
290
  # Create a lookup dictionary for efficient metadata retrieval
291
+ # Store the match with the highest score for each file_path
292
  metadata_lookup = {}
293
  for match in res["matches"]:
294
  file_path_key = match["metadata"].get("file_path", "")
295
+ score = match["score"]
296
+
297
+ # Only store if this is the first match for this file_path or if it has a higher score
298
+ if file_path_key not in metadata_lookup or score > metadata_lookup[file_path_key]["score"]:
299
  metadata_lookup[file_path_key] = match
300
 
301
  results = []
 
347
  results.append(result)
348
  doc_count += 1
349
 
350
+ # Debug logging for final results
351
+ if query.lower() == "quality industrial zone":
352
+ logger.info(f"Debug - Final results count: {len(results)}")
353
+ for i, result in enumerate(results):
354
+ logger.info(f"Debug - Result {i+1}: {result.get('title', 'No title')} - {result.get('file_path', 'No path')}")
355
+
356
  return results
357
  except Exception as e:
358
  logger.error(f"Search error: {e}")