danulr05 commited on
Commit
a5e18c5
·
verified ·
1 Parent(s): 8ad1d33

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -40
app.py CHANGED
@@ -306,18 +306,8 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
306
 
307
  # Only include documents that have meaningful content in the requested language
308
  # Skip documents where title and summary are empty or "Unknown"/"No summary available"
309
- # For non-English languages, be more lenient with the filtering
310
- has_valid_title = title and title.strip() and title not in ["Unknown", "Unknown Title", ""]
311
- has_valid_summary = summary and summary.strip() and summary not in ["No summary available", ""]
312
-
313
- # For English, require both title and summary to be valid
314
- # For other languages, only require title to be valid (summary can be empty)
315
- if language == 'en':
316
- is_valid = has_valid_title and has_valid_summary
317
- else:
318
- is_valid = has_valid_title
319
-
320
- if is_valid:
321
 
322
  result = {
323
  "title": title,
@@ -359,33 +349,40 @@ def get_all_proposals(category_filter=None, language='en'):
359
  if category_filter and category_filter != "All categories":
360
  filter_dict["category"] = category_filter
361
 
 
362
  # Query with a dummy vector to get all documents
363
  # Use language-specific vector dimensions
364
  if language == 'en':
365
  dummy_vector = [0.1] * 384 # 384 is the dimension of all-MiniLM-L6-v2
366
  else: # si, ta, or any other language
367
  dummy_vector = [0.1] * 768 # 768 is the dimension of EmbeddingGemma-300m
368
- res = pc_index.query(
369
- vector=dummy_vector,
370
- top_k=100, # Get all proposals
371
- include_metadata=True,
372
- filter=filter_dict
373
- )
374
 
375
- logger.info(f"Query returned {len(res['matches'])} matches")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
  results = []
378
- seen_files = set() # Track unique files to avoid duplicates
379
 
380
- for match in res["matches"]:
381
  metadata = match["metadata"]
382
- file_path = metadata.get("file_path", "")
383
-
384
- # Skip if we've already included this file (avoid duplicates from chunks)
385
- if file_path in seen_files:
386
- continue
387
-
388
- seen_files.add(file_path)
389
 
390
  # Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
391
  proposal_data = DYNAMIC_METADATA.get(file_path, {
@@ -404,18 +401,8 @@ def get_all_proposals(category_filter=None, language='en'):
404
 
405
  # Only include documents that have meaningful content in the requested language
406
  # Skip documents where title and summary are empty or "Unknown"/"No summary available"
407
- # For non-English languages, be more lenient with the filtering
408
- has_valid_title = title and title.strip() and title not in ["Unknown", "Unknown Title", ""]
409
- has_valid_summary = summary and summary.strip() and summary not in ["No summary available", ""]
410
-
411
- # For English, require both title and summary to be valid
412
- # For other languages, only require title to be valid (summary can be empty)
413
- if language == 'en':
414
- is_valid = has_valid_title and has_valid_summary
415
- else:
416
- is_valid = has_valid_title
417
-
418
- if is_valid:
419
 
420
  result = {
421
  "title": title,
@@ -432,6 +419,7 @@ def get_all_proposals(category_filter=None, language='en'):
432
 
433
  results.append(result)
434
 
 
435
  return results
436
 
437
  except Exception as e:
 
306
 
307
  # Only include documents that have meaningful content in the requested language
308
  # Skip documents where title and summary are empty or "Unknown"/"No summary available"
309
+ if (title and title.strip() and title not in ["Unknown", "Unknown Title", ""] and
310
+ summary and summary.strip() and summary not in ["No summary available", ""]):
 
 
 
 
 
 
 
 
 
 
311
 
312
  result = {
313
  "title": title,
 
349
  if category_filter and category_filter != "All categories":
350
  filter_dict["category"] = category_filter
351
 
352
+ # Use multiple dummy vectors to ensure we get all documents
353
  # Query with a dummy vector to get all documents
354
  # Use language-specific vector dimensions
355
  if language == 'en':
356
  dummy_vector = [0.1] * 384 # 384 is the dimension of all-MiniLM-L6-v2
357
  else: # si, ta, or any other language
358
  dummy_vector = [0.1] * 768 # 768 is the dimension of EmbeddingGemma-300m
 
 
 
 
 
 
359
 
360
+ # Try multiple queries with different dummy vectors to get all documents
361
+ all_matches = []
362
+ for i in range(5): # Try 5 different dummy vectors
363
+ # Create slightly different dummy vectors
364
+ dummy_vector_variant = [0.1 + (i * 0.01)] * len(dummy_vector)
365
+ res = pc_index.query(
366
+ vector=dummy_vector_variant,
367
+ top_k=100, # Get all proposals
368
+ include_metadata=True,
369
+ filter=filter_dict
370
+ )
371
+ all_matches.extend(res["matches"])
372
+
373
+ # Remove duplicates based on file_path
374
+ unique_matches = {}
375
+ for match in all_matches:
376
+ file_path = match["metadata"].get("file_path", "")
377
+ if file_path and file_path not in unique_matches:
378
+ unique_matches[file_path] = match
379
+
380
+ logger.info(f"Found {len(unique_matches)} unique documents")
381
 
382
  results = []
 
383
 
384
+ for file_path, match in unique_matches.items():
385
  metadata = match["metadata"]
 
 
 
 
 
 
 
386
 
387
  # Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
388
  proposal_data = DYNAMIC_METADATA.get(file_path, {
 
401
 
402
  # Only include documents that have meaningful content in the requested language
403
  # Skip documents where title and summary are empty or "Unknown"/"No summary available"
404
+ if (title and title.strip() and title not in ["Unknown", "Unknown Title", ""] and
405
+ summary and summary.strip() and summary not in ["No summary available", ""]):
 
 
 
 
 
 
 
 
 
 
406
 
407
  result = {
408
  "title": title,
 
419
 
420
  results.append(result)
421
 
422
+ logger.info(f"Returning {len(results)} proposals for language {language}")
423
  return results
424
 
425
  except Exception as e: