Update app.py
Browse files
app.py
CHANGED
|
@@ -306,18 +306,8 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
|
|
| 306 |
|
| 307 |
# Only include documents that have meaningful content in the requested language
|
| 308 |
# Skip documents where title and summary are empty or "Unknown"/"No summary available"
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
has_valid_summary = summary and summary.strip() and summary not in ["No summary available", ""]
|
| 312 |
-
|
| 313 |
-
# For English, require both title and summary to be valid
|
| 314 |
-
# For other languages, only require title to be valid (summary can be empty)
|
| 315 |
-
if language == 'en':
|
| 316 |
-
is_valid = has_valid_title and has_valid_summary
|
| 317 |
-
else:
|
| 318 |
-
is_valid = has_valid_title
|
| 319 |
-
|
| 320 |
-
if is_valid:
|
| 321 |
|
| 322 |
result = {
|
| 323 |
"title": title,
|
|
@@ -359,33 +349,40 @@ def get_all_proposals(category_filter=None, language='en'):
|
|
| 359 |
if category_filter and category_filter != "All categories":
|
| 360 |
filter_dict["category"] = category_filter
|
| 361 |
|
|
|
|
| 362 |
# Query with a dummy vector to get all documents
|
| 363 |
# Use language-specific vector dimensions
|
| 364 |
if language == 'en':
|
| 365 |
dummy_vector = [0.1] * 384 # 384 is the dimension of all-MiniLM-L6-v2
|
| 366 |
else: # si, ta, or any other language
|
| 367 |
dummy_vector = [0.1] * 768 # 768 is the dimension of EmbeddingGemma-300m
|
| 368 |
-
res = pc_index.query(
|
| 369 |
-
vector=dummy_vector,
|
| 370 |
-
top_k=100, # Get all proposals
|
| 371 |
-
include_metadata=True,
|
| 372 |
-
filter=filter_dict
|
| 373 |
-
)
|
| 374 |
|
| 375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
|
| 377 |
results = []
|
| 378 |
-
seen_files = set() # Track unique files to avoid duplicates
|
| 379 |
|
| 380 |
-
for match in
|
| 381 |
metadata = match["metadata"]
|
| 382 |
-
file_path = metadata.get("file_path", "")
|
| 383 |
-
|
| 384 |
-
# Skip if we've already included this file (avoid duplicates from chunks)
|
| 385 |
-
if file_path in seen_files:
|
| 386 |
-
continue
|
| 387 |
-
|
| 388 |
-
seen_files.add(file_path)
|
| 389 |
|
| 390 |
# Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
|
| 391 |
proposal_data = DYNAMIC_METADATA.get(file_path, {
|
|
@@ -404,18 +401,8 @@ def get_all_proposals(category_filter=None, language='en'):
|
|
| 404 |
|
| 405 |
# Only include documents that have meaningful content in the requested language
|
| 406 |
# Skip documents where title and summary are empty or "Unknown"/"No summary available"
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
has_valid_summary = summary and summary.strip() and summary not in ["No summary available", ""]
|
| 410 |
-
|
| 411 |
-
# For English, require both title and summary to be valid
|
| 412 |
-
# For other languages, only require title to be valid (summary can be empty)
|
| 413 |
-
if language == 'en':
|
| 414 |
-
is_valid = has_valid_title and has_valid_summary
|
| 415 |
-
else:
|
| 416 |
-
is_valid = has_valid_title
|
| 417 |
-
|
| 418 |
-
if is_valid:
|
| 419 |
|
| 420 |
result = {
|
| 421 |
"title": title,
|
|
@@ -432,6 +419,7 @@ def get_all_proposals(category_filter=None, language='en'):
|
|
| 432 |
|
| 433 |
results.append(result)
|
| 434 |
|
|
|
|
| 435 |
return results
|
| 436 |
|
| 437 |
except Exception as e:
|
|
|
|
| 306 |
|
| 307 |
# Only include documents that have meaningful content in the requested language
|
| 308 |
# Skip documents where title and summary are empty or "Unknown"/"No summary available"
|
| 309 |
+
if (title and title.strip() and title not in ["Unknown", "Unknown Title", ""] and
|
| 310 |
+
summary and summary.strip() and summary not in ["No summary available", ""]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
|
| 312 |
result = {
|
| 313 |
"title": title,
|
|
|
|
| 349 |
if category_filter and category_filter != "All categories":
|
| 350 |
filter_dict["category"] = category_filter
|
| 351 |
|
| 352 |
+
# Use multiple dummy vectors to ensure we get all documents
|
| 353 |
# Query with a dummy vector to get all documents
|
| 354 |
# Use language-specific vector dimensions
|
| 355 |
if language == 'en':
|
| 356 |
dummy_vector = [0.1] * 384 # 384 is the dimension of all-MiniLM-L6-v2
|
| 357 |
else: # si, ta, or any other language
|
| 358 |
dummy_vector = [0.1] * 768 # 768 is the dimension of EmbeddingGemma-300m
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
|
| 360 |
+
# Try multiple queries with different dummy vectors to get all documents
|
| 361 |
+
all_matches = []
|
| 362 |
+
for i in range(5): # Try 5 different dummy vectors
|
| 363 |
+
# Create slightly different dummy vectors
|
| 364 |
+
dummy_vector_variant = [0.1 + (i * 0.01)] * len(dummy_vector)
|
| 365 |
+
res = pc_index.query(
|
| 366 |
+
vector=dummy_vector_variant,
|
| 367 |
+
top_k=100, # Get all proposals
|
| 368 |
+
include_metadata=True,
|
| 369 |
+
filter=filter_dict
|
| 370 |
+
)
|
| 371 |
+
all_matches.extend(res["matches"])
|
| 372 |
+
|
| 373 |
+
# Remove duplicates based on file_path
|
| 374 |
+
unique_matches = {}
|
| 375 |
+
for match in all_matches:
|
| 376 |
+
file_path = match["metadata"].get("file_path", "")
|
| 377 |
+
if file_path and file_path not in unique_matches:
|
| 378 |
+
unique_matches[file_path] = match
|
| 379 |
+
|
| 380 |
+
logger.info(f"Found {len(unique_matches)} unique documents")
|
| 381 |
|
| 382 |
results = []
|
|
|
|
| 383 |
|
| 384 |
+
for file_path, match in unique_matches.items():
|
| 385 |
metadata = match["metadata"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
|
| 387 |
# Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
|
| 388 |
proposal_data = DYNAMIC_METADATA.get(file_path, {
|
|
|
|
| 401 |
|
| 402 |
# Only include documents that have meaningful content in the requested language
|
| 403 |
# Skip documents where title and summary are empty or "Unknown"/"No summary available"
|
| 404 |
+
if (title and title.strip() and title not in ["Unknown", "Unknown Title", ""] and
|
| 405 |
+
summary and summary.strip() and summary not in ["No summary available", ""]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
|
| 407 |
result = {
|
| 408 |
"title": title,
|
|
|
|
| 419 |
|
| 420 |
results.append(result)
|
| 421 |
|
| 422 |
+
logger.info(f"Returning {len(results)} proposals for language {language}")
|
| 423 |
return results
|
| 424 |
|
| 425 |
except Exception as e:
|