Update app.py
Browse files
app.py
CHANGED
|
@@ -278,6 +278,13 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
|
|
| 278 |
threshold = max_score * 0.5 # Show documents within 50% of best score
|
| 279 |
max_docs = 5
|
| 280 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
results = []
|
| 282 |
doc_count = 0
|
| 283 |
|
|
@@ -285,47 +292,47 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
|
|
| 285 |
if doc_count >= max_docs or score < threshold:
|
| 286 |
break
|
| 287 |
|
| 288 |
-
# Get the metadata for this document
|
| 289 |
-
|
|
|
|
| 290 |
metadata = match["metadata"]
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
if (title and title.strip() and title not in ["Unknown", "Unknown Title", ""] and
|
| 310 |
-
summary and summary.strip() and summary not in ["No summary available", ""]):
|
| 311 |
-
|
| 312 |
-
result = {
|
| 313 |
-
"title": title,
|
| 314 |
-
"summary": summary,
|
| 315 |
-
"costLKR": costLKR,
|
| 316 |
-
"category": category,
|
| 317 |
-
"pdfUrl": f"assets/pdfs/{file_path}" if file_path else "",
|
| 318 |
-
"thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "",
|
| 319 |
-
"score": score,
|
| 320 |
-
"relevance_percentage": int(score * 100),
|
| 321 |
-
"file_path": file_path,
|
| 322 |
-
"id": match["id"],
|
| 323 |
-
"content": metadata.get("content", "") # Add the actual content
|
| 324 |
-
}
|
| 325 |
-
|
| 326 |
-
results.append(result)
|
| 327 |
-
doc_count += 1
|
| 328 |
-
break
|
| 329 |
|
| 330 |
return results
|
| 331 |
except Exception as e:
|
|
|
|
| 278 |
threshold = max_score * 0.5 # Show documents within 50% of best score
|
| 279 |
max_docs = 5
|
| 280 |
|
| 281 |
+
# Create a lookup dictionary for efficient metadata retrieval
|
| 282 |
+
metadata_lookup = {}
|
| 283 |
+
for match in res["matches"]:
|
| 284 |
+
file_path_key = match["metadata"].get("file_path", "")
|
| 285 |
+
if file_path_key not in metadata_lookup:
|
| 286 |
+
metadata_lookup[file_path_key] = match
|
| 287 |
+
|
| 288 |
results = []
|
| 289 |
doc_count = 0
|
| 290 |
|
|
|
|
| 292 |
if doc_count >= max_docs or score < threshold:
|
| 293 |
break
|
| 294 |
|
| 295 |
+
# Get the metadata for this document using the lookup
|
| 296 |
+
if file_path in metadata_lookup:
|
| 297 |
+
match = metadata_lookup[file_path]
|
| 298 |
metadata = match["metadata"]
|
| 299 |
+
|
| 300 |
+
# Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
|
| 301 |
+
proposal_data = DYNAMIC_METADATA.get(file_path, {
|
| 302 |
+
"title": metadata.get("title", "Unknown Title"),
|
| 303 |
+
"summary": metadata.get("summary", ""),
|
| 304 |
+
"category": metadata.get("category", "Budget Proposal"),
|
| 305 |
+
"costLKR": metadata.get("costLKR", "No Costing Available")
|
| 306 |
+
})
|
| 307 |
+
|
| 308 |
+
# Get language-specific data
|
| 309 |
+
title = get_language_specific_data(proposal_data, "title", language)
|
| 310 |
+
summary = get_language_specific_data(proposal_data, "summary", language)
|
| 311 |
+
costLKR = get_language_specific_data(proposal_data, "costLKR", language)
|
| 312 |
+
category = get_language_specific_data(proposal_data, "category", language)
|
| 313 |
+
thumb_url = metadata.get("thumbUrl", "")
|
| 314 |
+
|
| 315 |
+
# Only include documents that have meaningful content in the requested language
|
| 316 |
+
# Skip documents where title and summary are empty or "Unknown"/"No summary available"
|
| 317 |
+
if (title and title.strip() and title not in ["Unknown", "Unknown Title", ""] and
|
| 318 |
+
summary and summary.strip() and summary not in ["No summary available", ""]):
|
| 319 |
|
| 320 |
+
result = {
|
| 321 |
+
"title": title,
|
| 322 |
+
"summary": summary,
|
| 323 |
+
"costLKR": costLKR,
|
| 324 |
+
"category": category,
|
| 325 |
+
"pdfUrl": f"assets/pdfs/{file_path}" if file_path else "",
|
| 326 |
+
"thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "",
|
| 327 |
+
"score": score,
|
| 328 |
+
"relevance_percentage": int(score * 100),
|
| 329 |
+
"file_path": file_path,
|
| 330 |
+
"id": match["id"],
|
| 331 |
+
"content": metadata.get("content", "") # Add the actual content
|
| 332 |
+
}
|
| 333 |
|
| 334 |
+
results.append(result)
|
| 335 |
+
doc_count += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
|
| 337 |
return results
|
| 338 |
except Exception as e:
|