Spaces:

danulr05
/

budget-proposals-search-api

Sleeping

App Files Files Community

danulr05 commited on Sep 16

Commit

a5e18c5

verified ·

1 Parent(s): 8ad1d33

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -40

app.py CHANGED Viewed

@@ -306,18 +306,8 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
                     # Only include documents that have meaningful content in the requested language
                     # Skip documents where title and summary are empty or "Unknown"/"No summary available"
-                    # For non-English languages, be more lenient with the filtering
-                    has_valid_title = title and title.strip() and title not in ["Unknown", "Unknown Title", ""]
-                    has_valid_summary = summary and summary.strip() and summary not in ["No summary available", ""]
-                    # For English, require both title and summary to be valid
-                    # For other languages, only require title to be valid (summary can be empty)
-                    if language == 'en':
-                        is_valid = has_valid_title and has_valid_summary
-                    else:
-                        is_valid = has_valid_title
-                    if is_valid:
                         result = {
                             "title": title,
@@ -359,33 +349,40 @@ def get_all_proposals(category_filter=None, language='en'):
         if category_filter and category_filter != "All categories":
             filter_dict["category"] = category_filter
         # Query with a dummy vector to get all documents
         # Use language-specific vector dimensions
         if language == 'en':
             dummy_vector = [0.1] * 384  # 384 is the dimension of all-MiniLM-L6-v2
         else:  # si, ta, or any other language
             dummy_vector = [0.1] * 768  # 768 is the dimension of EmbeddingGemma-300m
-        res = pc_index.query(
-            vector=dummy_vector,
-            top_k=100,  # Get all proposals
-            include_metadata=True,
-            filter=filter_dict
-        )
-        logger.info(f"Query returned {len(res['matches'])} matches")
         results = []
-        seen_files = set()  # Track unique files to avoid duplicates
-        for match in res["matches"]:
             metadata = match["metadata"]
-            file_path = metadata.get("file_path", "")
-            # Skip if we've already included this file (avoid duplicates from chunks)
-            if file_path in seen_files:
-                continue
-            seen_files.add(file_path)
             # Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
             proposal_data = DYNAMIC_METADATA.get(file_path, {
@@ -404,18 +401,8 @@ def get_all_proposals(category_filter=None, language='en'):
             # Only include documents that have meaningful content in the requested language
             # Skip documents where title and summary are empty or "Unknown"/"No summary available"
-            # For non-English languages, be more lenient with the filtering
-            has_valid_title = title and title.strip() and title not in ["Unknown", "Unknown Title", ""]
-            has_valid_summary = summary and summary.strip() and summary not in ["No summary available", ""]
-            # For English, require both title and summary to be valid
-            # For other languages, only require title to be valid (summary can be empty)
-            if language == 'en':
-                is_valid = has_valid_title and has_valid_summary
-            else:
-                is_valid = has_valid_title
-            if is_valid:
                 result = {
                     "title": title,
@@ -432,6 +419,7 @@ def get_all_proposals(category_filter=None, language='en'):
                 results.append(result)
         return results
     except Exception as e:

                     # Only include documents that have meaningful content in the requested language
                     # Skip documents where title and summary are empty or "Unknown"/"No summary available"
+                    if (title and title.strip() and title not in ["Unknown", "Unknown Title", ""] and
+                        summary and summary.strip() and summary not in ["No summary available", ""]):
                         result = {
                             "title": title,
         if category_filter and category_filter != "All categories":
             filter_dict["category"] = category_filter
+        # Use multiple dummy vectors to ensure we get all documents
         # Query with a dummy vector to get all documents
         # Use language-specific vector dimensions
         if language == 'en':
             dummy_vector = [0.1] * 384  # 384 is the dimension of all-MiniLM-L6-v2
         else:  # si, ta, or any other language
             dummy_vector = [0.1] * 768  # 768 is the dimension of EmbeddingGemma-300m
+        # Try multiple queries with different dummy vectors to get all documents
+        all_matches = []
+        for i in range(5):  # Try 5 different dummy vectors
+            # Create slightly different dummy vectors
+            dummy_vector_variant = [0.1 + (i * 0.01)] * len(dummy_vector)
+            res = pc_index.query(
+                vector=dummy_vector_variant,
+                top_k=100,  # Get all proposals
+                include_metadata=True,
+                filter=filter_dict
+            )
+            all_matches.extend(res["matches"])
+        # Remove duplicates based on file_path
+        unique_matches = {}
+        for match in all_matches:
+            file_path = match["metadata"].get("file_path", "")
+            if file_path and file_path not in unique_matches:
+                unique_matches[file_path] = match
+        logger.info(f"Found {len(unique_matches)} unique documents")
         results = []
+        for file_path, match in unique_matches.items():
             metadata = match["metadata"]
             # Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
             proposal_data = DYNAMIC_METADATA.get(file_path, {
             # Only include documents that have meaningful content in the requested language
             # Skip documents where title and summary are empty or "Unknown"/"No summary available"
+            if (title and title.strip() and title not in ["Unknown", "Unknown Title", ""] and
+                summary and summary.strip() and summary not in ["No summary available", ""]):
                 result = {
                     "title": title,
                 results.append(result)
+        logger.info(f"Returning {len(results)} proposals for language {language}")
         return results
     except Exception as e: