Spaces:

courtneyf2
/

DIVERSIFAIR

Sleeping

App Files Files Community

Courtney Ford commited on Jan 21

Commit

dcf70e8

1 Parent(s): de20ac0

updates based on feedback

Browse files

Files changed (6) hide show

.DS_Store +0 -0
app_new.py +2 -2
rag_query.py +89 -27
vectorstore/.DS_Store +0 -0
vectorstore/index.faiss +2 -2
vectorstore/index.pkl +2 -2

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

app_new.py CHANGED Viewed

@@ -37,10 +37,10 @@ class EnhancedRAGSystem:
                 self.demo_mode = True
                 return
-            # Load embedding model
             print("Loading embedding model...")
             self.embedding_model = HuggingFaceEmbeddings(
-                model_name="sentence-transformers/all-MiniLM-L6-v2",
                 model_kwargs={"device": "cpu"},
                 encode_kwargs={"normalize_embeddings": True},
             )

                 self.demo_mode = True
                 return
+            # Load embedding model - UPDATED TO MATCH NEW MODEL
             print("Loading embedding model...")
             self.embedding_model = HuggingFaceEmbeddings(
+                model_name="sentence-transformers/all-mpnet-base-v2",  # CHANGED
                 model_kwargs={"device": "cpu"},
                 encode_kwargs={"normalize_embeddings": True},
             )

rag_query.py CHANGED Viewed

@@ -11,14 +11,28 @@ def format_context_with_citations(results: List[Tuple[Document, float]]) -> str:
     for i, (doc, score) in enumerate(results, 1):
         citation = doc.metadata.get("citation", "Unknown Source")
         entity = doc.metadata.get("entity", "Unknown")
         text = doc.page_content
-        context_parts.append(
-            f"[Source {i}]\n"
-            f"Citation: {citation}\n"
-            f"Jurisdiction: {entity}\n"
-            f"Content: {text}\n"
-        )
     return "\n---\n".join(context_parts)
@@ -93,40 +107,53 @@ def rerank_by_document_priority(
     results: List[Tuple[Document, float]], boost_factor: float = 0.3
 ) -> List[Tuple[Document, float]]:
     """
-    Rerank results to prioritise passed legislation over white papers.
-    Priority order:
-    1. Passed Legislation (highest priority)
-    2. Draft Legislation (medium priority)
-    3. White Papers/Reports (baseline)
-    Args:
-        results: List of (Document, score) tuples from vectorstore
-        boost_factor: How much to boost priority documents (0.3 = 30% score reduction)
-    Returns:
-        Reranked list of (Document, score) tuples
     """
     reranked = []
     for doc, score in results:
         status_raw = doc.metadata.get("status", "")
         status = str(status_raw).lower()
-        doc_type = str(doc.metadata.get("document_type", ""))
-        if "passed" in status or "enacted" in status:
-            boosted_score = score * (1 - boost_factor * 2)
-        elif "draft" in status or "bill" in status:
-            boosted_score = score * (1 - boost_factor)
-        elif doc_type in ["Article_style", "US_Congress", "Special_cases"]:
-            boosted_score = score * (1 - boost_factor * 0.5)
         else:
             boosted_score = score
         reranked.append((doc, boosted_score))
     reranked.sort(key=lambda x: x[1])
     return [
         (doc, original_score)
         for (doc, _), (_, original_score) in zip(reranked, results)
@@ -220,6 +247,33 @@ def extract_document_references(
     return list(set(matching_files)), suggested_entity
 def ask_question_with_llm(
     vectorstore,
     question: str,
@@ -244,6 +298,11 @@ def ask_question_with_llm(
     Returns:
         Dictionary with answer, sources, and metadata
     """
     # Check if question references specific documents
     referenced_docs = []
     detected_entity = None
@@ -341,6 +400,9 @@ def ask_question_with_llm(
             results = boosted_results + other_results
         results = results[:k]
     if not results:

     for i, (doc, score) in enumerate(results, 1):
         citation = doc.metadata.get("citation", "Unknown Source")
         entity = doc.metadata.get("entity", "Unknown")
+        language = doc.metadata.get("language", "")
+        status = doc.metadata.get("status", "")
         text = doc.page_content
+        # Build the source block
+        source_block = [
+            f"[Source {i}]",
+            f"Citation: {citation}",
+            f"Jurisdiction: {entity}",
+        ]
+        if status and status.lower() not in ["published", ""]:
+            source_block.append(f"Status: {status}")
+        if language and language.lower() not in ["english", ""]:
+            source_block.append(
+                f"Language: {language} translation - interpret with caution"
+            )
+        source_block.append(f"Content: {text}")
+        context_parts.append("\n".join(source_block))
     return "\n---\n".join(context_parts)
     results: List[Tuple[Document, float]], boost_factor: float = 0.3
 ) -> List[Tuple[Document, float]]:
     """
+    Rerank results to prioritize:
+    1. Primary legislation (highest priority)
+    2. Draft legislation (medium priority)
+    3. Articles over preambles
+    4. White Papers/Reports (lowest priority)
     """
     reranked = []
     for doc, score in results:
         status_raw = doc.metadata.get("status", "")
         status = str(status_raw).lower()
+        doc_type = doc.metadata.get("document_type", "")
+        filename = doc.metadata.get("filename", "")
+        # Highest priority: Passed/enacted legislation in Article/Section format
+        if ("passed" in status or "enacted" in status) and doc_type in [
+            "Article_style",
+            "US_Congress",
+            "Special_cases",
+        ]:
+            boosted_score = score * (1 - boost_factor * 3)  # Strong boost
+        # Deprioritize preambles
+        elif "preamble" in filename.lower():
+            boosted_score = score * (1 + boost_factor * 2)  # Penalty
+        # Medium priority: Draft legislation or other structured docs
+        elif "draft" in status or doc_type in [
+            "Article_style",
+            "US_Congress",
+            "Special_cases",
+        ]:
+            boosted_score = score * (1 - boost_factor * 1.5)
+        # Low priority: White papers and reports (Paragraph_style)
+        elif doc_type == "Paragraph_style":
+            boosted_score = score * (1 + boost_factor)  # Slight penalty
         else:
             boosted_score = score
         reranked.append((doc, boosted_score))
+    # Sort by boosted score (lower is better in FAISS)
     reranked.sort(key=lambda x: x[1])
+    # Return with ORIGINAL scores for transparency
     return [
         (doc, original_score)
         for (doc, _), (_, original_score) in zip(reranked, results)
     return list(set(matching_files)), suggested_entity
+def is_comparison_question(question: str) -> bool:
+    """Detect if question is comparing multiple jurisdictions"""
+    question_lower = question.lower()
+    comparison_patterns = [
+        "differ from",
+        "compared to",
+        "versus",
+        "vs",
+        "vs.",
+        "difference between",
+        "differences between",
+        "compare",
+        "comparison",
+        "contrast",
+        "how does",
+        "what does",
+        "unlike",
+        "similar to",
+        "different from",
+        "in contrast to",
+        "as opposed to",
+    ]
+    return any(pattern in question_lower for pattern in comparison_patterns)
 def ask_question_with_llm(
     vectorstore,
     question: str,
     Returns:
         Dictionary with answer, sources, and metadata
     """
+    # If it's a comparison question, disable auto entity detection
+    if is_comparison_question(question):
+        auto_detect_entity = False
+        print("Comparison question detected - retrieving from all jurisdictions")
     # Check if question references specific documents
     referenced_docs = []
     detected_entity = None
             results = boosted_results + other_results
+        # RERANK BY DOCUMENT PRIORITY - prioritize primary legislation
+        results = rerank_by_document_priority(results, boost_factor=0.3)
         results = results[:k]
     if not results:

vectorstore/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

vectorstore/index.faiss CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:efb63120469d0a2b5af3345fdf5decfb9ab55ad7fb32b8b90fcf0df3fab8c652
-size 59520045

 version https://git-lfs.github.com/spec/v1
+oid sha256:bb883f59d3927a716c42023d82c9b39ba5edde321942c09a8909e06c3b2ea52d
+size 119040045

vectorstore/index.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ffd9eb630f2f73dda4e0581806087bc483cb471c37101ebb8ede6f3c0c3506f2
-size 43207761

 version https://git-lfs.github.com/spec/v1
+oid sha256:6002066d961d87767b0dbad5ebbc2c5bfcd59f0b088f1250874bb789a7de9c45
+size 43207710