Spaces:

CaffeinatedCoding
/

nyayasetu

Running

App Files Files Community

CaffeinatedCoding commited on Mar 12

Commit

0430f42

verified ·

1 Parent(s): 639ffe2

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

src/retrieval.py +27 -20
src/verify.py +63 -35

src/retrieval.py CHANGED Viewed

@@ -22,25 +22,28 @@ METADATA_PATH = os.getenv("METADATA_PATH", "models/faiss_index/chunk_metadata.js
 PARENT_PATH = os.getenv("PARENT_PATH", "data/parent_judgments.jsonl")
 TOP_K = 5
-# Similarity threshold — if best score is below this, query is out of domain
-# Score range: 0 to 1 (cosine similarity with normalized vectors)
-# 0.3 = very loose match, 0.5 = decent match, 0.7 = strong match
-SIMILARITY_THRESHOLD = 0.45
 def _load_resources():
     """Load index, metadata and parent store. Called once at module import."""
     print("Loading FAISS index...")
     index = faiss.read_index(INDEX_PATH)
     print(f"Index loaded: {index.ntotal} vectors")
     print("Loading chunk metadata...")
     metadata = []
     with open(METADATA_PATH, "r", encoding="utf-8") as f:
         for line in f:
             metadata.append(json.loads(line))
     print(f"Metadata loaded: {len(metadata)} chunks")
     print("Loading parent judgments...")
     parent_store = {}
     with open(PARENT_PATH, "r", encoding="utf-8") as f:
@@ -48,7 +51,7 @@ def _load_resources():
             parent = json.loads(line)
             parent_store[parent["judgment_id"]] = parent["text"]
     print(f"Parent store loaded: {len(parent_store)} judgments")
     return index, metadata, parent_store
 _index, _metadata, _parent_store = _load_resources()
@@ -57,28 +60,32 @@ _index, _metadata, _parent_store = _load_resources()
 def retrieve(query_embedding: np.ndarray, top_k: int = TOP_K) -> List[Dict]:
     """
     Find top-k chunks most similar to the query embedding.
-    Returns empty list if best score is below SIMILARITY_THRESHOLD
-    (meaning the query is likely out of domain).
     """
     query_vec = query_embedding.reshape(1, -1).astype(np.float32)
     scores, indices = _index.search(query_vec, top_k)
-    # Check if best match is above threshold
     best_score = float(scores[0][0])
-    if best_score < SIMILARITY_THRESHOLD:
         return []  # Out of domain — agent will handle this
     results = []
     for score, idx in zip(scores[0], indices[0]):
         if idx == -1:
             continue
         chunk = _metadata[idx]
         expanded = _get_expanded_context(
             chunk["judgment_id"],
             chunk["text"]
         )
         results.append({
             "chunk_id": chunk["chunk_id"],
             "judgment_id": chunk["judgment_id"],
@@ -88,7 +95,7 @@ def retrieve(query_embedding: np.ndarray, top_k: int = TOP_K) -> List[Dict]:
             "expanded_context": expanded,
             "similarity_score": float(score)
         })
     return results
@@ -105,16 +112,16 @@ def _get_expanded_context(judgment_id: str, chunk_text: str) -> str:
     parent_text = _parent_store.get(judgment_id, "")
     if not parent_text:
         return chunk_text
     # Find chunk position in parent
     anchor = chunk_text[:80]
     start_pos = parent_text.find(anchor)
     if start_pos == -1:
         return chunk_text
     # ~4 chars per token, 1024 tokens = ~4096 chars
     WINDOW = 4096
     expand_start = max(0, start_pos - WINDOW // 4)
     expand_end = min(len(parent_text), start_pos + WINDOW)
     return parent_text[expand_start:expand_end]

 PARENT_PATH = os.getenv("PARENT_PATH", "data/parent_judgments.jsonl")
 TOP_K = 5
+# Similarity threshold for out-of-domain detection.
+# This index uses L2 distance — HIGHER score = FURTHER AWAY = worse match.
+# Legal queries typically score 0.6 - 0.8.
+# Out-of-domain queries (cricket, Bollywood) score 0.9+.
+# Block anything where the best match is above this threshold.
+SIMILARITY_THRESHOLD = 0.85
 def _load_resources():
     """Load index, metadata and parent store. Called once at module import."""
     print("Loading FAISS index...")
     index = faiss.read_index(INDEX_PATH)
     print(f"Index loaded: {index.ntotal} vectors")
     print("Loading chunk metadata...")
     metadata = []
     with open(METADATA_PATH, "r", encoding="utf-8") as f:
         for line in f:
             metadata.append(json.loads(line))
     print(f"Metadata loaded: {len(metadata)} chunks")
     print("Loading parent judgments...")
     parent_store = {}
     with open(PARENT_PATH, "r", encoding="utf-8") as f:
             parent = json.loads(line)
             parent_store[parent["judgment_id"]] = parent["text"]
     print(f"Parent store loaded: {len(parent_store)} judgments")
     return index, metadata, parent_store
 _index, _metadata, _parent_store = _load_resources()
 def retrieve(query_embedding: np.ndarray, top_k: int = TOP_K) -> List[Dict]:
     """
     Find top-k chunks most similar to the query embedding.
+    Returns empty list if best score is above SIMILARITY_THRESHOLD
+    (meaning the query is likely out of domain — no close match found).
+    L2 distance logic:
+        low score  = close match = good = let through
+        high score = far match   = bad  = block
     """
     query_vec = query_embedding.reshape(1, -1).astype(np.float32)
     scores, indices = _index.search(query_vec, top_k)
+    # Block if even the best match is too far away
     best_score = float(scores[0][0])
+    if best_score > SIMILARITY_THRESHOLD:
         return []  # Out of domain — agent will handle this
     results = []
     for score, idx in zip(scores[0], indices[0]):
         if idx == -1:
             continue
         chunk = _metadata[idx]
         expanded = _get_expanded_context(
             chunk["judgment_id"],
             chunk["text"]
         )
         results.append({
             "chunk_id": chunk["chunk_id"],
             "judgment_id": chunk["judgment_id"],
             "expanded_context": expanded,
             "similarity_score": float(score)
         })
     return results
     parent_text = _parent_store.get(judgment_id, "")
     if not parent_text:
         return chunk_text
     # Find chunk position in parent
     anchor = chunk_text[:80]
     start_pos = parent_text.find(anchor)
     if start_pos == -1:
         return chunk_text
     # ~4 chars per token, 1024 tokens = ~4096 chars
     WINDOW = 4096
     expand_start = max(0, start_pos - WINDOW // 4)
     expand_end = min(len(parent_text), start_pos + WINDOW)
     return parent_text[expand_start:expand_end]

src/verify.py CHANGED Viewed

@@ -1,45 +1,73 @@
 """
-Citation verification. Deterministic string matching — no ML.
-LOGIC:
-- Extract all quoted phrases (in double quotes) from LLM answer
-- Check each phrase verbatim against all retrieved chunk texts
-- ALL found → Verified
-- ANY missing → Unverified
-- No quotes in answer → Verified (no verifiable claim made)
-DOCUMENTED LIMITATION:
-Paraphrased claims that are not quoted pass as Verified.
-Full NLI-based verification is out of scope — documented in README.
 """
 import re
-from typing import List, Dict, Tuple
-def extract_quotes(text: str) -> List[str]:
-    """Extract double-quoted phrases of at least 8 characters."""
-    return re.findall(r'"([^"]{8,})"', text)
-def verify_citations(
-    llm_answer: str,
-    retrieved_chunks: List[Dict]
-) -> Tuple[str, List[str]]:
     """
-    Returns (status, unverified_quotes).
-    status: "Verified" | "Unverified" | "No verifiable claims"
     """
-    quotes = extract_quotes(llm_answer)
     if not quotes:
-        return "No verifiable claims", []
-    all_context = " ".join(
-        c.get("expanded_context", c.get("chunk_text", ""))
-        for c in retrieved_chunks
-    ).lower()
-    unverified = [q for q in quotes if q.lower() not in all_context]
     if unverified:
-        return "Unverified", unverified
-    return "Verified", []

 """
+Citation verification module.
+Checks whether quoted phrases in LLM answer appear in retrieved context.
+Deterministic — no ML inference.
+Documented limitation: paraphrases pass as verified because
+exact paraphrase matching requires NLI which is out of scope.
 """
 import re
+import unicodedata
+def _normalise(text: str) -> str:
+    """Lowercase, strip punctuation, collapse whitespace."""
+    text = text.lower()
+    text = unicodedata.normalize("NFKD", text)
+    text = re.sub(r"[^\w\s]", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def _extract_quotes(text: str) -> list[str]:
+    """Extract all quoted phrases from text."""
+    patterns = [
+        r'"([^"]{10,})"',      # standard double quotes
+        r'\u201c([^\u201d]{10,})\u201d',  # curly double quotes
+        r"'([^']{10,})'",      # single quotes
+    ]
+    quotes = []
+    for pattern in patterns:
+        found = re.findall(pattern, text)
+        quotes.extend(found)
+    return quotes
+def verify_citations(answer: str, contexts: list[dict]) -> tuple[bool, list[str]]:
     """
+    Check whether quoted phrases in answer appear in context windows.
+    Returns:
+        (verified: bool, unverified_quotes: list[str])
+    Logic:
+        - Extract all quoted phrases from answer
+        - If no quotes: return (True, []) — no verifiable claims made
+        - For each quote: check if normalised quote is substring of any normalised context
+        - If ALL quotes found: (True, [])
+        - If ANY quote not found: (False, [list of missing quotes])
     """
+    quotes = _extract_quotes(answer)
     if not quotes:
+        return True, []
+    # Build normalised context corpus
+    all_context_text = " ".join(
+        _normalise(ctx.get("text", "") or ctx.get("excerpt", ""))
+        for ctx in contexts
+    )
+    unverified = []
+    for quote in quotes:
+        normalised_quote = _normalise(quote)
+        # Skip very short normalised quotes — likely artifacts
+        if len(normalised_quote) < 8:
+            continue
+        if normalised_quote not in all_context_text:
+            unverified.append(quote)
     if unverified:
+        return False, unverified
+    return True, []