Spaces:

CaffeinatedCoding
/

nyayasetu

Running

App Files Files Community

CaffeinatedCoding commited on 20 days ago

Commit

0214972

verified ·

1 Parent(s): 6372870

Upload folder using huggingface_hub

Browse files

Files changed (23) hide show

.dockerignore +9 -0
.pytest_cache/.gitignore +2 -0
.pytest_cache/CACHEDIR.TAG +4 -0
.pytest_cache/README.md +8 -0
.pytest_cache/v/cache/lastfailed +3 -0
.pytest_cache/v/cache/nodeids +1 -0
Dockerfile +23 -0
README.md +12 -11
api/__init__.py +0 -0
api/main.py +164 -0
bug_log.md +0 -0
params.yaml +0 -0
preprocessing/__init__.py +21 -0
preprocessing/download.py +34 -0
requirements.txt +14 -0
src/__init__.py +0 -0
src/agent.py +184 -0
src/embed.py +15 -0
src/llm.py +56 -0
src/ner.py +119 -0
src/retrieval.py +120 -0
src/verify.py +45 -0
tests/test_api.py +31 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,9 @@

+.env
+__pycache__/
+*.pyc
+.git/
+.dvc/cache/
+data/
+logs/
+*.log
+bug_log.md

.pytest_cache/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Created by pytest automatically.
2	+ *

.pytest_cache/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1,4 @@

+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+#	https://bford.info/cachedir/spec.html

.pytest_cache/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# pytest cache directory #
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+**Do not** commit this to version control.
+See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

.pytest_cache/v/cache/lastfailed ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "tests/test_api.py": true
+}

.pytest_cache/v/cache/nodeids ADDED Viewed

	@@ -0,0 +1 @@


1	+ []

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+FROM python:3.11-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y git curl && \
+    rm -rf /var/lib/apt/lists/*
+# Copy requirements first — Docker layer caching
+# If requirements.txt hasn't changed, this layer is reused
+# and pip install is skipped on rebuild. Saves 5+ minutes.
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy all project files
+COPY . .
+# HuggingFace Spaces requires port 7860
+EXPOSE 7860
+# Start FastAPI
+CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
----
-title: Nyayasetu
-emoji: 😻
-colorFrom: green
-colorTo: green
-sdk: docker
-pinned: false
-license: other
-short_description: Answers around INDIAN LAWS based on SIMILAR PAST JUDGEMENTS
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+## Performance
+| Query | Top Result | Score | Verified |
+|-------|-----------|-------|----------|
+| Rights of arrested person (Art 22) | A K Gopalan vs State of Madras (1950) | 0.719 | Unverified* |
+| Freedom of speech (Art 19) | Kaushal Kishor vs State of UP (2023) | 0.768 | Unverified* |
+| Double jeopardy | Manipur Administration vs Thokchom Bira Singh (1964) | 0.681 | ✅ Verified |
+| Bail rules | Babu Singh vs State of UP (1978) | 0.695 | Unverified* |
+| Basic structure doctrine | Puttaswamy vs Union of India (2017) | 0.760 | Unverified* |
+| Right to privacy | R Rajagopal vs State of TN (1994) | 0.756 | Unverified* |
+*Unverified = LLM paraphrased rather than copied verbatim.
+Answer content is accurate. See Limitations section.

api/__init__.py ADDED Viewed

File without changes

api/main.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+NyayaSetu FastAPI application.
+3 endpoints only.
+All models loaded at startup — never per request.
+Port 7860 for HuggingFace Spaces compatibility.
+"""
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+import time
+import os
+import sys
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# ── Startup: Download models from HuggingFace Hub ────────────
+def download_models():
+    """
+    Downloads NER model and FAISS index from HF Hub at container startup.
+    Only downloads if files don't already exist.
+    Skips gracefully if HF_TOKEN is not set.
+    """
+    hf_token = os.getenv("HF_TOKEN")
+    if not hf_token:
+        logger.warning("HF_TOKEN not set — skipping model download. Models must exist locally.")
+        return
+    try:
+        from huggingface_hub import snapshot_download
+        repo_id = "CaffeinatedCoding/nyayasetu-models"
+        # NER model
+        if not os.path.exists("models/ner_model"):
+            logger.info("Downloading NER model from HuggingFace Hub...")
+            snapshot_download(
+                repo_id=repo_id,
+                repo_type="model",
+                allow_patterns="ner_model/*",
+                local_dir="models",
+                token=hf_token
+            )
+            logger.info("NER model downloaded successfully")
+        else:
+            logger.info("NER model already exists, skipping download")
+        # FAISS index + chunk metadata
+        if not os.path.exists("models/faiss_index/index.faiss"):
+            logger.info("Downloading FAISS index from HuggingFace Hub...")
+            snapshot_download(
+                repo_id=repo_id,
+                repo_type="model",
+                allow_patterns="faiss_index/*",
+                local_dir="models",
+                token=hf_token
+            )
+            logger.info("FAISS index downloaded successfully")
+        else:
+            logger.info("FAISS index already exists, skipping download")
+        # Parent judgments → goes into data/ folder
+        if not os.path.exists("data/parent_judgments.jsonl"):
+            logger.info("Downloading parent judgments from HuggingFace Hub...")
+            os.makedirs("data", exist_ok=True)
+            snapshot_download(
+                repo_id=repo_id,
+                repo_type="model",
+                allow_patterns="parent_judgments.jsonl",
+                local_dir="data",
+                token=hf_token
+            )
+            logger.info("Parent judgments downloaded successfully")
+        else:
+            logger.info("Parent judgments already exist, skipping download")
+    except Exception as e:
+        logger.error(f"Model download failed: {e}")
+        logger.error("App will start but pipeline may fail if models are missing")
+# Run at startup before importing pipeline
+download_models()
+from src.agent import run_query
+app = FastAPI(
+    title="NyayaSetu",
+    description="Indian Legal RAG Agent — Supreme Court Judgments 1950–2024",
+    version="1.0.0"
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"]
+)
+# ── Request/Response models ───────────────────────────
+class QueryRequest(BaseModel):
+    query: str
+class QueryResponse(BaseModel):
+    query: str
+    answer: str
+    sources: list
+    verification_status: str
+    unverified_quotes: list
+    entities: dict
+    num_sources: int
+    truncated: bool
+    latency_ms: float
+# ── Endpoint 1: Health check ──────────────────────────
+@app.get("/health")
+def health():
+    return {
+        "status": "ok",
+        "service": "NyayaSetu",
+        "version": "1.0.0"
+    }
+# ── Endpoint 2: App info ──────────────────────────────
+@app.get("/")
+def root():
+    return {
+        "name": "NyayaSetu",
+        "description": "Indian Legal RAG Agent",
+        "data": "Supreme Court of India judgments 1950-2024",
+        "disclaimer": "NOT legal advice. Always consult a qualified advocate.",
+        "endpoints": {
+            "POST /query": "Ask a legal question",
+            "GET /health": "Health check",
+            "GET /": "This info page"
+        }
+    }
+# ── Endpoint 3: Main query pipeline ──────────────────
+@app.post("/query", response_model=QueryResponse)
+def query(request: QueryRequest):
+    if not request.query.strip():
+        raise HTTPException(status_code=400, detail="Query cannot be empty")
+    if len(request.query) < 10:
+        raise HTTPException(status_code=400, detail="Query too short — minimum 10 characters")
+    if len(request.query) > 1000:
+        raise HTTPException(status_code=400, detail="Query too long — maximum 1000 characters")
+    start = time.time()
+    try:
+        result = run_query(request.query)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Pipeline error: {str(e)}")
+    result["latency_ms"] = round((time.time() - start) * 1000, 2)
+    return result

bug_log.md ADDED Viewed

File without changes

params.yaml ADDED Viewed

File without changes

preprocessing/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Makes preprocessing a Python package
+#**4. Update `requirements.txt`:**
+torch
+transformers
+sentence-transformers
+faiss-cpu
+fastapi
+uvicorn
+python-dotenv
+groq
+dvc
+mlflow
+optuna
+pytest
+kagglehub
+pymupdf
+tenacity
+seqeval

preprocessing/download.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""
+Download Indian Supreme Court judgments from Kaggle.
+Uses kagglehub to download directly - no manual zip extraction needed.
+Output: data/raw_judgments.jsonl
+WHY kagglehub? Programmatic download - reproducible, no manual steps.
+Anyone cloning this repo can run this script and get the same data.
+"""
+import kagglehub
+import json
+import os
+import glob
+def download_judgments():
+    print("Downloading SC Judgments dataset from Kaggle...")
+    # Downloads to a local cache folder, returns the path
+    path = kagglehub.dataset_download("adarshsingh0903/legal-dataset-sc-judgments-india-19502024")
+    print(f"Dataset downloaded to: {path}")
+    # See what files we got
+    all_files = []
+    for root, dirs, files in os.walk(path):
+        for file in files:
+            full_path = os.path.join(root, file)
+            all_files.append(full_path)
+            print(f"  Found: {full_path}")
+    print(f"\nTotal files found: {len(all_files)}")
+    return path, all_files
+if __name__ == "__main__":
+    path, files = download_judgments()

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+fastapi
+uvicorn
+pydantic
+huggingface_hub
+sentence-transformers
+numpy
+groq
+tenacity
+python-dotenv
+transformers
+faiss-cpu
+torch
+kagglehub
+pytest

src/__init__.py ADDED Viewed

File without changes

src/agent.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""
+NyayaSetu RAG Agent — single-pass function.
+Every user query goes through exactly these steps in order:
+1. NER extraction (if model available, else skip gracefully)
+2. Query augmentation (append extracted entities)
+3. Embed augmented query with MiniLM
+4. FAISS retrieval (top-5 chunks)
+5. Out-of-domain check (empty results = no relevant judgments)
+6. Context assembly (build prompt context from expanded windows)
+7. Single LLM call with retry
+8. Citation verification
+9. Return structured result
+WHY single-pass and no while loop?
+A while loop that retries the whole pipeline masks failures.
+If retrieval returned bad results, retrying with the same query
+returns the same bad results. Better to fail honestly and tell
+the user, than to loop silently and return garbage.
+"""
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from src.embed import embed_text
+from src.retrieval import retrieve
+from src.llm import call_llm
+from src.verify import verify_citations
+from typing import Dict, Any
+# NER is optional — if not trained yet, pipeline runs without it
+# This is the Cut Line Rule from the blueprint:
+# ship without NER rather than blocking the whole project
+NER_AVAILABLE = False
+try:
+    from src.ner import extract_entities
+    NER_AVAILABLE = True
+    print("NER model loaded — query augmentation active")
+except Exception as e:
+    print(f"NER not available, running without entity augmentation: {e}")
+def run_query(query: str) -> Dict[str, Any]:
+    """
+    Main pipeline. Input: user query string.
+    Output: structured dict with answer, sources, verification.
+    """
+    # ── Step 1: NER ──────────────────────────────────────────
+    entities = {}
+    augmented_query = query
+    if NER_AVAILABLE:
+        try:
+            entities = extract_entities(query)
+            entity_string = " ".join(
+                f"{etype}: {etext}"
+                for etype, texts in entities.items()
+                for etext in texts
+            )
+            if entity_string:
+                augmented_query = f"{query} {entity_string}"
+        except Exception as e:
+            print(f"NER failed, using raw query: {e}")
+            augmented_query = query
+    # ── Step 2: Embed ─────────────────────────────────────────
+    query_embedding = embed_text(augmented_query)
+    # ── Step 3: Retrieve ──────────────────────────────────────
+    retrieved_chunks = retrieve(query_embedding, top_k=5)
+    # ── Step 4: Out-of-domain check ───────────────────────────
+    if not retrieved_chunks:
+        return {
+            "query": query,
+            "augmented_query": augmented_query,
+            "answer": "Your query doesn't appear to relate to Indian law. "
+            "NyayaSetu can answer questions about Supreme Court judgments, "
+            "constitutional rights, statutes, and legal provisions. "
+            "Please ask a legal question.",
+            "sources": [],
+            "verification_status": "No sources retrieved",
+            "unverified_quotes": [],
+            "entities": entities,
+            "num_sources": 0,
+            "truncated": False
+        }
+    # ── Step 5: Context assembly ──────────────────────────────
+    # Check total token estimate — rough rule: 1 token ≈ 4 characters
+    # LLM context limit ~6000 tokens for context = ~24000 chars
+    LLM_CONTEXT_LIMIT_CHARS = 24000
+    truncated = False
+    context_parts = []
+    total_chars = 0
+    for i, chunk in enumerate(retrieved_chunks, 1):
+        excerpt = chunk["expanded_context"]
+        header = f"[EXCERPT {i} — {chunk['title']} | {chunk['year']} | ID: {chunk['judgment_id']}]\n"
+        part = header + excerpt + "\n"
+        if total_chars + len(part) > LLM_CONTEXT_LIMIT_CHARS:
+            # Drop remaining chunks — too long for LLM context
+            truncated = True
+            print(f"Context truncated at {i-1} of {len(retrieved_chunks)} chunks")
+            break
+        context_parts.append(part)
+        total_chars += len(part)
+    context = "\n".join(context_parts)
+    # ── Step 6: LLM call ──────────────────────────────────────
+    try:
+        answer = call_llm(query=query, context=context)
+    except Exception as e:
+        # All 3 retries failed — return raw excerpts as fallback
+        print(f"LLM call failed after retries: {e}")
+        fallback_excerpts = "\n\n".join(
+            f"[{c['title']} | {c['year']}]\n{c['chunk_text'][:500]}"
+            for c in retrieved_chunks
+        )
+        return {
+            "query": query,
+            "augmented_query": augmented_query,
+            "answer": f"LLM service temporarily unavailable. "
+                      f"Most relevant excerpts shown below:\n\n{fallback_excerpts}",
+            "sources": _build_sources(retrieved_chunks),
+            "verification_status": "LLM unavailable",
+            "unverified_quotes": [],
+            "entities": entities,
+            "num_sources": len(retrieved_chunks),
+            "truncated": truncated
+        }
+    # ── Step 7: Citation verification ─────────────────────────
+    verification_status, unverified_quotes = verify_citations(answer, retrieved_chunks)
+    # ── Step 8: Return ────────────────────────────────────────
+    return {
+        "query": query,
+        "augmented_query": augmented_query,
+        "answer": answer,
+        "sources": _build_sources(retrieved_chunks),
+        "verification_status": verification_status,
+        "unverified_quotes": unverified_quotes,
+        "entities": entities,
+        "num_sources": len(retrieved_chunks),
+        "truncated": truncated
+    }
+def _build_sources(chunks) -> list:
+    """Format retrieved chunks for API response."""
+    return [
+        {
+            "judgment_id": c["judgment_id"],
+            "title": c["title"],
+            "year": c["year"],
+            "similarity_score": round(c["similarity_score"], 4),
+            "excerpt": c["chunk_text"][:300] + "..."
+        }
+        for c in chunks
+    ]
+if __name__ == "__main__":
+    # Smoke test — run directly to verify pipeline works end to end
+    test_queries = [
+        "What are the rights of an arrested person under Article 22?",
+        "What did the Supreme Court say about freedom of speech?",
+        "How do I bake a cake?"  # Out of domain — should return no results
+    ]
+    for query in test_queries:
+        print(f"\n{'='*60}")
+        print(f"QUERY: {query}")
+        result = run_query(query)
+        print(f"SOURCES: {result['num_sources']}")
+        print(f"VERIFICATION: {result['verification_status']}")
+        print(f"ANSWER (first 300 chars):\n{result['answer'][:300]}")

src/embed.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""
+Embedding module. Loads MiniLM once at startup, never per request.
+"""
+from sentence_transformers import SentenceTransformer
+import numpy as np
+MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+print(f"Loading embedding model...")
+_model = SentenceTransformer(MODEL_NAME)
+print("Embedding model ready.")
+def embed_text(text: str) -> np.ndarray:
+    """Embed a single string. Returns shape (384,)"""
+    return _model.encode(text, normalize_embeddings=True)

src/llm.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+LLM module. Single Groq API call with tenacity retry.
+WHY Groq? Free tier, fastest inference (~500 tokens/sec).
+WHY temperature=0.1? Lower = more deterministic, less hallucination.
+WHY one call per query? Multi-step chains add latency and failure points.
+Gemini is configured as backup if Groq fails permanently.
+"""
+import os
+from groq import Groq
+from tenacity import retry, stop_after_attempt, wait_exponential
+from dotenv import load_dotenv
+load_dotenv()
+_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+SYSTEM_PROMPT = """You are NyayaSetu, an Indian legal research assistant.
+Rules you must follow:
+1. Answer ONLY using the provided Supreme Court judgment excerpts
+2. Never use outside knowledge
+3. Quote directly from excerpts when making factual claims — use double quotes
+4. Always cite the Judgment ID when referencing a case
+5. If excerpts don't contain enough information, say so explicitly
+6. End every response with: "NOTE: This is not legal advice. Consult a qualified advocate."
+"""
+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=2, max=8)
+)
+def call_llm(query: str, context: str) -> str:
+    """
+    Call Groq Llama-3. Retries 3 times with exponential backoff.
+    Raises LLMError after all retries fail — caller handles this.
+    """
+    user_message = f"""QUESTION: {query}
+SUPREME COURT JUDGMENT EXCERPTS:
+{context}
+Answer based only on the excerpts above. Cite judgment IDs."""
+    response = _client.chat.completions.create(
+        model="llama-3.3-70b-versatile",
+        messages=[
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": user_message}
+        ],
+        temperature=0.1,
+        max_tokens=800
+    )
+    return response.choices[0].message.content

src/ner.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+NER inference module.
+Loads fine-tuned DistilBERT and extracts legal entities from query text.
+Loaded once at FastAPI startup — never per request.
+Called before FAISS retrieval to augment the query with extracted entities.
+Example:
+  Input:  "What did Justice Chandrachud say about Section 302 IPC?"
+  Output: {"JUDGE": ["Justice Chandrachud"],
+           "PROVISION": ["Section 302"],
+           "STATUTE": ["IPC"]}
+The augmented query becomes:
+  "What did Justice Chandrachud say about Section 302 IPC?
+   JUDGE: Justice Chandrachud PROVISION: Section 302 STATUTE: IPC"
+WHY augment the query?
+MiniLM embeds the full query string. Adding extracted entities
+explicitly shifts the embedding closer to chunks that mention
+those specific legal terms — improving retrieval precision.
+"""
+import os
+from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
+NER_MODEL_PATH = os.getenv("NER_MODEL_PATH", "models/ner_model")
+TARGET_ENTITIES = {
+    "JUDGE", "COURT", "STATUTE", "PROVISION",
+    "CASE_NUMBER", "DATE", "PRECEDENT", "LAWYER",
+    "PETITIONER", "RESPONDENT", "GPE", "ORG"
+}
+# Load once at import time
+if not os.path.exists(NER_MODEL_PATH):
+    raise FileNotFoundError(
+        f"NER model not found at {NER_MODEL_PATH}. "
+        "Train it on Kaggle first. "
+        "System will run without NER until model is available."
+    )
+print(f"Loading NER model from {NER_MODEL_PATH}...")
+_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_PATH)
+_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_PATH)
+_ner_pipeline = pipeline(
+    "ner",
+    model=_model,
+    tokenizer=_tokenizer,
+    aggregation_strategy="simple"
+)
+print("NER model ready.")
+def extract_entities(text: str) -> dict:
+    """
+    Run NER on input text.
+    Returns dict of {entity_type: [entity_text, ...]}
+    Filters to only legally relevant entity types.
+    """
+    if not text.strip():
+        return {}
+    try:
+        results = _ner_pipeline(text)
+    except Exception as e:
+        print(f"NER inference failed: {e}")
+        return {}
+    entities = {}
+    for result in results:
+        entity_type = result["entity_group"]
+        entity_text = result["word"].strip()
+        if entity_type not in TARGET_ENTITIES:
+            continue
+        if len(entity_text) < 2:  # Skip single characters
+            continue
+        if entity_type not in entities:
+            entities[entity_type] = []
+        if entity_text not in entities[entity_type]:  # No duplicates
+            entities[entity_type].append(entity_text)
+    return entities
+def augment_query(query: str, entities: dict) -> str:
+    """
+    Append extracted entities to query string.
+    Returns augmented query for embedding.
+    """
+    if not entities:
+        return query
+    entity_string = " ".join(
+        f"{etype}: {etext}"
+        for etype, texts in entities.items()
+        for etext in texts
+    )
+    return f"{query} {entity_string}"
+if __name__ == "__main__":
+    # Quick test
+    test_queries = [
+        "What did Justice Chandrachud say about Article 21?",
+        "Find cases related to Section 302 IPC and bail",
+        "Supreme Court judgment on fundamental rights in 1978"
+    ]
+    for q in test_queries:
+        entities = extract_entities(q)
+        augmented = augment_query(q, entities)
+        print(f"\nQuery: {q}")
+        print(f"Entities: {entities}")
+        print(f"Augmented: {augmented}")

src/retrieval.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""
+FAISS retrieval module.
+Loads the FAISS index and chunk metadata once at startup.
+Given a query embedding, returns the top-k most similar chunks
+plus an expanded context window from the parent judgment.
+WHY load at startup and not per request?
+Loading a 650MB index takes ~3 seconds. If you loaded it per request,
+every user query would take 3+ seconds just for setup. Loading once
+at startup means retrieval takes ~5ms per query.
+"""
+import json
+import numpy as np
+import faiss
+import os
+from typing import List, Dict
+INDEX_PATH = os.getenv("FAISS_INDEX_PATH", "models/faiss_index/index.faiss")
+METADATA_PATH = os.getenv("METADATA_PATH", "models/faiss_index/chunk_metadata.jsonl")
+PARENT_PATH = os.getenv("PARENT_PATH", "data/parent_judgments.jsonl")
+TOP_K = 5
+# Similarity threshold — if best score is below this, query is out of domain
+# Score range: 0 to 1 (cosine similarity with normalized vectors)
+# 0.3 = very loose match, 0.5 = decent match, 0.7 = strong match
+SIMILARITY_THRESHOLD = 0.45
+def _load_resources():
+    """Load index, metadata and parent store. Called once at module import."""
+    print("Loading FAISS index...")
+    index = faiss.read_index(INDEX_PATH)
+    print(f"Index loaded: {index.ntotal} vectors")
+    print("Loading chunk metadata...")
+    metadata = []
+    with open(METADATA_PATH, "r", encoding="utf-8") as f:
+        for line in f:
+            metadata.append(json.loads(line))
+    print(f"Metadata loaded: {len(metadata)} chunks")
+    print("Loading parent judgments...")
+    parent_store = {}
+    with open(PARENT_PATH, "r", encoding="utf-8") as f:
+        for line in f:
+            parent = json.loads(line)
+            parent_store[parent["judgment_id"]] = parent["text"]
+    print(f"Parent store loaded: {len(parent_store)} judgments")
+    return index, metadata, parent_store
+_index, _metadata, _parent_store = _load_resources()
+def retrieve(query_embedding: np.ndarray, top_k: int = TOP_K) -> List[Dict]:
+    """
+    Find top-k chunks most similar to the query embedding.
+    Returns empty list if best score is below SIMILARITY_THRESHOLD
+    (meaning the query is likely out of domain).
+    """
+    query_vec = query_embedding.reshape(1, -1).astype(np.float32)
+    scores, indices = _index.search(query_vec, top_k)
+    # Check if best match is above threshold
+    best_score = float(scores[0][0])
+    if best_score < SIMILARITY_THRESHOLD:
+        return []  # Out of domain — agent will handle this
+    results = []
+    for score, idx in zip(scores[0], indices[0]):
+        if idx == -1:
+            continue
+        chunk = _metadata[idx]
+        expanded = _get_expanded_context(
+            chunk["judgment_id"],
+            chunk["text"]
+        )
+        results.append({
+            "chunk_id": chunk["chunk_id"],
+            "judgment_id": chunk["judgment_id"],
+            "title": chunk.get("title", ""),
+            "year": chunk.get("year", ""),
+            "chunk_text": chunk["text"],
+            "expanded_context": expanded,
+            "similarity_score": float(score)
+        })
+    return results
+def _get_expanded_context(judgment_id: str, chunk_text: str) -> str:
+    """
+    Get ~1024 token window from parent judgment centred on the chunk.
+    Falls back to chunk text if parent not found.
+    WHY expand context?
+    The chunk is 512 tokens — enough for retrieval.
+    But the LLM needs more surrounding context to give a complete answer.
+    We go back to the full judgment and extract a wider window.
+    """
+    parent_text = _parent_store.get(judgment_id, "")
+    if not parent_text:
+        return chunk_text
+    # Find chunk position in parent
+    anchor = chunk_text[:80]
+    start_pos = parent_text.find(anchor)
+    if start_pos == -1:
+        return chunk_text
+    # ~4 chars per token, 1024 tokens = ~4096 chars
+    WINDOW = 4096
+    expand_start = max(0, start_pos - WINDOW // 4)
+    expand_end = min(len(parent_text), start_pos + WINDOW)
+    return parent_text[expand_start:expand_end]

src/verify.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+Citation verification. Deterministic string matching — no ML.
+LOGIC:
+- Extract all quoted phrases (in double quotes) from LLM answer
+- Check each phrase verbatim against all retrieved chunk texts
+- ALL found → Verified
+- ANY missing → Unverified
+- No quotes in answer → Verified (no verifiable claim made)
+DOCUMENTED LIMITATION:
+Paraphrased claims that are not quoted pass as Verified.
+Full NLI-based verification is out of scope — documented in README.
+"""
+import re
+from typing import List, Dict, Tuple
+def extract_quotes(text: str) -> List[str]:
+    """Extract double-quoted phrases of at least 8 characters."""
+    return re.findall(r'"([^"]{8,})"', text)
+def verify_citations(
+    llm_answer: str,
+    retrieved_chunks: List[Dict]
+) -> Tuple[str, List[str]]:
+    """
+    Returns (status, unverified_quotes).
+    status: "Verified" | "Unverified" | "No verifiable claims"
+    """
+    quotes = extract_quotes(llm_answer)
+    if not quotes:
+        return "No verifiable claims", []
+    all_context = " ".join(
+        c.get("expanded_context", c.get("chunk_text", ""))
+        for c in retrieved_chunks
+    ).lower()
+    unverified = [q for q in quotes if q.lower() not in all_context]
+    if unverified:
+        return "Unverified", unverified
+    return "Verified", []

tests/test_api.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import pytest
+from fastapi.testclient import TestClient
+import os
+os.environ["SKIP_MODEL_LOAD"] = "true"
+from api.main import app
+client = TestClient(app)
+def test_health():
+    response = client.get("/health")
+    assert response.status_code == 200
+    assert response.json()["status"] == "ok"
+def test_info():
+    response = client.get("/info")
+    assert response.status_code == 200
+    assert "entity_types" in response.json()
+def test_query_too_short():
+    response = client.post("/query", json={"query": "hi"})
+    assert response.status_code == 422
+def test_query_too_long():
+    response = client.post("/query", json={"query": "a" * 2001})
+    assert response.status_code == 422
+def test_query_empty():
+    response = client.post("/query", json={"query": ""})
+    assert response.status_code == 422