Spaces:

CaffeinatedCoding
/

nyayasetu

Running

App Files Files Community

CaffeinatedCoding commited on 16 days ago

Commit

8efa523

verified ·

1 Parent(s): e3240a1

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

api/main.py +36 -5
src/citation_graph.py +193 -0

api/main.py CHANGED Viewed

@@ -21,8 +21,8 @@ logger = logging.getLogger(__name__)
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 def download_models():
     hf_token = os.getenv("HF_TOKEN")
     if not hf_token:
         logger.warning("HF_TOKEN not set — skipping model download.")
@@ -30,35 +30,61 @@ def download_models():
     try:
         from huggingface_hub import snapshot_download, hf_hub_download
         repo_id = "CaffeinatedCoding/nyayasetu-models"
         if not os.path.exists("models/ner_model"):
             logger.info("Downloading NER model...")
-            snapshot_download(repo_id=repo_id, repo_type="model", allow_patterns="ner_model/*", local_dir="models", token=hf_token)
             logger.info("NER model downloaded")
         else:
             logger.info("NER model already exists")
         if not os.path.exists("models/faiss_index/index.faiss"):
             logger.info("Downloading FAISS index...")
             os.makedirs("models/faiss_index", exist_ok=True)
-            hf_hub_download(repo_id=repo_id, filename="faiss_index/index.faiss", repo_type="model", local_dir="models", token=hf_token)
-            hf_hub_download(repo_id=repo_id, filename="faiss_index/chunk_metadata.jsonl", repo_type="model", local_dir="models", token=hf_token)
             logger.info("FAISS index downloaded")
         else:
             logger.info("FAISS index already exists")
         if not os.path.exists("data/parent_judgments.jsonl"):
             logger.info("Downloading parent judgments...")
             os.makedirs("data", exist_ok=True)
-            hf_hub_download(repo_id=repo_id, filename="parent_judgments.jsonl", repo_type="model", local_dir="data", token=hf_token)
             logger.info("Parent judgments downloaded")
         else:
             logger.info("Parent judgments already exist")
     except Exception as e:
         logger.error(f"Model download failed: {e}")
 download_models()
 from src.ner import load_ner_model
 load_ner_model()
 AGENT_VERSION = os.getenv("AGENT_VERSION", "v2")
 if AGENT_VERSION == "v2":
@@ -77,10 +103,12 @@ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], all
 if os.path.exists("frontend"):
     app.mount("/static", StaticFiles(directory="frontend"), name="static")
 class QueryRequest(BaseModel):
     query: str
     session_id: Optional[str] = None
 class QueryResponse(BaseModel):
     query: str
     answer: str
@@ -92,16 +120,19 @@ class QueryResponse(BaseModel):
     truncated: bool
     latency_ms: float
 @app.get("/")
 def serve_frontend():
     if os.path.exists("frontend/index.html"):
         return FileResponse("frontend/index.html")
     return {"name": "NyayaSetu", "version": "2.0.0", "agent": AGENT_VERSION}
 @app.get("/health")
 def health():
     return {"status": "ok", "service": "NyayaSetu", "version": "2.0.0", "agent": AGENT_VERSION}
 @app.post("/query", response_model=QueryResponse)
 def query(request: QueryRequest):
     if not request.query.strip():

 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 def download_models():
     hf_token = os.getenv("HF_TOKEN")
     if not hf_token:
         logger.warning("HF_TOKEN not set — skipping model download.")
     try:
         from huggingface_hub import snapshot_download, hf_hub_download
         repo_id = "CaffeinatedCoding/nyayasetu-models"
         if not os.path.exists("models/ner_model"):
             logger.info("Downloading NER model...")
+            snapshot_download(
+                repo_id=repo_id, repo_type="model",
+                allow_patterns="ner_model/*", local_dir="models", token=hf_token
+            )
             logger.info("NER model downloaded")
         else:
             logger.info("NER model already exists")
         if not os.path.exists("models/faiss_index/index.faiss"):
             logger.info("Downloading FAISS index...")
             os.makedirs("models/faiss_index", exist_ok=True)
+            hf_hub_download(repo_id=repo_id, filename="faiss_index/index.faiss",
+                            repo_type="model", local_dir="models", token=hf_token)
+            hf_hub_download(repo_id=repo_id, filename="faiss_index/chunk_metadata.jsonl",
+                            repo_type="model", local_dir="models", token=hf_token)
             logger.info("FAISS index downloaded")
         else:
             logger.info("FAISS index already exists")
         if not os.path.exists("data/parent_judgments.jsonl"):
             logger.info("Downloading parent judgments...")
             os.makedirs("data", exist_ok=True)
+            hf_hub_download(repo_id=repo_id, filename="parent_judgments.jsonl",
+                            repo_type="model", local_dir="data", token=hf_token)
             logger.info("Parent judgments downloaded")
         else:
             logger.info("Parent judgments already exist")
+        # Download citation graph artifacts — only if Kaggle run has completed
+        os.makedirs("data", exist_ok=True)
+        for fname in ["citation_graph.json", "reverse_citation_graph.json", "title_to_id.json"]:
+            if not os.path.exists(f"data/{fname}"):
+                logger.info(f"Downloading {fname}...")
+                try:
+                    hf_hub_download(repo_id=repo_id, filename=fname,
+                                    repo_type="model", local_dir="data", token=hf_token)
+                    logger.info(f"{fname} downloaded")
+                except Exception as fe:
+                    logger.warning(f"{fname} not on Hub yet — skipping: {fe}")
     except Exception as e:
         logger.error(f"Model download failed: {e}")
 download_models()
 from src.ner import load_ner_model
 load_ner_model()
+from src.citation_graph import load_citation_graph
+load_citation_graph()
 AGENT_VERSION = os.getenv("AGENT_VERSION", "v2")
 if AGENT_VERSION == "v2":
 if os.path.exists("frontend"):
     app.mount("/static", StaticFiles(directory="frontend"), name="static")
 class QueryRequest(BaseModel):
     query: str
     session_id: Optional[str] = None
 class QueryResponse(BaseModel):
     query: str
     answer: str
     truncated: bool
     latency_ms: float
 @app.get("/")
 def serve_frontend():
     if os.path.exists("frontend/index.html"):
         return FileResponse("frontend/index.html")
     return {"name": "NyayaSetu", "version": "2.0.0", "agent": AGENT_VERSION}
 @app.get("/health")
 def health():
     return {"status": "ok", "service": "NyayaSetu", "version": "2.0.0", "agent": AGENT_VERSION}
 @app.post("/query", response_model=QueryResponse)
 def query(request: QueryRequest):
     if not request.query.strip():

src/citation_graph.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""
+Precedent Chain Builder — Runtime Module.
+Loads citation graph built offline by preprocessing/build_citation_graph.py.
+At query time, enriches retrieved chunks with cited predecessor judgments.
+WHY:
+Indian SC judgments build on each other. A 1984 judgment establishing
+a key principle was itself built on a 1971 judgment. Showing the user
+the reasoning chain across cases makes NyayaSetu feel like a legal
+researcher, not a search engine.
+The graph is loaded once at startup and kept in memory.
+Lookup is O(1) dict access — negligible runtime cost.
+"""
+import os
+import json
+import re
+import logging
+from typing import List, Dict, Optional
+logger = logging.getLogger(__name__)
+# ── Graph store ───────────────────────────────────────────
+_graph = {}           # judgment_id -> [citation_strings]
+_reverse_graph = {}   # citation_string -> [judgment_ids]
+_title_to_id = {}     # normalised_title -> judgment_id
+_parent_store = {}    # judgment_id -> text (loaded from parent_judgments.jsonl)
+_loaded = False
+def load_citation_graph(
+    graph_path: str = "data/citation_graph.json",
+    reverse_path: str = "data/reverse_citation_graph.json",
+    title_path: str = "data/title_to_id.json",
+    parent_path: str = "data/parent_judgments.jsonl"
+):
+    """
+    Load all citation graph artifacts once at startup.
+    Call from api/main.py after download_models().
+    Fails gracefully if files not found.
+    """
+    global _graph, _reverse_graph, _title_to_id, _parent_store, _loaded
+    try:
+        if os.path.exists(graph_path):
+            with open(graph_path) as f:
+                _graph = json.load(f)
+            logger.info(f"Citation graph loaded: {len(_graph)} judgments")
+        else:
+            logger.warning(f"Citation graph not found at {graph_path}")
+        if os.path.exists(reverse_path):
+            with open(reverse_path) as f:
+                _reverse_graph = json.load(f)
+            logger.info(f"Reverse citation graph loaded: {len(_reverse_graph)} citations")
+        if os.path.exists(title_path):
+            with open(title_path) as f:
+                _title_to_id = json.load(f)
+            logger.info(f"Title index loaded: {len(_title_to_id)} titles")
+        # Load parent judgments for text retrieval
+        if os.path.exists(parent_path):
+            with open(parent_path) as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        j = json.loads(line)
+                        jid = j.get("judgment_id", "")
+                        if jid:
+                            _parent_store[jid] = j.get("text", "")
+                    except Exception:
+                        continue
+            logger.info(f"Parent store loaded: {len(_parent_store)} judgments")
+        _loaded = True
+    except Exception as e:
+        logger.error(f"Citation graph load failed: {e}. Precedent chain disabled.")
+        _loaded = False
+def _resolve_citation_to_judgment(citation_string: str) -> Optional[str]:
+    """
+    Try to match a citation string to a judgment_id.
+    Uses multiple strategies in order of reliability.
+    """
+    if not citation_string:
+        return None
+    # Strategy 1: Check reverse graph directly
+    if citation_string in _reverse_graph:
+        refs = _reverse_graph[citation_string]
+        if refs:
+            return refs[0]
+    # Strategy 2: Normalise and check title index
+    normalised = re.sub(r'[^\w\s]', '', citation_string.lower())[:50]
+    if normalised in _title_to_id:
+        return _title_to_id[normalised]
+    # Strategy 3: Partial match on title index
+    for title, jid in _title_to_id.items():
+        if len(normalised) > 10 and normalised[:20] in title:
+            return jid
+    return None
+def get_precedent_chain(
+    judgment_ids: List[str],
+    max_precedents: int = 3
+) -> List[Dict]:
+    """
+    Given a list of retrieved judgment IDs, return their cited predecessors.
+    Args:
+        judgment_ids: IDs of judgments already retrieved by FAISS
+        max_precedents: maximum number of precedent chunks to return
+    Returns:
+        List of precedent dicts with same structure as regular chunks,
+        plus 'is_precedent': True and 'cited_by' field.
+    """
+    if not _loaded or not _graph:
+        return []
+    precedents = []
+    seen_ids = set(judgment_ids)
+    for jid in judgment_ids:
+        citations = _graph.get(jid, [])
+        if not citations:
+            continue
+        for citation_ref in citations[:3]:  # max 3 citations per judgment
+            resolved_id = _resolve_citation_to_judgment(citation_ref)
+            if not resolved_id or resolved_id in seen_ids:
+                continue
+            # Get text from parent store
+            text = _parent_store.get(resolved_id, "")
+            if not text:
+                continue
+            seen_ids.add(resolved_id)
+            # Extract a useful excerpt — first 1500 chars after any header
+            excerpt = text[:1500].strip()
+            precedents.append({
+                "judgment_id": resolved_id,
+                "chunk_id": f"{resolved_id}_precedent",
+                "text": excerpt,
+                "title": f"Precedent: {citation_ref[:80]}",
+                "year": resolved_id.split("_")[1] if "_" in resolved_id else "",
+                "source_type": "case_law",
+                "is_precedent": True,
+                "cited_by": jid,
+                "citation_ref": citation_ref,
+                "similarity_score": 0.5  # precedents are added, not ranked
+            })
+            if len(precedents) >= max_precedents:
+                break
+        if len(precedents) >= max_precedents:
+            break
+    if precedents:
+        logger.info(f"Precedent chain: added {len(precedents)} predecessor judgments")
+    return precedents
+def get_citation_count(judgment_id: str) -> int:
+    """How many times has this judgment been cited by others."""
+    count = 0
+    for citations in _graph.values():
+        for c in citations:
+            resolved = _resolve_citation_to_judgment(c)
+            if resolved == judgment_id:
+                count += 1
+    return count
+def is_loaded() -> bool:
+    return _loaded