Spaces:

CaffeinatedCoding
/

nyayasetu

Running

File size: 6,364 Bytes

8efa523

"""
Precedent Chain Builder — Runtime Module.

Loads citation graph built offline by preprocessing/build_citation_graph.py.
At query time, enriches retrieved chunks with cited predecessor judgments.

WHY:
Indian SC judgments build on each other. A 1984 judgment establishing
a key principle was itself built on a 1971 judgment. Showing the user
the reasoning chain across cases makes NyayaSetu feel like a legal
researcher, not a search engine.

The graph is loaded once at startup and kept in memory.
Lookup is O(1) dict access — negligible runtime cost.
"""

import os
import json
import re
import logging
from typing import List, Dict, Optional

logger = logging.getLogger(__name__)

# ── Graph store ───────────────────────────────────────────
_graph = {}           # judgment_id -> [citation_strings]
_reverse_graph = {}   # citation_string -> [judgment_ids]
_title_to_id = {}     # normalised_title -> judgment_id
_parent_store = {}    # judgment_id -> text (loaded from parent_judgments.jsonl)
_loaded = False


def load_citation_graph(
    graph_path: str = "data/citation_graph.json",
    reverse_path: str = "data/reverse_citation_graph.json",
    title_path: str = "data/title_to_id.json",
    parent_path: str = "data/parent_judgments.jsonl"
):
    """
    Load all citation graph artifacts once at startup.
    Call from api/main.py after download_models().
    Fails gracefully if files not found.
    """
    global _graph, _reverse_graph, _title_to_id, _parent_store, _loaded

    try:
        if os.path.exists(graph_path):
            with open(graph_path) as f:
                _graph = json.load(f)
            logger.info(f"Citation graph loaded: {len(_graph)} judgments")
        else:
            logger.warning(f"Citation graph not found at {graph_path}")

        if os.path.exists(reverse_path):
            with open(reverse_path) as f:
                _reverse_graph = json.load(f)
            logger.info(f"Reverse citation graph loaded: {len(_reverse_graph)} citations")

        if os.path.exists(title_path):
            with open(title_path) as f:
                _title_to_id = json.load(f)
            logger.info(f"Title index loaded: {len(_title_to_id)} titles")

        # Load parent judgments for text retrieval
        if os.path.exists(parent_path):
            with open(parent_path) as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        j = json.loads(line)
                        jid = j.get("judgment_id", "")
                        if jid:
                            _parent_store[jid] = j.get("text", "")
                    except Exception:
                        continue
            logger.info(f"Parent store loaded: {len(_parent_store)} judgments")

        _loaded = True

    except Exception as e:
        logger.error(f"Citation graph load failed: {e}. Precedent chain disabled.")
        _loaded = False


def _resolve_citation_to_judgment(citation_string: str) -> Optional[str]:
    """
    Try to match a citation string to a judgment_id.
    Uses multiple strategies in order of reliability.
    """
    if not citation_string:
        return None

    # Strategy 1: Check reverse graph directly
    if citation_string in _reverse_graph:
        refs = _reverse_graph[citation_string]
        if refs:
            return refs[0]

    # Strategy 2: Normalise and check title index
    normalised = re.sub(r'[^\w\s]', '', citation_string.lower())[:50]
    if normalised in _title_to_id:
        return _title_to_id[normalised]

    # Strategy 3: Partial match on title index
    for title, jid in _title_to_id.items():
        if len(normalised) > 10 and normalised[:20] in title:
            return jid

    return None


def get_precedent_chain(
    judgment_ids: List[str],
    max_precedents: int = 3
) -> List[Dict]:
    """
    Given a list of retrieved judgment IDs, return their cited predecessors.

    Args:
        judgment_ids: IDs of judgments already retrieved by FAISS
        max_precedents: maximum number of precedent chunks to return

    Returns:
        List of precedent dicts with same structure as regular chunks,
        plus 'is_precedent': True and 'cited_by' field.
    """
    if not _loaded or not _graph:
        return []

    precedents = []
    seen_ids = set(judgment_ids)

    for jid in judgment_ids:
        citations = _graph.get(jid, [])
        if not citations:
            continue

        for citation_ref in citations[:3]:  # max 3 citations per judgment
            resolved_id = _resolve_citation_to_judgment(citation_ref)

            if not resolved_id or resolved_id in seen_ids:
                continue

            # Get text from parent store
            text = _parent_store.get(resolved_id, "")
            if not text:
                continue

            seen_ids.add(resolved_id)

            # Extract a useful excerpt — first 1500 chars after any header
            excerpt = text[:1500].strip()

            precedents.append({
                "judgment_id": resolved_id,
                "chunk_id": f"{resolved_id}_precedent",
                "text": excerpt,
                "title": f"Precedent: {citation_ref[:80]}",
                "year": resolved_id.split("_")[1] if "_" in resolved_id else "",
                "source_type": "case_law",
                "is_precedent": True,
                "cited_by": jid,
                "citation_ref": citation_ref,
                "similarity_score": 0.5  # precedents are added, not ranked
            })

            if len(precedents) >= max_precedents:
                break

        if len(precedents) >= max_precedents:
            break

    if precedents:
        logger.info(f"Precedent chain: added {len(precedents)} predecessor judgments")

    return precedents


def get_citation_count(judgment_id: str) -> int:
    """How many times has this judgment been cited by others."""
    count = 0
    for citations in _graph.values():
        for c in citations:
            resolved = _resolve_citation_to_judgment(c)
            if resolved == judgment_id:
                count += 1
    return count


def is_loaded() -> bool:
    return _loaded