Spaces:
Running
Running
File size: 6,364 Bytes
8efa523 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | """
Precedent Chain Builder — Runtime Module.
Loads citation graph built offline by preprocessing/build_citation_graph.py.
At query time, enriches retrieved chunks with cited predecessor judgments.
WHY:
Indian SC judgments build on each other. A 1984 judgment establishing
a key principle was itself built on a 1971 judgment. Showing the user
the reasoning chain across cases makes NyayaSetu feel like a legal
researcher, not a search engine.
The graph is loaded once at startup and kept in memory.
Lookup is O(1) dict access — negligible runtime cost.
"""
import os
import json
import re
import logging
from typing import List, Dict, Optional
logger = logging.getLogger(__name__)
# ── Graph store ───────────────────────────────────────────
_graph = {} # judgment_id -> [citation_strings]
_reverse_graph = {} # citation_string -> [judgment_ids]
_title_to_id = {} # normalised_title -> judgment_id
_parent_store = {} # judgment_id -> text (loaded from parent_judgments.jsonl)
_loaded = False
def load_citation_graph(
graph_path: str = "data/citation_graph.json",
reverse_path: str = "data/reverse_citation_graph.json",
title_path: str = "data/title_to_id.json",
parent_path: str = "data/parent_judgments.jsonl"
):
"""
Load all citation graph artifacts once at startup.
Call from api/main.py after download_models().
Fails gracefully if files not found.
"""
global _graph, _reverse_graph, _title_to_id, _parent_store, _loaded
try:
if os.path.exists(graph_path):
with open(graph_path) as f:
_graph = json.load(f)
logger.info(f"Citation graph loaded: {len(_graph)} judgments")
else:
logger.warning(f"Citation graph not found at {graph_path}")
if os.path.exists(reverse_path):
with open(reverse_path) as f:
_reverse_graph = json.load(f)
logger.info(f"Reverse citation graph loaded: {len(_reverse_graph)} citations")
if os.path.exists(title_path):
with open(title_path) as f:
_title_to_id = json.load(f)
logger.info(f"Title index loaded: {len(_title_to_id)} titles")
# Load parent judgments for text retrieval
if os.path.exists(parent_path):
with open(parent_path) as f:
for line in f:
line = line.strip()
if not line:
continue
try:
j = json.loads(line)
jid = j.get("judgment_id", "")
if jid:
_parent_store[jid] = j.get("text", "")
except Exception:
continue
logger.info(f"Parent store loaded: {len(_parent_store)} judgments")
_loaded = True
except Exception as e:
logger.error(f"Citation graph load failed: {e}. Precedent chain disabled.")
_loaded = False
def _resolve_citation_to_judgment(citation_string: str) -> Optional[str]:
"""
Try to match a citation string to a judgment_id.
Uses multiple strategies in order of reliability.
"""
if not citation_string:
return None
# Strategy 1: Check reverse graph directly
if citation_string in _reverse_graph:
refs = _reverse_graph[citation_string]
if refs:
return refs[0]
# Strategy 2: Normalise and check title index
normalised = re.sub(r'[^\w\s]', '', citation_string.lower())[:50]
if normalised in _title_to_id:
return _title_to_id[normalised]
# Strategy 3: Partial match on title index
for title, jid in _title_to_id.items():
if len(normalised) > 10 and normalised[:20] in title:
return jid
return None
def get_precedent_chain(
judgment_ids: List[str],
max_precedents: int = 3
) -> List[Dict]:
"""
Given a list of retrieved judgment IDs, return their cited predecessors.
Args:
judgment_ids: IDs of judgments already retrieved by FAISS
max_precedents: maximum number of precedent chunks to return
Returns:
List of precedent dicts with same structure as regular chunks,
plus 'is_precedent': True and 'cited_by' field.
"""
if not _loaded or not _graph:
return []
precedents = []
seen_ids = set(judgment_ids)
for jid in judgment_ids:
citations = _graph.get(jid, [])
if not citations:
continue
for citation_ref in citations[:3]: # max 3 citations per judgment
resolved_id = _resolve_citation_to_judgment(citation_ref)
if not resolved_id or resolved_id in seen_ids:
continue
# Get text from parent store
text = _parent_store.get(resolved_id, "")
if not text:
continue
seen_ids.add(resolved_id)
# Extract a useful excerpt — first 1500 chars after any header
excerpt = text[:1500].strip()
precedents.append({
"judgment_id": resolved_id,
"chunk_id": f"{resolved_id}_precedent",
"text": excerpt,
"title": f"Precedent: {citation_ref[:80]}",
"year": resolved_id.split("_")[1] if "_" in resolved_id else "",
"source_type": "case_law",
"is_precedent": True,
"cited_by": jid,
"citation_ref": citation_ref,
"similarity_score": 0.5 # precedents are added, not ranked
})
if len(precedents) >= max_precedents:
break
if len(precedents) >= max_precedents:
break
if precedents:
logger.info(f"Precedent chain: added {len(precedents)} predecessor judgments")
return precedents
def get_citation_count(judgment_id: str) -> int:
"""How many times has this judgment been cited by others."""
count = 0
for citations in _graph.values():
for c in citations:
resolved = _resolve_citation_to_judgment(c)
if resolved == judgment_id:
count += 1
return count
def is_loaded() -> bool:
return _loaded |