File size: 6,364 Bytes
8efa523
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
"""
Precedent Chain Builder — Runtime Module.

Loads citation graph built offline by preprocessing/build_citation_graph.py.
At query time, enriches retrieved chunks with cited predecessor judgments.

WHY:
Indian SC judgments build on each other. A 1984 judgment establishing
a key principle was itself built on a 1971 judgment. Showing the user
the reasoning chain across cases makes NyayaSetu feel like a legal
researcher, not a search engine.

The graph is loaded once at startup and kept in memory.
Lookup is O(1) dict access — negligible runtime cost.
"""

import os
import json
import re
import logging
from typing import List, Dict, Optional

logger = logging.getLogger(__name__)

# ── Graph store ───────────────────────────────────────────
_graph = {}           # judgment_id -> [citation_strings]
_reverse_graph = {}   # citation_string -> [judgment_ids]
_title_to_id = {}     # normalised_title -> judgment_id
_parent_store = {}    # judgment_id -> text (loaded from parent_judgments.jsonl)
_loaded = False


def load_citation_graph(
    graph_path: str = "data/citation_graph.json",
    reverse_path: str = "data/reverse_citation_graph.json",
    title_path: str = "data/title_to_id.json",
    parent_path: str = "data/parent_judgments.jsonl"
):
    """
    Load all citation graph artifacts once at startup.
    Call from api/main.py after download_models().
    Fails gracefully if files not found.
    """
    global _graph, _reverse_graph, _title_to_id, _parent_store, _loaded

    try:
        if os.path.exists(graph_path):
            with open(graph_path) as f:
                _graph = json.load(f)
            logger.info(f"Citation graph loaded: {len(_graph)} judgments")
        else:
            logger.warning(f"Citation graph not found at {graph_path}")

        if os.path.exists(reverse_path):
            with open(reverse_path) as f:
                _reverse_graph = json.load(f)
            logger.info(f"Reverse citation graph loaded: {len(_reverse_graph)} citations")

        if os.path.exists(title_path):
            with open(title_path) as f:
                _title_to_id = json.load(f)
            logger.info(f"Title index loaded: {len(_title_to_id)} titles")

        # Load parent judgments for text retrieval
        if os.path.exists(parent_path):
            with open(parent_path) as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        j = json.loads(line)
                        jid = j.get("judgment_id", "")
                        if jid:
                            _parent_store[jid] = j.get("text", "")
                    except Exception:
                        continue
            logger.info(f"Parent store loaded: {len(_parent_store)} judgments")

        _loaded = True

    except Exception as e:
        logger.error(f"Citation graph load failed: {e}. Precedent chain disabled.")
        _loaded = False


def _resolve_citation_to_judgment(citation_string: str) -> Optional[str]:
    """
    Try to match a citation string to a judgment_id.
    Uses multiple strategies in order of reliability.
    """
    if not citation_string:
        return None

    # Strategy 1: Check reverse graph directly
    if citation_string in _reverse_graph:
        refs = _reverse_graph[citation_string]
        if refs:
            return refs[0]

    # Strategy 2: Normalise and check title index
    normalised = re.sub(r'[^\w\s]', '', citation_string.lower())[:50]
    if normalised in _title_to_id:
        return _title_to_id[normalised]

    # Strategy 3: Partial match on title index
    for title, jid in _title_to_id.items():
        if len(normalised) > 10 and normalised[:20] in title:
            return jid

    return None


def get_precedent_chain(
    judgment_ids: List[str],
    max_precedents: int = 3
) -> List[Dict]:
    """
    Given a list of retrieved judgment IDs, return their cited predecessors.

    Args:
        judgment_ids: IDs of judgments already retrieved by FAISS
        max_precedents: maximum number of precedent chunks to return

    Returns:
        List of precedent dicts with same structure as regular chunks,
        plus 'is_precedent': True and 'cited_by' field.
    """
    if not _loaded or not _graph:
        return []

    precedents = []
    seen_ids = set(judgment_ids)

    for jid in judgment_ids:
        citations = _graph.get(jid, [])
        if not citations:
            continue

        for citation_ref in citations[:3]:  # max 3 citations per judgment
            resolved_id = _resolve_citation_to_judgment(citation_ref)

            if not resolved_id or resolved_id in seen_ids:
                continue

            # Get text from parent store
            text = _parent_store.get(resolved_id, "")
            if not text:
                continue

            seen_ids.add(resolved_id)

            # Extract a useful excerpt — first 1500 chars after any header
            excerpt = text[:1500].strip()

            precedents.append({
                "judgment_id": resolved_id,
                "chunk_id": f"{resolved_id}_precedent",
                "text": excerpt,
                "title": f"Precedent: {citation_ref[:80]}",
                "year": resolved_id.split("_")[1] if "_" in resolved_id else "",
                "source_type": "case_law",
                "is_precedent": True,
                "cited_by": jid,
                "citation_ref": citation_ref,
                "similarity_score": 0.5  # precedents are added, not ranked
            })

            if len(precedents) >= max_precedents:
                break

        if len(precedents) >= max_precedents:
            break

    if precedents:
        logger.info(f"Precedent chain: added {len(precedents)} predecessor judgments")

    return precedents


def get_citation_count(judgment_id: str) -> int:
    """How many times has this judgment been cited by others."""
    count = 0
    for citations in _graph.values():
        for c in citations:
            resolved = _resolve_citation_to_judgment(c)
            if resolved == judgment_id:
                count += 1
    return count


def is_loaded() -> bool:
    return _loaded