File size: 4,647 Bytes
968e24d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# """
# Annotate paragraphs with legal sections using JudgmentSegmenter
# Creates paragraph_index_with_sections.jsonl
# """

# import json
# from pathlib import Path
# from collections import defaultdict
# from tqdm import tqdm

# from judgement_segmenter import JudgmentSegmenter


# INPUT_INDEX = Path("data/processed/indexed/paragraph_index.jsonl")
# OUTPUT_INDEX = Path("data/processed/indexed/paragraph_index_with_sections.jsonl")


# def annotate_paragraphs():
#     print("=" * 70)
#     print("NyayLens – Annotating Paragraphs with Sections")
#     print("=" * 70)

#     # Load paragraphs grouped by judgment
#     judgments = defaultdict(list)

#     with open(INPUT_INDEX, "r", encoding="utf-8") as f:
#         for line in f:
#             p = json.loads(line)
#             judgments[p["judgment_id"]].append(p)

#     print(f"βœ“ Loaded {len(judgments):,} judgments")

#     segmenter = JudgmentSegmenter()

#     with open(OUTPUT_INDEX, "w", encoding="utf-8") as writer:
#         for judgment_id, paras in tqdm(judgments.items(), desc="Annotating"):
#             # Preserve original order
#             paras = sorted(paras, key=lambda x: (x["page_no"], x["id"]))

#             texts = [p["text"] for p in paras]

#             sections = segmenter.segment(texts)

#             # Default all to unknown
#             section_labels = [
#                 ("unknown", 0.0) for _ in paras
#             ]

#             # Apply section labels
#             for sec in sections:
#                 for i in range(sec.start_para_idx, sec.end_para_idx + 1):
#                     section_labels[i] = (sec.type, sec.confidence)

#             # Write annotated paragraphs
#             for p, (sec_type, sec_conf) in zip(paras, section_labels):
#                 p_out = dict(p)
#                 p_out["section"] = sec_type
#                 p_out["section_conf"] = sec_conf

#                 writer.write(json.dumps(p_out, ensure_ascii=False) + "\n")

#     print("\nβœ“ Annotation complete")
#     print(f"βœ“ Output written to: {OUTPUT_INDEX}")


# if __name__ == "__main__":
#     annotate_paragraphs()
"""
Annotate paragraphs with legal sections using JudgmentSegmenter
PRESERVES ORIGINAL IDs AND ORDER
"""

import json
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm

from judgement_segmenter import JudgmentSegmenter


INPUT_INDEX = Path("data/processed/indexed/paragraph_index.jsonl")
OUTPUT_INDEX = Path("data/processed/indexed/paragraph_index_with_sections.jsonl")


def annotate_paragraphs():
    print("=" * 70)
    print("NyayLens – Annotating Paragraphs with Sections")
    print("=" * 70)

    # Load paragraphs IN ORIGINAL ORDER
    all_paragraphs = []
    with open(INPUT_INDEX, "r", encoding="utf-8") as f:
        for line in f:
            all_paragraphs.append(json.loads(line))
    
    print(f"βœ“ Loaded {len(all_paragraphs):,} paragraphs")

    # Group by judgment (preserve index in group)
    judgments = defaultdict(list)
    for idx, p in enumerate(all_paragraphs):
        judgments[p["judgment_id"]].append((idx, p))  # ← Store original index

    segmenter = JudgmentSegmenter()
    
    # Create array to store annotations (preserves original order)
    annotations = [None] * len(all_paragraphs)

    for judgment_id, indexed_paras in tqdm(judgments.items(), desc="Annotating"):
        # Extract just the paragraphs
        indices = [ip[0] for ip in indexed_paras]
        paras = [ip[1] for ip in indexed_paras]
        
        # Get texts
        texts = [p["text"] for p in paras]

        # Segment
        sections = segmenter.segment(texts)

        # Default labels
        section_labels = [("unknown", 0.0) for _ in paras]

        # Apply section labels
        for sec in sections:
            for i in range(sec.start_para_idx, sec.end_para_idx + 1):
                if i < len(section_labels):
                    section_labels[i] = (sec.type, sec.confidence)

        # Store annotations in ORIGINAL positions
        for orig_idx, p, (sec_type, sec_conf) in zip(indices, paras, section_labels):
            p_out = dict(p)  # Copy original
            p_out["section"] = sec_type
            p_out["section_conf"] = sec_conf
            annotations[orig_idx] = p_out

    # Write in ORIGINAL order
    print("\nWriting annotated paragraphs...")
    with open(OUTPUT_INDEX, "w", encoding="utf-8") as writer:
        for p_out in annotations:
            writer.write(json.dumps(p_out, ensure_ascii=False) + "\n")

    print(f"βœ“ Output written to: {OUTPUT_INDEX}")
    print("=" * 70)


if __name__ == "__main__":
    annotate_paragraphs()