NyayLens-API / src /segmentation /annotate_paragraphs.py
Sai Pranav Reddy
Clean lightweight deployment
968e24d
# """
# Annotate paragraphs with legal sections using JudgmentSegmenter
# Creates paragraph_index_with_sections.jsonl
# """
# import json
# from pathlib import Path
# from collections import defaultdict
# from tqdm import tqdm
# from judgement_segmenter import JudgmentSegmenter
# INPUT_INDEX = Path("data/processed/indexed/paragraph_index.jsonl")
# OUTPUT_INDEX = Path("data/processed/indexed/paragraph_index_with_sections.jsonl")
# def annotate_paragraphs():
# print("=" * 70)
# print("NyayLens – Annotating Paragraphs with Sections")
# print("=" * 70)
# # Load paragraphs grouped by judgment
# judgments = defaultdict(list)
# with open(INPUT_INDEX, "r", encoding="utf-8") as f:
# for line in f:
# p = json.loads(line)
# judgments[p["judgment_id"]].append(p)
# print(f"βœ“ Loaded {len(judgments):,} judgments")
# segmenter = JudgmentSegmenter()
# with open(OUTPUT_INDEX, "w", encoding="utf-8") as writer:
# for judgment_id, paras in tqdm(judgments.items(), desc="Annotating"):
# # Preserve original order
# paras = sorted(paras, key=lambda x: (x["page_no"], x["id"]))
# texts = [p["text"] for p in paras]
# sections = segmenter.segment(texts)
# # Default all to unknown
# section_labels = [
# ("unknown", 0.0) for _ in paras
# ]
# # Apply section labels
# for sec in sections:
# for i in range(sec.start_para_idx, sec.end_para_idx + 1):
# section_labels[i] = (sec.type, sec.confidence)
# # Write annotated paragraphs
# for p, (sec_type, sec_conf) in zip(paras, section_labels):
# p_out = dict(p)
# p_out["section"] = sec_type
# p_out["section_conf"] = sec_conf
# writer.write(json.dumps(p_out, ensure_ascii=False) + "\n")
# print("\nβœ“ Annotation complete")
# print(f"βœ“ Output written to: {OUTPUT_INDEX}")
# if __name__ == "__main__":
# annotate_paragraphs()
"""
Annotate paragraphs with legal sections using JudgmentSegmenter
PRESERVES ORIGINAL IDs AND ORDER
"""
import json
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm
from judgement_segmenter import JudgmentSegmenter
INPUT_INDEX = Path("data/processed/indexed/paragraph_index.jsonl")
OUTPUT_INDEX = Path("data/processed/indexed/paragraph_index_with_sections.jsonl")
def annotate_paragraphs():
print("=" * 70)
print("NyayLens – Annotating Paragraphs with Sections")
print("=" * 70)
# Load paragraphs IN ORIGINAL ORDER
all_paragraphs = []
with open(INPUT_INDEX, "r", encoding="utf-8") as f:
for line in f:
all_paragraphs.append(json.loads(line))
print(f"βœ“ Loaded {len(all_paragraphs):,} paragraphs")
# Group by judgment (preserve index in group)
judgments = defaultdict(list)
for idx, p in enumerate(all_paragraphs):
judgments[p["judgment_id"]].append((idx, p)) # ← Store original index
segmenter = JudgmentSegmenter()
# Create array to store annotations (preserves original order)
annotations = [None] * len(all_paragraphs)
for judgment_id, indexed_paras in tqdm(judgments.items(), desc="Annotating"):
# Extract just the paragraphs
indices = [ip[0] for ip in indexed_paras]
paras = [ip[1] for ip in indexed_paras]
# Get texts
texts = [p["text"] for p in paras]
# Segment
sections = segmenter.segment(texts)
# Default labels
section_labels = [("unknown", 0.0) for _ in paras]
# Apply section labels
for sec in sections:
for i in range(sec.start_para_idx, sec.end_para_idx + 1):
if i < len(section_labels):
section_labels[i] = (sec.type, sec.confidence)
# Store annotations in ORIGINAL positions
for orig_idx, p, (sec_type, sec_conf) in zip(indices, paras, section_labels):
p_out = dict(p) # Copy original
p_out["section"] = sec_type
p_out["section_conf"] = sec_conf
annotations[orig_idx] = p_out
# Write in ORIGINAL order
print("\nWriting annotated paragraphs...")
with open(OUTPUT_INDEX, "w", encoding="utf-8") as writer:
for p_out in annotations:
writer.write(json.dumps(p_out, ensure_ascii=False) + "\n")
print(f"βœ“ Output written to: {OUTPUT_INDEX}")
print("=" * 70)
if __name__ == "__main__":
annotate_paragraphs()