Spaces:
Running
Running
| # """ | |
| # Annotate paragraphs with legal sections using JudgmentSegmenter | |
| # Creates paragraph_index_with_sections.jsonl | |
| # """ | |
| # import json | |
| # from pathlib import Path | |
| # from collections import defaultdict | |
| # from tqdm import tqdm | |
| # from judgement_segmenter import JudgmentSegmenter | |
| # INPUT_INDEX = Path("data/processed/indexed/paragraph_index.jsonl") | |
| # OUTPUT_INDEX = Path("data/processed/indexed/paragraph_index_with_sections.jsonl") | |
| # def annotate_paragraphs(): | |
| # print("=" * 70) | |
| # print("NyayLens β Annotating Paragraphs with Sections") | |
| # print("=" * 70) | |
| # # Load paragraphs grouped by judgment | |
| # judgments = defaultdict(list) | |
| # with open(INPUT_INDEX, "r", encoding="utf-8") as f: | |
| # for line in f: | |
| # p = json.loads(line) | |
| # judgments[p["judgment_id"]].append(p) | |
| # print(f"β Loaded {len(judgments):,} judgments") | |
| # segmenter = JudgmentSegmenter() | |
| # with open(OUTPUT_INDEX, "w", encoding="utf-8") as writer: | |
| # for judgment_id, paras in tqdm(judgments.items(), desc="Annotating"): | |
| # # Preserve original order | |
| # paras = sorted(paras, key=lambda x: (x["page_no"], x["id"])) | |
| # texts = [p["text"] for p in paras] | |
| # sections = segmenter.segment(texts) | |
| # # Default all to unknown | |
| # section_labels = [ | |
| # ("unknown", 0.0) for _ in paras | |
| # ] | |
| # # Apply section labels | |
| # for sec in sections: | |
| # for i in range(sec.start_para_idx, sec.end_para_idx + 1): | |
| # section_labels[i] = (sec.type, sec.confidence) | |
| # # Write annotated paragraphs | |
| # for p, (sec_type, sec_conf) in zip(paras, section_labels): | |
| # p_out = dict(p) | |
| # p_out["section"] = sec_type | |
| # p_out["section_conf"] = sec_conf | |
| # writer.write(json.dumps(p_out, ensure_ascii=False) + "\n") | |
| # print("\nβ Annotation complete") | |
| # print(f"β Output written to: {OUTPUT_INDEX}") | |
| # if __name__ == "__main__": | |
| # annotate_paragraphs() | |
| """ | |
| Annotate paragraphs with legal sections using JudgmentSegmenter | |
| PRESERVES ORIGINAL IDs AND ORDER | |
| """ | |
| import json | |
| from pathlib import Path | |
| from collections import defaultdict | |
| from tqdm import tqdm | |
| from judgement_segmenter import JudgmentSegmenter | |
| INPUT_INDEX = Path("data/processed/indexed/paragraph_index.jsonl") | |
| OUTPUT_INDEX = Path("data/processed/indexed/paragraph_index_with_sections.jsonl") | |
| def annotate_paragraphs(): | |
| print("=" * 70) | |
| print("NyayLens β Annotating Paragraphs with Sections") | |
| print("=" * 70) | |
| # Load paragraphs IN ORIGINAL ORDER | |
| all_paragraphs = [] | |
| with open(INPUT_INDEX, "r", encoding="utf-8") as f: | |
| for line in f: | |
| all_paragraphs.append(json.loads(line)) | |
| print(f"β Loaded {len(all_paragraphs):,} paragraphs") | |
| # Group by judgment (preserve index in group) | |
| judgments = defaultdict(list) | |
| for idx, p in enumerate(all_paragraphs): | |
| judgments[p["judgment_id"]].append((idx, p)) # β Store original index | |
| segmenter = JudgmentSegmenter() | |
| # Create array to store annotations (preserves original order) | |
| annotations = [None] * len(all_paragraphs) | |
| for judgment_id, indexed_paras in tqdm(judgments.items(), desc="Annotating"): | |
| # Extract just the paragraphs | |
| indices = [ip[0] for ip in indexed_paras] | |
| paras = [ip[1] for ip in indexed_paras] | |
| # Get texts | |
| texts = [p["text"] for p in paras] | |
| # Segment | |
| sections = segmenter.segment(texts) | |
| # Default labels | |
| section_labels = [("unknown", 0.0) for _ in paras] | |
| # Apply section labels | |
| for sec in sections: | |
| for i in range(sec.start_para_idx, sec.end_para_idx + 1): | |
| if i < len(section_labels): | |
| section_labels[i] = (sec.type, sec.confidence) | |
| # Store annotations in ORIGINAL positions | |
| for orig_idx, p, (sec_type, sec_conf) in zip(indices, paras, section_labels): | |
| p_out = dict(p) # Copy original | |
| p_out["section"] = sec_type | |
| p_out["section_conf"] = sec_conf | |
| annotations[orig_idx] = p_out | |
| # Write in ORIGINAL order | |
| print("\nWriting annotated paragraphs...") | |
| with open(OUTPUT_INDEX, "w", encoding="utf-8") as writer: | |
| for p_out in annotations: | |
| writer.write(json.dumps(p_out, ensure_ascii=False) + "\n") | |
| print(f"β Output written to: {OUTPUT_INDEX}") | |
| print("=" * 70) | |
| if __name__ == "__main__": | |
| annotate_paragraphs() | |