Spaces:

SaiPranav09
/

NyayLens-API

Running

File size: 4,647 Bytes

968e24d

# """
# Annotate paragraphs with legal sections using JudgmentSegmenter
# Creates paragraph_index_with_sections.jsonl
# """

# import json
# from pathlib import Path
# from collections import defaultdict
# from tqdm import tqdm

# from judgement_segmenter import JudgmentSegmenter


# INPUT_INDEX = Path("data/processed/indexed/paragraph_index.jsonl")
# OUTPUT_INDEX = Path("data/processed/indexed/paragraph_index_with_sections.jsonl")


# def annotate_paragraphs():
#     print("=" * 70)
#     print("NyayLens – Annotating Paragraphs with Sections")
#     print("=" * 70)

#     # Load paragraphs grouped by judgment
#     judgments = defaultdict(list)

#     with open(INPUT_INDEX, "r", encoding="utf-8") as f:
#         for line in f:
#             p = json.loads(line)
#             judgments[p["judgment_id"]].append(p)

#     print(f"✓ Loaded {len(judgments):,} judgments")

#     segmenter = JudgmentSegmenter()

#     with open(OUTPUT_INDEX, "w", encoding="utf-8") as writer:
#         for judgment_id, paras in tqdm(judgments.items(), desc="Annotating"):
#             # Preserve original order
#             paras = sorted(paras, key=lambda x: (x["page_no"], x["id"]))

#             texts = [p["text"] for p in paras]

#             sections = segmenter.segment(texts)

#             # Default all to unknown
#             section_labels = [
#                 ("unknown", 0.0) for _ in paras
#             ]

#             # Apply section labels
#             for sec in sections:
#                 for i in range(sec.start_para_idx, sec.end_para_idx + 1):
#                     section_labels[i] = (sec.type, sec.confidence)

#             # Write annotated paragraphs
#             for p, (sec_type, sec_conf) in zip(paras, section_labels):
#                 p_out = dict(p)
#                 p_out["section"] = sec_type
#                 p_out["section_conf"] = sec_conf

#                 writer.write(json.dumps(p_out, ensure_ascii=False) + "\n")

#     print("\n✓ Annotation complete")
#     print(f"✓ Output written to: {OUTPUT_INDEX}")


# if __name__ == "__main__":
#     annotate_paragraphs()
"""
Annotate paragraphs with legal sections using JudgmentSegmenter
PRESERVES ORIGINAL IDs AND ORDER
"""

import json
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm

from judgement_segmenter import JudgmentSegmenter


INPUT_INDEX = Path("data/processed/indexed/paragraph_index.jsonl")
OUTPUT_INDEX = Path("data/processed/indexed/paragraph_index_with_sections.jsonl")


def annotate_paragraphs():
    print("=" * 70)
    print("NyayLens – Annotating Paragraphs with Sections")
    print("=" * 70)

    # Load paragraphs IN ORIGINAL ORDER
    all_paragraphs = []
    with open(INPUT_INDEX, "r", encoding="utf-8") as f:
        for line in f:
            all_paragraphs.append(json.loads(line))
    
    print(f"✓ Loaded {len(all_paragraphs):,} paragraphs")

    # Group by judgment (preserve index in group)
    judgments = defaultdict(list)
    for idx, p in enumerate(all_paragraphs):
        judgments[p["judgment_id"]].append((idx, p))  # ← Store original index

    segmenter = JudgmentSegmenter()
    
    # Create array to store annotations (preserves original order)
    annotations = [None] * len(all_paragraphs)

    for judgment_id, indexed_paras in tqdm(judgments.items(), desc="Annotating"):
        # Extract just the paragraphs
        indices = [ip[0] for ip in indexed_paras]
        paras = [ip[1] for ip in indexed_paras]
        
        # Get texts
        texts = [p["text"] for p in paras]

        # Segment
        sections = segmenter.segment(texts)

        # Default labels
        section_labels = [("unknown", 0.0) for _ in paras]

        # Apply section labels
        for sec in sections:
            for i in range(sec.start_para_idx, sec.end_para_idx + 1):
                if i < len(section_labels):
                    section_labels[i] = (sec.type, sec.confidence)

        # Store annotations in ORIGINAL positions
        for orig_idx, p, (sec_type, sec_conf) in zip(indices, paras, section_labels):
            p_out = dict(p)  # Copy original
            p_out["section"] = sec_type
            p_out["section_conf"] = sec_conf
            annotations[orig_idx] = p_out

    # Write in ORIGINAL order
    print("\nWriting annotated paragraphs...")
    with open(OUTPUT_INDEX, "w", encoding="utf-8") as writer:
        for p_out in annotations:
            writer.write(json.dumps(p_out, ensure_ascii=False) + "\n")

    print(f"✓ Output written to: {OUTPUT_INDEX}")
    print("=" * 70)


if __name__ == "__main__":
    annotate_paragraphs()