NyayLens-API / src /indexing /paragraph_indexer.py
Sai Pranav Reddy
Clean lightweight deployment
968e24d
"""
Paragraph-level indexer for NyayLens RAG
- Page-aware
- Content-stable paragraph IDs
- Legal-aware filtering
- Streaming JSONL output (memory safe)
"""
import json
import hashlib
import re
from pathlib import Path
from typing import Dict, Iterable
from tqdm import tqdm
class ParagraphIndexer:
"""Index legal judgments at paragraph level with stable IDs and metadata"""
# Legal keywords worth preserving even in short paragraphs
LEGAL_KEYWORDS = {
'held', 'order', 'appeal', 'writ', 'judgment', 'decree',
'petition', 'application', 'allowed', 'dismissed', 'granted',
'rejected', 'disposed', 'quashed', 'set aside',
'affirmed', 'reversed', 'remanded', 'bail', 'custody',
'interim', 'stay', 'injunction', 'no costs'
}
PAGE_MARKER_PATTERN = re.compile(r'<<<PAGE:(\d+)>>>')
def __init__(self, texts_dir: Path, output_dir: Path):
self.texts_dir = Path(texts_dir)
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.index_file = self.output_dir / "paragraph_index.jsonl"
self.stats_file = self.output_dir / "index_stats.json"
@staticmethod
def _contains_legal_keyword(text: str) -> bool:
text_l = text.lower()
return any(
re.search(rf"\b{re.escape(kw)}\b", text_l)
for kw in ParagraphIndexer.LEGAL_KEYWORDS
)
@staticmethod
def _stable_paragraph_id(judgment_id: str, page_no: int, text: str) -> str:
"""
Content-stable ID:
- same paragraph text => same ID
- survives re-indexing
"""
h = hashlib.sha1(text.encode("utf-8")).hexdigest()[:16]
page_str = page_no if page_no is not None else "unk"
return f"{judgment_id}_p{page_str}_{h}"
def _strip_header(self, content: str) -> str:
"""
Remove extractor quality header safely
"""
sep = "=" * 70
if sep in content:
parts = content.split(sep, 2)
if len(parts) == 3:
return parts[2].strip()
return content.strip()
def _iter_paragraphs(self, content: str) -> Iterable[tuple]:
"""
Yield paragraph records with page numbers
"""
current_page = None
buffer = []
for line in content.splitlines():
page_match = self.PAGE_MARKER_PATTERN.match(line.strip())
if page_match:
# Flush buffer before page change
if buffer:
yield current_page, "\n".join(buffer).strip()
buffer = []
current_page = int(page_match.group(1))
continue
if not line.strip():
if buffer:
yield current_page, "\n".join(buffer).strip()
buffer = []
continue
buffer.append(line)
if buffer:
yield current_page, "\n".join(buffer).strip()
def index_judgment(self, text_file: Path, writer) -> int:
"""
Index a single judgment file.
Returns number of paragraphs indexed.
"""
with open(text_file, "r", encoding="utf-8") as f:
content = self._strip_header(f.read())
judgment_id = text_file.stem
para_count = 0
for page_no, para in self._iter_paragraphs(content):
if not para:
continue
# Keep substantial OR legally important short paragraphs
if len(para) < 50 and not self._contains_legal_keyword(para):
continue
record = {
"id": self._stable_paragraph_id(judgment_id, page_no if page_no is not None else -1, para),
"judgment_id": judgment_id,
"page_no": page_no if page_no is not None else -1,
"text": para,
"char_count": len(para),
"word_count": len(para.split())
}
writer.write(json.dumps(record, ensure_ascii=False) + "\n")
para_count += 1
return para_count
def build_full_index(self):
text_files = sorted(self.texts_dir.glob("*.txt"))
print(f"Indexing {len(text_files):,} judgments...")
total_paragraphs = 0
with open(self.index_file, "w", encoding="utf-8") as writer:
for text_file in tqdm(text_files, desc="Indexing"):
try:
total_paragraphs += self.index_judgment(text_file, writer)
except Exception as e:
print(f"❌ Failed indexing {text_file.name}: {e}")
stats = {
"total_judgments": len(text_files),
"total_paragraphs": total_paragraphs,
"avg_paragraphs_per_judgment":
total_paragraphs / len(text_files) if text_files else 0
}
with open(self.stats_file, "w", encoding="utf-8") as f:
json.dump(stats, f, indent=2)
print("\n✓ Paragraph indexing complete")
print(f" Total paragraphs: {total_paragraphs:,}")
print(f" Output: {self.index_file}")
return stats
if __name__ == "__main__":
indexer = ParagraphIndexer(
texts_dir=Path("data/processed/extracted/texts"),
output_dir=Path("data/processed/indexed")
)
stats = indexer.build_full_index()
print(f"\nAverage paragraphs per judgment: {stats['avg_paragraphs_per_judgment']:.1f}")