File size: 3,297 Bytes
968e24d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import argparse
from pathlib import Path
import logging
import time

from extraction.pdf_extractor import LegalJudgmentExtractor
from segmentation.judgement_segmenter import JudgmentSegmenter

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class NyayLensPipeline:
    """Unified ingestion pipeline for NyayLens"""
    
    def __init__(self, raw_dir="data/raw", processed_dir="data/processed"):
        self.raw_dir = Path(raw_dir)
        self.processed_dir = Path(processed_dir)
        
        logger.info("Initializing NyayLens Pipeline components...")
        self.extractor = LegalJudgmentExtractor(output_dir=self.processed_dir / "extracted")
        self.segmenter = JudgmentSegmenter()
        
    def ingest_pdf(self, pdf_path: Path):
        """Process a single PDF end-to-end"""
        logger.info(f"--- Starting Ingestion: {pdf_path.name} ---")
        start_time = time.time()
        
        # 1. Extraction
        logger.info("Step 1: Extracting text...")
        extraction_result = self.extractor.extract_pdf(pdf_path)
        if not extraction_result['text']:
            logger.error(f"Extraction failed for {pdf_path.name}")
            return False
            
        # Optional: Save extraction
        self.extractor.save_extraction(pdf_path, extraction_result)
        
        # 2. Segmentation
        logger.info("Step 2: Segmenting judgment...")
        # Simple paragraph split for segmentation
        paragraphs = extraction_result['text'].split('\n\n')
        sections = self.segmenter.segment(paragraphs)
        
        logger.info(f"Found {len(sections)} distinct sections.")
        
        # 3. Next steps would hook into `create_embeddings.py` and `create_sqlite_index.py`
        logger.info("Step 3: Ready for Embeddings & Indexing (Batch process recommended)")
        
        elapsed = time.time() - start_time
        logger.info(f"--- Successfully processed {pdf_path.name} in {elapsed:.2f}s ---")
        return True

    def process_directory(self, limit: int = None):
        """Process all PDFs in the raw directory"""
        pdfs = list(self.raw_dir.glob("**/*.pdf")) + list(self.raw_dir.glob("**/*.PDF"))
        
        if limit:
            pdfs = pdfs[:limit]
            
        logger.info(f"Found {len(pdfs)} PDFs to process.")
        
        success = 0
        for pdf in pdfs:
            if self.ingest_pdf(pdf):
                success += 1
                
        logger.info(f"Batch completed. {success}/{len(pdfs)} successful.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="NyayLens Unified Ingestion Pipeline")
    parser.add_argument("--pdf", type=str, help="Path to a single PDF to ingest")
    parser.add_argument("--batch", action="store_true", help="Process all PDFs in data/raw")
    parser.add_argument("--limit", type=int, default=None, help="Limit number of PDFs in batch")
    
    args = parser.parse_args()
    
    pipeline = NyayLensPipeline()
    
    if args.pdf:
        pipeline.ingest_pdf(Path(args.pdf))
    elif args.batch:
        pipeline.process_directory(limit=args.limit)
    else:
        logger.warning("Please specify --pdf <path> or --batch. Use --help for options.")