Spaces:

khagu
/

setu

Running

File size: 6,665 Bytes
"""
Example usage of the PDF Processor utility
Demonstrates how to extract sentences from Nepali PDFs and prepare for bias detection
"""

from utility.pdf_processor import PDFProcessor
import json


def example_pdf_processing():
    """Example 1: Basic PDF processing with sentence extraction"""
    print("=" * 60)
    print("Example 1: Basic PDF Processing")
    print("=" * 60)
    
    # Initialize processor
    processor = PDFProcessor()
    
    # Process a PDF file
    pdf_path = "path/to/your/nepali_document.pdf"
    
    result = processor.process_pdf(
        pdf_path=pdf_path,
        refine_with_llm=True  # Use Mistral LLM for refinement
    )
    
    if result["success"]:
        print(f"\n✓ Successfully processed PDF")
        print(f"  Total sentences extracted: {result['total_sentences']}")
        print(f"\nExtracted sentences:")
        for i, sentence in enumerate(result["sentences"], 1):
            print(f"  {i}. {sentence}")
    else:
        print(f"✗ Error: {result['error']}")


def example_pdf_processing_without_llm():
    """Example 2: PDF processing without LLM refinement (faster)"""
    print("\n" + "=" * 60)
    print("Example 2: PDF Processing (No LLM Refinement)")
    print("=" * 60)
    
    processor = PDFProcessor()
    
    pdf_path = "path/to/your/nepali_document.pdf"
    
    # Process without LLM refinement for faster results
    result = processor.process_pdf(
        pdf_path=pdf_path,
        refine_with_llm=False
    )
    
    if result["success"]:
        print(f"\n✓ Successfully processed PDF (without LLM refinement)")
        print(f"  Total sentences: {result['total_sentences']}")
        print(f"  Processing time: Faster (no LLM calls)")
    else:
        print(f"✗ Error: {result['error']}")


def example_batch_processing():
    """Example 3: Process multiple PDFs"""
    print("\n" + "=" * 60)
    print("Example 3: Batch PDF Processing")
    print("=" * 60)
    
    processor = PDFProcessor()
    pdf_files = [
        "path/to/document1.pdf",
        "path/to/document2.pdf",
        "path/to/document3.pdf",
    ]
    
    all_results = {}
    
    for pdf_path in pdf_files:
        result = processor.process_pdf(
            pdf_path=pdf_path,
            refine_with_llm=True
        )
        
        all_results[pdf_path] = {
            "success": result["success"],
            "total_sentences": result["total_sentences"],
            "sentences": result["sentences"]
        }
        
        if result["success"]:
            print(f"✓ {pdf_path}: {result['total_sentences']} sentences")
        else:
            print(f"✗ {pdf_path}: {result['error']}")
    
    return all_results


def example_prepare_for_bias_detection():
    """Example 4: Prepare extracted sentences for Bias Detection API"""
    print("\n" + "=" * 60)
    print("Example 4: Prepare for Bias Detection API")
    print("=" * 60)
    
    processor = PDFProcessor()
    
    pdf_path = "path/to/your/nepali_document.pdf"
    result = processor.process_pdf(
        pdf_path=pdf_path,
        refine_with_llm=True
    )
    
    if result["success"]:
        sentences = result["sentences"]
        
        # Prepare payload for bias detection API
        from api.schemas import BatchBiasDetectionRequest
        
        api_payload = BatchBiasDetectionRequest(
            texts=sentences,
            confidence_threshold=0.7
        )
        
        print(f"\n✓ Prepared {len(sentences)} sentences for bias detection")
        print(f"\nAPI Payload Preview:")
        print(f"  - Number of texts: {len(api_payload.texts)}")
        print(f"  - Confidence threshold: {api_payload.confidence_threshold}")
        print(f"\nExample sentences to be analyzed:")
        for i, sentence in enumerate(api_payload.texts[:3], 1):
            print(f"  {i}. {sentence}")
        
        return api_payload
    else:
        print(f"✗ Error: {result['error']}")
        return None


def example_direct_api_integration():
    """Example 5: Integration pattern for API endpoint"""
    print("\n" + "=" * 60)
    print("Example 5: API Integration Pattern")
    print("=" * 60)
    
    print("""
When integrating with FastAPI endpoints:

1. In /api/routes/pdf_processing.py:
   - POST /api/v1/process-pdf
     * Upload PDF file
     * Returns extracted sentences
   
   - POST /api/v1/process-pdf-to-bias
     * Upload PDF file
     * Returns bias detection results directly
   
   - GET /api/v1/pdf-health
     * Check PDF processor service status

2. Usage flow:
   a) User uploads PDF via /api/v1/process-pdf
   b) PDFProcessor extracts text and segments into sentences
   c) Mistral LLM refines sentences (optional)
   d) Return list of sentences
   
   OR
   
   a) User uploads PDF via /api/v1/process-pdf-to-bias
   b) PDFProcessor extracts and refines sentences
   c) Bias detection model analyzes each sentence
   d) Return complete bias analysis results

3. Example cURL commands:

   # Extract sentences only
   curl -X POST "http://localhost:8000/api/v1/process-pdf" \\
     -F "file=@document.pdf" \\
     -F "refine_with_llm=true"
   
   # Extract and analyze bias
   curl -X POST "http://localhost:8000/api/v1/process-pdf-to-bias" \\
     -F "file=@document.pdf" \\
     -F "refine_with_llm=true" \\
     -F "confidence_threshold=0.7"
    """)


def example_error_handling():
    """Example 6: Error handling"""
    print("\n" + "=" * 60)
    print("Example 6: Error Handling")
    print("=" * 60)
    
    processor = PDFProcessor()
    
    test_cases = [
        ("nonexistent.pdf", "File not found"),
        ("image_pdf.pdf", "Image-based PDF (no OCR)"),
    ]
    
    for pdf_path, description in test_cases:
        print(f"\nTest: {description}")
        result = processor.process_pdf(pdf_path=pdf_path, refine_with_llm=False)
        
        if not result["success"]:
            print(f"  Handled error: {result['error']}")
        else:
            print(f"  Success: {result['total_sentences']} sentences extracted")


if __name__ == "__main__":
    print("\n" + "=" * 60)
    print("PDF Processor Usage Examples")
    print("=" * 60)
    
    # Run examples (comment out as needed for testing)
    print("\nNote: Update file paths to your actual PDF files to run examples\n")
    
    example_direct_api_integration()
    
    # Uncomment to run other examples:
    # example_pdf_processing()
    # example_pdf_processing_without_llm()
    # example_batch_processing()
    # example_prepare_for_bias_detection()
    # example_error_handling()
    
    print("\n" + "=" * 60)
    print("For more information, see docs/pdf_processing.md")
    print("=" * 60)