Spaces:

khagu
/

setu

Running

File size: 5,934 Bytes
"""
PDF Processing Routes
Handles Nepali PDF uploads and processing for bias detection
"""

from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends
from api.core.deps import get_current_user
from api.schemas import (
    PDFProcessingResponse,
    PDFToBiasDetectionRequest,
    PDFToBiasDetectionResponse,
    BiasResult,
)
from typing import List, Optional
import logging
from utility.pdf_processor import PDFProcessor
from .bias_detection import run_bias_detection

logger = logging.getLogger(__name__)
router = APIRouter()

# Initialize PDF Processor
pdf_processor = PDFProcessor()


@router.post("/process-pdf", response_model=PDFProcessingResponse)
async def process_pdf(
    file: UploadFile = File(...),
    refine_with_llm: bool = Form(default=True),
    user: dict = Depends(get_current_user)
):
    """
    Upload a Nepali PDF and extract sentences.
    
    - **file**: PDF file to process (required)
    - **refine_with_llm**: Whether to refine sentences using Mistral LLM (default: True)
    
    Returns:
    - Extracted sentences as a list
    - Total number of sentences
    - Raw extracted text (optional)
    """
    try:
        if not file.filename.endswith('.pdf'):
            raise HTTPException(
                status_code=400,
                detail="Only PDF files are supported"
            )
        
        logger.info(f"Processing PDF: {file.filename}")
        
        # Read file contents
        contents = await file.read()
        
        if not contents:
            raise HTTPException(
                status_code=400,
                detail="Empty file provided"
            )
        
        # Process PDF
        result = pdf_processor.process_pdf_from_bytes(
            pdf_bytes=contents,
            refine_with_llm=refine_with_llm
        )
        
        if not result["success"]:
            raise HTTPException(
                status_code=400,
                detail=result["error"]
            )
        
        logger.info(f"Successfully processed {file.filename}: {result['total_sentences']} sentences")
        
        return PDFProcessingResponse(
            success=True,
            sentences=result["sentences"],
            total_sentences=result["total_sentences"],
            raw_text=result["raw_text"],
            filename=file.filename
        )
        
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"PDF processing error: {e}")
        raise HTTPException(status_code=500, detail=str(e))


@router.post("/process-pdf-to-bias", response_model=PDFToBiasDetectionResponse)
async def process_pdf_to_bias(
    file: UploadFile = File(...),
    refine_with_llm: bool = Form(default=True),
    confidence_threshold: float = Form(default=0.7),
    user: dict = Depends(get_current_user)
):
    """
    Upload a Nepali PDF, extract sentences, and directly analyze for bias.
    
    - **file**: PDF file to process (required)
    - **refine_with_llm**: Whether to refine sentences using Mistral LLM (default: True)
    - **confidence_threshold**: Confidence threshold for bias detection (default: 0.7)
    
    Returns:
    - Bias detection results for all extracted sentences
    - Summary statistics (biased_count, neutral_count)
    """
    try:
        if not file.filename.endswith('.pdf'):
            raise HTTPException(
                status_code=400,
                detail="Only PDF files are supported"
            )
        
        logger.info(f"Processing PDF for bias detection: {file.filename}")
        
        # Read file contents
        contents = await file.read()
        
        if not contents:
            raise HTTPException(
                status_code=400,
                detail="Empty file provided"
            )
        
        # Step 1: Process PDF
        pdf_result = pdf_processor.process_pdf_from_bytes(
            pdf_bytes=contents,
            refine_with_llm=refine_with_llm
        )
        
        if not pdf_result["success"]:
            raise HTTPException(
                status_code=400,
                detail=pdf_result["error"]
            )
        
        sentences = pdf_result["sentences"]
        logger.info(f"Extracted {len(sentences)} sentences from {file.filename}")
        
        # Step 2: Analyze bias for extracted sentences
        combined_text = " ".join(sentences)
        bias_result = run_bias_detection(combined_text, confidence_threshold)
        
        logger.info(f"Bias detection completed: {bias_result.biased_count} biased, {bias_result.neutral_count} neutral")
        
        return PDFToBiasDetectionResponse(
            success=True,
            total_sentences=bias_result.total_sentences,
            biased_count=bias_result.biased_count,
            neutral_count=bias_result.neutral_count,
            results=bias_result.results,
            filename=file.filename
        )
        
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"PDF to bias detection error: {e}")
        raise HTTPException(status_code=500, detail=str(e))


@router.get("/pdf-health")
async def pdf_processor_health():
    """
    Check if the PDF processing service is running properly.
    """
    try:
        # Test if Mistral client is initialized
        llm_available = pdf_processor.llm_client.client is not None
        
        return {
            "status": "healthy" if llm_available else "degraded",
            "pdf_processor": "ready",
            "mistral_client": "connected" if llm_available else "disconnected",
            "features": {
                "pdf_extraction": True,
                "sentence_segmentation": True,
                "llm_refinement": llm_available
            }
        }
    except Exception as e:
        logger.error(f"Health check failed: {e}")
        return {
            "status": "unhealthy",
            "error": str(e)
        }