Spaces:
Build error
Build error
| """ | |
| FastAPI route handlers for PDF processing API. | |
| """ | |
| from pathlib import Path | |
| from typing import Optional | |
| from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks | |
| from fastapi.responses import FileResponse | |
| from app.api.schemas import ( | |
| ProcessingRequest, | |
| ProcessingResult, | |
| UploadResponse, | |
| BatchProcessingRequest, | |
| BatchProcessingResult, | |
| MappingConfig, | |
| MappingEntry, | |
| TemplateValidation, | |
| FormFieldInfo, | |
| ExtractedData, | |
| HealthResponse | |
| ) | |
| from app.api.processor_service import processor_service | |
| from app.config import settings | |
| from app.utils.logging import get_logger | |
| from app.utils.exceptions import PDFProcessorError, FileValidationError | |
| logger = get_logger(__name__) | |
| router = APIRouter(prefix="/api/v1", tags=["pdf"]) | |
| async def health_check(): | |
| """Check API health and service availability.""" | |
| return HealthResponse( | |
| status="healthy", | |
| version=settings.app_version, | |
| ocr_available=processor_service.ocr_available, | |
| gemini_available=settings.google_api_key is not None | |
| ) | |
| async def upload_files( | |
| source_pdf: UploadFile = File(..., description="T1 tax return PDF to extract data from"), | |
| template_pdf: UploadFile = File(..., description="Target PDF form to fill") | |
| ): | |
| """ | |
| Upload source T1 PDF and target template PDF. | |
| Returns a session ID for subsequent processing. | |
| """ | |
| try: | |
| # Create new session | |
| session = processor_service.create_session() | |
| # Save source PDF | |
| source_content = await source_pdf.read() | |
| processor_service.save_uploaded_file( | |
| session, source_content, source_pdf.filename, "source" | |
| ) | |
| # Save template PDF | |
| template_content = await template_pdf.read() | |
| processor_service.save_uploaded_file( | |
| session, template_content, template_pdf.filename, "template" | |
| ) | |
| return UploadResponse( | |
| session_id=session.session_id, | |
| source_filename=source_pdf.filename, | |
| template_filename=template_pdf.filename, | |
| message="Files uploaded successfully. Use /process to extract and fill data." | |
| ) | |
| except FileValidationError as e: | |
| raise HTTPException(status_code=400, detail=str(e)) | |
| except Exception as e: | |
| logger.error(f"Upload failed: {e}") | |
| raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}") | |
| async def process_pdfs(request: ProcessingRequest): | |
| """ | |
| Process uploaded PDFs: extract data and fill template. | |
| Extracts T1 tax line values from the source PDF and fills the template PDF form. | |
| """ | |
| try: | |
| session = processor_service.get_session(request.session_id) | |
| result = processor_service.process( | |
| session=session, | |
| line_numbers=request.line_numbers, | |
| use_ocr=request.use_ocr, | |
| flatten=request.flatten_output | |
| ) | |
| extracted = result["extracted_data"] | |
| return ProcessingResult( | |
| session_id=session.session_id, | |
| status="success", | |
| extracted_data=ExtractedData( | |
| line_values=extracted["line_values"], | |
| extraction_method=extracted["extraction_method"], | |
| has_text_content=extracted["has_text"], | |
| raw_text_preview=extracted.get("text_preview") | |
| ), | |
| mapped_fields=result["mapped_fields"], | |
| output_filename=result["output_filename"], | |
| errors=session.errors, | |
| warnings=session.warnings | |
| ) | |
| except FileValidationError as e: | |
| raise HTTPException(status_code=400, detail=str(e)) | |
| except PDFProcessorError as e: | |
| logger.error(f"Processing error: {e}") | |
| raise HTTPException(status_code=422, detail=str(e)) | |
| except Exception as e: | |
| logger.error(f"Unexpected error: {e}") | |
| raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}") | |
| async def download_result(session_id: str): | |
| """ | |
| Download the filled PDF for a completed session. | |
| """ | |
| try: | |
| session = processor_service.get_session(session_id) | |
| output_path = processor_service.get_output_path(session) | |
| if not output_path: | |
| raise HTTPException( | |
| status_code=404, | |
| detail="Output file not found. Ensure processing is complete." | |
| ) | |
| return FileResponse( | |
| path=output_path, | |
| filename=f"filled_t1_{session_id[:8]}.pdf", | |
| media_type="application/pdf" | |
| ) | |
| except FileValidationError as e: | |
| raise HTTPException(status_code=404, detail=str(e)) | |
| except Exception as e: | |
| logger.error(f"Download error: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def batch_process(request: BatchProcessingRequest): | |
| """ | |
| Process multiple uploaded PDF sessions in batch. | |
| """ | |
| results = [] | |
| successful = 0 | |
| failed = 0 | |
| for session_id in request.session_ids: | |
| try: | |
| session = processor_service.get_session(session_id) | |
| result = processor_service.process( | |
| session=session, | |
| line_numbers=request.line_numbers, | |
| use_ocr=request.use_ocr, | |
| flatten=request.flatten_output | |
| ) | |
| extracted = result["extracted_data"] | |
| results.append(ProcessingResult( | |
| session_id=session_id, | |
| status="success", | |
| extracted_data=ExtractedData( | |
| line_values=extracted["line_values"], | |
| extraction_method=extracted["extraction_method"], | |
| has_text_content=extracted["has_text"], | |
| raw_text_preview=extracted.get("text_preview") | |
| ), | |
| mapped_fields=result["mapped_fields"], | |
| output_filename=result["output_filename"], | |
| errors=session.errors, | |
| warnings=session.warnings | |
| )) | |
| successful += 1 | |
| except Exception as e: | |
| logger.error(f"Batch processing failed for {session_id}: {e}") | |
| results.append(ProcessingResult( | |
| session_id=session_id, | |
| status="failed", | |
| errors=[str(e)] | |
| )) | |
| failed += 1 | |
| return BatchProcessingResult( | |
| total=len(request.session_ids), | |
| successful=successful, | |
| failed=failed, | |
| results=results | |
| ) | |
| async def get_template_fields(session_id: str): | |
| """ | |
| Get form fields from the uploaded template PDF. | |
| """ | |
| try: | |
| session = processor_service.get_session(session_id) | |
| fields = processor_service.get_template_fields(session) | |
| field_list = [ | |
| FormFieldInfo( | |
| name=name, | |
| field_type=str(info.get("type", "Unknown")), | |
| current_value=str(info.get("value", "")) if info.get("value") else None | |
| ) | |
| for name, info in fields.items() | |
| ] | |
| # Check which mapped fields are missing | |
| all_mappings = processor_service.get_all_mappings() | |
| mapped_field_names = {m["field"] for m in all_mappings.values()} | |
| existing_field_names = set(fields.keys()) | |
| missing = list(mapped_field_names - existing_field_names) | |
| return TemplateValidation( | |
| valid=len(missing) == 0, | |
| total_fields=len(fields), | |
| fields=field_list, | |
| missing_fields=missing | |
| ) | |
| except FileValidationError as e: | |
| raise HTTPException(status_code=400, detail=str(e)) | |
| except Exception as e: | |
| logger.error(f"Template fields error: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def get_mappings(): | |
| """ | |
| Get all current T1 line to field mappings. | |
| """ | |
| return processor_service.get_all_mappings() | |
| async def update_mappings(config: MappingConfig): | |
| """ | |
| Update T1 line to field mappings. | |
| """ | |
| try: | |
| for entry in config.mappings: | |
| processor_service.update_mapping( | |
| entry.line_number, | |
| entry.field_name, | |
| entry.description or "" | |
| ) | |
| # Save to config file | |
| processor_service.save_mapping_config() | |
| return {"message": f"Updated {len(config.mappings)} mappings"} | |
| except Exception as e: | |
| logger.error(f"Mapping update error: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def cleanup_session(session_id: str, background_tasks: BackgroundTasks): | |
| """ | |
| Clean up session files and data. | |
| """ | |
| try: | |
| background_tasks.add_task(processor_service.cleanup_session, session_id) | |
| return {"message": f"Session {session_id} cleanup scheduled"} | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |