Document Text Extraction

""" FastAPI web service for document text extraction. Provides REST API endpoints for uploading and processing documents. """ from fastapi import FastAPI, File, UploadFile, HTTPException, Form from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import HTMLResponse, JSONResponse from fastapi.staticfiles import StaticFiles import uvicorn import tempfile import os import json from pathlib import Path from typing import List, Optional, Dict, Any import shutil from src.inference import DocumentInference # Initialize FastAPI app app = FastAPI( title="Document Text Extraction API", description="Extract structured information from documents using Small Language Model (SLM)", version="1.0.0" ) # Add CORS middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Global inference pipeline inference_pipeline: Optional[DocumentInference] = None def get_inference_pipeline() -> DocumentInference: """Get or initialize the inference pipeline.""" global inference_pipeline if inference_pipeline is None: model_path = "models/document_ner_model" if not Path(model_path).exists(): raise HTTPException( status_code=503, detail="Model not found. Please train the model first by running training_pipeline.py" ) try: inference_pipeline = DocumentInference(model_path) except Exception as e: raise HTTPException( status_code=503, detail=f"Failed to load model: {str(e)}" ) return inference_pipeline @app.on_event("startup") async def startup_event(): """Initialize the model on startup.""" try: get_inference_pipeline() print("Model loaded successfully on startup") except Exception as e: print(f"Failed to load model on startup: {e}") print("Model will be loaded on first request") @app.get("/", response_class=HTMLResponse) async def root(): """Serve the main HTML interface.""" html_content = """ Document Text Extraction

Upload File

Enter Text

""" return html_content @app.get("/health") async def health_check(): """Health check endpoint.""" try: get_inference_pipeline() return {"status": "healthy", "message": "Model loaded successfully"} except Exception as e: return {"status": "unhealthy", "message": str(e)} @app.post("/extract-from-file") async def extract_from_file(file: UploadFile = File(...)): """Extract structured information from an uploaded file.""" if not file: raise HTTPException(status_code=400, detail="No file provided") # Check file type allowed_extensions = {'.pdf', '.docx', '.png', '.jpg', '.jpeg', '.tiff', '.bmp'} file_extension = Path(file.filename).suffix.lower() if file_extension not in allowed_extensions: raise HTTPException( status_code=400, detail=f"Unsupported file type: {file_extension}. Allowed: {', '.join(allowed_extensions)}" ) # Save uploaded file temporarily with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file: shutil.copyfileobj(file.file, temp_file) temp_file_path = temp_file.name try: # Process the document inference = get_inference_pipeline() result = inference.process_document(temp_file_path) return JSONResponse(content=result) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) finally: # Clean up temporary file try: os.unlink(temp_file_path) except: pass @app.post("/extract-from-text") async def extract_from_text(request: Dict[str, str]): """Extract structured information from text.""" text = request.get("text", "").strip() if not text: raise HTTPException(status_code=400, detail="No text provided") try: # Process the text inference = get_inference_pipeline() result = inference.process_text_directly(text) return JSONResponse(content=result) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/supported-formats") async def get_supported_formats(): """Get list of supported file formats.""" return { "supported_formats": [ {"extension": ".pdf", "description": "PDF documents"}, {"extension": ".docx", "description": "Microsoft Word documents"}, {"extension": ".png", "description": "PNG images"}, {"extension": ".jpg", "description": "JPEG images"}, {"extension": ".jpeg", "description": "JPEG images"}, {"extension": ".tiff", "description": "TIFF images"}, {"extension": ".bmp", "description": "BMP images"} ], "entity_types": [ "Name", "Date", "InvoiceNo", "Amount", "Address", "Phone", "Email" ] } @app.get("/model-info") async def get_model_info(): """Get information about the loaded model.""" try: inference = get_inference_pipeline() return { "model_path": inference.model_path, "model_name": inference.config.model_name, "max_length": inference.config.max_length, "entity_labels": inference.config.entity_labels, "num_labels": inference.config.num_labels } except Exception as e: raise HTTPException(status_code=503, detail=f"Model not loaded: {str(e)}") def main(): """Run the FastAPI server.""" print("Starting Document Text Extraction API Server...") print("Server will be available at: http://localhost:8000") print("Web interface: http://localhost:8000") print("API docs: http://localhost:8000/docs") print("Health check: http://localhost:8000/health") uvicorn.run( "api.app:app", host="0.0.0.0", port=8000, reload=True, log_level="info" ) if __name__ == "__main__": main()

Document Text Extraction

Extraction Results