Spaces:
Sleeping
Sleeping
| # main.py - FastAPI Web Service for Advanced Multi-Language OCR | |
| # Integrates main6_pix2text.py functionality with REST API endpoints | |
| import os | |
| import json | |
| import shutil | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Optional, Dict, Any, List | |
| import uvicorn | |
| from fastapi import FastAPI, File, UploadFile, HTTPException, status | |
| from fastapi.responses import HTMLResponse, JSONResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel, Field | |
| # Import our OCR functionality | |
| from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text | |
| # Import evaluation functionality | |
| from eval import evaluate_ocr_accuracy, clean_control_characters | |
| # Initialize FastAPI app | |
| app = FastAPI( | |
| title="Advanced Multi-Language OCR API", | |
| description=""" | |
| 🔍 **Advanced OCR System for Multi-Language Text Extraction** | |
| This API provides sophisticated text extraction from PDF documents containing: | |
| - **English** text | |
| - **Bangla** (Bengali) text | |
| - **Mathematical expressions** and formulas | |
| ## Features | |
| - Upload PDF files for processing | |
| - Intelligent content classification | |
| - Pix2Text integration for advanced math extraction | |
| - Character-by-character analysis | |
| - Comprehensive extraction reports | |
| ## Usage | |
| 1. Upload a PDF using the `/extract` endpoint | |
| 2. Get extracted text with detailed analysis | |
| 3. Files are saved with organized naming convention | |
| """, | |
| version="1.0.0", | |
| contact={ | |
| "name": "Advanced OCR System", | |
| "url": "https://github.com/ashfaqbracu/aaladinai", | |
| }, | |
| ) | |
| # Add CORS middleware with proper encoding support | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Exception handler for JSON decode errors | |
| async def json_decode_error_handler(request, exc): | |
| """ | |
| Handle JSON decode errors that occur due to control characters in request body. | |
| """ | |
| if "JSON decode error" in str(exc.detail): | |
| return JSONResponse( | |
| status_code=400, | |
| content={ | |
| "error": "Invalid characters in request", | |
| "message": "The request contains invalid control characters. Please ensure your text data is properly encoded and does not contain control characters.", | |
| "suggestion": "Try cleaning your input text to remove any control characters before sending the request.", | |
| "details": str(exc.detail) if hasattr(exc, "detail") else str(exc), | |
| }, | |
| ) | |
| # For other 422 errors, return the original error | |
| raise exc | |
| # Create directories | |
| def create_directories(): | |
| """Create necessary directories for file storage.""" | |
| directories = ["documents", "extracted"] | |
| for directory in directories: | |
| Path(directory).mkdir(exist_ok=True) | |
| print(f"✅ Created/verified directory: {directory}") | |
| # Initialize directories on startup | |
| create_directories() | |
| # Pydantic models for API responses | |
| class ExtractionResult(BaseModel): | |
| """Model for OCR extraction results.""" | |
| success: bool = Field(description="Whether the extraction was successful") | |
| filename: str = Field(description="Name of the processed file") | |
| extracted_text: str = Field(description="Full extracted text content") | |
| text_file_path: str = Field(description="Path to the saved text file") | |
| json_file_path: str = Field(description="Path to the detailed JSON results") | |
| analysis_file_path: str = Field(description="Path to the analysis report") | |
| extraction_summary: Dict[str, Any] = Field( | |
| description="Summary of extraction statistics" | |
| ) | |
| processing_time: float = Field(description="Time taken for processing (seconds)") | |
| error_message: Optional[str] = Field( | |
| default=None, description="Error message if extraction failed" | |
| ) | |
| class HealthCheck(BaseModel): | |
| """Model for health check response.""" | |
| status: str = Field(description="Service status") | |
| message: str = Field(description="Status message") | |
| pix2text_available: bool = Field(description="Whether Pix2Text is available") | |
| directories: Dict[str, bool] = Field(description="Directory availability status") | |
| class FileInfo(BaseModel): | |
| """Model for file information.""" | |
| filename: str = Field(description="Name of the file") | |
| size: int = Field(description="File size in bytes") | |
| content_type: str = Field(description="MIME content type") | |
| upload_time: str = Field(description="Upload timestamp") | |
| class EvaluationRequest(BaseModel): | |
| """Model for OCR evaluation request with optional name.""" | |
| evaluation_name: Optional[str] = Field( | |
| default=None, description="Optional name for the evaluation" | |
| ) | |
| class EvaluationResult(BaseModel): | |
| """Model for OCR evaluation results.""" | |
| success: bool = Field(description="Whether the evaluation was successful") | |
| evaluation_name: Optional[str] = Field( | |
| default=None, description="Name of the evaluation" | |
| ) | |
| overall_accuracy: float = Field(description="Overall accuracy score (0-100)") | |
| similarity_score: float = Field(description="Text similarity score (0-100)") | |
| character_metrics: Dict[str, Any] = Field( | |
| description="Character-level accuracy metrics" | |
| ) | |
| word_metrics: Dict[str, Any] = Field(description="Word-level accuracy metrics") | |
| line_metrics: Dict[str, Any] = Field(description="Line-level accuracy metrics") | |
| language_specific: Dict[str, Any] = Field( | |
| description="Language-specific accuracy metrics" | |
| ) | |
| text_statistics: Dict[str, Any] = Field(description="Text comparison statistics") | |
| detailed_diff: List[Dict[str, str]] = Field( | |
| description="Detailed diff showing changes" | |
| ) | |
| evaluation_summary: Dict[str, Any] = Field( | |
| description="Summary with grade and recommendations" | |
| ) | |
| processing_time: float = Field(description="Time taken for evaluation (seconds)") | |
| error_message: Optional[str] = Field( | |
| default=None, description="Error message if evaluation failed" | |
| ) | |
| # Utility functions | |
| def get_safe_filename(filename: str) -> str: | |
| """Generate a safe filename with timestamp.""" | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| name, ext = os.path.splitext(filename) | |
| # Remove special characters and replace spaces | |
| safe_name = "".join(c for c in name if c.isalnum() or c in ("-", "_")).rstrip() | |
| return f"{safe_name}_{timestamp}{ext}" | |
| def get_extraction_filename(pdf_filename: str, file_type: str) -> str: | |
| """Generate extraction filename with convention: [pdf_filename]_extract.[extension]""" | |
| base_name = os.path.splitext(pdf_filename)[0] | |
| extensions = {"txt": "txt", "json": "json", "analysis": "json"} | |
| return f"{base_name}_extract.{extensions.get(file_type, 'txt')}" | |
| # API Routes | |
| async def root(): | |
| """ | |
| 🏠 **Main Route - Welcome Page** | |
| Returns a welcome page with information about the OCR API service. | |
| """ | |
| html_content = """ | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <title>Advanced Multi-Language OCR API</title> | |
| <style> | |
| body { font-family: Arial, sans-serif; margin: 40px; background: #f5f5f5; } | |
| .container { background: white; padding: 30px; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); } | |
| h1 { color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 10px; } | |
| .feature { background: #ecf0f1; padding: 15px; margin: 10px 0; border-radius: 5px; } | |
| .highlight { color: #e74c3c; font-weight: bold; } | |
| .button { background: #3498db; color: white; padding: 10px 20px; text-decoration: none; border-radius: 5px; display: inline-block; margin: 10px 5px; } | |
| .stats { display: flex; gap: 20px; margin: 20px 0; } | |
| .stat { background: #2ecc71; color: white; padding: 15px; border-radius: 5px; text-align: center; flex: 1; } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <h1>🔍 Advanced Multi-Language OCR API</h1> | |
| <p>Welcome to the <span class="highlight">most advanced OCR system</span> for extracting text from PDFs containing mixed languages and mathematical expressions!</p> | |
| <div class="stats"> | |
| <div class="stat"> | |
| <h3>🌐 Multi-Language</h3> | |
| <p>English + Bangla + Math</p> | |
| </div> | |
| <div class="stat"> | |
| <h3>🧠 AI-Powered</h3> | |
| <p>Pix2Text Integration</p> | |
| </div> | |
| <div class="stat"> | |
| <h3>📊 Detailed Analysis</h3> | |
| <p>Character-level Classification</p> | |
| </div> | |
| </div> | |
| <h2>🚀 Key Features</h2> | |
| <div class="feature"> | |
| <strong>📄 PDF Processing:</strong> Upload PDFs and get comprehensive text extraction | |
| </div> | |
| <div class="feature"> | |
| <strong>🔤 Language Detection:</strong> Automatic classification of English, Bangla, and mathematical content | |
| </div> | |
| <div class="feature"> | |
| <strong>🧮 Mathematical Expressions:</strong> Advanced LaTeX and formula recognition using Pix2Text | |
| </div> | |
| <div class="feature"> | |
| <strong>📈 Detailed Reports:</strong> Character analysis, confidence scores, and extraction statistics | |
| </div> | |
| <div class="feature"> | |
| <strong>💾 Organized Storage:</strong> Automatic file organization with clear naming conventions | |
| </div> | |
| <div class="feature"> | |
| <strong>📊 Accuracy Evaluation:</strong> Upload text files to compare OCR results with baseline and get detailed accuracy metrics | |
| </div> | |
| <h2>📋 API Endpoints</h2> | |
| <p><strong>GET /:</strong> This welcome page</p> | |
| <p><strong>GET /health:</strong> Service health status</p> | |
| <p><strong>POST /extract:</strong> Upload PDF and extract text</p> | |
| <p><strong>POST /eval:</strong> Upload two text files to evaluate OCR accuracy</p> | |
| <h2>🔗 Quick Actions</h2> | |
| <a href="/docs" class="button">📖 API Documentation (Swagger)</a> | |
| <a href="/redoc" class="button">📚 Alternative Docs (ReDoc)</a> | |
| <a href="/health" class="button">🏥 Health Check</a> | |
| <hr style="margin: 30px 0;"> | |
| <p style="text-align: center; color: #7f8c8d;"> | |
| <strong>Advanced Multi-Language OCR System</strong><br> | |
| Powered by Tesseract, Pix2Text, and FastAPI<br> | |
| <a href="https://github.com/ashfaqbracu/aaladinai" style="color: #3498db;">GitHub Repository</a> | |
| </p> | |
| </div> | |
| </body> | |
| </html> | |
| """ | |
| return html_content | |
| async def health_check(): | |
| """ | |
| 🏥 **Health Check** | |
| Check the health status of the OCR service and its dependencies. | |
| """ | |
| try: | |
| # Check if Pix2Text is available | |
| pix2text_model = initialize_pix2text() | |
| pix2text_available = pix2text_model is not None | |
| # Check directories | |
| directories_status = { | |
| "documents": Path("documents").exists(), | |
| "extracted": Path("extracted").exists(), | |
| } | |
| # Create directories if they don't exist | |
| for directory in directories_status: | |
| if not directories_status[directory]: | |
| Path(directory).mkdir(exist_ok=True) | |
| directories_status[directory] = True | |
| return HealthCheck( | |
| status="healthy", | |
| message="OCR service is running and ready to process files", | |
| pix2text_available=pix2text_available, | |
| directories=directories_status, | |
| ) | |
| except Exception as e: | |
| return HealthCheck( | |
| status="unhealthy", | |
| message=f"Service health check failed: {str(e)}", | |
| pix2text_available=False, | |
| directories={"documents": False, "extracted": False}, | |
| ) | |
| async def extract_text_from_pdf( | |
| file: UploadFile = File( | |
| ..., description="PDF file to extract text from", media_type="application/pdf" | |
| ), | |
| ): | |
| """ | |
| 📄 **Extract Text from PDF** | |
| Upload a PDF file and extract text using advanced multi-language OCR. | |
| ## Process: | |
| 1. **Upload**: PDF is saved to `documents/` folder | |
| 2. **Processing**: Advanced OCR with language detection | |
| 3. **Extraction**: Text, JSON, and analysis files generated | |
| 4. **Storage**: Results saved to `extracted/` folder with naming convention | |
| ## Naming Convention: | |
| - Input: `document.pdf` → Output: `document_extract.txt`, `document_extract.json` | |
| ## Returns: | |
| - Complete extracted text | |
| - Detailed extraction metadata | |
| - Processing statistics and analysis | |
| """ | |
| start_time = datetime.now() | |
| try: | |
| # Validate file type | |
| if not file.content_type == "application/pdf": | |
| raise HTTPException( | |
| status_code=status.HTTP_400_BAD_REQUEST, | |
| detail=f"Invalid file type. Expected PDF, got {file.content_type}", | |
| ) | |
| if not file.filename.lower().endswith(".pdf"): | |
| raise HTTPException( | |
| status_code=status.HTTP_400_BAD_REQUEST, | |
| detail="File must have .pdf extension", | |
| ) | |
| # Generate safe filename and save uploaded file | |
| safe_filename = get_safe_filename(file.filename) | |
| documents_path = Path("documents") / safe_filename | |
| # Save uploaded file | |
| with open(documents_path, "wb") as buffer: | |
| shutil.copyfileobj(file.file, buffer) | |
| print(f"📁 File saved to: {documents_path}") | |
| # Generate output filenames using naming convention | |
| text_filename = get_extraction_filename(safe_filename, "txt") | |
| json_filename = get_extraction_filename(safe_filename, "json") | |
| analysis_filename = get_extraction_filename(safe_filename, "analysis") | |
| # Create full paths for extracted files | |
| text_path = Path("extracted") / text_filename | |
| json_path = Path("extracted") / json_filename | |
| analysis_path = Path("extracted") / analysis_filename | |
| print(f"🔄 Starting OCR processing for: {safe_filename}") | |
| # Process the PDF using our advanced OCR system | |
| extract_all_text_advanced_pix2text( | |
| pdf_path=str(documents_path), | |
| output_text_file=str(text_path), | |
| output_json_file=str(json_path), | |
| output_analysis_file=str(analysis_path), | |
| ) | |
| # Read the extracted text | |
| with open(text_path, "r", encoding="utf-8") as f: | |
| extracted_text = f.read() | |
| # Read the analysis for summary | |
| with open(analysis_path, "r", encoding="utf-8") as f: | |
| analysis_data = json.load(f) | |
| # Calculate processing time | |
| end_time = datetime.now() | |
| processing_time = (end_time - start_time).total_seconds() | |
| print(f"✅ OCR processing completed in {processing_time:.2f} seconds") | |
| print(f"📊 Extracted {len(extracted_text)} characters") | |
| return ExtractionResult( | |
| success=True, | |
| filename=file.filename, | |
| extracted_text=extracted_text, | |
| text_file_path=str(text_path), | |
| json_file_path=str(json_path), | |
| analysis_file_path=str(analysis_path), | |
| extraction_summary=analysis_data, | |
| processing_time=processing_time, | |
| error_message=None, | |
| ) | |
| except HTTPException: | |
| # Re-raise HTTP exceptions | |
| raise | |
| except Exception as e: | |
| # Handle unexpected errors | |
| error_message = f"OCR processing failed: {str(e)}" | |
| print(f"❌ {error_message}") | |
| processing_time = (datetime.now() - start_time).total_seconds() | |
| return ExtractionResult( | |
| success=False, | |
| filename=file.filename if file else "unknown", | |
| extracted_text="", | |
| text_file_path="", | |
| json_file_path="", | |
| analysis_file_path="", | |
| extraction_summary={}, | |
| processing_time=processing_time, | |
| error_message=error_message, | |
| ) | |
| async def evaluate_ocr_extraction( | |
| extracted_file: UploadFile = File( | |
| ..., description="Text file containing OCR extracted text" | |
| ), | |
| baseline_file: UploadFile = File( | |
| ..., description="Text file containing ground truth baseline text" | |
| ), | |
| evaluation_name: Optional[str] = None, | |
| ): | |
| """ | |
| 📊 **Evaluate OCR Extraction Accuracy** | |
| Compare extracted text file with ground truth baseline file to measure OCR accuracy. | |
| ## Features: | |
| - **File-based input**: Upload two text files for comparison | |
| - **Character-level accuracy**: Precise character matching and edit distance | |
| - **Word-level accuracy**: Word matching and error rates | |
| - **Line-level accuracy**: Line comparison and similarity scores | |
| - **Language-specific metrics**: Separate accuracy for English, Bangla, and Math | |
| - **Overall grading**: Letter grade system (A+ to F) | |
| - **Detailed diff**: Character-by-character comparison | |
| - **Recommendations**: Suggestions for improving OCR accuracy | |
| ## Input: | |
| - `extracted_file`: Text file with OCR extracted content (.txt format) | |
| - `baseline_file`: Text file with ground truth content (.txt format) | |
| - `evaluation_name`: Optional name for the evaluation | |
| ## Output: | |
| - Comprehensive accuracy metrics | |
| - Performance grading | |
| - Detailed comparison analysis | |
| - Improvement recommendations | |
| """ | |
| start_time = datetime.now() | |
| try: | |
| print(f"🔍 Starting OCR evaluation: {evaluation_name or 'Unnamed'}") | |
| # Validate file types | |
| if not extracted_file.filename.lower().endswith(".txt"): | |
| raise HTTPException( | |
| status_code=status.HTTP_400_BAD_REQUEST, | |
| detail="Extracted file must be a .txt file", | |
| ) | |
| if not baseline_file.filename.lower().endswith(".txt"): | |
| raise HTTPException( | |
| status_code=status.HTTP_400_BAD_REQUEST, | |
| detail="Baseline file must be a .txt file", | |
| ) | |
| # Read file contents | |
| try: | |
| extracted_content = await extracted_file.read() | |
| baseline_content = await baseline_file.read() | |
| # Decode with UTF-8 and handle potential encoding issues | |
| extracted_text = extracted_content.decode("utf-8", errors="replace") | |
| baseline_text = baseline_content.decode("utf-8", errors="replace") | |
| print(f"📁 Files read successfully:") | |
| print( | |
| f" Extracted file: {extracted_file.filename} ({len(extracted_text)} characters)" | |
| ) | |
| print( | |
| f" Baseline file: {baseline_file.filename} ({len(baseline_text)} characters)" | |
| ) | |
| except Exception as e: | |
| raise HTTPException( | |
| status_code=status.HTTP_400_BAD_REQUEST, | |
| detail=f"Failed to read file contents: {str(e)}", | |
| ) | |
| # Validate that baseline text is not empty | |
| if not baseline_text.strip(): | |
| raise HTTPException( | |
| status_code=status.HTTP_400_BAD_REQUEST, | |
| detail="Baseline file cannot be empty", | |
| ) | |
| # Clean input texts to prevent any issues | |
| from eval import clean_control_characters | |
| extracted_text_clean = clean_control_characters(extracted_text) | |
| baseline_text_clean = clean_control_characters(baseline_text) | |
| print( | |
| f"📝 Text lengths after cleaning - Extracted: {len(extracted_text_clean)}, Baseline: {len(baseline_text_clean)}" | |
| ) | |
| # Perform evaluation with cleaned texts | |
| evaluation_results = evaluate_ocr_accuracy( | |
| extracted_text=extracted_text_clean, | |
| baseline_text=baseline_text_clean, | |
| ) | |
| # Check for evaluation errors | |
| if "error" in evaluation_results: | |
| raise HTTPException( | |
| status_code=status.HTTP_400_BAD_REQUEST, | |
| detail=evaluation_results["error"], | |
| ) | |
| # Calculate processing time | |
| end_time = datetime.now() | |
| processing_time = (end_time - start_time).total_seconds() | |
| print(f"✅ Evaluation completed in {processing_time:.3f} seconds") | |
| print(f"📊 Overall accuracy: {evaluation_results['overall_accuracy']:.2f}%") | |
| print(f"🎯 Grade: {evaluation_results['evaluation_summary']['grade']}") | |
| return EvaluationResult( | |
| success=True, | |
| evaluation_name=evaluation_name, | |
| overall_accuracy=evaluation_results["overall_accuracy"], | |
| similarity_score=evaluation_results["similarity_score"], | |
| character_metrics=evaluation_results["character_metrics"], | |
| word_metrics=evaluation_results["word_metrics"], | |
| line_metrics=evaluation_results["line_metrics"], | |
| language_specific=evaluation_results["language_specific"], | |
| text_statistics=evaluation_results["text_statistics"], | |
| detailed_diff=evaluation_results["detailed_diff"], | |
| evaluation_summary=evaluation_results["evaluation_summary"], | |
| processing_time=processing_time, | |
| error_message=None, | |
| ) | |
| except HTTPException: | |
| # Re-raise HTTP exceptions | |
| raise | |
| except Exception as e: | |
| # Handle unexpected errors | |
| error_message = f"Evaluation failed: {str(e)}" | |
| print(f"❌ {error_message}") | |
| processing_time = (datetime.now() - start_time).total_seconds() | |
| return EvaluationResult( | |
| success=False, | |
| evaluation_name=evaluation_name, | |
| overall_accuracy=0.0, | |
| similarity_score=0.0, | |
| character_metrics={}, | |
| word_metrics={}, | |
| line_metrics={}, | |
| language_specific={}, | |
| text_statistics={}, | |
| detailed_diff=[], | |
| evaluation_summary={ | |
| "grade": "F (Error)", | |
| "recommendations": [error_message], | |
| }, | |
| processing_time=processing_time, | |
| error_message=error_message, | |
| ) | |
| # Development server configuration | |
| if __name__ == "__main__": | |
| print("🚀 Starting Advanced Multi-Language OCR API Server...") | |
| print("📖 API Documentation: http://localhost:8000/docs") | |
| print("🏠 Main Page: http://localhost:8000/") | |
| print("🏥 Health Check: http://localhost:8000/health") | |
| # Get port from environment variable for Render deployment, default to 8000 for local development | |
| port = int(os.environ.get("PORT", 7860)) | |
| uvicorn.run("main:app", host="0.0.0.0", port=port, log_level="info") | |