# app.py - Gradio Interface for Hugging Face Spaces import gradio as gr import os import json import shutil import subprocess from datetime import datetime from pathlib import Path from typing import Tuple # Import our OCR functionality from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text from eval import evaluate_ocr_accuracy, clean_control_characters def check_system_dependencies(): """Check and report system dependencies status.""" print("🔍 Checking system dependencies...") # Check Tesseract try: result = subprocess.run(['tesseract', '--version'], capture_output=True, text=True) if result.returncode == 0: print("✅ Tesseract is available") else: print("❌ Tesseract check failed") except FileNotFoundError: print("❌ Tesseract not found in PATH") # Check Poppler poppler_tools = ['pdftoppm', 'pdfinfo'] for tool in poppler_tools: try: result = subprocess.run(['which', tool], capture_output=True, text=True) if result.returncode == 0: print(f"✅ {tool} is available") else: print(f"❌ {tool} not found") except FileNotFoundError: print(f"❌ {tool} not available") # Check pdf2image try: import importlib.util if importlib.util.find_spec("pdf2image") is not None: print("✅ pdf2image is available") else: print("❌ pdf2image module not found") except Exception as e: print(f"❌ pdf2image check failed: {e}") print(f"📍 PATH: {os.environ.get('PATH', 'NOT SET')}") # Run dependency check on startup check_system_dependencies() # Initialize directories def create_directories(): """Create necessary directories for file storage.""" directories = ["documents", "extracted", "temp"] for directory in directories: Path(directory).mkdir(exist_ok=True) create_directories() def process_pdf_ocr(pdf_file) -> Tuple[str, str, str]: """ Process uploaded PDF file and extract text using advanced OCR. Args: pdf_file: Gradio file input (temporary file path) Returns: Tuple of (extracted_text, json_results, analysis_results) """ if pdf_file is None: return "❌ No file uploaded", "", "" try: # Generate timestamp for unique naming timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Get original filename original_name = os.path.basename(pdf_file.name) base_name = os.path.splitext(original_name)[0] # Create unique filenames pdf_filename = f"{base_name}_{timestamp}.pdf" text_filename = f"{base_name}_{timestamp}_extract.txt" json_filename = f"{base_name}_{timestamp}_extract.json" analysis_filename = f"{base_name}_{timestamp}_analysis.json" # Create paths pdf_path = Path("temp") / pdf_filename text_path = Path("extracted") / text_filename json_path = Path("extracted") / json_filename analysis_path = Path("extracted") / analysis_filename # Copy uploaded file to our temp directory shutil.copy2(pdf_file.name, pdf_path) # Process the PDF using our advanced OCR system extract_all_text_advanced_pix2text( pdf_path=str(pdf_path), output_text_file=str(text_path), output_json_file=str(json_path), output_analysis_file=str(analysis_path), ) # Read results with open(text_path, "r", encoding="utf-8") as f: extracted_text = f.read() with open(json_path, "r", encoding="utf-8") as f: json_results = json.load(f) with open(analysis_path, "r", encoding="utf-8") as f: analysis_results = json.load(f) # Format results for display json_display = json.dumps(json_results, indent=2, ensure_ascii=False) analysis_display = json.dumps(analysis_results, indent=2, ensure_ascii=False) # Clean up temp file try: os.remove(pdf_path) except Exception: pass return extracted_text, json_display, analysis_display except Exception as e: error_msg = str(e) # Provide specific guidance for common errors if ( "poppler" in error_msg.lower() or "unable to get page count" in error_msg.lower() ): error_msg = """❌ PDF Processing Error: Poppler not found 🔧 This error occurs because Poppler (PDF utilities) is not properly installed. 📋 For Hugging Face Spaces: 1. Ensure your setup.sh script runs during deployment 2. Check that poppler-utils is installed in the container 3. Verify the setup logs show successful poppler installation 💡 The setup.sh script should install these packages: - poppler-utils - libpoppler-cpp-dev - pkg-config 🚨 Original error: {error_msg} 🔄 Try restarting the space if this persists.""" elif "tesseract" in error_msg.lower(): error_msg = f"""❌ OCR Engine Error: Tesseract issue 🔧 This error is related to Tesseract OCR engine. 📋 Possible solutions: 1. Check Tesseract installation in setup.sh 2. Verify language data files are available 3. Ensure proper permissions on tessdata directory 🚨 Original error: {error_msg}""" else: error_msg = f"❌ Error processing PDF: {error_msg}" return error_msg, "", "" def evaluate_ocr_files(extracted_file, baseline_file) -> Tuple[str, str]: """ Evaluate OCR accuracy by comparing extracted text with baseline. Args: extracted_file: Gradio file input (extracted text file) baseline_file: Gradio file input (baseline/ground truth text file) Returns: Tuple of (evaluation_summary, detailed_results) """ if extracted_file is None or baseline_file is None: return "❌ Please upload both files", "" try: # Read file contents with open(extracted_file.name, "r", encoding="utf-8") as f: extracted_text = f.read() with open(baseline_file.name, "r", encoding="utf-8") as f: baseline_text = f.read() # Clean texts extracted_text_clean = clean_control_characters(extracted_text) baseline_text_clean = clean_control_characters(baseline_text) # Perform evaluation results = evaluate_ocr_accuracy( extracted_text=extracted_text_clean, baseline_text=baseline_text_clean, ) if "error" in results: return f"❌ Evaluation error: {results['error']}", "" # Create summary summary = f""" 📊 **OCR Evaluation Results** đŸŽ¯ **Overall Grade: {results["evaluation_summary"]["grade"]}** 📈 **Overall Accuracy: {results["overall_accuracy"]:.2f}%** 🔍 **Similarity Score: {results["similarity_score"]:.2f}%** 📝 **Character Metrics:** - Total Characters: {results["character_metrics"]["total_chars"]} - Correct Characters: {results["character_metrics"]["correct_chars"]} - Character Accuracy: {results["character_metrics"]["accuracy"]:.2f}% 📄 **Word Metrics:** - Total Words: {results["word_metrics"]["total_words"]} - Correct Words: {results["word_metrics"]["correct_words"]} - Word Accuracy: {results["word_metrics"]["accuracy"]:.2f}% 📋 **Line Metrics:** - Total Lines: {results["line_metrics"]["total_lines"]} - Correct Lines: {results["line_metrics"]["correct_lines"]} - Line Accuracy: {results["line_metrics"]["accuracy"]:.2f}% 🌐 **Language-Specific Accuracy:** - English: {results["language_specific"]["english"]["accuracy"]:.2f}% - Bangla: {results["language_specific"]["bangla"]["accuracy"]:.2f}% - Mathematical: {results["language_specific"]["math"]["accuracy"]:.2f}% 💡 **Recommendations:** {chr(10).join(f"â€ĸ {rec}" for rec in results["evaluation_summary"]["recommendations"])} """ # Detailed results detailed = json.dumps(results, indent=2, ensure_ascii=False) return summary, detailed except Exception as e: error_msg = f"❌ Error during evaluation: {str(e)}" return error_msg, "" # Create Gradio Interface def create_interface(): """Create the main Gradio interface.""" with gr.Blocks( title="🔍 Advanced Multi-Language OCR System", theme=gr.themes.Soft(), css=""" .gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; } .header { text-align: center; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); color: white; padding: 2rem; border-radius: 10px; margin-bottom: 2rem; } """, ) as app: # Header gr.HTML("""

🔍 Advanced Multi-Language OCR System

Extract text from PDFs containing English, Bangla, and Mathematical expressions

Powered by Tesseract, Pix2Text, and Advanced Language Detection

""") with gr.Tabs(): # OCR Processing Tab with gr.Tab("📄 OCR Processing", id="ocr"): gr.Markdown(""" ## 📄 PDF Text Extraction Upload a PDF file to extract text using advanced multi-language OCR technology. **Features:** - 🌐 Multi-language support (English, Bangla, Mathematical expressions) - 🧮 Advanced mathematical formula recognition with Pix2Text - 📊 Detailed character-by-character analysis - đŸˇī¸ Automatic content classification """) with gr.Row(): with gr.Column(): pdf_input = gr.File( label="📎 Upload PDF File", file_types=[".pdf"], file_count="single", ) process_btn = gr.Button( "🚀 Extract Text", variant="primary", size="lg" ) with gr.Row(): with gr.Column(): extracted_output = gr.Textbox( label="📝 Extracted Text", lines=15, max_lines=20, placeholder="Extracted text will appear here...", ) with gr.Column(): json_output = gr.Textbox( label="📋 Detailed JSON Results", lines=8, max_lines=15, placeholder="JSON results will appear here...", ) analysis_output = gr.Textbox( label="📊 Analysis Report", lines=7, max_lines=10, placeholder="Analysis report will appear here...", ) # Connect OCR processing process_btn.click( fn=process_pdf_ocr, inputs=[pdf_input], outputs=[extracted_output, json_output, analysis_output], ) # Evaluation Tab with gr.Tab("📊 OCR Evaluation", id="eval"): gr.Markdown(""" ## 📊 OCR Accuracy Evaluation Compare extracted text with ground truth baseline to measure OCR accuracy. **Features:** - đŸŽ¯ Character, word, and line-level accuracy metrics - 🌐 Language-specific accuracy analysis - 📈 Overall grading system (A+ to F) - 💡 Improvement recommendations """) with gr.Row(): with gr.Column(): extracted_file = gr.File( label="📄 Extracted Text File (.txt)", file_types=[".txt"], file_count="single", ) with gr.Column(): baseline_file = gr.File( label="📋 Baseline/Ground Truth File (.txt)", file_types=[".txt"], file_count="single", ) evaluate_btn = gr.Button( "🔍 Evaluate Accuracy", variant="primary", size="lg" ) with gr.Row(): with gr.Column(): eval_summary = gr.Textbox( label="📊 Evaluation Summary", lines=20, max_lines=25, placeholder="Evaluation summary will appear here...", ) with gr.Column(): eval_detailed = gr.Textbox( label="📋 Detailed Results (JSON)", lines=20, max_lines=25, placeholder="Detailed evaluation results will appear here...", ) # Connect evaluation evaluate_btn.click( fn=evaluate_ocr_files, inputs=[extracted_file, baseline_file], outputs=[eval_summary, eval_detailed], ) # About Tab with gr.Tab("â„šī¸ About", id="about"): gr.Markdown(""" ## 🔍 Advanced Multi-Language OCR System ### 🌟 Overview This system provides state-of-the-art OCR capabilities for documents containing mixed languages and mathematical expressions. ### 🚀 Key Features #### 📄 Multi-Language OCR - **English**: Advanced text recognition with high accuracy - **Bangla**: Native Bengali script support with proper Unicode handling - **Mathematical**: LaTeX and formula recognition using Pix2Text #### 🧮 Advanced Math Processing - Integration with **Pix2Text** for superior mathematical expression recognition - LaTeX output for mathematical formulas - Support for complex equations and symbols #### 📊 Comprehensive Analysis - Character-by-character classification and confidence scoring - Language detection and content categorization - Detailed extraction statistics and reports #### đŸŽ¯ Accuracy Evaluation - Compare extracted text with ground truth baseline - Character, word, and line-level accuracy metrics - Language-specific performance analysis - Grading system with improvement recommendations ### đŸ› ī¸ Technology Stack - **OCR Engine**: Tesseract with custom language models - **Math Recognition**: Pix2Text for advanced mathematical expressions - **Language Detection**: Custom algorithms for multi-language content - **Backend**: FastAPI with async processing - **Frontend**: Gradio for interactive web interface ### 📝 Usage Tips #### For Best OCR Results: 1. **File Quality**: Use high-resolution PDF files (300 DPI or higher) 2. **Text Clarity**: Ensure text is clear and not blurry or distorted 3. **Language**: The system works best with properly formatted text 4. **Mathematical Content**: Complex formulas are processed using specialized Pix2Text models #### For Accurate Evaluation: 1. **File Format**: Upload plain text files (.txt) in UTF-8 encoding 2. **Content Matching**: Ensure baseline file corresponds to the same source document 3. **Text Cleaning**: The system automatically cleans control characters ### 🔗 Links - **GitHub Repository**: [aaladin-ocr](https://github.com/ashfaqbracu/aaladin-ocr) - **Documentation**: Available in the repository - **Issues/Support**: Report issues on GitHub ### 📧 Contact For questions or support, please visit our GitHub repository or create an issue. --- **Developed with â¤ī¸ for advanced document processing and OCR accuracy.** """) return app # Initialize Pix2Text on startup print("🚀 Initializing Pix2Text model...") try: pix2text_model = initialize_pix2text() if pix2text_model: print("✅ Pix2Text initialized successfully") else: print("âš ī¸ Pix2Text initialization failed - math extraction may be limited") except Exception as e: print(f"âš ī¸ Pix2Text initialization error: {e}") # Create and launch the interface if __name__ == "__main__": app = create_interface() # Launch with proper configuration for Hugging Face Spaces app.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)