Spaces:
Sleeping
Sleeping
| # app.py - Gradio Interface for Hugging Face Spaces | |
| import gradio as gr | |
| import os | |
| import json | |
| import shutil | |
| import subprocess | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Tuple | |
| # Import our OCR functionality | |
| from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text | |
| from eval import evaluate_ocr_accuracy, clean_control_characters | |
| def check_system_dependencies(): | |
| """Check and report system dependencies status.""" | |
| print("๐ Checking system dependencies...") | |
| # Check Tesseract | |
| try: | |
| result = subprocess.run(['tesseract', '--version'], capture_output=True, text=True) | |
| if result.returncode == 0: | |
| print("โ Tesseract is available") | |
| else: | |
| print("โ Tesseract check failed") | |
| except FileNotFoundError: | |
| print("โ Tesseract not found in PATH") | |
| # Check Poppler | |
| poppler_tools = ['pdftoppm', 'pdfinfo'] | |
| for tool in poppler_tools: | |
| try: | |
| result = subprocess.run(['which', tool], capture_output=True, text=True) | |
| if result.returncode == 0: | |
| print(f"โ {tool} is available") | |
| else: | |
| print(f"โ {tool} not found") | |
| except FileNotFoundError: | |
| print(f"โ {tool} not available") | |
| # Check pdf2image | |
| try: | |
| import importlib.util | |
| if importlib.util.find_spec("pdf2image") is not None: | |
| print("โ pdf2image is available") | |
| else: | |
| print("โ pdf2image module not found") | |
| except Exception as e: | |
| print(f"โ pdf2image check failed: {e}") | |
| print(f"๐ PATH: {os.environ.get('PATH', 'NOT SET')}") | |
| # Run dependency check on startup | |
| check_system_dependencies() | |
| # Initialize directories | |
| def create_directories(): | |
| """Create necessary directories for file storage.""" | |
| directories = ["documents", "extracted", "temp"] | |
| for directory in directories: | |
| Path(directory).mkdir(exist_ok=True) | |
| create_directories() | |
| def process_pdf_ocr(pdf_file) -> Tuple[str, str, str]: | |
| """ | |
| Process uploaded PDF file and extract text using advanced OCR. | |
| Args: | |
| pdf_file: Gradio file input (temporary file path) | |
| Returns: | |
| Tuple of (extracted_text, json_results, analysis_results) | |
| """ | |
| if pdf_file is None: | |
| return "โ No file uploaded", "", "" | |
| try: | |
| # Generate timestamp for unique naming | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| # Get original filename | |
| original_name = os.path.basename(pdf_file.name) | |
| base_name = os.path.splitext(original_name)[0] | |
| # Create unique filenames | |
| pdf_filename = f"{base_name}_{timestamp}.pdf" | |
| text_filename = f"{base_name}_{timestamp}_extract.txt" | |
| json_filename = f"{base_name}_{timestamp}_extract.json" | |
| analysis_filename = f"{base_name}_{timestamp}_analysis.json" | |
| # Create paths | |
| pdf_path = Path("temp") / pdf_filename | |
| text_path = Path("extracted") / text_filename | |
| json_path = Path("extracted") / json_filename | |
| analysis_path = Path("extracted") / analysis_filename | |
| # Copy uploaded file to our temp directory | |
| shutil.copy2(pdf_file.name, pdf_path) | |
| # Process the PDF using our advanced OCR system | |
| extract_all_text_advanced_pix2text( | |
| pdf_path=str(pdf_path), | |
| output_text_file=str(text_path), | |
| output_json_file=str(json_path), | |
| output_analysis_file=str(analysis_path), | |
| ) | |
| # Read results | |
| with open(text_path, "r", encoding="utf-8") as f: | |
| extracted_text = f.read() | |
| with open(json_path, "r", encoding="utf-8") as f: | |
| json_results = json.load(f) | |
| with open(analysis_path, "r", encoding="utf-8") as f: | |
| analysis_results = json.load(f) | |
| # Format results for display | |
| json_display = json.dumps(json_results, indent=2, ensure_ascii=False) | |
| analysis_display = json.dumps(analysis_results, indent=2, ensure_ascii=False) | |
| # Clean up temp file | |
| try: | |
| os.remove(pdf_path) | |
| except Exception: | |
| pass | |
| return extracted_text, json_display, analysis_display | |
| except Exception as e: | |
| error_msg = str(e) | |
| # Provide specific guidance for common errors | |
| if ( | |
| "poppler" in error_msg.lower() | |
| or "unable to get page count" in error_msg.lower() | |
| ): | |
| error_msg = """โ PDF Processing Error: Poppler not found | |
| ๐ง This error occurs because Poppler (PDF utilities) is not properly installed. | |
| ๐ For Hugging Face Spaces: | |
| 1. Ensure your setup.sh script runs during deployment | |
| 2. Check that poppler-utils is installed in the container | |
| 3. Verify the setup logs show successful poppler installation | |
| ๐ก The setup.sh script should install these packages: | |
| - poppler-utils | |
| - libpoppler-cpp-dev | |
| - pkg-config | |
| ๐จ Original error: {error_msg} | |
| ๐ Try restarting the space if this persists.""" | |
| elif "tesseract" in error_msg.lower(): | |
| error_msg = f"""โ OCR Engine Error: Tesseract issue | |
| ๐ง This error is related to Tesseract OCR engine. | |
| ๐ Possible solutions: | |
| 1. Check Tesseract installation in setup.sh | |
| 2. Verify language data files are available | |
| 3. Ensure proper permissions on tessdata directory | |
| ๐จ Original error: {error_msg}""" | |
| else: | |
| error_msg = f"โ Error processing PDF: {error_msg}" | |
| return error_msg, "", "" | |
| def evaluate_ocr_files(extracted_file, baseline_file) -> Tuple[str, str]: | |
| """ | |
| Evaluate OCR accuracy by comparing extracted text with baseline. | |
| Args: | |
| extracted_file: Gradio file input (extracted text file) | |
| baseline_file: Gradio file input (baseline/ground truth text file) | |
| Returns: | |
| Tuple of (evaluation_summary, detailed_results) | |
| """ | |
| if extracted_file is None or baseline_file is None: | |
| return "โ Please upload both files", "" | |
| try: | |
| # Read file contents | |
| with open(extracted_file.name, "r", encoding="utf-8") as f: | |
| extracted_text = f.read() | |
| with open(baseline_file.name, "r", encoding="utf-8") as f: | |
| baseline_text = f.read() | |
| # Clean texts | |
| extracted_text_clean = clean_control_characters(extracted_text) | |
| baseline_text_clean = clean_control_characters(baseline_text) | |
| # Perform evaluation | |
| results = evaluate_ocr_accuracy( | |
| extracted_text=extracted_text_clean, | |
| baseline_text=baseline_text_clean, | |
| ) | |
| if "error" in results: | |
| return f"โ Evaluation error: {results['error']}", "" | |
| # Create summary | |
| summary = f""" | |
| ๐ **OCR Evaluation Results** | |
| ๐ฏ **Overall Grade: {results["evaluation_summary"]["grade"]}** | |
| ๐ **Overall Accuracy: {results["overall_accuracy"]:.2f}%** | |
| ๐ **Similarity Score: {results["similarity_score"]:.2f}%** | |
| ๐ **Character Metrics:** | |
| - Total Characters: {results["character_metrics"]["total_chars"]} | |
| - Correct Characters: {results["character_metrics"]["correct_chars"]} | |
| - Character Accuracy: {results["character_metrics"]["accuracy"]:.2f}% | |
| ๐ **Word Metrics:** | |
| - Total Words: {results["word_metrics"]["total_words"]} | |
| - Correct Words: {results["word_metrics"]["correct_words"]} | |
| - Word Accuracy: {results["word_metrics"]["accuracy"]:.2f}% | |
| ๐ **Line Metrics:** | |
| - Total Lines: {results["line_metrics"]["total_lines"]} | |
| - Correct Lines: {results["line_metrics"]["correct_lines"]} | |
| - Line Accuracy: {results["line_metrics"]["accuracy"]:.2f}% | |
| ๐ **Language-Specific Accuracy:** | |
| - English: {results["language_specific"]["english"]["accuracy"]:.2f}% | |
| - Bangla: {results["language_specific"]["bangla"]["accuracy"]:.2f}% | |
| - Mathematical: {results["language_specific"]["math"]["accuracy"]:.2f}% | |
| ๐ก **Recommendations:** | |
| {chr(10).join(f"โข {rec}" for rec in results["evaluation_summary"]["recommendations"])} | |
| """ | |
| # Detailed results | |
| detailed = json.dumps(results, indent=2, ensure_ascii=False) | |
| return summary, detailed | |
| except Exception as e: | |
| error_msg = f"โ Error during evaluation: {str(e)}" | |
| return error_msg, "" | |
| # Create Gradio Interface | |
| def create_interface(): | |
| """Create the main Gradio interface.""" | |
| with gr.Blocks( | |
| title="๐ Advanced Multi-Language OCR System", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container { | |
| font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
| } | |
| .header { | |
| text-align: center; | |
| background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| padding: 2rem; | |
| border-radius: 10px; | |
| margin-bottom: 2rem; | |
| } | |
| """, | |
| ) as app: | |
| # Header | |
| gr.HTML(""" | |
| <div class="header"> | |
| <h1>๐ Advanced Multi-Language OCR System</h1> | |
| <p>Extract text from PDFs containing English, Bangla, and Mathematical expressions</p> | |
| <p>Powered by Tesseract, Pix2Text, and Advanced Language Detection</p> | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| # OCR Processing Tab | |
| with gr.Tab("๐ OCR Processing", id="ocr"): | |
| gr.Markdown(""" | |
| ## ๐ PDF Text Extraction | |
| Upload a PDF file to extract text using advanced multi-language OCR technology. | |
| **Features:** | |
| - ๐ Multi-language support (English, Bangla, Mathematical expressions) | |
| - ๐งฎ Advanced mathematical formula recognition with Pix2Text | |
| - ๐ Detailed character-by-character analysis | |
| - ๐ท๏ธ Automatic content classification | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| pdf_input = gr.File( | |
| label="๐ Upload PDF File", | |
| file_types=[".pdf"], | |
| file_count="single", | |
| ) | |
| process_btn = gr.Button( | |
| "๐ Extract Text", variant="primary", size="lg" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| extracted_output = gr.Textbox( | |
| label="๐ Extracted Text", | |
| lines=15, | |
| max_lines=20, | |
| placeholder="Extracted text will appear here...", | |
| ) | |
| with gr.Column(): | |
| json_output = gr.Textbox( | |
| label="๐ Detailed JSON Results", | |
| lines=8, | |
| max_lines=15, | |
| placeholder="JSON results will appear here...", | |
| ) | |
| analysis_output = gr.Textbox( | |
| label="๐ Analysis Report", | |
| lines=7, | |
| max_lines=10, | |
| placeholder="Analysis report will appear here...", | |
| ) | |
| # Connect OCR processing | |
| process_btn.click( | |
| fn=process_pdf_ocr, | |
| inputs=[pdf_input], | |
| outputs=[extracted_output, json_output, analysis_output], | |
| ) | |
| # Evaluation Tab | |
| with gr.Tab("๐ OCR Evaluation", id="eval"): | |
| gr.Markdown(""" | |
| ## ๐ OCR Accuracy Evaluation | |
| Compare extracted text with ground truth baseline to measure OCR accuracy. | |
| **Features:** | |
| - ๐ฏ Character, word, and line-level accuracy metrics | |
| - ๐ Language-specific accuracy analysis | |
| - ๐ Overall grading system (A+ to F) | |
| - ๐ก Improvement recommendations | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| extracted_file = gr.File( | |
| label="๐ Extracted Text File (.txt)", | |
| file_types=[".txt"], | |
| file_count="single", | |
| ) | |
| with gr.Column(): | |
| baseline_file = gr.File( | |
| label="๐ Baseline/Ground Truth File (.txt)", | |
| file_types=[".txt"], | |
| file_count="single", | |
| ) | |
| evaluate_btn = gr.Button( | |
| "๐ Evaluate Accuracy", variant="primary", size="lg" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| eval_summary = gr.Textbox( | |
| label="๐ Evaluation Summary", | |
| lines=20, | |
| max_lines=25, | |
| placeholder="Evaluation summary will appear here...", | |
| ) | |
| with gr.Column(): | |
| eval_detailed = gr.Textbox( | |
| label="๐ Detailed Results (JSON)", | |
| lines=20, | |
| max_lines=25, | |
| placeholder="Detailed evaluation results will appear here...", | |
| ) | |
| # Connect evaluation | |
| evaluate_btn.click( | |
| fn=evaluate_ocr_files, | |
| inputs=[extracted_file, baseline_file], | |
| outputs=[eval_summary, eval_detailed], | |
| ) | |
| # About Tab | |
| with gr.Tab("โน๏ธ About", id="about"): | |
| gr.Markdown(""" | |
| ## ๐ Advanced Multi-Language OCR System | |
| ### ๐ Overview | |
| This system provides state-of-the-art OCR capabilities for documents containing mixed languages and mathematical expressions. | |
| ### ๐ Key Features | |
| #### ๐ Multi-Language OCR | |
| - **English**: Advanced text recognition with high accuracy | |
| - **Bangla**: Native Bengali script support with proper Unicode handling | |
| - **Mathematical**: LaTeX and formula recognition using Pix2Text | |
| #### ๐งฎ Advanced Math Processing | |
| - Integration with **Pix2Text** for superior mathematical expression recognition | |
| - LaTeX output for mathematical formulas | |
| - Support for complex equations and symbols | |
| #### ๐ Comprehensive Analysis | |
| - Character-by-character classification and confidence scoring | |
| - Language detection and content categorization | |
| - Detailed extraction statistics and reports | |
| #### ๐ฏ Accuracy Evaluation | |
| - Compare extracted text with ground truth baseline | |
| - Character, word, and line-level accuracy metrics | |
| - Language-specific performance analysis | |
| - Grading system with improvement recommendations | |
| ### ๐ ๏ธ Technology Stack | |
| - **OCR Engine**: Tesseract with custom language models | |
| - **Math Recognition**: Pix2Text for advanced mathematical expressions | |
| - **Language Detection**: Custom algorithms for multi-language content | |
| - **Backend**: FastAPI with async processing | |
| - **Frontend**: Gradio for interactive web interface | |
| ### ๐ Usage Tips | |
| #### For Best OCR Results: | |
| 1. **File Quality**: Use high-resolution PDF files (300 DPI or higher) | |
| 2. **Text Clarity**: Ensure text is clear and not blurry or distorted | |
| 3. **Language**: The system works best with properly formatted text | |
| 4. **Mathematical Content**: Complex formulas are processed using specialized Pix2Text models | |
| #### For Accurate Evaluation: | |
| 1. **File Format**: Upload plain text files (.txt) in UTF-8 encoding | |
| 2. **Content Matching**: Ensure baseline file corresponds to the same source document | |
| 3. **Text Cleaning**: The system automatically cleans control characters | |
| ### ๐ Links | |
| - **GitHub Repository**: [aaladin-ocr](https://github.com/ashfaqbracu/aaladin-ocr) | |
| - **Documentation**: Available in the repository | |
| - **Issues/Support**: Report issues on GitHub | |
| ### ๐ง Contact | |
| For questions or support, please visit our GitHub repository or create an issue. | |
| --- | |
| **Developed with โค๏ธ for advanced document processing and OCR accuracy.** | |
| """) | |
| return app | |
| # Initialize Pix2Text on startup | |
| print("๐ Initializing Pix2Text model...") | |
| try: | |
| pix2text_model = initialize_pix2text() | |
| if pix2text_model: | |
| print("โ Pix2Text initialized successfully") | |
| else: | |
| print("โ ๏ธ Pix2Text initialization failed - math extraction may be limited") | |
| except Exception as e: | |
| print(f"โ ๏ธ Pix2Text initialization error: {e}") | |
| # Create and launch the interface | |
| if __name__ == "__main__": | |
| app = create_interface() | |
| # Launch with proper configuration for Hugging Face Spaces | |
| app.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True) | |