Spaces:

Ash2749
/

advanced-multilang-ocr

Sleeping

File size: 17,978 Bytes

# app.py - Gradio Interface for Hugging Face Spaces
import gradio as gr
import os
import json
import shutil
import subprocess
from datetime import datetime
from pathlib import Path
from typing import Tuple

# Import our OCR functionality
from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text
from eval import evaluate_ocr_accuracy, clean_control_characters

def check_system_dependencies():
    """Check and report system dependencies status."""
    print("🔍 Checking system dependencies...")
    
    # Check Tesseract
    try:
        result = subprocess.run(['tesseract', '--version'], capture_output=True, text=True)
        if result.returncode == 0:
            print("✅ Tesseract is available")
        else:
            print("❌ Tesseract check failed")
    except FileNotFoundError:
        print("❌ Tesseract not found in PATH")
    
    # Check Poppler
    poppler_tools = ['pdftoppm', 'pdfinfo']
    for tool in poppler_tools:
        try:
            result = subprocess.run(['which', tool], capture_output=True, text=True)
            if result.returncode == 0:
                print(f"✅ {tool} is available")
            else:
                print(f"❌ {tool} not found")
        except FileNotFoundError:
            print(f"❌ {tool} not available")
    
    # Check pdf2image
    try:
        import importlib.util
        if importlib.util.find_spec("pdf2image") is not None:
            print("✅ pdf2image is available")
        else:
            print("❌ pdf2image module not found")
    except Exception as e:
        print(f"❌ pdf2image check failed: {e}")
    
    print(f"📍 PATH: {os.environ.get('PATH', 'NOT SET')}")

# Run dependency check on startup
check_system_dependencies()


# Initialize directories
def create_directories():
    """Create necessary directories for file storage."""
    directories = ["documents", "extracted", "temp"]
    for directory in directories:
        Path(directory).mkdir(exist_ok=True)


create_directories()


def process_pdf_ocr(pdf_file) -> Tuple[str, str, str]:
    """
    Process uploaded PDF file and extract text using advanced OCR.

    Args:
        pdf_file: Gradio file input (temporary file path)

    Returns:
        Tuple of (extracted_text, json_results, analysis_results)
    """
    if pdf_file is None:
        return "❌ No file uploaded", "", ""

    try:
        # Generate timestamp for unique naming
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Get original filename
        original_name = os.path.basename(pdf_file.name)
        base_name = os.path.splitext(original_name)[0]

        # Create unique filenames
        pdf_filename = f"{base_name}_{timestamp}.pdf"
        text_filename = f"{base_name}_{timestamp}_extract.txt"
        json_filename = f"{base_name}_{timestamp}_extract.json"
        analysis_filename = f"{base_name}_{timestamp}_analysis.json"

        # Create paths
        pdf_path = Path("temp") / pdf_filename
        text_path = Path("extracted") / text_filename
        json_path = Path("extracted") / json_filename
        analysis_path = Path("extracted") / analysis_filename

        # Copy uploaded file to our temp directory
        shutil.copy2(pdf_file.name, pdf_path)

        # Process the PDF using our advanced OCR system
        extract_all_text_advanced_pix2text(
            pdf_path=str(pdf_path),
            output_text_file=str(text_path),
            output_json_file=str(json_path),
            output_analysis_file=str(analysis_path),
        )

        # Read results
        with open(text_path, "r", encoding="utf-8") as f:
            extracted_text = f.read()

        with open(json_path, "r", encoding="utf-8") as f:
            json_results = json.load(f)

        with open(analysis_path, "r", encoding="utf-8") as f:
            analysis_results = json.load(f)

        # Format results for display
        json_display = json.dumps(json_results, indent=2, ensure_ascii=False)
        analysis_display = json.dumps(analysis_results, indent=2, ensure_ascii=False)

        # Clean up temp file
        try:
            os.remove(pdf_path)
        except Exception:
            pass

        return extracted_text, json_display, analysis_display

    except Exception as e:
        error_msg = str(e)

        # Provide specific guidance for common errors
        if (
            "poppler" in error_msg.lower()
            or "unable to get page count" in error_msg.lower()
        ):
            error_msg = """❌ PDF Processing Error: Poppler not found
            
🔧 This error occurs because Poppler (PDF utilities) is not properly installed.

📋 For Hugging Face Spaces:
1. Ensure your setup.sh script runs during deployment
2. Check that poppler-utils is installed in the container
3. Verify the setup logs show successful poppler installation

💡 The setup.sh script should install these packages:
   - poppler-utils
   - libpoppler-cpp-dev
   - pkg-config

🚨 Original error: {error_msg}

🔄 Try restarting the space if this persists."""
        elif "tesseract" in error_msg.lower():
            error_msg = f"""❌ OCR Engine Error: Tesseract issue
            
🔧 This error is related to Tesseract OCR engine.

📋 Possible solutions:
1. Check Tesseract installation in setup.sh
2. Verify language data files are available
3. Ensure proper permissions on tessdata directory

🚨 Original error: {error_msg}"""
        else:
            error_msg = f"❌ Error processing PDF: {error_msg}"

        return error_msg, "", ""


def evaluate_ocr_files(extracted_file, baseline_file) -> Tuple[str, str]:
    """
    Evaluate OCR accuracy by comparing extracted text with baseline.

    Args:
        extracted_file: Gradio file input (extracted text file)
        baseline_file: Gradio file input (baseline/ground truth text file)

    Returns:
        Tuple of (evaluation_summary, detailed_results)
    """
    if extracted_file is None or baseline_file is None:
        return "❌ Please upload both files", ""

    try:
        # Read file contents
        with open(extracted_file.name, "r", encoding="utf-8") as f:
            extracted_text = f.read()

        with open(baseline_file.name, "r", encoding="utf-8") as f:
            baseline_text = f.read()

        # Clean texts
        extracted_text_clean = clean_control_characters(extracted_text)
        baseline_text_clean = clean_control_characters(baseline_text)

        # Perform evaluation
        results = evaluate_ocr_accuracy(
            extracted_text=extracted_text_clean,
            baseline_text=baseline_text_clean,
        )

        if "error" in results:
            return f"❌ Evaluation error: {results['error']}", ""

        # Create summary
        summary = f"""
📊 **OCR Evaluation Results**

🎯 **Overall Grade: {results["evaluation_summary"]["grade"]}**
📈 **Overall Accuracy: {results["overall_accuracy"]:.2f}%**
🔍 **Similarity Score: {results["similarity_score"]:.2f}%**

📝 **Character Metrics:**
- Total Characters: {results["character_metrics"]["total_chars"]}
- Correct Characters: {results["character_metrics"]["correct_chars"]}
- Character Accuracy: {results["character_metrics"]["accuracy"]:.2f}%

📄 **Word Metrics:**
- Total Words: {results["word_metrics"]["total_words"]}
- Correct Words: {results["word_metrics"]["correct_words"]}
- Word Accuracy: {results["word_metrics"]["accuracy"]:.2f}%

📋 **Line Metrics:**
- Total Lines: {results["line_metrics"]["total_lines"]}
- Correct Lines: {results["line_metrics"]["correct_lines"]}
- Line Accuracy: {results["line_metrics"]["accuracy"]:.2f}%

🌐 **Language-Specific Accuracy:**
- English: {results["language_specific"]["english"]["accuracy"]:.2f}%
- Bangla: {results["language_specific"]["bangla"]["accuracy"]:.2f}%
- Mathematical: {results["language_specific"]["math"]["accuracy"]:.2f}%

💡 **Recommendations:**
{chr(10).join(f"• {rec}" for rec in results["evaluation_summary"]["recommendations"])}
"""

        # Detailed results
        detailed = json.dumps(results, indent=2, ensure_ascii=False)

        return summary, detailed

    except Exception as e:
        error_msg = f"❌ Error during evaluation: {str(e)}"
        return error_msg, ""


# Create Gradio Interface
def create_interface():
    """Create the main Gradio interface."""

    with gr.Blocks(
        title="🔍 Advanced Multi-Language OCR System",
        theme=gr.themes.Soft(),
        css="""
        .gradio-container {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
        }
        .header {
            text-align: center;
            background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 2rem;
            border-radius: 10px;
            margin-bottom: 2rem;
        }
        """,
    ) as app:
        # Header
        gr.HTML("""
        <div class="header">
            <h1>🔍 Advanced Multi-Language OCR System</h1>
            <p>Extract text from PDFs containing English, Bangla, and Mathematical expressions</p>
            <p>Powered by Tesseract, Pix2Text, and Advanced Language Detection</p>
        </div>
        """)

        with gr.Tabs():
            # OCR Processing Tab
            with gr.Tab("📄 OCR Processing", id="ocr"):
                gr.Markdown("""
                ## 📄 PDF Text Extraction
                
                Upload a PDF file to extract text using advanced multi-language OCR technology.
                
                **Features:**
                - 🌐 Multi-language support (English, Bangla, Mathematical expressions)
                - 🧮 Advanced mathematical formula recognition with Pix2Text
                - 📊 Detailed character-by-character analysis
                - 🏷️ Automatic content classification
                """)

                with gr.Row():
                    with gr.Column():
                        pdf_input = gr.File(
                            label="📎 Upload PDF File",
                            file_types=[".pdf"],
                            file_count="single",
                        )

                        process_btn = gr.Button(
                            "🚀 Extract Text", variant="primary", size="lg"
                        )

                with gr.Row():
                    with gr.Column():
                        extracted_output = gr.Textbox(
                            label="📝 Extracted Text",
                            lines=15,
                            max_lines=20,
                            placeholder="Extracted text will appear here...",
                        )

                    with gr.Column():
                        json_output = gr.Textbox(
                            label="📋 Detailed JSON Results",
                            lines=8,
                            max_lines=15,
                            placeholder="JSON results will appear here...",
                        )

                        analysis_output = gr.Textbox(
                            label="📊 Analysis Report",
                            lines=7,
                            max_lines=10,
                            placeholder="Analysis report will appear here...",
                        )

                # Connect OCR processing
                process_btn.click(
                    fn=process_pdf_ocr,
                    inputs=[pdf_input],
                    outputs=[extracted_output, json_output, analysis_output],
                )

            # Evaluation Tab
            with gr.Tab("📊 OCR Evaluation", id="eval"):
                gr.Markdown("""
                ## 📊 OCR Accuracy Evaluation
                
                Compare extracted text with ground truth baseline to measure OCR accuracy.
                
                **Features:**
                - 🎯 Character, word, and line-level accuracy metrics
                - 🌐 Language-specific accuracy analysis
                - 📈 Overall grading system (A+ to F)
                - 💡 Improvement recommendations
                """)

                with gr.Row():
                    with gr.Column():
                        extracted_file = gr.File(
                            label="📄 Extracted Text File (.txt)",
                            file_types=[".txt"],
                            file_count="single",
                        )

                    with gr.Column():
                        baseline_file = gr.File(
                            label="📋 Baseline/Ground Truth File (.txt)",
                            file_types=[".txt"],
                            file_count="single",
                        )

                evaluate_btn = gr.Button(
                    "🔍 Evaluate Accuracy", variant="primary", size="lg"
                )

                with gr.Row():
                    with gr.Column():
                        eval_summary = gr.Textbox(
                            label="📊 Evaluation Summary",
                            lines=20,
                            max_lines=25,
                            placeholder="Evaluation summary will appear here...",
                        )

                    with gr.Column():
                        eval_detailed = gr.Textbox(
                            label="📋 Detailed Results (JSON)",
                            lines=20,
                            max_lines=25,
                            placeholder="Detailed evaluation results will appear here...",
                        )

                # Connect evaluation
                evaluate_btn.click(
                    fn=evaluate_ocr_files,
                    inputs=[extracted_file, baseline_file],
                    outputs=[eval_summary, eval_detailed],
                )

            # About Tab
            with gr.Tab("ℹ️ About", id="about"):
                gr.Markdown("""
                ## 🔍 Advanced Multi-Language OCR System
                
                ### 🌟 Overview
                This system provides state-of-the-art OCR capabilities for documents containing mixed languages and mathematical expressions.
                
                ### 🚀 Key Features
                
                #### 📄 Multi-Language OCR
                - **English**: Advanced text recognition with high accuracy
                - **Bangla**: Native Bengali script support with proper Unicode handling
                - **Mathematical**: LaTeX and formula recognition using Pix2Text
                
                #### 🧮 Advanced Math Processing
                - Integration with **Pix2Text** for superior mathematical expression recognition
                - LaTeX output for mathematical formulas
                - Support for complex equations and symbols
                
                #### 📊 Comprehensive Analysis
                - Character-by-character classification and confidence scoring
                - Language detection and content categorization
                - Detailed extraction statistics and reports
                
                #### 🎯 Accuracy Evaluation
                - Compare extracted text with ground truth baseline
                - Character, word, and line-level accuracy metrics
                - Language-specific performance analysis
                - Grading system with improvement recommendations
                
                ### 🛠️ Technology Stack
                - **OCR Engine**: Tesseract with custom language models
                - **Math Recognition**: Pix2Text for advanced mathematical expressions
                - **Language Detection**: Custom algorithms for multi-language content
                - **Backend**: FastAPI with async processing
                - **Frontend**: Gradio for interactive web interface
                
                ### 📝 Usage Tips
                
                #### For Best OCR Results:
                1. **File Quality**: Use high-resolution PDF files (300 DPI or higher)
                2. **Text Clarity**: Ensure text is clear and not blurry or distorted
                3. **Language**: The system works best with properly formatted text
                4. **Mathematical Content**: Complex formulas are processed using specialized Pix2Text models
                
                #### For Accurate Evaluation:
                1. **File Format**: Upload plain text files (.txt) in UTF-8 encoding
                2. **Content Matching**: Ensure baseline file corresponds to the same source document
                3. **Text Cleaning**: The system automatically cleans control characters
                
                ### 🔗 Links
                - **GitHub Repository**: [aaladin-ocr](https://github.com/ashfaqbracu/aaladin-ocr)
                - **Documentation**: Available in the repository
                - **Issues/Support**: Report issues on GitHub
                
                ### 📧 Contact
                For questions or support, please visit our GitHub repository or create an issue.
                
                ---
                
                **Developed with ❤️ for advanced document processing and OCR accuracy.**
                """)

        return app


# Initialize Pix2Text on startup
print("🚀 Initializing Pix2Text model...")
try:
    pix2text_model = initialize_pix2text()
    if pix2text_model:
        print("✅ Pix2Text initialized successfully")
    else:
        print("⚠️ Pix2Text initialization failed - math extraction may be limited")
except Exception as e:
    print(f"⚠️ Pix2Text initialization error: {e}")

# Create and launch the interface
if __name__ == "__main__":
    app = create_interface()

    # Launch with proper configuration for Hugging Face Spaces
    app.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)