Spaces:

Ash2749
/

math-ocr

Running

File size: 27,019 Bytes

c139f95

# app.py - Gradio Interface for Advanced Multi-Language OCR System
# Hugging Face Spaces compatible application

import os
import json
import shutil
from datetime import datetime
from pathlib import Path
from typing import Tuple
import gradio as gr

# Set up logging first
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Try to import our OCR functionality with error handling
try:
    from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text
    from eval import evaluate_ocr_accuracy, clean_control_characters

    OCR_AVAILABLE = True
    logger.info("✅ OCR modules imported successfully")
except ImportError as e:
    logger.error(f"❌ OCR modules not available: {e}")
    OCR_AVAILABLE = False

    # Create dummy functions as fallbacks
    def extract_all_text_advanced_pix2text(*args, **kwargs):
        raise RuntimeError(
            "OCR functionality not available due to missing dependencies"
        )

    def initialize_pix2text():
        return None

    def evaluate_ocr_accuracy(*args, **kwargs):
        raise RuntimeError(
            "Evaluation functionality not available due to missing dependencies"
        )

    def clean_control_characters(text):
        return text


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# Create necessary directories
def create_directories():
    """Create necessary directories for file storage."""
    directories = ["documents", "extracted", "temp"]
    for directory in directories:
        Path(directory).mkdir(exist_ok=True)
        logger.info(f"✅ Created/verified directory: {directory}")


# Initialize directories
create_directories()

# Initialize Pix2Text model at startup with error handling
logger.info("🚀 Initializing Pix2Text model...")
if OCR_AVAILABLE:
    try:
        PIX2TEXT_MODEL = initialize_pix2text()
        if PIX2TEXT_MODEL:
            logger.info("✅ Pix2Text model loaded successfully")
        else:
            logger.warning("⚠️ Pix2Text model not available, using fallback OCR")
    except Exception as e:
        logger.error(f"❌ Failed to initialize Pix2Text: {e}")
        PIX2TEXT_MODEL = None
else:
    logger.warning("⚠️ OCR modules not available - running in demo mode")
    PIX2TEXT_MODEL = None


def get_safe_filename(filename: str) -> str:
    """Generate a safe filename with timestamp."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    name, ext = os.path.splitext(filename)
    # Remove special characters and replace spaces
    safe_name = "".join(c for c in name if c.isalnum() or c in ("-", "_")).rstrip()
    return f"{safe_name}_{timestamp}{ext}"


def get_extraction_filename(pdf_filename: str, file_type: str) -> str:
    """Generate extraction filename with convention: [pdf_filename]_extract.[extension]"""
    base_name = os.path.splitext(pdf_filename)[0]
    extensions = {"txt": "txt", "json": "json", "analysis": "json"}
    return f"{base_name}_extract.{extensions.get(file_type, 'txt')}"


def extract_text_from_pdf(pdf_file) -> Tuple[str, str, str, str]:
    """
    Extract text from uploaded PDF file using advanced OCR.

    Returns:
        - extracted_text: The full extracted text
        - summary_text: A summary of the extraction process
        - text_file_path: Path to the text file (for download)
        - json_file_path: Path to the JSON file (for download)
    """
    if pdf_file is None:
        return "❌ No file uploaded", "Please upload a PDF file", "", ""

    try:
        start_time = datetime.now()

        # Get the uploaded file path
        pdf_path = pdf_file.name
        filename = os.path.basename(pdf_path)

        logger.info(f"📄 Processing uploaded file: {filename}")

        # Generate safe filename
        safe_filename = get_safe_filename(filename)

        # Copy uploaded file to documents directory
        documents_path = Path("documents") / safe_filename
        shutil.copy2(pdf_path, documents_path)

        # Generate output filenames
        text_filename = get_extraction_filename(safe_filename, "txt")
        json_filename = get_extraction_filename(safe_filename, "json")
        analysis_filename = get_extraction_filename(safe_filename, "analysis")

        # Create full paths for extracted files
        text_path = Path("extracted") / text_filename
        json_path = Path("extracted") / json_filename
        analysis_path = Path("extracted") / analysis_filename

        logger.info("🔄 Starting OCR processing...")

        # Check if OCR functionality is available
        if not OCR_AVAILABLE:
            return (
                """❌ **OCR functionality not available**

This appears to be a demo environment where the OCR dependencies are not fully installed.

**Missing components:**
- OpenCV (cv2) for image processing
- Tesseract OCR for text recognition
- Pix2Text for mathematical expression extraction

**To use this system:**
1. Deploy to Hugging Face Spaces with proper dependencies
2. Or install missing packages locally:
   ```bash
   pip install opencv-python pytesseract pix2text
   apt-get install tesseract-ocr tesseract-ocr-ben poppler-utils
   ```

**Demo Features Available:**
- Interface navigation and design preview
- File upload testing (files are validated but not processed)
- System architecture demonstration
""",
                "OCR dependencies not available in this environment",
                "",
                "",
            )

        # Process the PDF using our advanced OCR system
        extract_all_text_advanced_pix2text(
            pdf_path=str(documents_path),
            output_text_file=str(text_path),
            output_json_file=str(json_path),
            output_analysis_file=str(analysis_path),
        )

        # Read the extracted text
        with open(text_path, "r", encoding="utf-8") as f:
            extracted_text = f.read()

        # Read the analysis for summary
        with open(analysis_path, "r", encoding="utf-8") as f:
            analysis_data = json.load(f)

        # Calculate processing time
        end_time = datetime.now()
        processing_time = (end_time - start_time).total_seconds()

        # Create summary
        summary = f"""
📊 **OCR Processing Complete!**

⏱️ **Processing Time:** {processing_time:.2f} seconds
📄 **Original File:** {filename}
📝 **Extracted Characters:** {len(extracted_text):,}

🔤 **Text Distribution:**
- English regions: {analysis_data.get("type_distribution", {}).get("english", 0)}
- Bangla regions: {analysis_data.get("type_distribution", {}).get("bangla", 0)}
- Math regions: {analysis_data.get("type_distribution", {}).get("math", 0)}
- Mixed regions: {analysis_data.get("type_distribution", {}).get("mixed", 0)}

📈 **Quality Metrics:**
- Total text regions: {analysis_data.get("total_regions", 0)}
- Pages processed: {analysis_data.get("total_pages", 0)}
- Average confidence: {analysis_data.get("confidence_stats", {}).get("avg", 0):.1f}%

🔧 **Extraction Methods:**
- Pix2Text (Math): {analysis_data.get("extraction_methods", {}).get("pix2text", 0)} regions
- Tesseract (Text): {analysis_data.get("extraction_methods", {}).get("tesseract", 0)} regions

✅ **Status:** Extraction completed successfully!
"""

        logger.info(f"✅ OCR processing completed in {processing_time:.2f} seconds")

        return extracted_text, summary, str(text_path), str(json_path)

    except Exception as e:
        error_message = f"❌ **Error during OCR processing:**\n\n{str(e)}"
        logger.error(f"OCR processing failed: {e}")
        return error_message, error_message, "", ""


def evaluate_ocr_files(
    extracted_file, baseline_file, evaluation_name: str = ""
) -> Tuple[str, str]:
    """
    Evaluate OCR accuracy by comparing extracted text with baseline.

    Returns:
        - results_text: Formatted evaluation results
        - summary_text: Summary of the evaluation
    """
    if extracted_file is None or baseline_file is None:
        return "❌ Please upload both files for evaluation", "Missing files"

    try:
        start_time = datetime.now()

        # Read file contents
        with open(extracted_file.name, "r", encoding="utf-8") as f:
            extracted_text = f.read()

        with open(baseline_file.name, "r", encoding="utf-8") as f:
            baseline_text = f.read()

        logger.info(f"📊 Starting evaluation: {evaluation_name or 'Unnamed'}")
        logger.info(f"Extracted text length: {len(extracted_text)} characters")
        logger.info(f"Baseline text length: {len(baseline_text)} characters")

        # Check if evaluation functionality is available
        if not OCR_AVAILABLE:
            return (
                """❌ **Evaluation functionality not available**

This appears to be a demo environment where the evaluation dependencies are not fully installed.

**Missing components:**
- Text processing utilities
- Evaluation algorithms
- Statistical analysis functions

**To use this system:**
1. Deploy to Hugging Face Spaces with proper dependencies
2. Or install missing packages locally

**Demo Features Available:**
- Interface navigation and design preview
- File upload testing (files are validated but not processed)
- System architecture demonstration
""",
                "Evaluation dependencies not available in this environment",
            )

        # Clean input texts
        extracted_text_clean = clean_control_characters(extracted_text)
        baseline_text_clean = clean_control_characters(baseline_text)

        # Perform evaluation
        evaluation_results = evaluate_ocr_accuracy(
            extracted_text=extracted_text_clean,
            baseline_text=baseline_text_clean,
        )

        # Check for evaluation errors
        if "error" in evaluation_results:
            return (
                f"❌ **Evaluation Error:** {evaluation_results['error']}",
                "Error occurred",
            )

        # Calculate processing time
        end_time = datetime.now()
        processing_time = (end_time - start_time).total_seconds()

        # Format results
        results_text = f"""
📊 **OCR Evaluation Results**
{f"📝 **Evaluation Name:** {evaluation_name}" if evaluation_name else ""}

🎯 **Overall Performance**
- **Overall Accuracy:** {evaluation_results["overall_accuracy"]:.2f}%
- **Similarity Score:** {evaluation_results["similarity_score"]:.2f}%
- **Grade:** {evaluation_results["evaluation_summary"]["grade"]}

📝 **Character-Level Analysis**
- **Character Accuracy:** {evaluation_results["character_metrics"]["character_accuracy"]:.2f}%
- **Character Error Rate:** {evaluation_results["character_metrics"]["character_error_rate"]:.2f}%
- **Edit Distance:** {evaluation_results["character_metrics"]["edit_distance"]}
- **Total Characters:** {evaluation_results["character_metrics"]["total_characters"]:,}

📚 **Word-Level Analysis**
- **Word Accuracy:** {evaluation_results["word_metrics"]["word_accuracy"]:.2f}%
- **Word Error Rate:** {evaluation_results["word_metrics"]["word_error_rate"]:.2f}%
- **Correct Words:** {evaluation_results["word_metrics"]["correct_words"]} / {evaluation_results["word_metrics"]["total_words"]}
- **Missing Words:** {evaluation_results["word_metrics"]["missing_words"]}
- **Extra Words:** {evaluation_results["word_metrics"]["extra_words"]}

📄 **Line-Level Analysis**
- **Line Accuracy:** {evaluation_results["line_metrics"]["line_accuracy"]:.2f}%
- **Average Line Similarity:** {evaluation_results["line_metrics"]["average_line_similarity"]:.2f}%
- **Lines Matched:** {evaluation_results["line_metrics"]["lines_matched"]} / {evaluation_results["line_metrics"]["total_lines"]}

🌐 **Language-Specific Accuracy**
- **English:** {evaluation_results["language_specific"].get("english_accuracy", "N/A")}%
- **Bangla:** {evaluation_results["language_specific"].get("bangla_accuracy", "N/A")}%
- **Mathematics:** {evaluation_results["language_specific"].get("math_accuracy", "N/A")}%
- **Numbers:** {evaluation_results["language_specific"].get("number_accuracy", "N/A")}%

📈 **Text Statistics**
- **Extracted Length:** {evaluation_results["text_statistics"]["extracted_length"]:,} characters
- **Baseline Length:** {evaluation_results["text_statistics"]["baseline_length"]:,} characters
- **Extracted Words:** {evaluation_results["text_statistics"]["extracted_words"]:,}
- **Baseline Words:** {evaluation_results["text_statistics"]["baseline_words"]:,}

💡 **Recommendations**
"""

        for i, rec in enumerate(
            evaluation_results["evaluation_summary"]["recommendations"], 1
        ):
            results_text += f"{i}. {rec}\n"

        # Create summary
        summary = f"""
🎯 **Evaluation Summary**

⏱️ **Processing Time:** {processing_time:.3f} seconds
📊 **Overall Score:** {evaluation_results["overall_accuracy"]:.2f}%
🏆 **Grade:** {evaluation_results["evaluation_summary"]["grade"]}
📝 **Character Accuracy:** {evaluation_results["character_metrics"]["character_accuracy"]:.2f}%
📚 **Word Accuracy:** {evaluation_results["word_metrics"]["word_accuracy"]:.2f}%

✅ **Evaluation completed successfully!**
"""

        logger.info(f"✅ Evaluation completed in {processing_time:.3f} seconds")
        logger.info(
            f"📊 Overall accuracy: {evaluation_results['overall_accuracy']:.2f}%"
        )

        return results_text, summary

    except Exception as e:
        error_message = f"❌ **Error during evaluation:**\n\n{str(e)}"
        logger.error(f"Evaluation failed: {e}")
        return error_message, error_message


# Create Gradio interface
def create_gradio_interface():
    """Create and configure the Gradio interface."""

    # Custom CSS for better styling
    css = """
    .gradio-container {
        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    }
    .output-text {
        font-family: 'Courier New', monospace;
        font-size: 14px;
    }
    .summary-box {
        background-color: #f0f8ff;
        border: 1px solid #d0e7ff;
        border-radius: 8px;
        padding: 16px;
        margin: 8px 0;
    }
    """

    with gr.Blocks(
        css=css, title="Advanced Multi-Language OCR System", theme=gr.themes.Soft()
    ) as app:
        # Header
        gr.Markdown("""
        # 🔍 Advanced Multi-Language OCR System
        
        **Powered by Pix2Text, Tesseract, and FastAPI**
        
        Extract text from PDFs containing **English**, **Bangla**, and **Mathematical expressions** with high accuracy.
        Evaluate OCR performance with comprehensive metrics and detailed analysis.
        """)

        with gr.Tabs():
            # Tab 1: OCR Extraction
            with gr.Tab("📄 PDF Text Extraction"):
                gr.Markdown("""
                ### Upload a PDF and extract text using advanced multi-language OCR
                
                **Features:**
                - 🌐 **Multi-language support**: English, Bangla (Bengali), and Mathematical expressions
                - 🧮 **Advanced Math Recognition**: Pix2Text integration for LaTeX and mathematical formulas
                - 📊 **Detailed Analysis**: Character-level classification and confidence scores
                - 💾 **Download Results**: Get extracted text and detailed JSON analysis
                """)

                with gr.Row():
                    with gr.Column(scale=1):
                        pdf_input = gr.File(
                            label="📄 Upload PDF File",
                            file_types=[".pdf"],
                            type="filepath",
                        )
                        extract_btn = gr.Button(
                            "🚀 Extract Text", variant="primary", size="lg"
                        )

                    with gr.Column(scale=2):
                        extraction_summary = gr.Textbox(
                            label="📊 Extraction Summary",
                            lines=15,
                            elem_classes=["summary-box"],
                        )

                with gr.Row():
                    extracted_text_output = gr.Textbox(
                        label="📝 Extracted Text",
                        lines=20,
                        elem_classes=["output-text"],
                        show_copy_button=True,
                    )

                with gr.Row():
                    text_file_download = gr.File(
                        label="📥 Download Text File", visible=False
                    )
                    json_file_download = gr.File(
                        label="📥 Download JSON Analysis", visible=False
                    )

                # Connect extraction functionality
                extract_btn.click(
                    fn=extract_text_from_pdf,
                    inputs=[pdf_input],
                    outputs=[
                        extracted_text_output,
                        extraction_summary,
                        text_file_download,
                        json_file_download,
                    ],
                ).then(
                    lambda text_path, json_path: (
                        gr.update(
                            visible=bool(text_path),
                            value=text_path if text_path else None,
                        ),
                        gr.update(
                            visible=bool(json_path),
                            value=json_path if json_path else None,
                        ),
                    ),
                    inputs=[text_file_download, json_file_download],
                    outputs=[text_file_download, json_file_download],
                )

            # Tab 2: OCR Evaluation
            with gr.Tab("📊 OCR Accuracy Evaluation"):
                gr.Markdown("""
                ### Compare OCR extracted text with ground truth baseline for accuracy analysis
                
                **Evaluation Features:**
                - 🎯 **Character-level accuracy**: Precise character matching and edit distance
                - 📚 **Word-level accuracy**: Word matching and error rates  
                - 📄 **Line-level accuracy**: Line comparison and similarity scores
                - 🌐 **Language-specific metrics**: Separate accuracy for English, Bangla, and Math
                - 🏆 **Grading system**: Letter grades from A+ to F with recommendations
                """)

                with gr.Row():
                    with gr.Column():
                        extracted_file_input = gr.File(
                            label="📄 OCR Extracted Text File (.txt)",
                            file_types=[".txt"],
                            type="filepath",
                        )
                        baseline_file_input = gr.File(
                            label="📑 Ground Truth Baseline File (.txt)",
                            file_types=[".txt"],
                            type="filepath",
                        )
                        evaluation_name_input = gr.Textbox(
                            label="📝 Evaluation Name (Optional)",
                            placeholder="e.g., Math Document Test #1",
                        )
                        evaluate_btn = gr.Button(
                            "📊 Evaluate Accuracy", variant="primary", size="lg"
                        )

                    with gr.Column():
                        evaluation_summary = gr.Textbox(
                            label="🎯 Evaluation Summary",
                            lines=10,
                            elem_classes=["summary-box"],
                        )

                with gr.Row():
                    evaluation_results = gr.Textbox(
                        label="📈 Detailed Evaluation Results",
                        lines=25,
                        elem_classes=["output-text"],
                        show_copy_button=True,
                    )

                # Connect evaluation functionality
                evaluate_btn.click(
                    fn=evaluate_ocr_files,
                    inputs=[
                        extracted_file_input,
                        baseline_file_input,
                        evaluation_name_input,
                    ],
                    outputs=[evaluation_results, evaluation_summary],
                )

            # Tab 3: About & Help
            with gr.Tab("ℹ️ About & Help"):
                gr.Markdown("""
                ## 🔍 Advanced Multi-Language OCR System
                
                This application provides state-of-the-art Optical Character Recognition (OCR) for documents containing mixed languages and mathematical expressions.
                
                ### 🌟 Key Features
                
                #### 📄 **PDF Text Extraction**
                - **Multi-language Support**: Simultaneously process English and Bangla (Bengali) text
                - **Mathematical Recognition**: Advanced extraction of mathematical formulas and equations using Pix2Text
                - **Intelligent Classification**: Automatic detection and classification of text regions by language/content type
                - **High Accuracy**: Optimized preprocessing and multiple OCR engines for best results
                - **Detailed Analysis**: Character-by-character analysis with confidence scores and language distribution
                
                #### 📊 **OCR Accuracy Evaluation**
                - **Comprehensive Metrics**: Character, word, and line-level accuracy measurements
                - **Language-Specific Analysis**: Separate accuracy scores for different languages and mathematical content
                - **Edit Distance Calculation**: Precise measurement of text differences using Levenshtein distance
                - **Grading System**: Letter grades (A+ to F) with improvement recommendations
                - **Detailed Comparison**: Side-by-side diff analysis showing insertions, deletions, and matches
                
                ### 🛠️ **Technology Stack**
                
                - **Pix2Text**: Advanced mathematical expression recognition
                - **Tesseract OCR**: Multi-language text recognition with Bengali support
                - **OpenCV**: Image preprocessing and enhancement
                - **PDF2Image**: High-quality PDF to image conversion
                - **FastAPI**: RESTful API backend
                - **Gradio**: Interactive web interface
                
                ### 📝 **Usage Instructions**
                
                #### **For PDF Text Extraction:**
                1. Upload a PDF file using the file picker
                2. Click "🚀 Extract Text" to start processing
                3. Review the extraction summary for statistics
                4. Copy the extracted text or download the files
                5. Download the JSON file for detailed analysis data
                
                #### **For OCR Evaluation:**
                1. Upload the OCR-extracted text file (what you want to evaluate)
                2. Upload the ground truth baseline file (the correct text)
                3. Optionally provide an evaluation name for identification
                4. Click "📊 Evaluate Accuracy" to run the comparison
                5. Review the detailed metrics and recommendations
                
                ### 🎯 **Accuracy Grading System**
                
                - **A+ (95-100%)**: Excellent - Professional-grade accuracy
                - **A (90-94%)**: Very Good - High-quality results with minor errors
                - **B (80-89%)**: Good - Acceptable for most applications
                - **C (70-79%)**: Fair - May require manual review
                - **D (60-69%)**: Poor - Significant improvements needed
                - **F (<60%)**: Very Poor - Major issues requiring attention
                
                ### 📚 **Supported Languages & Content**
                
                - **English**: Full Latin alphabet with punctuation and symbols
                - **Bangla (Bengali)**: Complete Bengali Unicode range (U+0980-U+09FF)
                - **Mathematical Expressions**: 
                  - Basic arithmetic operators (+, -, ×, ÷, =)
                  - Greek letters (α, β, γ, δ, π, θ, λ, μ, Ω, etc.)
                  - Mathematical symbols (∑, ∫, √, ∞, ∂, →, ≤, ≥, etc.)
                  - Subscripts and superscripts
                  - Functions and equations
                  - LaTeX-style expressions
                
                ### 🔧 **Tips for Best Results**
                
                1. **PDF Quality**: Use high-resolution PDFs (300+ DPI) for better accuracy
                2. **Text Clarity**: Ensure text is not blurry, skewed, or low contrast
                3. **Language Consistency**: Mixed-language documents work best when languages are clearly separated
                4. **Mathematical Content**: Complex equations may require manual verification
                5. **File Size**: Larger documents may take longer to process
                
                ### 🐛 **Troubleshooting**
                
                - **Empty Results**: Check if the PDF contains selectable text or if images need OCR
                - **Low Accuracy**: Try preprocessing the PDF to improve image quality
                - **Mixed Languages**: Ensure the document has clear language boundaries
                - **Mathematical Errors**: Complex formulas may need manual correction
                
                ### 📞 **Support & Feedback**
                
                For issues, suggestions, or contributions, please visit our [GitHub repository](https://github.com/ashfaqbracu/aaladinai).
                
                ---
                
                **Made with ❤️ for advancing multilingual text recognition**
                """)

        # Footer
        gr.Markdown("""
        ---
        
        **🔗 Links:** [GitHub Repository](https://github.com/ashfaqbracu/aaladinai) | [Documentation](https://github.com/ashfaqbracu/aaladinai#readme)
        
        **⚡ Powered by:** Pix2Text • Tesseract OCR • OpenCV • FastAPI • Gradio
        """)

    return app


# Main execution
if __name__ == "__main__":
    logger.info("🚀 Starting Advanced Multi-Language OCR Gradio Interface...")

    # Create and launch the interface
    app = create_gradio_interface()

    # Launch configuration
    app.launch(
        server_name="0.0.0.0",  # Allow external access for Hugging Face Spaces
        server_port=7860,  # Standard port for Hugging Face Spaces
        share=False,  # Don't create gradio.live link
        show_error=True,  # Show detailed error messages
        max_threads=4,  # Limit concurrent requests
    )