Spaces:

Ash2749
/

math-ocr

Running

App Files Files Community

Ash2749 commited on Oct 10, 2025

Commit

1a3e965

verified ·

1 Parent(s): 6389f50

Upload 5 files

Browse files

Files changed (5) hide show

app.py +566 -0
eval.py +428 -0
main6_pix2text.py +838 -0
packages.txt +10 -0
requirements.txt +39 -0

app.py ADDED Viewed

	@@ -0,0 +1,566 @@

+# app.py - Gradio Interface for Advanced Multi-Language OCR System
+# Hugging Face Spaces compatible application
+import os
+import json
+import shutil
+from datetime import datetime
+from pathlib import Path
+from typing import Tuple
+import gradio as gr
+# Import our OCR functionality
+from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text
+from eval import evaluate_ocr_accuracy, clean_control_characters
+# Set up logging
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Create necessary directories
+def create_directories():
+    """Create necessary directories for file storage."""
+    directories = ["documents", "extracted", "temp"]
+    for directory in directories:
+        Path(directory).mkdir(exist_ok=True)
+        logger.info(f"✅ Created/verified directory: {directory}")
+# Initialize directories
+create_directories()
+# Initialize Pix2Text model at startup
+logger.info("🚀 Initializing Pix2Text model...")
+PIX2TEXT_MODEL = initialize_pix2text()
+if PIX2TEXT_MODEL:
+    logger.info("✅ Pix2Text model loaded successfully")
+else:
+    logger.warning("⚠️ Pix2Text model not available, using fallback OCR")
+def get_safe_filename(filename: str) -> str:
+    """Generate a safe filename with timestamp."""
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    name, ext = os.path.splitext(filename)
+    # Remove special characters and replace spaces
+    safe_name = "".join(c for c in name if c.isalnum() or c in ("-", "_")).rstrip()
+    return f"{safe_name}_{timestamp}{ext}"
+def get_extraction_filename(pdf_filename: str, file_type: str) -> str:
+    """Generate extraction filename with convention: [pdf_filename]_extract.[extension]"""
+    base_name = os.path.splitext(pdf_filename)[0]
+    extensions = {"txt": "txt", "json": "json", "analysis": "json"}
+    return f"{base_name}_extract.{extensions.get(file_type, 'txt')}"
+def extract_text_from_pdf(pdf_file) -> Tuple[str, str, str, str]:
+    """
+    Extract text from uploaded PDF file using advanced OCR.
+    Returns:
+        - extracted_text: The full extracted text
+        - summary_text: A summary of the extraction process
+        - text_file_path: Path to the text file (for download)
+        - json_file_path: Path to the JSON file (for download)
+    """
+    if pdf_file is None:
+        return "❌ No file uploaded", "Please upload a PDF file", "", ""
+    try:
+        start_time = datetime.now()
+        # Get the uploaded file path
+        pdf_path = pdf_file.name
+        filename = os.path.basename(pdf_path)
+        logger.info(f"📄 Processing uploaded file: {filename}")
+        # Generate safe filename
+        safe_filename = get_safe_filename(filename)
+        # Copy uploaded file to documents directory
+        documents_path = Path("documents") / safe_filename
+        shutil.copy2(pdf_path, documents_path)
+        # Generate output filenames
+        text_filename = get_extraction_filename(safe_filename, "txt")
+        json_filename = get_extraction_filename(safe_filename, "json")
+        analysis_filename = get_extraction_filename(safe_filename, "analysis")
+        # Create full paths for extracted files
+        text_path = Path("extracted") / text_filename
+        json_path = Path("extracted") / json_filename
+        analysis_path = Path("extracted") / analysis_filename
+        logger.info("🔄 Starting OCR processing...")
+        # Process the PDF using our advanced OCR system
+        extract_all_text_advanced_pix2text(
+            pdf_path=str(documents_path),
+            output_text_file=str(text_path),
+            output_json_file=str(json_path),
+            output_analysis_file=str(analysis_path),
+        )
+        # Read the extracted text
+        with open(text_path, "r", encoding="utf-8") as f:
+            extracted_text = f.read()
+        # Read the analysis for summary
+        with open(analysis_path, "r", encoding="utf-8") as f:
+            analysis_data = json.load(f)
+        # Calculate processing time
+        end_time = datetime.now()
+        processing_time = (end_time - start_time).total_seconds()
+        # Create summary
+        summary = f"""
+📊 **OCR Processing Complete!**
+⏱️ **Processing Time:** {processing_time:.2f} seconds
+📄 **Original File:** {filename}
+📝 **Extracted Characters:** {len(extracted_text):,}
+🔤 **Text Distribution:**
+- English regions: {analysis_data.get("type_distribution", {}).get("english", 0)}
+- Bangla regions: {analysis_data.get("type_distribution", {}).get("bangla", 0)}
+- Math regions: {analysis_data.get("type_distribution", {}).get("math", 0)}
+- Mixed regions: {analysis_data.get("type_distribution", {}).get("mixed", 0)}
+📈 **Quality Metrics:**
+- Total text regions: {analysis_data.get("total_regions", 0)}
+- Pages processed: {analysis_data.get("total_pages", 0)}
+- Average confidence: {analysis_data.get("confidence_stats", {}).get("avg", 0):.1f}%
+🔧 **Extraction Methods:**
+- Pix2Text (Math): {analysis_data.get("extraction_methods", {}).get("pix2text", 0)} regions
+- Tesseract (Text): {analysis_data.get("extraction_methods", {}).get("tesseract", 0)} regions
+✅ **Status:** Extraction completed successfully!
+"""
+        logger.info(f"✅ OCR processing completed in {processing_time:.2f} seconds")
+        return extracted_text, summary, str(text_path), str(json_path)
+    except Exception as e:
+        error_message = f"❌ **Error during OCR processing:**\n\n{str(e)}"
+        logger.error(f"OCR processing failed: {e}")
+        return error_message, error_message, "", ""
+def evaluate_ocr_files(
+    extracted_file, baseline_file, evaluation_name: str = ""
+) -> Tuple[str, str]:
+    """
+    Evaluate OCR accuracy by comparing extracted text with baseline.
+    Returns:
+        - results_text: Formatted evaluation results
+        - summary_text: Summary of the evaluation
+    """
+    if extracted_file is None or baseline_file is None:
+        return "❌ Please upload both files for evaluation", "Missing files"
+    try:
+        start_time = datetime.now()
+        # Read file contents
+        with open(extracted_file.name, "r", encoding="utf-8") as f:
+            extracted_text = f.read()
+        with open(baseline_file.name, "r", encoding="utf-8") as f:
+            baseline_text = f.read()
+        logger.info(f"📊 Starting evaluation: {evaluation_name or 'Unnamed'}")
+        logger.info(f"Extracted text length: {len(extracted_text)} characters")
+        logger.info(f"Baseline text length: {len(baseline_text)} characters")
+        # Clean input texts
+        extracted_text_clean = clean_control_characters(extracted_text)
+        baseline_text_clean = clean_control_characters(baseline_text)
+        # Perform evaluation
+        evaluation_results = evaluate_ocr_accuracy(
+            extracted_text=extracted_text_clean,
+            baseline_text=baseline_text_clean,
+        )
+        # Check for evaluation errors
+        if "error" in evaluation_results:
+            return (
+                f"❌ **Evaluation Error:** {evaluation_results['error']}",
+                "Error occurred",
+            )
+        # Calculate processing time
+        end_time = datetime.now()
+        processing_time = (end_time - start_time).total_seconds()
+        # Format results
+        results_text = f"""
+📊 **OCR Evaluation Results**
+{f"📝 **Evaluation Name:** {evaluation_name}" if evaluation_name else ""}
+🎯 **Overall Performance**
+- **Overall Accuracy:** {evaluation_results["overall_accuracy"]:.2f}%
+- **Similarity Score:** {evaluation_results["similarity_score"]:.2f}%
+- **Grade:** {evaluation_results["evaluation_summary"]["grade"]}
+📝 **Character-Level Analysis**
+- **Character Accuracy:** {evaluation_results["character_metrics"]["character_accuracy"]:.2f}%
+- **Character Error Rate:** {evaluation_results["character_metrics"]["character_error_rate"]:.2f}%
+- **Edit Distance:** {evaluation_results["character_metrics"]["edit_distance"]}
+- **Total Characters:** {evaluation_results["character_metrics"]["total_characters"]:,}
+📚 **Word-Level Analysis**
+- **Word Accuracy:** {evaluation_results["word_metrics"]["word_accuracy"]:.2f}%
+- **Word Error Rate:** {evaluation_results["word_metrics"]["word_error_rate"]:.2f}%
+- **Correct Words:** {evaluation_results["word_metrics"]["correct_words"]} / {evaluation_results["word_metrics"]["total_words"]}
+- **Missing Words:** {evaluation_results["word_metrics"]["missing_words"]}
+- **Extra Words:** {evaluation_results["word_metrics"]["extra_words"]}
+📄 **Line-Level Analysis**
+- **Line Accuracy:** {evaluation_results["line_metrics"]["line_accuracy"]:.2f}%
+- **Average Line Similarity:** {evaluation_results["line_metrics"]["average_line_similarity"]:.2f}%
+- **Lines Matched:** {evaluation_results["line_metrics"]["lines_matched"]} / {evaluation_results["line_metrics"]["total_lines"]}
+🌐 **Language-Specific Accuracy**
+- **English:** {evaluation_results["language_specific"].get("english_accuracy", "N/A")}%
+- **Bangla:** {evaluation_results["language_specific"].get("bangla_accuracy", "N/A")}%
+- **Mathematics:** {evaluation_results["language_specific"].get("math_accuracy", "N/A")}%
+- **Numbers:** {evaluation_results["language_specific"].get("number_accuracy", "N/A")}%
+📈 **Text Statistics**
+- **Extracted Length:** {evaluation_results["text_statistics"]["extracted_length"]:,} characters
+- **Baseline Length:** {evaluation_results["text_statistics"]["baseline_length"]:,} characters
+- **Extracted Words:** {evaluation_results["text_statistics"]["extracted_words"]:,}
+- **Baseline Words:** {evaluation_results["text_statistics"]["baseline_words"]:,}
+💡 **Recommendations**
+"""
+        for i, rec in enumerate(
+            evaluation_results["evaluation_summary"]["recommendations"], 1
+        ):
+            results_text += f"{i}. {rec}\n"
+        # Create summary
+        summary = f"""
+🎯 **Evaluation Summary**
+⏱️ **Processing Time:** {processing_time:.3f} seconds
+📊 **Overall Score:** {evaluation_results["overall_accuracy"]:.2f}%
+🏆 **Grade:** {evaluation_results["evaluation_summary"]["grade"]}
+📝 **Character Accuracy:** {evaluation_results["character_metrics"]["character_accuracy"]:.2f}%
+📚 **Word Accuracy:** {evaluation_results["word_metrics"]["word_accuracy"]:.2f}%
+✅ **Evaluation completed successfully!**
+"""
+        logger.info(f"✅ Evaluation completed in {processing_time:.3f} seconds")
+        logger.info(
+            f"📊 Overall accuracy: {evaluation_results['overall_accuracy']:.2f}%"
+        )
+        return results_text, summary
+    except Exception as e:
+        error_message = f"❌ **Error during evaluation:**\n\n{str(e)}"
+        logger.error(f"Evaluation failed: {e}")
+        return error_message, error_message
+# Create Gradio interface
+def create_gradio_interface():
+    """Create and configure the Gradio interface."""
+    # Custom CSS for better styling
+    css = """
+    .gradio-container {
+        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+    }
+    .output-text {
+        font-family: 'Courier New', monospace;
+        font-size: 14px;
+    }
+    .summary-box {
+        background-color: #f0f8ff;
+        border: 1px solid #d0e7ff;
+        border-radius: 8px;
+        padding: 16px;
+        margin: 8px 0;
+    }
+    """
+    with gr.Blocks(
+        css=css, title="Advanced Multi-Language OCR System", theme=gr.themes.Soft()
+    ) as app:
+        # Header
+        gr.Markdown("""
+        # 🔍 Advanced Multi-Language OCR System
+        **Powered by Pix2Text, Tesseract, and FastAPI**
+        Extract text from PDFs containing **English**, **Bangla**, and **Mathematical expressions** with high accuracy.
+        Evaluate OCR performance with comprehensive metrics and detailed analysis.
+        """)
+        with gr.Tabs():
+            # Tab 1: OCR Extraction
+            with gr.Tab("📄 PDF Text Extraction"):
+                gr.Markdown("""
+                ### Upload a PDF and extract text using advanced multi-language OCR
+                **Features:**
+                - 🌐 **Multi-language support**: English, Bangla (Bengali), and Mathematical expressions
+                - 🧮 **Advanced Math Recognition**: Pix2Text integration for LaTeX and mathematical formulas
+                - 📊 **Detailed Analysis**: Character-level classification and confidence scores
+                - 💾 **Download Results**: Get extracted text and detailed JSON analysis
+                """)
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        pdf_input = gr.File(
+                            label="📄 Upload PDF File",
+                            file_types=[".pdf"],
+                            type="filepath",
+                        )
+                        extract_btn = gr.Button(
+                            "🚀 Extract Text", variant="primary", size="lg"
+                        )
+                    with gr.Column(scale=2):
+                        extraction_summary = gr.Textbox(
+                            label="📊 Extraction Summary",
+                            lines=15,
+                            elem_classes=["summary-box"],
+                        )
+                with gr.Row():
+                    extracted_text_output = gr.Textbox(
+                        label="📝 Extracted Text",
+                        lines=20,
+                        elem_classes=["output-text"],
+                        show_copy_button=True,
+                    )
+                with gr.Row():
+                    text_file_download = gr.File(
+                        label="📥 Download Text File", visible=False
+                    )
+                    json_file_download = gr.File(
+                        label="📥 Download JSON Analysis", visible=False
+                    )
+                # Connect extraction functionality
+                extract_btn.click(
+                    fn=extract_text_from_pdf,
+                    inputs=[pdf_input],
+                    outputs=[
+                        extracted_text_output,
+                        extraction_summary,
+                        text_file_download,
+                        json_file_download,
+                    ],
+                ).then(
+                    lambda text_path, json_path: (
+                        gr.update(
+                            visible=bool(text_path),
+                            value=text_path if text_path else None,
+                        ),
+                        gr.update(
+                            visible=bool(json_path),
+                            value=json_path if json_path else None,
+                        ),
+                    ),
+                    inputs=[text_file_download, json_file_download],
+                    outputs=[text_file_download, json_file_download],
+                )
+            # Tab 2: OCR Evaluation
+            with gr.Tab("📊 OCR Accuracy Evaluation"):
+                gr.Markdown("""
+                ### Compare OCR extracted text with ground truth baseline for accuracy analysis
+                **Evaluation Features:**
+                - 🎯 **Character-level accuracy**: Precise character matching and edit distance
+                - 📚 **Word-level accuracy**: Word matching and error rates
+                - 📄 **Line-level accuracy**: Line comparison and similarity scores
+                - 🌐 **Language-specific metrics**: Separate accuracy for English, Bangla, and Math
+                - 🏆 **Grading system**: Letter grades from A+ to F with recommendations
+                """)
+                with gr.Row():
+                    with gr.Column():
+                        extracted_file_input = gr.File(
+                            label="📄 OCR Extracted Text File (.txt)",
+                            file_types=[".txt"],
+                            type="filepath",
+                        )
+                        baseline_file_input = gr.File(
+                            label="📑 Ground Truth Baseline File (.txt)",
+                            file_types=[".txt"],
+                            type="filepath",
+                        )
+                        evaluation_name_input = gr.Textbox(
+                            label="📝 Evaluation Name (Optional)",
+                            placeholder="e.g., Math Document Test #1",
+                        )
+                        evaluate_btn = gr.Button(
+                            "📊 Evaluate Accuracy", variant="primary", size="lg"
+                        )
+                    with gr.Column():
+                        evaluation_summary = gr.Textbox(
+                            label="🎯 Evaluation Summary",
+                            lines=10,
+                            elem_classes=["summary-box"],
+                        )
+                with gr.Row():
+                    evaluation_results = gr.Textbox(
+                        label="📈 Detailed Evaluation Results",
+                        lines=25,
+                        elem_classes=["output-text"],
+                        show_copy_button=True,
+                    )
+                # Connect evaluation functionality
+                evaluate_btn.click(
+                    fn=evaluate_ocr_files,
+                    inputs=[
+                        extracted_file_input,
+                        baseline_file_input,
+                        evaluation_name_input,
+                    ],
+                    outputs=[evaluation_results, evaluation_summary],
+                )
+            # Tab 3: About & Help
+            with gr.Tab("ℹ️ About & Help"):
+                gr.Markdown("""
+                ## 🔍 Advanced Multi-Language OCR System
+                This application provides state-of-the-art Optical Character Recognition (OCR) for documents containing mixed languages and mathematical expressions.
+                ### 🌟 Key Features
+                #### 📄 **PDF Text Extraction**
+                - **Multi-language Support**: Simultaneously process English and Bangla (Bengali) text
+                - **Mathematical Recognition**: Advanced extraction of mathematical formulas and equations using Pix2Text
+                - **Intelligent Classification**: Automatic detection and classification of text regions by language/content type
+                - **High Accuracy**: Optimized preprocessing and multiple OCR engines for best results
+                - **Detailed Analysis**: Character-by-character analysis with confidence scores and language distribution
+                #### 📊 **OCR Accuracy Evaluation**
+                - **Comprehensive Metrics**: Character, word, and line-level accuracy measurements
+                - **Language-Specific Analysis**: Separate accuracy scores for different languages and mathematical content
+                - **Edit Distance Calculation**: Precise measurement of text differences using Levenshtein distance
+                - **Grading System**: Letter grades (A+ to F) with improvement recommendations
+                - **Detailed Comparison**: Side-by-side diff analysis showing insertions, deletions, and matches
+                ### 🛠️ **Technology Stack**
+                - **Pix2Text**: Advanced mathematical expression recognition
+                - **Tesseract OCR**: Multi-language text recognition with Bengali support
+                - **OpenCV**: Image preprocessing and enhancement
+                - **PDF2Image**: High-quality PDF to image conversion
+                - **FastAPI**: RESTful API backend
+                - **Gradio**: Interactive web interface
+                ### 📝 **Usage Instructions**
+                #### **For PDF Text Extraction:**
+                1. Upload a PDF file using the file picker
+                2. Click "🚀 Extract Text" to start processing
+                3. Review the extraction summary for statistics
+                4. Copy the extracted text or download the files
+                5. Download the JSON file for detailed analysis data
+                #### **For OCR Evaluation:**
+                1. Upload the OCR-extracted text file (what you want to evaluate)
+                2. Upload the ground truth baseline file (the correct text)
+                3. Optionally provide an evaluation name for identification
+                4. Click "📊 Evaluate Accuracy" to run the comparison
+                5. Review the detailed metrics and recommendations
+                ### 🎯 **Accuracy Grading System**
+                - **A+ (95-100%)**: Excellent - Professional-grade accuracy
+                - **A (90-94%)**: Very Good - High-quality results with minor errors
+                - **B (80-89%)**: Good - Acceptable for most applications
+                - **C (70-79%)**: Fair - May require manual review
+                - **D (60-69%)**: Poor - Significant improvements needed
+                - **F (<60%)**: Very Poor - Major issues requiring attention
+                ### 📚 **Supported Languages & Content**
+                - **English**: Full Latin alphabet with punctuation and symbols
+                - **Bangla (Bengali)**: Complete Bengali Unicode range (U+0980-U+09FF)
+                - **Mathematical Expressions**:
+                  - Basic arithmetic operators (+, -, ×, ÷, =)
+                  - Greek letters (α, β, γ, δ, π, θ, λ, μ, Ω, etc.)
+                  - Mathematical symbols (∑, ∫, √, ∞, ∂, →, ≤, ≥, etc.)
+                  - Subscripts and superscripts
+                  - Functions and equations
+                  - LaTeX-style expressions
+                ### 🔧 **Tips for Best Results**
+                1. **PDF Quality**: Use high-resolution PDFs (300+ DPI) for better accuracy
+                2. **Text Clarity**: Ensure text is not blurry, skewed, or low contrast
+                3. **Language Consistency**: Mixed-language documents work best when languages are clearly separated
+                4. **Mathematical Content**: Complex equations may require manual verification
+                5. **File Size**: Larger documents may take longer to process
+                ### 🐛 **Troubleshooting**
+                - **Empty Results**: Check if the PDF contains selectable text or if images need OCR
+                - **Low Accuracy**: Try preprocessing the PDF to improve image quality
+                - **Mixed Languages**: Ensure the document has clear language boundaries
+                - **Mathematical Errors**: Complex formulas may need manual correction
+                ### 📞 **Support & Feedback**
+                For issues, suggestions, or contributions, please visit our [GitHub repository](https://github.com/ashfaqbracu/aaladinai).
+                ---
+                **Made with ❤️ for advancing multilingual text recognition**
+                """)
+        # Footer
+        gr.Markdown("""
+        ---
+        **🔗 Links:** [GitHub Repository](https://github.com/ashfaqbracu/aaladinai) | [Documentation](https://github.com/ashfaqbracu/aaladinai#readme)
+        **⚡ Powered by:** Pix2Text • Tesseract OCR • OpenCV • FastAPI • Gradio
+        """)
+    return app
+# Main execution
+if __name__ == "__main__":
+    logger.info("🚀 Starting Advanced Multi-Language OCR Gradio Interface...")
+    # Create and launch the interface
+    app = create_gradio_interface()
+    # Launch configuration
+    app.launch(
+        server_name="0.0.0.0",  # Allow external access for Hugging Face Spaces
+        server_port=7860,  # Standard port for Hugging Face Spaces
+        share=False,  # Don't create gradio.live link
+        show_error=True,  # Show detailed error messages
+        show_tips=True,  # Show helpful tips
+        enable_queue=True,  # Enable request queuing for better performance
+        max_threads=4,  # Limit concurrent requests
+    )

eval.py ADDED Viewed

	@@ -0,0 +1,428 @@

+# eval.py - OCR Evaluation Methods
+# Comprehensive accuracy evaluation for OCR text extraction
+import re
+import difflib
+from typing import Dict, List, Any
+from collections import defaultdict
+import unicodedata
+def clean_control_characters(text: str) -> str:
+    """
+    Remove or replace control characters that can cause JSON encoding issues.
+    Properly handles Bangla and other Unicode characters.
+    """
+    if not text:
+        return text
+    # First, ensure the text is properly encoded
+    if isinstance(text, bytes):
+        try:
+            text = text.decode("utf-8", errors="replace")
+        except Exception:
+            text = str(text)
+    cleaned = ""
+    for char in text:
+        # Get Unicode category
+        category = unicodedata.category(char)
+        # Remove control characters except for common whitespace
+        if category.startswith("C") and char not in "\t\n\r":
+            # Replace with space for control characters
+            cleaned += " "
+        # Keep printable characters including Bangla unicode range
+        elif (
+            char.isprintable()
+            or char in "\t\n\r"
+            or "\u0980" <= char <= "\u09ff"  # Bangla
+            or "\u0900" <= char <= "\u097f"  # Devanagari
+            or "\u0600" <= char <= "\u06ff"
+        ):  # Arabic
+            cleaned += char
+        else:
+            # Replace unprintable characters with space
+            cleaned += " "
+    # Clean up multiple spaces and normalize
+    cleaned = re.sub(r"\s+", " ", cleaned)
+    return cleaned.strip()
+def safe_json_serialize(data: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Ensure all string values in the dictionary are safe for JSON serialization.
+    Handles Unicode characters properly for JSON encoding.
+    """
+    if isinstance(data, dict):
+        return {key: safe_json_serialize(value) for key, value in data.items()}
+    elif isinstance(data, list):
+        return [safe_json_serialize(item) for item in data]
+    elif isinstance(data, str):
+        # Clean control characters first
+        cleaned = clean_control_characters(data)
+        # Ensure the string is JSON-safe by encoding to UTF-8 and back
+        try:
+            # Test if it can be JSON serialized
+            import json
+            json.dumps(cleaned, ensure_ascii=False)
+            return cleaned
+        except Exception:
+            # If there are still issues, use ASCII encoding with escape sequences
+            return cleaned.encode("ascii", errors="replace").decode("ascii")
+    else:
+        return data
+def edit_distance(s1: str, s2: str) -> int:
+    """
+    Calculate edit distance (Levenshtein distance) between two strings.
+    """
+    if len(s1) < len(s2):
+        return edit_distance(s2, s1)
+    if len(s2) == 0:
+        return len(s1)
+    previous_row = list(range(len(s2) + 1))
+    for i, c1 in enumerate(s1):
+        current_row = [i + 1]
+        for j, c2 in enumerate(s2):
+            insertions = previous_row[j + 1] + 1
+            deletions = current_row[j] + 1
+            substitutions = previous_row[j] + (c1 != c2)
+            current_row.append(min(insertions, deletions, substitutions))
+        previous_row = current_row
+    return previous_row[-1]
+def normalize_text(text: str) -> str:
+    """
+    Normalize text for better comparison by:
+    - Converting to lowercase
+    - Removing extra whitespace
+    - Normalizing Unicode characters
+    """
+    # Normalize Unicode (handles accents, special characters)
+    text = unicodedata.normalize("NFKD", text)
+    # Convert to lowercase
+    text = text.lower()
+    # Remove extra whitespace and normalize line breaks
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+def calculate_character_accuracy(extracted: str, baseline: str) -> Dict[str, float]:
+    """
+    Calculate character-level accuracy metrics.
+    """
+    extracted_norm = normalize_text(extracted)
+    baseline_norm = normalize_text(baseline)
+    # Character-level metrics
+    total_chars = len(baseline_norm)
+    if total_chars == 0:
+        return {"character_accuracy": 0.0, "character_error_rate": 100.0}
+    # Calculate edit distance (Levenshtein distance)
+    edit_dist = edit_distance(extracted_norm, baseline_norm)
+    # Character accuracy = (total_chars - edit_distance) / total_chars
+    char_accuracy = max(0, (total_chars - edit_dist) / total_chars) * 100
+    char_error_rate = (edit_dist / total_chars) * 100
+    return {
+        "character_accuracy": round(char_accuracy, 2),
+        "character_error_rate": round(char_error_rate, 2),
+        "edit_distance": edit_dist,
+        "total_characters": total_chars,
+    }
+def calculate_word_accuracy(extracted: str, baseline: str) -> Dict[str, float]:
+    """
+    Calculate word-level accuracy metrics.
+    """
+    extracted_words = normalize_text(extracted).split()
+    baseline_words = normalize_text(baseline).split()
+    total_words = len(baseline_words)
+    if total_words == 0:
+        return {"word_accuracy": 0.0, "word_error_rate": 100.0}
+    # Calculate word-level edit distance
+    word_edit_dist = edit_distance(" ".join(extracted_words), " ".join(baseline_words))
+    # Count exact word matches
+    extracted_set = set(extracted_words)
+    baseline_set = set(baseline_words)
+    correct_words = len(extracted_set.intersection(baseline_set))
+    word_accuracy = (correct_words / total_words) * 100
+    # Word Error Rate (WER)
+    word_error_rate = (word_edit_dist / total_words) * 100
+    return {
+        "word_accuracy": round(word_accuracy, 2),
+        "word_error_rate": round(word_error_rate, 2),
+        "correct_words": correct_words,
+        "total_words": total_words,
+        "missing_words": len(baseline_set - extracted_set),
+        "extra_words": len(extracted_set - baseline_set),
+    }
+def calculate_line_accuracy(extracted: str, baseline: str) -> Dict[str, float]:
+    """
+    Calculate line-level accuracy metrics.
+    """
+    extracted_lines = [line.strip() for line in extracted.split("\n") if line.strip()]
+    baseline_lines = [line.strip() for line in baseline.split("\n") if line.strip()]
+    total_lines = len(baseline_lines)
+    if total_lines == 0:
+        return {"line_accuracy": 0.0, "lines_matched": 0}
+    # Calculate similarity for each line
+    matched_lines = 0
+    line_similarities = []
+    for i, baseline_line in enumerate(baseline_lines):
+        best_similarity = 0
+        for extracted_line in extracted_lines:
+            similarity = difflib.SequenceMatcher(
+                None, normalize_text(baseline_line), normalize_text(extracted_line)
+            ).ratio()
+            best_similarity = max(best_similarity, similarity)
+        line_similarities.append(best_similarity)
+        if best_similarity > 0.8:  # 80% similarity threshold
+            matched_lines += 1
+    line_accuracy = (matched_lines / total_lines) * 100
+    avg_line_similarity = (sum(line_similarities) / len(line_similarities)) * 100
+    return {
+        "line_accuracy": round(line_accuracy, 2),
+        "average_line_similarity": round(avg_line_similarity, 2),
+        "lines_matched": matched_lines,
+        "total_lines": total_lines,
+    }
+def calculate_language_specific_accuracy(
+    extracted: str, baseline: str
+) -> Dict[str, Any]:
+    """
+    Calculate accuracy for different language components (English, Bangla, Math).
+    """
+    def classify_char(char):
+        if "\u0980" <= char <= "\u09ff":  # Bangla unicode range
+            return "bangla"
+        elif char.isascii() and char.isalpha():
+            return "english"
+        elif char.isdigit():
+            return "number"
+        elif char in "=+-×÷∑∫√π∞∂→≤≥∝∴∵∠∆∇∀∃∈∉⊂⊃⊆⊇∪∩∧∨¬αβγδεζηθικλμνξοπρστυφχψω":
+            return "math"
+        else:
+            return "other"
+    # Analyze character distribution
+    extracted_chars = defaultdict(list)
+    baseline_chars = defaultdict(list)
+    for char in extracted:
+        char_type = classify_char(char)
+        extracted_chars[char_type].append(char)
+    for char in baseline:
+        char_type = classify_char(char)
+        baseline_chars[char_type].append(char)
+    language_accuracy = {}
+    for lang_type in ["english", "bangla", "math", "number"]:
+        extracted_text = "".join(extracted_chars.get(lang_type, []))
+        baseline_text = "".join(baseline_chars.get(lang_type, []))
+        if baseline_text:
+            char_metrics = calculate_character_accuracy(extracted_text, baseline_text)
+            language_accuracy[f"{lang_type}_accuracy"] = char_metrics[
+                "character_accuracy"
+            ]
+        else:
+            language_accuracy[f"{lang_type}_accuracy"] = (
+                100.0 if not extracted_text else 0.0
+            )
+    return language_accuracy
+def calculate_similarity_score(extracted: str, baseline: str) -> float:
+    """
+    Calculate overall similarity score using sequence matcher.
+    """
+    similarity = difflib.SequenceMatcher(
+        None, normalize_text(extracted), normalize_text(baseline)
+    ).ratio()
+    return round(similarity * 100, 2)
+def generate_detailed_diff(extracted: str, baseline: str) -> List[Dict[str, str]]:
+    """
+    Generate a detailed diff showing insertions, deletions, and matches.
+    """
+    extracted_norm = normalize_text(extracted)
+    baseline_norm = normalize_text(baseline)
+    differ = difflib.unified_diff(
+        baseline_norm.splitlines(keepends=True),
+        extracted_norm.splitlines(keepends=True),
+        fromfile="baseline",
+        tofile="extracted",
+        lineterm="",
+    )
+    diff_result = []
+    for line in differ:
+        if line.startswith("---") or line.startswith("+++") or line.startswith("@@"):
+            continue
+        elif line.startswith("-"):
+            content = clean_control_characters(line[1:])
+            diff_result.append({"type": "deletion", "content": content})
+        elif line.startswith("+"):
+            content = clean_control_characters(line[1:])
+            diff_result.append({"type": "insertion", "content": content})
+        else:
+            content = clean_control_characters(line)
+            diff_result.append({"type": "match", "content": content})
+    return diff_result
+def evaluate_ocr_accuracy(extracted_text: str, baseline_text: str) -> Dict[str, Any]:
+    """
+    Comprehensive OCR accuracy evaluation.
+    Args:
+        extracted_text: The text extracted by OCR
+        baseline_text: The ground truth text
+    Returns:
+        Dictionary containing various accuracy metrics
+    """
+    if not extracted_text and not baseline_text:
+        return {"error": "Both texts are empty"}
+    if not baseline_text:
+        return {"error": "Baseline text is empty"}
+    # Clean control characters from input texts
+    extracted_text = clean_control_characters(extracted_text)
+    baseline_text = clean_control_characters(baseline_text)
+    # Calculate all metrics
+    char_metrics = calculate_character_accuracy(extracted_text, baseline_text)
+    word_metrics = calculate_word_accuracy(extracted_text, baseline_text)
+    line_metrics = calculate_line_accuracy(extracted_text, baseline_text)
+    lang_metrics = calculate_language_specific_accuracy(extracted_text, baseline_text)
+    similarity_score = calculate_similarity_score(extracted_text, baseline_text)
+    detailed_diff = generate_detailed_diff(extracted_text, baseline_text)
+    # Calculate overall score (weighted average)
+    overall_score = (
+        char_metrics["character_accuracy"] * 0.4
+        + word_metrics["word_accuracy"] * 0.3
+        + line_metrics["line_accuracy"] * 0.2
+        + similarity_score * 0.1
+    )
+    result = {
+        "overall_accuracy": round(overall_score, 2),
+        "similarity_score": similarity_score,
+        "character_metrics": char_metrics,
+        "word_metrics": word_metrics,
+        "line_metrics": line_metrics,
+        "language_specific": lang_metrics,
+        "text_statistics": {
+            "extracted_length": len(extracted_text),
+            "baseline_length": len(baseline_text),
+            "extracted_words": len(extracted_text.split()),
+            "baseline_words": len(baseline_text.split()),
+            "extracted_lines": len(extracted_text.split("\n")),
+            "baseline_lines": len(baseline_text.split("\n")),
+        },
+        "detailed_diff": detailed_diff[:50],  # Limit to first 50 diff items
+        "evaluation_summary": {
+            "grade": get_accuracy_grade(overall_score),
+            "recommendations": get_recommendations(
+                char_metrics, word_metrics, lang_metrics
+            ),
+        },
+    }
+    # Clean all string values to ensure JSON safety
+    return safe_json_serialize(result)
+def get_accuracy_grade(score: float) -> str:
+    """Convert accuracy score to letter grade."""
+    if score >= 95:
+        return "A+ (Excellent)"
+    elif score >= 90:
+        return "A (Very Good)"
+    elif score >= 80:
+        return "B (Good)"
+    elif score >= 70:
+        return "C (Fair)"
+    elif score >= 60:
+        return "D (Poor)"
+    else:
+        return "F (Very Poor)"
+def get_recommendations(
+    char_metrics: Dict, word_metrics: Dict, lang_metrics: Dict
+) -> List[str]:
+    """Generate recommendations based on accuracy metrics."""
+    recommendations = []
+    if char_metrics["character_accuracy"] < 80:
+        recommendations.append(
+            "Consider improving image preprocessing (noise reduction, contrast adjustment)"
+        )
+    if word_metrics["word_accuracy"] < 70:
+        recommendations.append(
+            "Word-level accuracy is low - check language model configuration"
+        )
+    if lang_metrics.get("bangla_accuracy", 100) < 80:
+        recommendations.append(
+            "Bangla text accuracy is low - ensure Bengali language pack is installed"
+        )
+    if lang_metrics.get("math_accuracy", 100) < 70:
+        recommendations.append(
+            "Mathematical expression accuracy is low - consider tuning Pix2Text parameters"
+        )
+    if lang_metrics.get("english_accuracy", 100) < 85:
+        recommendations.append(
+            "English text accuracy could be improved - check OCR engine settings"
+        )
+    if not recommendations:
+        recommendations.append("Excellent accuracy! No specific improvements needed.")
+    return recommendations

main6_pix2text.py ADDED Viewed

	@@ -0,0 +1,838 @@

+import cv2
+import pytesseract
+from pytesseract import Output
+from pdf2image import convert_from_path
+import numpy as np
+import json
+from tqdm import tqdm
+import unicodedata
+from collections import defaultdict
+from PIL import Image
+import logging
+try:
+    from pix2text import Pix2Text
+    PIX2TEXT_AVAILABLE = True
+    print("Pix2Text imported successfully for advanced math extraction")
+except ImportError:
+    PIX2TEXT_AVAILABLE = False
+    print("Pix2Text not available. Install with: pip install pix2text")
+    print("   Falling back to traditional OCR for math expressions")
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# ----------------------------
+# STEP 1: Enhanced Character Classification
+# ----------------------------
+def classify_character(char):
+    """
+    Classify a single character as English, Bangla, Math, or Other.
+    Enhanced for better math detection.
+    """
+    if not char or char.isspace():
+        return "space"
+    # Unicode ranges for Bangla
+    if "\u0980" <= char <= "\u09ff":  # Bangla unicode range
+        return "bangla"
+    # Enhanced mathematical symbols and operators
+    math_chars = set(
+        "=+-×÷∑∫√π∞∂→≤≥∝∴∵∠∆∇∀∃∈∉⊂⊃⊆⊇∪∩∧∨¬"
+        "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ"
+        "±≈≠≡⇒⇔∘∗⊕⊗⊙⊥∥∦∝∞"
+    )
+    # Extended math ranges
+    math_ranges = [
+        ("\u2200", "\u22ff"),  # Mathematical Operators
+        ("\u2190", "\u21ff"),  # Arrows
+        ("\u0370", "\u03ff"),  # Greek and Coptic
+        ("\u2070", "\u209f"),  # Superscripts and Subscripts
+        ("\u27c0", "\u27ef"),  # Miscellaneous Mathematical Symbols-A
+        ("\u2980", "\u29ff"),  # Miscellaneous Mathematical Symbols-B
+    ]
+    if char in math_chars:
+        return "math"
+    for start, end in math_ranges:
+        if start <= char <= end:
+            return "math"
+    # Numbers (also often mathematical)
+    if char.isdigit():
+        return "number"
+    # English letters
+    if char.isascii() and char.isalpha():
+        return "english"
+    # Mathematical punctuation
+    if char in ".,;:!?()[]{}\"'-_/\\^":
+        return "punctuation"
+    return "other"
+def classify_text_region(text):
+    """
+    Enhanced text region classification with better math detection.
+    """
+    if not text.strip():
+        return "empty"
+    char_counts = defaultdict(int)
+    for char in text:
+        char_type = classify_character(char)
+        char_counts[char_type] += 1
+    # Remove spaces from consideration
+    significant_chars = {k: v for k, v in char_counts.items() if k not in ["space"]}
+    if not significant_chars:
+        return "empty"
+    total_significant = sum(significant_chars.values())
+    percentages = {k: v / total_significant for k, v in significant_chars.items()}
+    # Enhanced classification logic
+    math_indicators = percentages.get("math", 0) + percentages.get("number", 0) * 0.5
+    if percentages.get("bangla", 0) > 0.5:
+        return "bangla"
+    elif math_indicators > 0.3 or has_math_patterns(text):
+        return "math"
+    elif percentages.get("english", 0) > 0.5:
+        return "english"
+    else:
+        return "mixed"
+def has_math_patterns(text):
+    """
+    Detect mathematical patterns in text using regex and heuristics.
+    """
+    import re
+    # Common mathematical patterns
+    math_patterns = [
+        r"\d+[\+\-\*/=]\d+",  # Simple arithmetic
+        r"[xy]\^?\d+",  # Variables with powers
+        r"\\[a-zA-Z]+",  # LaTeX commands
+        r"\$.*?\$",  # LaTeX inline math
+        r"[a-zA-Z]\([a-zA-Z,\d\s]+\)",  # Functions like f(x)
+        r"\b(sin|cos|tan|log|ln|exp|sqrt|int|sum|lim)\b",  # Math functions
+        r"[≤≥≠≈∫∑∂∞]",  # Math symbols
+    ]
+    for pattern in math_patterns:
+        if re.search(pattern, text, re.IGNORECASE):
+            return True
+    return False
+# ----------------------------
+# STEP 2: Initialize Pix2Text
+# ----------------------------
+def initialize_pix2text():
+    """Initialize Pix2Text model for mathematical expression extraction."""
+    if not PIX2TEXT_AVAILABLE:
+        return None
+    try:
+        # Initialize Pix2Text with specific configuration for math
+        # Try different initialization methods
+        logger.info("Initializing Pix2Text...")
+        # Method 1: Default initialization
+        try:
+            p2t = Pix2Text.from_config()
+            logger.info("✅ Pix2Text initialized with default config")
+            return p2t
+        except Exception as e1:
+            logger.warning(f"Default Pix2Text init failed: {e1}")
+        # Method 2: Try with specific config
+        try:
+            p2t = Pix2Text()
+            logger.info("✅ Pix2Text initialized with basic constructor")
+            return p2t
+        except Exception as e2:
+            logger.warning(f"Basic Pix2Text init failed: {e2}")
+        # Method 3: Try with minimal config
+        try:
+            config = {"device": "cpu"}  # Force CPU to avoid CUDA issues
+            p2t = Pix2Text.from_config(config)
+            logger.info("✅ Pix2Text initialized with CPU config")
+            return p2t
+        except Exception as e3:
+            logger.error(f"All Pix2Text initialization methods failed: {e3}")
+        return None
+    except Exception as e:
+        logger.error(f"❌ Failed to initialize Pix2Text: {e}")
+        return None
+# ----------------------------
+# STEP 3: Enhanced Image Preprocessing
+# ----------------------------
+def preprocess_image_advanced(pil_image):
+    """Enhanced image preprocessing with multiple techniques."""
+    img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    # Noise reduction
+    gray = cv2.fastNlMeansDenoising(gray, h=15)
+    # Adaptive thresholding for better text separation
+    binary = cv2.adaptiveThreshold(
+        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 5
+    )
+    # Enhance contrast
+    enhanced = cv2.convertScaleAbs(binary, alpha=1.2, beta=10)
+    # Scale up for better OCR accuracy
+    height, width = enhanced.shape
+    scaled = cv2.resize(
+        enhanced, (width * 2, height * 2), interpolation=cv2.INTER_CUBIC
+    )
+    return scaled
+def preprocess_for_pix2text(pil_image, region):
+    """
+    Special preprocessing for Pix2Text mathematical expression extraction.
+    """
+    # Convert PIL to numpy array
+    img = np.array(pil_image)
+    # Crop the specific region
+    x, y, w, h = region["left"], region["top"], region["width"], region["height"]
+    # Validate region dimensions
+    if w <= 0 or h <= 0:
+        logger.warning(f"Invalid region dimensions: w={w}, h={h}. Skipping Pix2Text.")
+        return None
+    # Add padding around the math region for better recognition
+    padding = 10
+    x_start = max(0, x - padding)
+    y_start = max(0, y - padding)
+    x_end = min(img.shape[1], x + w + padding)
+    y_end = min(img.shape[0], y + h + padding)
+    # Validate cropping bounds
+    if x_end <= x_start or y_end <= y_start:
+        logger.warning(
+            f"Invalid crop bounds: x({x_start}:{x_end}), y({y_start}:{y_end}). Skipping Pix2Text."
+        )
+        return None
+    cropped = img[y_start:y_end, x_start:x_end]
+    # Check if crop resulted in empty image
+    if cropped.size == 0:
+        logger.warning("Cropped image is empty. Skipping Pix2Text.")
+        return None
+    # Convert back to PIL Image
+    try:
+        cropped_pil = Image.fromarray(cropped)
+    except Exception as e:
+        logger.error(f"Failed to create PIL image from cropped array: {e}")
+        return None
+    # Ensure minimum size for Pix2Text
+    min_size = 32
+    if cropped_pil.width <= 0 or cropped_pil.height <= 0:
+        logger.warning(
+            f"Invalid PIL image dimensions: {cropped_pil.width}x{cropped_pil.height}"
+        )
+        return None
+    if cropped_pil.width < min_size or cropped_pil.height < min_size:
+        # Resize maintaining aspect ratio
+        try:
+            ratio = max(min_size / cropped_pil.width, min_size / cropped_pil.height)
+            new_width = int(cropped_pil.width * ratio)
+            new_height = int(cropped_pil.height * ratio)
+            # Ensure new dimensions are valid
+            if new_width <= 0 or new_height <= 0:
+                logger.warning(f"Invalid resized dimensions: {new_width}x{new_height}")
+                return None
+            cropped_pil = cropped_pil.resize((new_width, new_height), Image.LANCZOS)
+        except Exception as e:
+            logger.error(f"Failed to resize image: {e}")
+            return None
+    return cropped_pil
+# ----------------------------
+# STEP 4: Text Detection and Line Segmentation
+# ----------------------------
+def detect_text_regions(image):
+    """Detect text regions and classify them by line and character type."""
+    data = pytesseract.image_to_data(image, output_type=Output.DICT, lang="eng+ben")
+    text_regions = []
+    for i in range(len(data["text"])):
+        text = data["text"][i].strip()
+        if text and int(data["conf"][i]) > 25:  # Lowered threshold for math
+            # Validate region dimensions
+            width = int(data["width"][i])
+            height = int(data["height"][i])
+            left = int(data["left"][i])
+            top = int(data["top"][i])
+            # Skip regions with invalid dimensions
+            if width <= 0 or height <= 0:
+                logger.debug(
+                    f"Skipping region with invalid dimensions: {width}x{height}"
+                )
+                continue
+            # Skip regions that are too small to be meaningful
+            if width < 3 or height < 3:
+                logger.debug(f"Skipping tiny region: {width}x{height}")
+                continue
+            region = {
+                "text": text,
+                "left": left,
+                "top": top,
+                "width": width,
+                "height": height,
+                "confidence": int(data["conf"][i]),
+                "type": classify_text_region(text),
+            }
+            text_regions.append(region)
+    logger.info(f"Detected {len(text_regions)} valid text regions")
+    return text_regions
+def group_regions_by_line(regions, line_tolerance=15):
+    """Group text regions into lines with better tolerance for math expressions."""
+    if not regions:
+        return []
+    regions_sorted = sorted(regions, key=lambda x: x["top"])
+    lines = []
+    current_line = [regions_sorted[0]]
+    current_top = regions_sorted[0]["top"]
+    for region in regions_sorted[1:]:
+        # More flexible line grouping for mathematical expressions
+        # Handle zero heights safely
+        current_height = max(1, current_line[0]["height"])  # Avoid division by zero
+        region_height = max(1, region["height"])  # Avoid division by zero
+        height_avg = (current_height + region_height) / 2
+        tolerance = max(line_tolerance, height_avg * 0.3)
+        if abs(region["top"] - current_top) <= tolerance:
+            current_line.append(region)
+        else:
+            current_line.sort(key=lambda x: x["left"])
+            lines.append(current_line)
+            current_line = [region]
+            current_top = region["top"]
+    if current_line:
+        current_line.sort(key=lambda x: x["left"])
+        lines.append(current_line)
+    return lines
+# ----------------------------
+# STEP 5: Advanced OCR Extractors
+# ----------------------------
+def extract_english_region(image, region):
+    """Extract English text from a specific region with optimized settings."""
+    x, y, w, h = region["left"], region["top"], region["width"], region["height"]
+    roi = image[y : y + h, x : x + w]
+    if roi.size == 0:
+        return region["text"]
+    config = r"--oem 3 --psm 8 -l eng"
+    try:
+        result = pytesseract.image_to_string(roi, config=config).strip()
+        return result if result else region["text"]
+    except Exception:
+        return region["text"]
+def extract_bangla_region(image, region):
+    """Extract Bangla text from a specific region with optimized settings."""
+    x, y, w, h = region["left"], region["top"], region["width"], region["height"]
+    roi = image[y : y + h, x : x + w]
+    if roi.size == 0:
+        return region["text"]
+    config = r"--oem 3 --psm 8 -l ben"
+    try:
+        result = pytesseract.image_to_string(roi, config=config).strip()
+        return result if result else region["text"]
+    except Exception:
+        return region["text"]
+def extract_math_region_pix2text(pil_image, region, p2t_model):
+    """
+    Extract mathematical expressions using Pix2Text with fallback to traditional OCR.
+    """
+    if not p2t_model:
+        return extract_math_region_traditional(pil_image, region)
+    try:
+        # Preprocess image for Pix2Text
+        math_image = preprocess_for_pix2text(pil_image, region)
+        # If preprocessing failed, fall back to traditional OCR
+        if math_image is None:
+            logger.warning(
+                "Pix2Text preprocessing failed, falling back to traditional OCR"
+            )
+            return extract_math_region_traditional(pil_image, region)
+        # Use Pix2Text to extract mathematical expressions
+        result = p2t_model(math_image)
+        # Enhanced result parsing to handle different Pix2Text response formats
+        extracted_text = parse_pix2text_result(result)
+        if extracted_text and extracted_text.strip():
+            # Filter out invalid responses
+            if not is_valid_pix2text_result(extracted_text):
+                logger.warning(f"Invalid Pix2Text result: {extracted_text[:100]}...")
+                return extract_math_region_traditional(pil_image, region)
+            logger.info(f"✅ Pix2Text extracted: {extracted_text[:50]}...")
+            return extracted_text.strip()
+        else:
+            logger.warning(
+                "⚠️  Pix2Text returned empty result, falling back to traditional OCR"
+            )
+            return extract_math_region_traditional(pil_image, region)
+    except Exception as e:
+        logger.error(f"❌ Pix2Text extraction failed: {e}")
+        return extract_math_region_traditional(pil_image, region)
+def parse_pix2text_result(result):
+    """
+    Parse Pix2Text result handling various response formats.
+    """
+    try:
+        if isinstance(result, dict):
+            # Handle different Pix2Text response formats
+            # Try common keys for mathematical content
+            for key in ["text", "formula", "latex", "content", "output"]:
+                if key in result and result[key]:
+                    return str(result[key])
+            # If no specific key found, convert entire dict to string
+            # but filter out obviously bad content
+            result_str = str(result)
+            if len(result_str) > 1000:  # Too long, likely debug info
+                return ""
+            return result_str
+        elif isinstance(result, list):
+            # Handle list responses
+            if not result:
+                return ""
+            # Join list elements that look like mathematical content
+            valid_items = []
+            for item in result:
+                item_str = str(item).strip()
+                if item_str and not is_debug_content(item_str):
+                    valid_items.append(item_str)
+            return " ".join(valid_items)
+        elif isinstance(result, str):
+            return result
+        else:
+            return str(result)
+    except Exception as e:
+        logger.error(f"Error parsing Pix2Text result: {e}")
+        return ""
+def is_valid_pix2text_result(text):
+    """
+    Check if the Pix2Text result is valid mathematical content.
+    """
+    if not text or not text.strip():
+        return False
+    text = text.strip()
+    # Filter out obvious debug/error content
+    invalid_patterns = [
+        "Page(id=",
+        "elements=[]",
+        "number=0",
+        "Error:",
+        "Exception:",
+        "Traceback:",
+        "DEBUG:",
+        "INFO:",
+        "WARNING:",
+        "ERROR:",
+    ]
+    for pattern in invalid_patterns:
+        if pattern in text:
+            return False
+    # Must have some reasonable length for math content
+    if len(text) < 1:
+        return False
+    # Should contain some mathematical or textual content
+    # Allow mathematical symbols, letters, numbers, basic punctuation
+    import re
+    if re.search(r"[a-zA-Z0-9=+\-*/(){}[\]^_√∫∑∂πθαβγδλμΩ]", text):
+        return True
+    return False
+def is_debug_content(text):
+    """
+    Check if text appears to be debug/logging content rather than actual content.
+    """
+    debug_indicators = [
+        "Page(",
+        "id=",
+        "number=",
+        "elements=",
+        "[])",
+        "DEBUG",
+        "INFO",
+        "WARNING",
+        "ERROR",
+        "Exception",
+        "Traceback",
+        'File "',
+        "line ",
+        " at 0x",
+    ]
+    for indicator in debug_indicators:
+        if indicator in text:
+            return True
+    return False
+def extract_math_region_traditional(pil_image, region):
+    """
+    Fallback traditional OCR for mathematical expressions.
+    """
+    # Convert PIL to OpenCV format
+    img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    x, y, w, h = region["left"], region["top"], region["width"], region["height"]
+    roi = gray[y : y + h, x : x + w]
+    if roi.size == 0:
+        return region["text"]
+    # Math-optimized OCR with expanded symbol whitelist
+    math_chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz=+-×÷∑∫√π∞∂→≤≥∝∴∵∠∆∇()[]{}.,;:^_αβγδλμθΩ±≈≠≡⇒⇔"
+    config = f"--oem 3 --psm 6 -c tessedit_char_whitelist={math_chars}"
+    try:
+        result = pytesseract.image_to_string(roi, config=config).strip()
+        return result if result else region["text"]
+    except Exception:
+        return region["text"]
+def extract_mixed_region(pil_image, region, p2t_model):
+    """Extract mixed content using multiple approaches."""
+    # Convert PIL to OpenCV for traditional OCR
+    img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    eng_result = extract_english_region(gray, region)
+    bangla_result = extract_bangla_region(gray, region)
+    # If it might contain math, try Pix2Text too
+    if has_math_patterns(region["text"]):
+        math_result = extract_math_region_pix2text(pil_image, region, p2t_model)
+        # Choose the longest non-empty result
+        results = [r for r in [eng_result, bangla_result, math_result] if r.strip()]
+        return max(results, key=len) if results else region["text"]
+    # Choose between English and Bangla
+    return bangla_result if len(bangla_result) > len(eng_result) else eng_result
+# ----------------------------
+# STEP 6: Character Analysis (unchanged)
+# ----------------------------
+def analyze_character_by_character(text):
+    """Analyze text character by character to identify language patterns."""
+    analysis = {
+        "characters": [],
+        "language_segments": [],
+        "total_chars": len(text),
+        "language_distribution": defaultdict(int),
+    }
+    for i, char in enumerate(text):
+        char_type = classify_character(char)
+        analysis["characters"].append(
+            {
+                "char": char,
+                "position": i,
+                "type": char_type,
+                "unicode_name": unicodedata.name(char, "UNKNOWN"),
+            }
+        )
+        analysis["language_distribution"][char_type] += 1
+    # Create language segments
+    current_segment = None
+    for char_info in analysis["characters"]:
+        if char_info["type"] in ["space", "punctuation"]:
+            continue
+        if current_segment is None or current_segment["type"] != char_info["type"]:
+            if current_segment:
+                analysis["language_segments"].append(current_segment)
+            current_segment = {
+                "type": char_info["type"],
+                "start": char_info["position"],
+                "end": char_info["position"],
+                "text": char_info["char"],
+            }
+        else:
+            current_segment["end"] = char_info["position"]
+            current_segment["text"] += char_info["char"]
+    if current_segment:
+        analysis["language_segments"].append(current_segment)
+    return analysis
+# ----------------------------
+# STEP 7: Main Processing Pipeline
+# ----------------------------
+def process_page_advanced(page_image, page_num, p2t_model):
+    """
+    Advanced page processing with Pix2Text integration.
+    """
+    print(f"Processing page {page_num + 1}...")
+    # Preprocess image
+    processed_image = preprocess_image_advanced(page_image)
+    # Detect text regions
+    regions = detect_text_regions(processed_image)
+    # Group regions by lines
+    lines = group_regions_by_line(regions)
+    page_results = []
+    for line_num, line in enumerate(lines):
+        line_text_parts = []
+        for region in line:
+            # Choose appropriate extractor based on region type
+            if region["type"] == "english":
+                extracted_text = extract_english_region(processed_image, region)
+            elif region["type"] == "bangla":
+                extracted_text = extract_bangla_region(processed_image, region)
+            elif region["type"] == "math":
+                extracted_text = extract_math_region_pix2text(
+                    page_image, region, p2t_model
+                )
+            elif region["type"] == "mixed":
+                extracted_text = extract_mixed_region(page_image, region, p2t_model)
+            else:
+                extracted_text = region["text"]
+            # Character-by-character analysis
+            char_analysis = analyze_character_by_character(extracted_text)
+            region_result = {
+                "page": page_num,
+                "line": line_num,
+                "text": extracted_text,
+                "original_text": region["text"],
+                "position": {
+                    "left": region["left"],
+                    "top": region["top"],
+                    "width": region["width"],
+                    "height": region["height"],
+                },
+                "confidence": region["confidence"],
+                "detected_type": region["type"],
+                "extraction_method": "pix2text"
+                if region["type"] == "math" and p2t_model
+                else "tesseract",
+                "character_analysis": char_analysis,
+            }
+            page_results.append(region_result)
+            line_text_parts.append(extracted_text)
+        # Log line information
+        if line_text_parts:
+            line_text = " ".join(line_text_parts)
+            print(f"  Line {line_num + 1}: {line_text[:100]}...")
+    return page_results
+def extract_all_text_advanced_pix2text(
+    pdf_path, output_text_file, output_json_file, output_analysis_file
+):
+    """
+    Advanced text extraction with Pix2Text integration.
+    """
+    print("[INFO] Initializing Pix2Text for mathematical expression extraction...")
+    p2t_model = initialize_pix2text()
+    if p2t_model:
+        print("✅ Pix2Text ready for advanced math extraction")
+    else:
+        print("⚠️  Using traditional OCR for math expressions")
+    print("[INFO] Converting PDF to images...")
+    pages = convert_from_path(pdf_path, dpi=300)
+    all_results = []
+    combined_text_parts = []
+    for page_num, page_image in enumerate(tqdm(pages, desc="Processing pages")):
+        page_results = process_page_advanced(page_image, page_num, p2t_model)
+        all_results.extend(page_results)
+        # Build page text
+        page_text_parts = [result["text"] for result in page_results]
+        page_text = " ".join(page_text_parts)
+        combined_text_parts.append(page_text)
+    # Combine all text
+    final_text = "\n\n".join(combined_text_parts)
+    # Save text file
+    with open(output_text_file, "w", encoding="utf-8") as f:
+        f.write(final_text)
+    # Save detailed JSON results
+    with open(output_json_file, "w", encoding="utf-8") as f:
+        json.dump(all_results, f, ensure_ascii=False, indent=2)
+    # Create summary analysis
+    summary_analysis = create_extraction_summary(all_results)
+    with open(output_analysis_file, "w", encoding="utf-8") as f:
+        json.dump(summary_analysis, f, ensure_ascii=False, indent=2)
+    print("\n[✅] Advanced Pix2Text extraction complete!")
+    print(f"→ Text file saved to: {output_text_file}")
+    print(f"→ Detailed JSON saved to: {output_json_file}")
+    print(f"→ Analysis report saved to: {output_analysis_file}")
+    # Print summary
+    print("\n📊 Extraction Summary:")
+    print(f"   Total text regions: {len(all_results)}")
+    print(f"   English regions: {summary_analysis['type_distribution']['english']}")
+    print(f"   Bangla regions: {summary_analysis['type_distribution']['bangla']}")
+    print(f"   Math regions: {summary_analysis['type_distribution']['math']}")
+    print(f"   Mixed regions: {summary_analysis['type_distribution']['mixed']}")
+    # Show extraction method statistics
+    method_stats = defaultdict(int)
+    for result in all_results:
+        method_stats[result.get("extraction_method", "unknown")] += 1
+    print("\n🔧 Extraction Methods Used:")
+    for method, count in method_stats.items():
+        print(f"   {method}: {count} regions")
+def create_extraction_summary(results):
+    """Create a comprehensive summary of the extraction results."""
+    summary = {
+        "total_regions": len(results),
+        "total_pages": len(set(r["page"] for r in results)),
+        "type_distribution": defaultdict(int),
+        "character_distribution": defaultdict(int),
+        "confidence_stats": {"min": 100, "max": 0, "avg": 0},
+        "language_segments_summary": defaultdict(int),
+        "extraction_methods": defaultdict(int),
+    }
+    total_confidence = 0
+    for result in results:
+        summary["type_distribution"][result["detected_type"]] += 1
+        summary["extraction_methods"][result.get("extraction_method", "unknown")] += 1
+        conf = result["confidence"]
+        total_confidence += conf
+        summary["confidence_stats"]["min"] = min(
+            summary["confidence_stats"]["min"], conf
+        )
+        summary["confidence_stats"]["max"] = max(
+            summary["confidence_stats"]["max"], conf
+        )
+        # Character distribution
+        char_analysis = result["character_analysis"]
+        for char_type, count in char_analysis["language_distribution"].items():
+            summary["character_distribution"][char_type] += count
+        # Language segments
+        for segment in char_analysis["language_segments"]:
+            summary["language_segments_summary"][segment["type"]] += 1
+    if results:
+        summary["confidence_stats"]["avg"] = total_confidence / len(results)
+    return summary
+# ----------------------
+# MAIN EXECUTION SECTION
+# ----------------------
+if __name__ == "__main__":
+    pdf_path = r"math102.pdf"
+    output_text_file = "math102_pix2text.txt"
+    output_json_file = "math102_pix2text.json"
+    output_analysis_file = "math102_pix2text_analysis.json"
+    extract_all_text_advanced_pix2text(
+        pdf_path, output_text_file, output_json_file, output_analysis_file
+    )

packages.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+tesseract-ocr
+tesseract-ocr-ben
+tesseract-ocr-eng
+poppler-utils
+libgl1-mesa-glx
+libglib2.0-0
+libsm6
+libxext6
+libfontconfig1
+libxrender1

requirements.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+# Requirements for Advanced Multi-Language OCR System
+# Compatible with Hugging Face Spaces
+# Gradio Web Interface for HuggingFace Spaces
+gradio>=4.0.0
+# FastAPI Web Service Dependencies (for backend compatibility)
+fastapi>=0.104.0
+uvicorn[standard]>=0.23.0
+python-multipart>=0.0.6
+# Core OCR Dependencies
+opencv-python>=4.8.0
+pytesseract>=0.3.10
+pdf2image>=1.16.0
+pillow>=9.0.0
+numpy>=1.24.0
+tqdm>=4.65.0
+# Pix2Text for advanced mathematical expression extraction
+pix2text>=1.0.0
+# AI/ML Dependencies for Math Extraction
+torch>=2.0.0
+torchvision>=0.15.0
+transformers>=4.20.0
+# Additional utilities
+unicodedata2>=15.0.0
+# System dependencies that may be needed for Hugging Face Spaces
+# These are usually pre-installed in HF Spaces but listed for completeness
+# poppler-utils  # For pdf2image (system package)
+# tesseract-ocr  # Tesseract binary (system package)
+# tesseract-ocr-ben  # Bengali language pack (system package)
+# Optional: GPU support for faster processing
+# torch-audio  # Uncomment if using GPU
+# Note: Install CUDA-compatible PyTorch for GPU acceleration