math-ocr / app.py
Ash2749's picture
Update app.py
c4d61af verified
# app.py - Gradio Interface for Advanced Multi-Language OCR System
# Hugging Face Spaces compatible application
import os
import json
import shutil
from datetime import datetime
from pathlib import Path
from typing import Tuple
import gradio as gr
# Set up logging first
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Try to import our OCR functionality with error handling
try:
from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text
from eval import evaluate_ocr_accuracy, clean_control_characters
OCR_AVAILABLE = True
logger.info("✅ OCR modules imported successfully")
except ImportError as e:
logger.error(f"❌ OCR modules not available: {e}")
OCR_AVAILABLE = False
# Create dummy functions as fallbacks
def extract_all_text_advanced_pix2text(*args, **kwargs):
raise RuntimeError(
"OCR functionality not available due to missing dependencies"
)
def initialize_pix2text():
return None
def evaluate_ocr_accuracy(*args, **kwargs):
raise RuntimeError(
"Evaluation functionality not available due to missing dependencies"
)
def clean_control_characters(text):
return text
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Create necessary directories
def create_directories():
"""Create necessary directories for file storage."""
directories = ["documents", "extracted", "temp"]
for directory in directories:
Path(directory).mkdir(exist_ok=True)
logger.info(f"✅ Created/verified directory: {directory}")
# Initialize directories
create_directories()
# Initialize Pix2Text model at startup with error handling
logger.info("🚀 Initializing Pix2Text model...")
if OCR_AVAILABLE:
try:
PIX2TEXT_MODEL = initialize_pix2text()
if PIX2TEXT_MODEL:
logger.info("✅ Pix2Text model loaded successfully")
else:
logger.warning("⚠️ Pix2Text model not available, using fallback OCR")
except Exception as e:
logger.error(f"❌ Failed to initialize Pix2Text: {e}")
PIX2TEXT_MODEL = None
else:
logger.warning("⚠️ OCR modules not available - running in demo mode")
PIX2TEXT_MODEL = None
def get_safe_filename(filename: str) -> str:
"""Generate a safe filename with timestamp."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
name, ext = os.path.splitext(filename)
# Remove special characters and replace spaces
safe_name = "".join(c for c in name if c.isalnum() or c in ("-", "_")).rstrip()
return f"{safe_name}_{timestamp}{ext}"
def get_extraction_filename(pdf_filename: str, file_type: str) -> str:
"""Generate extraction filename with convention: [pdf_filename]_extract.[extension]"""
base_name = os.path.splitext(pdf_filename)[0]
extensions = {"txt": "txt", "json": "json", "analysis": "json"}
return f"{base_name}_extract.{extensions.get(file_type, 'txt')}"
def extract_text_from_pdf(pdf_file) -> Tuple[str, str, str, str]:
"""
Extract text from uploaded PDF file using advanced OCR.
Returns:
- extracted_text: The full extracted text
- summary_text: A summary of the extraction process
- text_file_path: Path to the text file (for download)
- json_file_path: Path to the JSON file (for download)
"""
if pdf_file is None:
return "❌ No file uploaded", "Please upload a PDF file", "", ""
try:
start_time = datetime.now()
# Get the uploaded file path
pdf_path = pdf_file.name
filename = os.path.basename(pdf_path)
logger.info(f"📄 Processing uploaded file: {filename}")
# Generate safe filename
safe_filename = get_safe_filename(filename)
# Copy uploaded file to documents directory
documents_path = Path("documents") / safe_filename
shutil.copy2(pdf_path, documents_path)
# Generate output filenames
text_filename = get_extraction_filename(safe_filename, "txt")
json_filename = get_extraction_filename(safe_filename, "json")
analysis_filename = get_extraction_filename(safe_filename, "analysis")
# Create full paths for extracted files
text_path = Path("extracted") / text_filename
json_path = Path("extracted") / json_filename
analysis_path = Path("extracted") / analysis_filename
logger.info("🔄 Starting OCR processing...")
# Check if OCR functionality is available
if not OCR_AVAILABLE:
return (
"""❌ **OCR functionality not available**
This appears to be a demo environment where the OCR dependencies are not fully installed.
**Missing components:**
- OpenCV (cv2) for image processing
- Tesseract OCR for text recognition
- Pix2Text for mathematical expression extraction
**To use this system:**
1. Deploy to Hugging Face Spaces with proper dependencies
2. Or install missing packages locally:
```bash
pip install opencv-python pytesseract pix2text
apt-get install tesseract-ocr tesseract-ocr-ben poppler-utils
```
**Demo Features Available:**
- Interface navigation and design preview
- File upload testing (files are validated but not processed)
- System architecture demonstration
""",
"OCR dependencies not available in this environment",
"",
"",
)
# Process the PDF using our advanced OCR system
extract_all_text_advanced_pix2text(
pdf_path=str(documents_path),
output_text_file=str(text_path),
output_json_file=str(json_path),
output_analysis_file=str(analysis_path),
)
# Read the extracted text
with open(text_path, "r", encoding="utf-8") as f:
extracted_text = f.read()
# Read the analysis for summary
with open(analysis_path, "r", encoding="utf-8") as f:
analysis_data = json.load(f)
# Calculate processing time
end_time = datetime.now()
processing_time = (end_time - start_time).total_seconds()
# Create summary
summary = f"""
📊 **OCR Processing Complete!**
⏱️ **Processing Time:** {processing_time:.2f} seconds
📄 **Original File:** {filename}
📝 **Extracted Characters:** {len(extracted_text):,}
🔤 **Text Distribution:**
- English regions: {analysis_data.get("type_distribution", {}).get("english", 0)}
- Bangla regions: {analysis_data.get("type_distribution", {}).get("bangla", 0)}
- Math regions: {analysis_data.get("type_distribution", {}).get("math", 0)}
- Mixed regions: {analysis_data.get("type_distribution", {}).get("mixed", 0)}
📈 **Quality Metrics:**
- Total text regions: {analysis_data.get("total_regions", 0)}
- Pages processed: {analysis_data.get("total_pages", 0)}
- Average confidence: {analysis_data.get("confidence_stats", {}).get("avg", 0):.1f}%
🔧 **Extraction Methods:**
- Pix2Text (Math): {analysis_data.get("extraction_methods", {}).get("pix2text", 0)} regions
- Tesseract (Text): {analysis_data.get("extraction_methods", {}).get("tesseract", 0)} regions
✅ **Status:** Extraction completed successfully!
"""
logger.info(f"✅ OCR processing completed in {processing_time:.2f} seconds")
return extracted_text, summary, str(text_path), str(json_path)
except Exception as e:
error_message = f"❌ **Error during OCR processing:**\n\n{str(e)}"
logger.error(f"OCR processing failed: {e}")
return error_message, error_message, "", ""
def evaluate_ocr_files(
extracted_file, baseline_file, evaluation_name: str = ""
) -> Tuple[str, str]:
"""
Evaluate OCR accuracy by comparing extracted text with baseline.
Returns:
- results_text: Formatted evaluation results
- summary_text: Summary of the evaluation
"""
if extracted_file is None or baseline_file is None:
return "❌ Please upload both files for evaluation", "Missing files"
try:
start_time = datetime.now()
# Read file contents
with open(extracted_file.name, "r", encoding="utf-8") as f:
extracted_text = f.read()
with open(baseline_file.name, "r", encoding="utf-8") as f:
baseline_text = f.read()
logger.info(f"📊 Starting evaluation: {evaluation_name or 'Unnamed'}")
logger.info(f"Extracted text length: {len(extracted_text)} characters")
logger.info(f"Baseline text length: {len(baseline_text)} characters")
# Check if evaluation functionality is available
if not OCR_AVAILABLE:
return (
"""❌ **Evaluation functionality not available**
This appears to be a demo environment where the evaluation dependencies are not fully installed.
**Missing components:**
- Text processing utilities
- Evaluation algorithms
- Statistical analysis functions
**To use this system:**
1. Deploy to Hugging Face Spaces with proper dependencies
2. Or install missing packages locally
**Demo Features Available:**
- Interface navigation and design preview
- File upload testing (files are validated but not processed)
- System architecture demonstration
""",
"Evaluation dependencies not available in this environment",
)
# Clean input texts
extracted_text_clean = clean_control_characters(extracted_text)
baseline_text_clean = clean_control_characters(baseline_text)
# Perform evaluation
evaluation_results = evaluate_ocr_accuracy(
extracted_text=extracted_text_clean,
baseline_text=baseline_text_clean,
)
# Check for evaluation errors
if "error" in evaluation_results:
return (
f"❌ **Evaluation Error:** {evaluation_results['error']}",
"Error occurred",
)
# Calculate processing time
end_time = datetime.now()
processing_time = (end_time - start_time).total_seconds()
# Format results
results_text = f"""
📊 **OCR Evaluation Results**
{f"📝 **Evaluation Name:** {evaluation_name}" if evaluation_name else ""}
🎯 **Overall Performance**
- **Overall Accuracy:** {evaluation_results["overall_accuracy"]:.2f}%
- **Similarity Score:** {evaluation_results["similarity_score"]:.2f}%
- **Grade:** {evaluation_results["evaluation_summary"]["grade"]}
📝 **Character-Level Analysis**
- **Character Accuracy:** {evaluation_results["character_metrics"]["character_accuracy"]:.2f}%
- **Character Error Rate:** {evaluation_results["character_metrics"]["character_error_rate"]:.2f}%
- **Edit Distance:** {evaluation_results["character_metrics"]["edit_distance"]}
- **Total Characters:** {evaluation_results["character_metrics"]["total_characters"]:,}
📚 **Word-Level Analysis**
- **Word Accuracy:** {evaluation_results["word_metrics"]["word_accuracy"]:.2f}%
- **Word Error Rate:** {evaluation_results["word_metrics"]["word_error_rate"]:.2f}%
- **Correct Words:** {evaluation_results["word_metrics"]["correct_words"]} / {evaluation_results["word_metrics"]["total_words"]}
- **Missing Words:** {evaluation_results["word_metrics"]["missing_words"]}
- **Extra Words:** {evaluation_results["word_metrics"]["extra_words"]}
📄 **Line-Level Analysis**
- **Line Accuracy:** {evaluation_results["line_metrics"]["line_accuracy"]:.2f}%
- **Average Line Similarity:** {evaluation_results["line_metrics"]["average_line_similarity"]:.2f}%
- **Lines Matched:** {evaluation_results["line_metrics"]["lines_matched"]} / {evaluation_results["line_metrics"]["total_lines"]}
🌐 **Language-Specific Accuracy**
- **English:** {evaluation_results["language_specific"].get("english_accuracy", "N/A")}%
- **Bangla:** {evaluation_results["language_specific"].get("bangla_accuracy", "N/A")}%
- **Mathematics:** {evaluation_results["language_specific"].get("math_accuracy", "N/A")}%
- **Numbers:** {evaluation_results["language_specific"].get("number_accuracy", "N/A")}%
📈 **Text Statistics**
- **Extracted Length:** {evaluation_results["text_statistics"]["extracted_length"]:,} characters
- **Baseline Length:** {evaluation_results["text_statistics"]["baseline_length"]:,} characters
- **Extracted Words:** {evaluation_results["text_statistics"]["extracted_words"]:,}
- **Baseline Words:** {evaluation_results["text_statistics"]["baseline_words"]:,}
💡 **Recommendations**
"""
for i, rec in enumerate(
evaluation_results["evaluation_summary"]["recommendations"], 1
):
results_text += f"{i}. {rec}\n"
# Create summary
summary = f"""
🎯 **Evaluation Summary**
⏱️ **Processing Time:** {processing_time:.3f} seconds
📊 **Overall Score:** {evaluation_results["overall_accuracy"]:.2f}%
🏆 **Grade:** {evaluation_results["evaluation_summary"]["grade"]}
📝 **Character Accuracy:** {evaluation_results["character_metrics"]["character_accuracy"]:.2f}%
📚 **Word Accuracy:** {evaluation_results["word_metrics"]["word_accuracy"]:.2f}%
✅ **Evaluation completed successfully!**
"""
logger.info(f"✅ Evaluation completed in {processing_time:.3f} seconds")
logger.info(
f"📊 Overall accuracy: {evaluation_results['overall_accuracy']:.2f}%"
)
return results_text, summary
except Exception as e:
error_message = f"❌ **Error during evaluation:**\n\n{str(e)}"
logger.error(f"Evaluation failed: {e}")
return error_message, error_message
# Create Gradio interface
def create_gradio_interface():
"""Create and configure the Gradio interface."""
# Custom CSS for better styling
css = """
.gradio-container {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}
.output-text {
font-family: 'Courier New', monospace;
font-size: 14px;
}
.summary-box {
background-color: #f0f8ff;
border: 1px solid #d0e7ff;
border-radius: 8px;
padding: 16px;
margin: 8px 0;
}
"""
with gr.Blocks(
css=css, title="Advanced Multi-Language OCR System", theme=gr.themes.Soft()
) as app:
# Header
gr.Markdown("""
# 🔍 Advanced Multi-Language OCR System
**Powered by Pix2Text, Tesseract, and FastAPI**
Extract text from PDFs containing **English**, **Bangla**, and **Mathematical expressions** with high accuracy.
Evaluate OCR performance with comprehensive metrics and detailed analysis.
""")
with gr.Tabs():
# Tab 1: OCR Extraction
with gr.Tab("📄 PDF Text Extraction"):
gr.Markdown("""
### Upload a PDF and extract text using advanced multi-language OCR
**Features:**
- 🌐 **Multi-language support**: English, Bangla (Bengali), and Mathematical expressions
- 🧮 **Advanced Math Recognition**: Pix2Text integration for LaTeX and mathematical formulas
- 📊 **Detailed Analysis**: Character-level classification and confidence scores
- 💾 **Download Results**: Get extracted text and detailed JSON analysis
""")
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(
label="📄 Upload PDF File",
file_types=[".pdf"],
type="filepath",
)
extract_btn = gr.Button(
"🚀 Extract Text", variant="primary", size="lg"
)
with gr.Column(scale=2):
extraction_summary = gr.Textbox(
label="📊 Extraction Summary",
lines=15,
elem_classes=["summary-box"],
)
with gr.Row():
extracted_text_output = gr.Textbox(
label="📝 Extracted Text",
lines=20,
elem_classes=["output-text"],
show_copy_button=True,
)
with gr.Row():
text_file_download = gr.File(
label="📥 Download Text File", visible=False
)
json_file_download = gr.File(
label="📥 Download JSON Analysis", visible=False
)
# Connect extraction functionality
extract_btn.click(
fn=extract_text_from_pdf,
inputs=[pdf_input],
outputs=[
extracted_text_output,
extraction_summary,
text_file_download,
json_file_download,
],
).then(
lambda text_path, json_path: (
gr.update(
visible=bool(text_path),
value=text_path if text_path else None,
),
gr.update(
visible=bool(json_path),
value=json_path if json_path else None,
),
),
inputs=[text_file_download, json_file_download],
outputs=[text_file_download, json_file_download],
)
# Tab 2: OCR Evaluation
with gr.Tab("📊 OCR Accuracy Evaluation"):
gr.Markdown("""
### Compare OCR extracted text with ground truth baseline for accuracy analysis
**Evaluation Features:**
- 🎯 **Character-level accuracy**: Precise character matching and edit distance
- 📚 **Word-level accuracy**: Word matching and error rates
- 📄 **Line-level accuracy**: Line comparison and similarity scores
- 🌐 **Language-specific metrics**: Separate accuracy for English, Bangla, and Math
- 🏆 **Grading system**: Letter grades from A+ to F with recommendations
""")
with gr.Row():
with gr.Column():
extracted_file_input = gr.File(
label="📄 OCR Extracted Text File (.txt)",
file_types=[".txt"],
type="filepath",
)
baseline_file_input = gr.File(
label="📑 Ground Truth Baseline File (.txt)",
file_types=[".txt"],
type="filepath",
)
evaluation_name_input = gr.Textbox(
label="📝 Evaluation Name (Optional)",
placeholder="e.g., Math Document Test #1",
)
evaluate_btn = gr.Button(
"📊 Evaluate Accuracy", variant="primary", size="lg"
)
with gr.Column():
evaluation_summary = gr.Textbox(
label="🎯 Evaluation Summary",
lines=10,
elem_classes=["summary-box"],
)
with gr.Row():
evaluation_results = gr.Textbox(
label="📈 Detailed Evaluation Results",
lines=25,
elem_classes=["output-text"],
show_copy_button=True,
)
# Connect evaluation functionality
evaluate_btn.click(
fn=evaluate_ocr_files,
inputs=[
extracted_file_input,
baseline_file_input,
evaluation_name_input,
],
outputs=[evaluation_results, evaluation_summary],
)
# Tab 3: About & Help
with gr.Tab("ℹ️ About & Help"):
gr.Markdown("""
## 🔍 Advanced Multi-Language OCR System
This application provides state-of-the-art Optical Character Recognition (OCR) for documents containing mixed languages and mathematical expressions.
### 🌟 Key Features
#### 📄 **PDF Text Extraction**
- **Multi-language Support**: Simultaneously process English and Bangla (Bengali) text
- **Mathematical Recognition**: Advanced extraction of mathematical formulas and equations using Pix2Text
- **Intelligent Classification**: Automatic detection and classification of text regions by language/content type
- **High Accuracy**: Optimized preprocessing and multiple OCR engines for best results
- **Detailed Analysis**: Character-by-character analysis with confidence scores and language distribution
#### 📊 **OCR Accuracy Evaluation**
- **Comprehensive Metrics**: Character, word, and line-level accuracy measurements
- **Language-Specific Analysis**: Separate accuracy scores for different languages and mathematical content
- **Edit Distance Calculation**: Precise measurement of text differences using Levenshtein distance
- **Grading System**: Letter grades (A+ to F) with improvement recommendations
- **Detailed Comparison**: Side-by-side diff analysis showing insertions, deletions, and matches
### 🛠️ **Technology Stack**
- **Pix2Text**: Advanced mathematical expression recognition
- **Tesseract OCR**: Multi-language text recognition with Bengali support
- **OpenCV**: Image preprocessing and enhancement
- **PDF2Image**: High-quality PDF to image conversion
- **FastAPI**: RESTful API backend
- **Gradio**: Interactive web interface
### 📝 **Usage Instructions**
#### **For PDF Text Extraction:**
1. Upload a PDF file using the file picker
2. Click "🚀 Extract Text" to start processing
3. Review the extraction summary for statistics
4. Copy the extracted text or download the files
5. Download the JSON file for detailed analysis data
#### **For OCR Evaluation:**
1. Upload the OCR-extracted text file (what you want to evaluate)
2. Upload the ground truth baseline file (the correct text)
3. Optionally provide an evaluation name for identification
4. Click "📊 Evaluate Accuracy" to run the comparison
5. Review the detailed metrics and recommendations
### 🎯 **Accuracy Grading System**
- **A+ (95-100%)**: Excellent - Professional-grade accuracy
- **A (90-94%)**: Very Good - High-quality results with minor errors
- **B (80-89%)**: Good - Acceptable for most applications
- **C (70-79%)**: Fair - May require manual review
- **D (60-69%)**: Poor - Significant improvements needed
- **F (<60%)**: Very Poor - Major issues requiring attention
### 📚 **Supported Languages & Content**
- **English**: Full Latin alphabet with punctuation and symbols
- **Bangla (Bengali)**: Complete Bengali Unicode range (U+0980-U+09FF)
- **Mathematical Expressions**:
- Basic arithmetic operators (+, -, ×, ÷, =)
- Greek letters (α, β, γ, δ, π, θ, λ, μ, Ω, etc.)
- Mathematical symbols (∑, ∫, √, ∞, ∂, →, ≤, ≥, etc.)
- Subscripts and superscripts
- Functions and equations
- LaTeX-style expressions
### 🔧 **Tips for Best Results**
1. **PDF Quality**: Use high-resolution PDFs (300+ DPI) for better accuracy
2. **Text Clarity**: Ensure text is not blurry, skewed, or low contrast
3. **Language Consistency**: Mixed-language documents work best when languages are clearly separated
4. **Mathematical Content**: Complex equations may require manual verification
5. **File Size**: Larger documents may take longer to process
### 🐛 **Troubleshooting**
- **Empty Results**: Check if the PDF contains selectable text or if images need OCR
- **Low Accuracy**: Try preprocessing the PDF to improve image quality
- **Mixed Languages**: Ensure the document has clear language boundaries
- **Mathematical Errors**: Complex formulas may need manual correction
### 📞 **Support & Feedback**
For issues, suggestions, or contributions, please visit our [GitHub repository](https://github.com/ashfaqbracu/aaladinai).
---
**Made with ❤️ for advancing multilingual text recognition**
""")
# Footer
gr.Markdown("""
---
**🔗 Links:** [GitHub Repository](https://github.com/ashfaqbracu/aaladinai) | [Documentation](https://github.com/ashfaqbracu/aaladinai#readme)
**⚡ Powered by:** Pix2Text • Tesseract OCR • OpenCV • FastAPI • Gradio
""")
return app
# Main execution
if __name__ == "__main__":
logger.info("🚀 Starting Advanced Multi-Language OCR Gradio Interface...")
# Create and launch the interface
app = create_gradio_interface()
# Launch configuration
app.launch(
server_name="0.0.0.0", # Allow external access for Hugging Face Spaces
server_port=7860, # Standard port for Hugging Face Spaces
share=False, # Don't create gradio.live link
show_error=True, # Show detailed error messages
max_threads=4, # Limit concurrent requests
)