Ash2749's picture
Update app.py
0302b73 verified
# app.py - Gradio Interface for Hugging Face Spaces
import gradio as gr
import os
import json
import shutil
import subprocess
from datetime import datetime
from pathlib import Path
from typing import Tuple
# Import our OCR functionality
from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text
from eval import evaluate_ocr_accuracy, clean_control_characters
def check_system_dependencies():
"""Check and report system dependencies status."""
print("๐Ÿ” Checking system dependencies...")
# Check Tesseract
try:
result = subprocess.run(['tesseract', '--version'], capture_output=True, text=True)
if result.returncode == 0:
print("โœ… Tesseract is available")
else:
print("โŒ Tesseract check failed")
except FileNotFoundError:
print("โŒ Tesseract not found in PATH")
# Check Poppler
poppler_tools = ['pdftoppm', 'pdfinfo']
for tool in poppler_tools:
try:
result = subprocess.run(['which', tool], capture_output=True, text=True)
if result.returncode == 0:
print(f"โœ… {tool} is available")
else:
print(f"โŒ {tool} not found")
except FileNotFoundError:
print(f"โŒ {tool} not available")
# Check pdf2image
try:
import importlib.util
if importlib.util.find_spec("pdf2image") is not None:
print("โœ… pdf2image is available")
else:
print("โŒ pdf2image module not found")
except Exception as e:
print(f"โŒ pdf2image check failed: {e}")
print(f"๐Ÿ“ PATH: {os.environ.get('PATH', 'NOT SET')}")
# Run dependency check on startup
check_system_dependencies()
# Initialize directories
def create_directories():
"""Create necessary directories for file storage."""
directories = ["documents", "extracted", "temp"]
for directory in directories:
Path(directory).mkdir(exist_ok=True)
create_directories()
def process_pdf_ocr(pdf_file) -> Tuple[str, str, str]:
"""
Process uploaded PDF file and extract text using advanced OCR.
Args:
pdf_file: Gradio file input (temporary file path)
Returns:
Tuple of (extracted_text, json_results, analysis_results)
"""
if pdf_file is None:
return "โŒ No file uploaded", "", ""
try:
# Generate timestamp for unique naming
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Get original filename
original_name = os.path.basename(pdf_file.name)
base_name = os.path.splitext(original_name)[0]
# Create unique filenames
pdf_filename = f"{base_name}_{timestamp}.pdf"
text_filename = f"{base_name}_{timestamp}_extract.txt"
json_filename = f"{base_name}_{timestamp}_extract.json"
analysis_filename = f"{base_name}_{timestamp}_analysis.json"
# Create paths
pdf_path = Path("temp") / pdf_filename
text_path = Path("extracted") / text_filename
json_path = Path("extracted") / json_filename
analysis_path = Path("extracted") / analysis_filename
# Copy uploaded file to our temp directory
shutil.copy2(pdf_file.name, pdf_path)
# Process the PDF using our advanced OCR system
extract_all_text_advanced_pix2text(
pdf_path=str(pdf_path),
output_text_file=str(text_path),
output_json_file=str(json_path),
output_analysis_file=str(analysis_path),
)
# Read results
with open(text_path, "r", encoding="utf-8") as f:
extracted_text = f.read()
with open(json_path, "r", encoding="utf-8") as f:
json_results = json.load(f)
with open(analysis_path, "r", encoding="utf-8") as f:
analysis_results = json.load(f)
# Format results for display
json_display = json.dumps(json_results, indent=2, ensure_ascii=False)
analysis_display = json.dumps(analysis_results, indent=2, ensure_ascii=False)
# Clean up temp file
try:
os.remove(pdf_path)
except Exception:
pass
return extracted_text, json_display, analysis_display
except Exception as e:
error_msg = str(e)
# Provide specific guidance for common errors
if (
"poppler" in error_msg.lower()
or "unable to get page count" in error_msg.lower()
):
error_msg = """โŒ PDF Processing Error: Poppler not found
๐Ÿ”ง This error occurs because Poppler (PDF utilities) is not properly installed.
๐Ÿ“‹ For Hugging Face Spaces:
1. Ensure your setup.sh script runs during deployment
2. Check that poppler-utils is installed in the container
3. Verify the setup logs show successful poppler installation
๐Ÿ’ก The setup.sh script should install these packages:
- poppler-utils
- libpoppler-cpp-dev
- pkg-config
๐Ÿšจ Original error: {error_msg}
๐Ÿ”„ Try restarting the space if this persists."""
elif "tesseract" in error_msg.lower():
error_msg = f"""โŒ OCR Engine Error: Tesseract issue
๐Ÿ”ง This error is related to Tesseract OCR engine.
๐Ÿ“‹ Possible solutions:
1. Check Tesseract installation in setup.sh
2. Verify language data files are available
3. Ensure proper permissions on tessdata directory
๐Ÿšจ Original error: {error_msg}"""
else:
error_msg = f"โŒ Error processing PDF: {error_msg}"
return error_msg, "", ""
def evaluate_ocr_files(extracted_file, baseline_file) -> Tuple[str, str]:
"""
Evaluate OCR accuracy by comparing extracted text with baseline.
Args:
extracted_file: Gradio file input (extracted text file)
baseline_file: Gradio file input (baseline/ground truth text file)
Returns:
Tuple of (evaluation_summary, detailed_results)
"""
if extracted_file is None or baseline_file is None:
return "โŒ Please upload both files", ""
try:
# Read file contents
with open(extracted_file.name, "r", encoding="utf-8") as f:
extracted_text = f.read()
with open(baseline_file.name, "r", encoding="utf-8") as f:
baseline_text = f.read()
# Clean texts
extracted_text_clean = clean_control_characters(extracted_text)
baseline_text_clean = clean_control_characters(baseline_text)
# Perform evaluation
results = evaluate_ocr_accuracy(
extracted_text=extracted_text_clean,
baseline_text=baseline_text_clean,
)
if "error" in results:
return f"โŒ Evaluation error: {results['error']}", ""
# Create summary
summary = f"""
๐Ÿ“Š **OCR Evaluation Results**
๐ŸŽฏ **Overall Grade: {results["evaluation_summary"]["grade"]}**
๐Ÿ“ˆ **Overall Accuracy: {results["overall_accuracy"]:.2f}%**
๐Ÿ” **Similarity Score: {results["similarity_score"]:.2f}%**
๐Ÿ“ **Character Metrics:**
- Total Characters: {results["character_metrics"]["total_chars"]}
- Correct Characters: {results["character_metrics"]["correct_chars"]}
- Character Accuracy: {results["character_metrics"]["accuracy"]:.2f}%
๐Ÿ“„ **Word Metrics:**
- Total Words: {results["word_metrics"]["total_words"]}
- Correct Words: {results["word_metrics"]["correct_words"]}
- Word Accuracy: {results["word_metrics"]["accuracy"]:.2f}%
๐Ÿ“‹ **Line Metrics:**
- Total Lines: {results["line_metrics"]["total_lines"]}
- Correct Lines: {results["line_metrics"]["correct_lines"]}
- Line Accuracy: {results["line_metrics"]["accuracy"]:.2f}%
๐ŸŒ **Language-Specific Accuracy:**
- English: {results["language_specific"]["english"]["accuracy"]:.2f}%
- Bangla: {results["language_specific"]["bangla"]["accuracy"]:.2f}%
- Mathematical: {results["language_specific"]["math"]["accuracy"]:.2f}%
๐Ÿ’ก **Recommendations:**
{chr(10).join(f"โ€ข {rec}" for rec in results["evaluation_summary"]["recommendations"])}
"""
# Detailed results
detailed = json.dumps(results, indent=2, ensure_ascii=False)
return summary, detailed
except Exception as e:
error_msg = f"โŒ Error during evaluation: {str(e)}"
return error_msg, ""
# Create Gradio Interface
def create_interface():
"""Create the main Gradio interface."""
with gr.Blocks(
title="๐Ÿ” Advanced Multi-Language OCR System",
theme=gr.themes.Soft(),
css="""
.gradio-container {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}
.header {
text-align: center;
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 2rem;
border-radius: 10px;
margin-bottom: 2rem;
}
""",
) as app:
# Header
gr.HTML("""
<div class="header">
<h1>๐Ÿ” Advanced Multi-Language OCR System</h1>
<p>Extract text from PDFs containing English, Bangla, and Mathematical expressions</p>
<p>Powered by Tesseract, Pix2Text, and Advanced Language Detection</p>
</div>
""")
with gr.Tabs():
# OCR Processing Tab
with gr.Tab("๐Ÿ“„ OCR Processing", id="ocr"):
gr.Markdown("""
## ๐Ÿ“„ PDF Text Extraction
Upload a PDF file to extract text using advanced multi-language OCR technology.
**Features:**
- ๐ŸŒ Multi-language support (English, Bangla, Mathematical expressions)
- ๐Ÿงฎ Advanced mathematical formula recognition with Pix2Text
- ๐Ÿ“Š Detailed character-by-character analysis
- ๐Ÿท๏ธ Automatic content classification
""")
with gr.Row():
with gr.Column():
pdf_input = gr.File(
label="๐Ÿ“Ž Upload PDF File",
file_types=[".pdf"],
file_count="single",
)
process_btn = gr.Button(
"๐Ÿš€ Extract Text", variant="primary", size="lg"
)
with gr.Row():
with gr.Column():
extracted_output = gr.Textbox(
label="๐Ÿ“ Extracted Text",
lines=15,
max_lines=20,
placeholder="Extracted text will appear here...",
)
with gr.Column():
json_output = gr.Textbox(
label="๐Ÿ“‹ Detailed JSON Results",
lines=8,
max_lines=15,
placeholder="JSON results will appear here...",
)
analysis_output = gr.Textbox(
label="๐Ÿ“Š Analysis Report",
lines=7,
max_lines=10,
placeholder="Analysis report will appear here...",
)
# Connect OCR processing
process_btn.click(
fn=process_pdf_ocr,
inputs=[pdf_input],
outputs=[extracted_output, json_output, analysis_output],
)
# Evaluation Tab
with gr.Tab("๐Ÿ“Š OCR Evaluation", id="eval"):
gr.Markdown("""
## ๐Ÿ“Š OCR Accuracy Evaluation
Compare extracted text with ground truth baseline to measure OCR accuracy.
**Features:**
- ๐ŸŽฏ Character, word, and line-level accuracy metrics
- ๐ŸŒ Language-specific accuracy analysis
- ๐Ÿ“ˆ Overall grading system (A+ to F)
- ๐Ÿ’ก Improvement recommendations
""")
with gr.Row():
with gr.Column():
extracted_file = gr.File(
label="๐Ÿ“„ Extracted Text File (.txt)",
file_types=[".txt"],
file_count="single",
)
with gr.Column():
baseline_file = gr.File(
label="๐Ÿ“‹ Baseline/Ground Truth File (.txt)",
file_types=[".txt"],
file_count="single",
)
evaluate_btn = gr.Button(
"๐Ÿ” Evaluate Accuracy", variant="primary", size="lg"
)
with gr.Row():
with gr.Column():
eval_summary = gr.Textbox(
label="๐Ÿ“Š Evaluation Summary",
lines=20,
max_lines=25,
placeholder="Evaluation summary will appear here...",
)
with gr.Column():
eval_detailed = gr.Textbox(
label="๐Ÿ“‹ Detailed Results (JSON)",
lines=20,
max_lines=25,
placeholder="Detailed evaluation results will appear here...",
)
# Connect evaluation
evaluate_btn.click(
fn=evaluate_ocr_files,
inputs=[extracted_file, baseline_file],
outputs=[eval_summary, eval_detailed],
)
# About Tab
with gr.Tab("โ„น๏ธ About", id="about"):
gr.Markdown("""
## ๐Ÿ” Advanced Multi-Language OCR System
### ๐ŸŒŸ Overview
This system provides state-of-the-art OCR capabilities for documents containing mixed languages and mathematical expressions.
### ๐Ÿš€ Key Features
#### ๐Ÿ“„ Multi-Language OCR
- **English**: Advanced text recognition with high accuracy
- **Bangla**: Native Bengali script support with proper Unicode handling
- **Mathematical**: LaTeX and formula recognition using Pix2Text
#### ๐Ÿงฎ Advanced Math Processing
- Integration with **Pix2Text** for superior mathematical expression recognition
- LaTeX output for mathematical formulas
- Support for complex equations and symbols
#### ๐Ÿ“Š Comprehensive Analysis
- Character-by-character classification and confidence scoring
- Language detection and content categorization
- Detailed extraction statistics and reports
#### ๐ŸŽฏ Accuracy Evaluation
- Compare extracted text with ground truth baseline
- Character, word, and line-level accuracy metrics
- Language-specific performance analysis
- Grading system with improvement recommendations
### ๐Ÿ› ๏ธ Technology Stack
- **OCR Engine**: Tesseract with custom language models
- **Math Recognition**: Pix2Text for advanced mathematical expressions
- **Language Detection**: Custom algorithms for multi-language content
- **Backend**: FastAPI with async processing
- **Frontend**: Gradio for interactive web interface
### ๐Ÿ“ Usage Tips
#### For Best OCR Results:
1. **File Quality**: Use high-resolution PDF files (300 DPI or higher)
2. **Text Clarity**: Ensure text is clear and not blurry or distorted
3. **Language**: The system works best with properly formatted text
4. **Mathematical Content**: Complex formulas are processed using specialized Pix2Text models
#### For Accurate Evaluation:
1. **File Format**: Upload plain text files (.txt) in UTF-8 encoding
2. **Content Matching**: Ensure baseline file corresponds to the same source document
3. **Text Cleaning**: The system automatically cleans control characters
### ๐Ÿ”— Links
- **GitHub Repository**: [aaladin-ocr](https://github.com/ashfaqbracu/aaladin-ocr)
- **Documentation**: Available in the repository
- **Issues/Support**: Report issues on GitHub
### ๐Ÿ“ง Contact
For questions or support, please visit our GitHub repository or create an issue.
---
**Developed with โค๏ธ for advanced document processing and OCR accuracy.**
""")
return app
# Initialize Pix2Text on startup
print("๐Ÿš€ Initializing Pix2Text model...")
try:
pix2text_model = initialize_pix2text()
if pix2text_model:
print("โœ… Pix2Text initialized successfully")
else:
print("โš ๏ธ Pix2Text initialization failed - math extraction may be limited")
except Exception as e:
print(f"โš ๏ธ Pix2Text initialization error: {e}")
# Create and launch the interface
if __name__ == "__main__":
app = create_interface()
# Launch with proper configuration for Hugging Face Spaces
app.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)