Spaces:

Ash2749
/

advanced-multilang-ocr

Sleeping

App Files Files Community

advanced-multilang-ocr / app.py

Ash2749

Update app.py

0302b73 verified 5 months ago

raw

history blame contribute delete

18 kB

	# app.py - Gradio Interface for Hugging Face Spaces
	import gradio as gr
	import os
	import json
	import shutil
	import subprocess
	from datetime import datetime
	from pathlib import Path
	from typing import Tuple

	# Import our OCR functionality
	from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text
	from eval import evaluate_ocr_accuracy, clean_control_characters

	def check_system_dependencies():
	"""Check and report system dependencies status."""
	print("🔍 Checking system dependencies...")

	# Check Tesseract
	try:
	result = subprocess.run(['tesseract', '--version'], capture_output=True, text=True)
	if result.returncode == 0:
	print("✅ Tesseract is available")
	else:
	print("❌ Tesseract check failed")
	except FileNotFoundError:
	print("❌ Tesseract not found in PATH")

	# Check Poppler
	poppler_tools = ['pdftoppm', 'pdfinfo']
	for tool in poppler_tools:
	try:
	result = subprocess.run(['which', tool], capture_output=True, text=True)
	if result.returncode == 0:
	print(f"✅ {tool} is available")
	else:
	print(f"❌ {tool} not found")
	except FileNotFoundError:
	print(f"❌ {tool} not available")

	# Check pdf2image
	try:
	import importlib.util
	if importlib.util.find_spec("pdf2image") is not None:
	print("✅ pdf2image is available")
	else:
	print("❌ pdf2image module not found")
	except Exception as e:
	print(f"❌ pdf2image check failed: {e}")

	print(f"📍 PATH: {os.environ.get('PATH', 'NOT SET')}")

	# Run dependency check on startup
	check_system_dependencies()


	# Initialize directories
	def create_directories():
	"""Create necessary directories for file storage."""
	directories = ["documents", "extracted", "temp"]
	for directory in directories:
	Path(directory).mkdir(exist_ok=True)


	create_directories()


	def process_pdf_ocr(pdf_file) -> Tuple[str, str, str]:
	"""
	Process uploaded PDF file and extract text using advanced OCR.

	Args:
	pdf_file: Gradio file input (temporary file path)

	Returns:
	Tuple of (extracted_text, json_results, analysis_results)
	"""
	if pdf_file is None:
	return "❌ No file uploaded", "", ""

	try:
	# Generate timestamp for unique naming
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

	# Get original filename
	original_name = os.path.basename(pdf_file.name)
	base_name = os.path.splitext(original_name)[0]

	# Create unique filenames
	pdf_filename = f"{base_name}_{timestamp}.pdf"
	text_filename = f"{base_name}_{timestamp}_extract.txt"
	json_filename = f"{base_name}_{timestamp}_extract.json"
	analysis_filename = f"{base_name}_{timestamp}_analysis.json"

	# Create paths
	pdf_path = Path("temp") / pdf_filename
	text_path = Path("extracted") / text_filename
	json_path = Path("extracted") / json_filename
	analysis_path = Path("extracted") / analysis_filename

	# Copy uploaded file to our temp directory
	shutil.copy2(pdf_file.name, pdf_path)

	# Process the PDF using our advanced OCR system
	extract_all_text_advanced_pix2text(
	pdf_path=str(pdf_path),
	output_text_file=str(text_path),
	output_json_file=str(json_path),
	output_analysis_file=str(analysis_path),
	)

	# Read results
	with open(text_path, "r", encoding="utf-8") as f:
	extracted_text = f.read()

	with open(json_path, "r", encoding="utf-8") as f:
	json_results = json.load(f)

	with open(analysis_path, "r", encoding="utf-8") as f:
	analysis_results = json.load(f)

	# Format results for display
	json_display = json.dumps(json_results, indent=2, ensure_ascii=False)
	analysis_display = json.dumps(analysis_results, indent=2, ensure_ascii=False)

	# Clean up temp file
	try:
	os.remove(pdf_path)
	except Exception:
	pass

	return extracted_text, json_display, analysis_display

	except Exception as e:
	error_msg = str(e)

	# Provide specific guidance for common errors
	if (
	"poppler" in error_msg.lower()
	or "unable to get page count" in error_msg.lower()
	):
	error_msg = """❌ PDF Processing Error: Poppler not found

	🔧 This error occurs because Poppler (PDF utilities) is not properly installed.

	📋 For Hugging Face Spaces:
	1. Ensure your setup.sh script runs during deployment
	2. Check that poppler-utils is installed in the container
	3. Verify the setup logs show successful poppler installation

	💡 The setup.sh script should install these packages:
	- poppler-utils
	- libpoppler-cpp-dev
	- pkg-config

	🚨 Original error: {error_msg}

	🔄 Try restarting the space if this persists."""
	elif "tesseract" in error_msg.lower():
	error_msg = f"""❌ OCR Engine Error: Tesseract issue

	🔧 This error is related to Tesseract OCR engine.

	📋 Possible solutions:
	1. Check Tesseract installation in setup.sh
	2. Verify language data files are available
	3. Ensure proper permissions on tessdata directory

	🚨 Original error: {error_msg}"""
	else:
	error_msg = f"❌ Error processing PDF: {error_msg}"

	return error_msg, "", ""


	def evaluate_ocr_files(extracted_file, baseline_file) -> Tuple[str, str]:
	"""
	Evaluate OCR accuracy by comparing extracted text with baseline.

	Args:
	extracted_file: Gradio file input (extracted text file)
	baseline_file: Gradio file input (baseline/ground truth text file)

	Returns:
	Tuple of (evaluation_summary, detailed_results)
	"""
	if extracted_file is None or baseline_file is None:
	return "❌ Please upload both files", ""

	try:
	# Read file contents
	with open(extracted_file.name, "r", encoding="utf-8") as f:
	extracted_text = f.read()

	with open(baseline_file.name, "r", encoding="utf-8") as f:
	baseline_text = f.read()

	# Clean texts
	extracted_text_clean = clean_control_characters(extracted_text)
	baseline_text_clean = clean_control_characters(baseline_text)

	# Perform evaluation
	results = evaluate_ocr_accuracy(
	extracted_text=extracted_text_clean,
	baseline_text=baseline_text_clean,
	)

	if "error" in results:
	return f"❌ Evaluation error: {results['error']}", ""

	# Create summary
	summary = f"""
	📊 OCR Evaluation Results

	🎯 Overall Grade: {results["evaluation_summary"]["grade"]}
	📈 Overall Accuracy: {results["overall_accuracy"]:.2f}%
	🔍 Similarity Score: {results["similarity_score"]:.2f}%

	📝 Character Metrics:
	- Total Characters: {results["character_metrics"]["total_chars"]}
	- Correct Characters: {results["character_metrics"]["correct_chars"]}
	- Character Accuracy: {results["character_metrics"]["accuracy"]:.2f}%

	📄 Word Metrics:
	- Total Words: {results["word_metrics"]["total_words"]}
	- Correct Words: {results["word_metrics"]["correct_words"]}
	- Word Accuracy: {results["word_metrics"]["accuracy"]:.2f}%

	📋 Line Metrics:
	- Total Lines: {results["line_metrics"]["total_lines"]}
	- Correct Lines: {results["line_metrics"]["correct_lines"]}
	- Line Accuracy: {results["line_metrics"]["accuracy"]:.2f}%

	🌐 Language-Specific Accuracy:
	- English: {results["language_specific"]["english"]["accuracy"]:.2f}%
	- Bangla: {results["language_specific"]["bangla"]["accuracy"]:.2f}%
	- Mathematical: {results["language_specific"]["math"]["accuracy"]:.2f}%

	💡 Recommendations:
	{chr(10).join(f"• {rec}" for rec in results["evaluation_summary"]["recommendations"])}
	"""

	# Detailed results
	detailed = json.dumps(results, indent=2, ensure_ascii=False)

	return summary, detailed

	except Exception as e:
	error_msg = f"❌ Error during evaluation: {str(e)}"
	return error_msg, ""


	# Create Gradio Interface
	def create_interface():
	"""Create the main Gradio interface."""

	with gr.Blocks(
	title="🔍 Advanced Multi-Language OCR System",
	theme=gr.themes.Soft(),
	css="""
	.gradio-container {
	font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
	}
	.header {
	text-align: center;
	background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
	color: white;
	padding: 2rem;
	border-radius: 10px;
	margin-bottom: 2rem;
	}
	""",
	) as app:
	# Header
	gr.HTML("""
	<div class="header">
	<h1>🔍 Advanced Multi-Language OCR System</h1>
	<p>Extract text from PDFs containing English, Bangla, and Mathematical expressions</p>
	<p>Powered by Tesseract, Pix2Text, and Advanced Language Detection</p>
	</div>
	""")

	with gr.Tabs():
	# OCR Processing Tab
	with gr.Tab("📄 OCR Processing", id="ocr"):
	gr.Markdown("""
	## 📄 PDF Text Extraction

	Upload a PDF file to extract text using advanced multi-language OCR technology.

	Features:
	- 🌐 Multi-language support (English, Bangla, Mathematical expressions)
	- 🧮 Advanced mathematical formula recognition with Pix2Text
	- 📊 Detailed character-by-character analysis
	- 🏷️ Automatic content classification
	""")

	with gr.Row():
	with gr.Column():
	pdf_input = gr.File(
	label="📎 Upload PDF File",
	file_types=[".pdf"],
	file_count="single",
	)

	process_btn = gr.Button(
	"🚀 Extract Text", variant="primary", size="lg"
	)

	with gr.Row():
	with gr.Column():
	extracted_output = gr.Textbox(
	label="📝 Extracted Text",
	lines=15,
	max_lines=20,
	placeholder="Extracted text will appear here...",
	)

	with gr.Column():
	json_output = gr.Textbox(
	label="📋 Detailed JSON Results",
	lines=8,
	max_lines=15,
	placeholder="JSON results will appear here...",
	)

	analysis_output = gr.Textbox(
	label="📊 Analysis Report",
	lines=7,
	max_lines=10,
	placeholder="Analysis report will appear here...",
	)

	# Connect OCR processing
	process_btn.click(
	fn=process_pdf_ocr,
	inputs=[pdf_input],
	outputs=[extracted_output, json_output, analysis_output],
	)

	# Evaluation Tab
	with gr.Tab("📊 OCR Evaluation", id="eval"):
	gr.Markdown("""
	## 📊 OCR Accuracy Evaluation

	Compare extracted text with ground truth baseline to measure OCR accuracy.

	Features:
	- 🎯 Character, word, and line-level accuracy metrics
	- 🌐 Language-specific accuracy analysis
	- 📈 Overall grading system (A+ to F)
	- 💡 Improvement recommendations
	""")

	with gr.Row():
	with gr.Column():
	extracted_file = gr.File(
	label="📄 Extracted Text File (.txt)",
	file_types=[".txt"],
	file_count="single",
	)

	with gr.Column():
	baseline_file = gr.File(
	label="📋 Baseline/Ground Truth File (.txt)",
	file_types=[".txt"],
	file_count="single",
	)

	evaluate_btn = gr.Button(
	"🔍 Evaluate Accuracy", variant="primary", size="lg"
	)

	with gr.Row():
	with gr.Column():
	eval_summary = gr.Textbox(
	label="📊 Evaluation Summary",
	lines=20,
	max_lines=25,
	placeholder="Evaluation summary will appear here...",
	)

	with gr.Column():
	eval_detailed = gr.Textbox(
	label="📋 Detailed Results (JSON)",
	lines=20,
	max_lines=25,
	placeholder="Detailed evaluation results will appear here...",
	)

	# Connect evaluation
	evaluate_btn.click(
	fn=evaluate_ocr_files,
	inputs=[extracted_file, baseline_file],
	outputs=[eval_summary, eval_detailed],
	)

	# About Tab
	with gr.Tab("ℹ️ About", id="about"):
	gr.Markdown("""
	## 🔍 Advanced Multi-Language OCR System

	### 🌟 Overview
	This system provides state-of-the-art OCR capabilities for documents containing mixed languages and mathematical expressions.

	### 🚀 Key Features

	#### 📄 Multi-Language OCR
	- English: Advanced text recognition with high accuracy
	- Bangla: Native Bengali script support with proper Unicode handling
	- Mathematical: LaTeX and formula recognition using Pix2Text

	#### 🧮 Advanced Math Processing
	- Integration with Pix2Text for superior mathematical expression recognition
	- LaTeX output for mathematical formulas
	- Support for complex equations and symbols

	#### 📊 Comprehensive Analysis
	- Character-by-character classification and confidence scoring
	- Language detection and content categorization
	- Detailed extraction statistics and reports

	#### 🎯 Accuracy Evaluation
	- Compare extracted text with ground truth baseline
	- Character, word, and line-level accuracy metrics
	- Language-specific performance analysis
	- Grading system with improvement recommendations

	### 🛠️ Technology Stack
	- OCR Engine: Tesseract with custom language models
	- Math Recognition: Pix2Text for advanced mathematical expressions
	- Language Detection: Custom algorithms for multi-language content
	- Backend: FastAPI with async processing
	- Frontend: Gradio for interactive web interface

	### 📝 Usage Tips

	#### For Best OCR Results:
	1. File Quality: Use high-resolution PDF files (300 DPI or higher)
	2. Text Clarity: Ensure text is clear and not blurry or distorted
	3. Language: The system works best with properly formatted text
	4. Mathematical Content: Complex formulas are processed using specialized Pix2Text models

	#### For Accurate Evaluation:
	1. File Format: Upload plain text files (.txt) in UTF-8 encoding
	2. Content Matching: Ensure baseline file corresponds to the same source document
	3. Text Cleaning: The system automatically cleans control characters

	### 🔗 Links
	- GitHub Repository: [aaladin-ocr](https://github.com/ashfaqbracu/aaladin-ocr)
	- Documentation: Available in the repository
	- Issues/Support: Report issues on GitHub

	### 📧 Contact
	For questions or support, please visit our GitHub repository or create an issue.

	---

	Developed with ❤️ for advanced document processing and OCR accuracy.
	""")

	return app


	# Initialize Pix2Text on startup
	print("🚀 Initializing Pix2Text model...")
	try:
	pix2text_model = initialize_pix2text()
	if pix2text_model:
	print("✅ Pix2Text initialized successfully")
	else:
	print("⚠️ Pix2Text initialization failed - math extraction may be limited")
	except Exception as e:
	print(f"⚠️ Pix2Text initialization error: {e}")

	# Create and launch the interface
	if __name__ == "__main__":
	app = create_interface()

	# Launch with proper configuration for Hugging Face Spaces
	app.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)