# app.py - Gradio Interface for Hugging Face Spaces import gradio as gr import os import json import shutil import subprocess from datetime import datetime from pathlib import Path from typing import Tuple # Import our OCR functionality from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text from eval import evaluate_ocr_accuracy, clean_control_characters def check_system_dependencies(): """Check and report system dependencies status.""" print("đ Checking system dependencies...") # Check Tesseract try: result = subprocess.run(['tesseract', '--version'], capture_output=True, text=True) if result.returncode == 0: print("â Tesseract is available") else: print("â Tesseract check failed") except FileNotFoundError: print("â Tesseract not found in PATH") # Check Poppler poppler_tools = ['pdftoppm', 'pdfinfo'] for tool in poppler_tools: try: result = subprocess.run(['which', tool], capture_output=True, text=True) if result.returncode == 0: print(f"â {tool} is available") else: print(f"â {tool} not found") except FileNotFoundError: print(f"â {tool} not available") # Check pdf2image try: import importlib.util if importlib.util.find_spec("pdf2image") is not None: print("â pdf2image is available") else: print("â pdf2image module not found") except Exception as e: print(f"â pdf2image check failed: {e}") print(f"đ PATH: {os.environ.get('PATH', 'NOT SET')}") # Run dependency check on startup check_system_dependencies() # Initialize directories def create_directories(): """Create necessary directories for file storage.""" directories = ["documents", "extracted", "temp"] for directory in directories: Path(directory).mkdir(exist_ok=True) create_directories() def process_pdf_ocr(pdf_file) -> Tuple[str, str, str]: """ Process uploaded PDF file and extract text using advanced OCR. Args: pdf_file: Gradio file input (temporary file path) Returns: Tuple of (extracted_text, json_results, analysis_results) """ if pdf_file is None: return "â No file uploaded", "", "" try: # Generate timestamp for unique naming timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Get original filename original_name = os.path.basename(pdf_file.name) base_name = os.path.splitext(original_name)[0] # Create unique filenames pdf_filename = f"{base_name}_{timestamp}.pdf" text_filename = f"{base_name}_{timestamp}_extract.txt" json_filename = f"{base_name}_{timestamp}_extract.json" analysis_filename = f"{base_name}_{timestamp}_analysis.json" # Create paths pdf_path = Path("temp") / pdf_filename text_path = Path("extracted") / text_filename json_path = Path("extracted") / json_filename analysis_path = Path("extracted") / analysis_filename # Copy uploaded file to our temp directory shutil.copy2(pdf_file.name, pdf_path) # Process the PDF using our advanced OCR system extract_all_text_advanced_pix2text( pdf_path=str(pdf_path), output_text_file=str(text_path), output_json_file=str(json_path), output_analysis_file=str(analysis_path), ) # Read results with open(text_path, "r", encoding="utf-8") as f: extracted_text = f.read() with open(json_path, "r", encoding="utf-8") as f: json_results = json.load(f) with open(analysis_path, "r", encoding="utf-8") as f: analysis_results = json.load(f) # Format results for display json_display = json.dumps(json_results, indent=2, ensure_ascii=False) analysis_display = json.dumps(analysis_results, indent=2, ensure_ascii=False) # Clean up temp file try: os.remove(pdf_path) except Exception: pass return extracted_text, json_display, analysis_display except Exception as e: error_msg = str(e) # Provide specific guidance for common errors if ( "poppler" in error_msg.lower() or "unable to get page count" in error_msg.lower() ): error_msg = """â PDF Processing Error: Poppler not found đ§ This error occurs because Poppler (PDF utilities) is not properly installed. đ For Hugging Face Spaces: 1. Ensure your setup.sh script runs during deployment 2. Check that poppler-utils is installed in the container 3. Verify the setup logs show successful poppler installation đĄ The setup.sh script should install these packages: - poppler-utils - libpoppler-cpp-dev - pkg-config đ¨ Original error: {error_msg} đ Try restarting the space if this persists.""" elif "tesseract" in error_msg.lower(): error_msg = f"""â OCR Engine Error: Tesseract issue đ§ This error is related to Tesseract OCR engine. đ Possible solutions: 1. Check Tesseract installation in setup.sh 2. Verify language data files are available 3. Ensure proper permissions on tessdata directory đ¨ Original error: {error_msg}""" else: error_msg = f"â Error processing PDF: {error_msg}" return error_msg, "", "" def evaluate_ocr_files(extracted_file, baseline_file) -> Tuple[str, str]: """ Evaluate OCR accuracy by comparing extracted text with baseline. Args: extracted_file: Gradio file input (extracted text file) baseline_file: Gradio file input (baseline/ground truth text file) Returns: Tuple of (evaluation_summary, detailed_results) """ if extracted_file is None or baseline_file is None: return "â Please upload both files", "" try: # Read file contents with open(extracted_file.name, "r", encoding="utf-8") as f: extracted_text = f.read() with open(baseline_file.name, "r", encoding="utf-8") as f: baseline_text = f.read() # Clean texts extracted_text_clean = clean_control_characters(extracted_text) baseline_text_clean = clean_control_characters(baseline_text) # Perform evaluation results = evaluate_ocr_accuracy( extracted_text=extracted_text_clean, baseline_text=baseline_text_clean, ) if "error" in results: return f"â Evaluation error: {results['error']}", "" # Create summary summary = f""" đ **OCR Evaluation Results** đ¯ **Overall Grade: {results["evaluation_summary"]["grade"]}** đ **Overall Accuracy: {results["overall_accuracy"]:.2f}%** đ **Similarity Score: {results["similarity_score"]:.2f}%** đ **Character Metrics:** - Total Characters: {results["character_metrics"]["total_chars"]} - Correct Characters: {results["character_metrics"]["correct_chars"]} - Character Accuracy: {results["character_metrics"]["accuracy"]:.2f}% đ **Word Metrics:** - Total Words: {results["word_metrics"]["total_words"]} - Correct Words: {results["word_metrics"]["correct_words"]} - Word Accuracy: {results["word_metrics"]["accuracy"]:.2f}% đ **Line Metrics:** - Total Lines: {results["line_metrics"]["total_lines"]} - Correct Lines: {results["line_metrics"]["correct_lines"]} - Line Accuracy: {results["line_metrics"]["accuracy"]:.2f}% đ **Language-Specific Accuracy:** - English: {results["language_specific"]["english"]["accuracy"]:.2f}% - Bangla: {results["language_specific"]["bangla"]["accuracy"]:.2f}% - Mathematical: {results["language_specific"]["math"]["accuracy"]:.2f}% đĄ **Recommendations:** {chr(10).join(f"âĸ {rec}" for rec in results["evaluation_summary"]["recommendations"])} """ # Detailed results detailed = json.dumps(results, indent=2, ensure_ascii=False) return summary, detailed except Exception as e: error_msg = f"â Error during evaluation: {str(e)}" return error_msg, "" # Create Gradio Interface def create_interface(): """Create the main Gradio interface.""" with gr.Blocks( title="đ Advanced Multi-Language OCR System", theme=gr.themes.Soft(), css=""" .gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; } .header { text-align: center; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); color: white; padding: 2rem; border-radius: 10px; margin-bottom: 2rem; } """, ) as app: # Header gr.HTML("""
Extract text from PDFs containing English, Bangla, and Mathematical expressions
Powered by Tesseract, Pix2Text, and Advanced Language Detection