File size: 17,978 Bytes
a30e196
 
 
 
 
0302b73
a30e196
 
 
 
 
 
 
 
0302b73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a30e196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0302b73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a30e196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0302b73
a30e196
0302b73
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
# app.py - Gradio Interface for Hugging Face Spaces
import gradio as gr
import os
import json
import shutil
import subprocess
from datetime import datetime
from pathlib import Path
from typing import Tuple

# Import our OCR functionality
from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text
from eval import evaluate_ocr_accuracy, clean_control_characters

def check_system_dependencies():
    """Check and report system dependencies status."""
    print("๐Ÿ” Checking system dependencies...")
    
    # Check Tesseract
    try:
        result = subprocess.run(['tesseract', '--version'], capture_output=True, text=True)
        if result.returncode == 0:
            print("โœ… Tesseract is available")
        else:
            print("โŒ Tesseract check failed")
    except FileNotFoundError:
        print("โŒ Tesseract not found in PATH")
    
    # Check Poppler
    poppler_tools = ['pdftoppm', 'pdfinfo']
    for tool in poppler_tools:
        try:
            result = subprocess.run(['which', tool], capture_output=True, text=True)
            if result.returncode == 0:
                print(f"โœ… {tool} is available")
            else:
                print(f"โŒ {tool} not found")
        except FileNotFoundError:
            print(f"โŒ {tool} not available")
    
    # Check pdf2image
    try:
        import importlib.util
        if importlib.util.find_spec("pdf2image") is not None:
            print("โœ… pdf2image is available")
        else:
            print("โŒ pdf2image module not found")
    except Exception as e:
        print(f"โŒ pdf2image check failed: {e}")
    
    print(f"๐Ÿ“ PATH: {os.environ.get('PATH', 'NOT SET')}")

# Run dependency check on startup
check_system_dependencies()


# Initialize directories
def create_directories():
    """Create necessary directories for file storage."""
    directories = ["documents", "extracted", "temp"]
    for directory in directories:
        Path(directory).mkdir(exist_ok=True)


create_directories()


def process_pdf_ocr(pdf_file) -> Tuple[str, str, str]:
    """
    Process uploaded PDF file and extract text using advanced OCR.

    Args:
        pdf_file: Gradio file input (temporary file path)

    Returns:
        Tuple of (extracted_text, json_results, analysis_results)
    """
    if pdf_file is None:
        return "โŒ No file uploaded", "", ""

    try:
        # Generate timestamp for unique naming
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Get original filename
        original_name = os.path.basename(pdf_file.name)
        base_name = os.path.splitext(original_name)[0]

        # Create unique filenames
        pdf_filename = f"{base_name}_{timestamp}.pdf"
        text_filename = f"{base_name}_{timestamp}_extract.txt"
        json_filename = f"{base_name}_{timestamp}_extract.json"
        analysis_filename = f"{base_name}_{timestamp}_analysis.json"

        # Create paths
        pdf_path = Path("temp") / pdf_filename
        text_path = Path("extracted") / text_filename
        json_path = Path("extracted") / json_filename
        analysis_path = Path("extracted") / analysis_filename

        # Copy uploaded file to our temp directory
        shutil.copy2(pdf_file.name, pdf_path)

        # Process the PDF using our advanced OCR system
        extract_all_text_advanced_pix2text(
            pdf_path=str(pdf_path),
            output_text_file=str(text_path),
            output_json_file=str(json_path),
            output_analysis_file=str(analysis_path),
        )

        # Read results
        with open(text_path, "r", encoding="utf-8") as f:
            extracted_text = f.read()

        with open(json_path, "r", encoding="utf-8") as f:
            json_results = json.load(f)

        with open(analysis_path, "r", encoding="utf-8") as f:
            analysis_results = json.load(f)

        # Format results for display
        json_display = json.dumps(json_results, indent=2, ensure_ascii=False)
        analysis_display = json.dumps(analysis_results, indent=2, ensure_ascii=False)

        # Clean up temp file
        try:
            os.remove(pdf_path)
        except Exception:
            pass

        return extracted_text, json_display, analysis_display

    except Exception as e:
        error_msg = str(e)

        # Provide specific guidance for common errors
        if (
            "poppler" in error_msg.lower()
            or "unable to get page count" in error_msg.lower()
        ):
            error_msg = """โŒ PDF Processing Error: Poppler not found
            
๐Ÿ”ง This error occurs because Poppler (PDF utilities) is not properly installed.

๐Ÿ“‹ For Hugging Face Spaces:
1. Ensure your setup.sh script runs during deployment
2. Check that poppler-utils is installed in the container
3. Verify the setup logs show successful poppler installation

๐Ÿ’ก The setup.sh script should install these packages:
   - poppler-utils
   - libpoppler-cpp-dev
   - pkg-config

๐Ÿšจ Original error: {error_msg}

๐Ÿ”„ Try restarting the space if this persists."""
        elif "tesseract" in error_msg.lower():
            error_msg = f"""โŒ OCR Engine Error: Tesseract issue
            
๐Ÿ”ง This error is related to Tesseract OCR engine.

๐Ÿ“‹ Possible solutions:
1. Check Tesseract installation in setup.sh
2. Verify language data files are available
3. Ensure proper permissions on tessdata directory

๐Ÿšจ Original error: {error_msg}"""
        else:
            error_msg = f"โŒ Error processing PDF: {error_msg}"

        return error_msg, "", ""


def evaluate_ocr_files(extracted_file, baseline_file) -> Tuple[str, str]:
    """
    Evaluate OCR accuracy by comparing extracted text with baseline.

    Args:
        extracted_file: Gradio file input (extracted text file)
        baseline_file: Gradio file input (baseline/ground truth text file)

    Returns:
        Tuple of (evaluation_summary, detailed_results)
    """
    if extracted_file is None or baseline_file is None:
        return "โŒ Please upload both files", ""

    try:
        # Read file contents
        with open(extracted_file.name, "r", encoding="utf-8") as f:
            extracted_text = f.read()

        with open(baseline_file.name, "r", encoding="utf-8") as f:
            baseline_text = f.read()

        # Clean texts
        extracted_text_clean = clean_control_characters(extracted_text)
        baseline_text_clean = clean_control_characters(baseline_text)

        # Perform evaluation
        results = evaluate_ocr_accuracy(
            extracted_text=extracted_text_clean,
            baseline_text=baseline_text_clean,
        )

        if "error" in results:
            return f"โŒ Evaluation error: {results['error']}", ""

        # Create summary
        summary = f"""
๐Ÿ“Š **OCR Evaluation Results**

๐ŸŽฏ **Overall Grade: {results["evaluation_summary"]["grade"]}**
๐Ÿ“ˆ **Overall Accuracy: {results["overall_accuracy"]:.2f}%**
๐Ÿ” **Similarity Score: {results["similarity_score"]:.2f}%**

๐Ÿ“ **Character Metrics:**
- Total Characters: {results["character_metrics"]["total_chars"]}
- Correct Characters: {results["character_metrics"]["correct_chars"]}
- Character Accuracy: {results["character_metrics"]["accuracy"]:.2f}%

๐Ÿ“„ **Word Metrics:**
- Total Words: {results["word_metrics"]["total_words"]}
- Correct Words: {results["word_metrics"]["correct_words"]}
- Word Accuracy: {results["word_metrics"]["accuracy"]:.2f}%

๐Ÿ“‹ **Line Metrics:**
- Total Lines: {results["line_metrics"]["total_lines"]}
- Correct Lines: {results["line_metrics"]["correct_lines"]}
- Line Accuracy: {results["line_metrics"]["accuracy"]:.2f}%

๐ŸŒ **Language-Specific Accuracy:**
- English: {results["language_specific"]["english"]["accuracy"]:.2f}%
- Bangla: {results["language_specific"]["bangla"]["accuracy"]:.2f}%
- Mathematical: {results["language_specific"]["math"]["accuracy"]:.2f}%

๐Ÿ’ก **Recommendations:**
{chr(10).join(f"โ€ข {rec}" for rec in results["evaluation_summary"]["recommendations"])}
"""

        # Detailed results
        detailed = json.dumps(results, indent=2, ensure_ascii=False)

        return summary, detailed

    except Exception as e:
        error_msg = f"โŒ Error during evaluation: {str(e)}"
        return error_msg, ""


# Create Gradio Interface
def create_interface():
    """Create the main Gradio interface."""

    with gr.Blocks(
        title="๐Ÿ” Advanced Multi-Language OCR System",
        theme=gr.themes.Soft(),
        css="""
        .gradio-container {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
        }
        .header {
            text-align: center;
            background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 2rem;
            border-radius: 10px;
            margin-bottom: 2rem;
        }
        """,
    ) as app:
        # Header
        gr.HTML("""
        <div class="header">
            <h1>๐Ÿ” Advanced Multi-Language OCR System</h1>
            <p>Extract text from PDFs containing English, Bangla, and Mathematical expressions</p>
            <p>Powered by Tesseract, Pix2Text, and Advanced Language Detection</p>
        </div>
        """)

        with gr.Tabs():
            # OCR Processing Tab
            with gr.Tab("๐Ÿ“„ OCR Processing", id="ocr"):
                gr.Markdown("""
                ## ๐Ÿ“„ PDF Text Extraction
                
                Upload a PDF file to extract text using advanced multi-language OCR technology.
                
                **Features:**
                - ๐ŸŒ Multi-language support (English, Bangla, Mathematical expressions)
                - ๐Ÿงฎ Advanced mathematical formula recognition with Pix2Text
                - ๐Ÿ“Š Detailed character-by-character analysis
                - ๐Ÿท๏ธ Automatic content classification
                """)

                with gr.Row():
                    with gr.Column():
                        pdf_input = gr.File(
                            label="๐Ÿ“Ž Upload PDF File",
                            file_types=[".pdf"],
                            file_count="single",
                        )

                        process_btn = gr.Button(
                            "๐Ÿš€ Extract Text", variant="primary", size="lg"
                        )

                with gr.Row():
                    with gr.Column():
                        extracted_output = gr.Textbox(
                            label="๐Ÿ“ Extracted Text",
                            lines=15,
                            max_lines=20,
                            placeholder="Extracted text will appear here...",
                        )

                    with gr.Column():
                        json_output = gr.Textbox(
                            label="๐Ÿ“‹ Detailed JSON Results",
                            lines=8,
                            max_lines=15,
                            placeholder="JSON results will appear here...",
                        )

                        analysis_output = gr.Textbox(
                            label="๐Ÿ“Š Analysis Report",
                            lines=7,
                            max_lines=10,
                            placeholder="Analysis report will appear here...",
                        )

                # Connect OCR processing
                process_btn.click(
                    fn=process_pdf_ocr,
                    inputs=[pdf_input],
                    outputs=[extracted_output, json_output, analysis_output],
                )

            # Evaluation Tab
            with gr.Tab("๐Ÿ“Š OCR Evaluation", id="eval"):
                gr.Markdown("""
                ## ๐Ÿ“Š OCR Accuracy Evaluation
                
                Compare extracted text with ground truth baseline to measure OCR accuracy.
                
                **Features:**
                - ๐ŸŽฏ Character, word, and line-level accuracy metrics
                - ๐ŸŒ Language-specific accuracy analysis
                - ๐Ÿ“ˆ Overall grading system (A+ to F)
                - ๐Ÿ’ก Improvement recommendations
                """)

                with gr.Row():
                    with gr.Column():
                        extracted_file = gr.File(
                            label="๐Ÿ“„ Extracted Text File (.txt)",
                            file_types=[".txt"],
                            file_count="single",
                        )

                    with gr.Column():
                        baseline_file = gr.File(
                            label="๐Ÿ“‹ Baseline/Ground Truth File (.txt)",
                            file_types=[".txt"],
                            file_count="single",
                        )

                evaluate_btn = gr.Button(
                    "๐Ÿ” Evaluate Accuracy", variant="primary", size="lg"
                )

                with gr.Row():
                    with gr.Column():
                        eval_summary = gr.Textbox(
                            label="๐Ÿ“Š Evaluation Summary",
                            lines=20,
                            max_lines=25,
                            placeholder="Evaluation summary will appear here...",
                        )

                    with gr.Column():
                        eval_detailed = gr.Textbox(
                            label="๐Ÿ“‹ Detailed Results (JSON)",
                            lines=20,
                            max_lines=25,
                            placeholder="Detailed evaluation results will appear here...",
                        )

                # Connect evaluation
                evaluate_btn.click(
                    fn=evaluate_ocr_files,
                    inputs=[extracted_file, baseline_file],
                    outputs=[eval_summary, eval_detailed],
                )

            # About Tab
            with gr.Tab("โ„น๏ธ About", id="about"):
                gr.Markdown("""
                ## ๐Ÿ” Advanced Multi-Language OCR System
                
                ### ๐ŸŒŸ Overview
                This system provides state-of-the-art OCR capabilities for documents containing mixed languages and mathematical expressions.
                
                ### ๐Ÿš€ Key Features
                
                #### ๐Ÿ“„ Multi-Language OCR
                - **English**: Advanced text recognition with high accuracy
                - **Bangla**: Native Bengali script support with proper Unicode handling
                - **Mathematical**: LaTeX and formula recognition using Pix2Text
                
                #### ๐Ÿงฎ Advanced Math Processing
                - Integration with **Pix2Text** for superior mathematical expression recognition
                - LaTeX output for mathematical formulas
                - Support for complex equations and symbols
                
                #### ๐Ÿ“Š Comprehensive Analysis
                - Character-by-character classification and confidence scoring
                - Language detection and content categorization
                - Detailed extraction statistics and reports
                
                #### ๐ŸŽฏ Accuracy Evaluation
                - Compare extracted text with ground truth baseline
                - Character, word, and line-level accuracy metrics
                - Language-specific performance analysis
                - Grading system with improvement recommendations
                
                ### ๐Ÿ› ๏ธ Technology Stack
                - **OCR Engine**: Tesseract with custom language models
                - **Math Recognition**: Pix2Text for advanced mathematical expressions
                - **Language Detection**: Custom algorithms for multi-language content
                - **Backend**: FastAPI with async processing
                - **Frontend**: Gradio for interactive web interface
                
                ### ๐Ÿ“ Usage Tips
                
                #### For Best OCR Results:
                1. **File Quality**: Use high-resolution PDF files (300 DPI or higher)
                2. **Text Clarity**: Ensure text is clear and not blurry or distorted
                3. **Language**: The system works best with properly formatted text
                4. **Mathematical Content**: Complex formulas are processed using specialized Pix2Text models
                
                #### For Accurate Evaluation:
                1. **File Format**: Upload plain text files (.txt) in UTF-8 encoding
                2. **Content Matching**: Ensure baseline file corresponds to the same source document
                3. **Text Cleaning**: The system automatically cleans control characters
                
                ### ๐Ÿ”— Links
                - **GitHub Repository**: [aaladin-ocr](https://github.com/ashfaqbracu/aaladin-ocr)
                - **Documentation**: Available in the repository
                - **Issues/Support**: Report issues on GitHub
                
                ### ๐Ÿ“ง Contact
                For questions or support, please visit our GitHub repository or create an issue.
                
                ---
                
                **Developed with โค๏ธ for advanced document processing and OCR accuracy.**
                """)

        return app


# Initialize Pix2Text on startup
print("๐Ÿš€ Initializing Pix2Text model...")
try:
    pix2text_model = initialize_pix2text()
    if pix2text_model:
        print("โœ… Pix2Text initialized successfully")
    else:
        print("โš ๏ธ Pix2Text initialization failed - math extraction may be limited")
except Exception as e:
    print(f"โš ๏ธ Pix2Text initialization error: {e}")

# Create and launch the interface
if __name__ == "__main__":
    app = create_interface()

    # Launch with proper configuration for Hugging Face Spaces
    app.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)