Ash2749 commited on
Commit
c139f95
·
verified ·
1 Parent(s): 1a3e965

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +656 -566
app.py CHANGED
@@ -1,566 +1,656 @@
1
- # app.py - Gradio Interface for Advanced Multi-Language OCR System
2
- # Hugging Face Spaces compatible application
3
-
4
- import os
5
- import json
6
- import shutil
7
- from datetime import datetime
8
- from pathlib import Path
9
- from typing import Tuple
10
- import gradio as gr
11
-
12
- # Import our OCR functionality
13
- from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text
14
- from eval import evaluate_ocr_accuracy, clean_control_characters
15
-
16
- # Set up logging
17
- import logging
18
-
19
- logging.basicConfig(level=logging.INFO)
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- # Create necessary directories
24
- def create_directories():
25
- """Create necessary directories for file storage."""
26
- directories = ["documents", "extracted", "temp"]
27
- for directory in directories:
28
- Path(directory).mkdir(exist_ok=True)
29
- logger.info(f"✅ Created/verified directory: {directory}")
30
-
31
-
32
- # Initialize directories
33
- create_directories()
34
-
35
- # Initialize Pix2Text model at startup
36
- logger.info("🚀 Initializing Pix2Text model...")
37
- PIX2TEXT_MODEL = initialize_pix2text()
38
- if PIX2TEXT_MODEL:
39
- logger.info("✅ Pix2Text model loaded successfully")
40
- else:
41
- logger.warning("⚠️ Pix2Text model not available, using fallback OCR")
42
-
43
-
44
- def get_safe_filename(filename: str) -> str:
45
- """Generate a safe filename with timestamp."""
46
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
47
- name, ext = os.path.splitext(filename)
48
- # Remove special characters and replace spaces
49
- safe_name = "".join(c for c in name if c.isalnum() or c in ("-", "_")).rstrip()
50
- return f"{safe_name}_{timestamp}{ext}"
51
-
52
-
53
- def get_extraction_filename(pdf_filename: str, file_type: str) -> str:
54
- """Generate extraction filename with convention: [pdf_filename]_extract.[extension]"""
55
- base_name = os.path.splitext(pdf_filename)[0]
56
- extensions = {"txt": "txt", "json": "json", "analysis": "json"}
57
- return f"{base_name}_extract.{extensions.get(file_type, 'txt')}"
58
-
59
-
60
- def extract_text_from_pdf(pdf_file) -> Tuple[str, str, str, str]:
61
- """
62
- Extract text from uploaded PDF file using advanced OCR.
63
-
64
- Returns:
65
- - extracted_text: The full extracted text
66
- - summary_text: A summary of the extraction process
67
- - text_file_path: Path to the text file (for download)
68
- - json_file_path: Path to the JSON file (for download)
69
- """
70
- if pdf_file is None:
71
- return " No file uploaded", "Please upload a PDF file", "", ""
72
-
73
- try:
74
- start_time = datetime.now()
75
-
76
- # Get the uploaded file path
77
- pdf_path = pdf_file.name
78
- filename = os.path.basename(pdf_path)
79
-
80
- logger.info(f"📄 Processing uploaded file: {filename}")
81
-
82
- # Generate safe filename
83
- safe_filename = get_safe_filename(filename)
84
-
85
- # Copy uploaded file to documents directory
86
- documents_path = Path("documents") / safe_filename
87
- shutil.copy2(pdf_path, documents_path)
88
-
89
- # Generate output filenames
90
- text_filename = get_extraction_filename(safe_filename, "txt")
91
- json_filename = get_extraction_filename(safe_filename, "json")
92
- analysis_filename = get_extraction_filename(safe_filename, "analysis")
93
-
94
- # Create full paths for extracted files
95
- text_path = Path("extracted") / text_filename
96
- json_path = Path("extracted") / json_filename
97
- analysis_path = Path("extracted") / analysis_filename
98
-
99
- logger.info("🔄 Starting OCR processing...")
100
-
101
- # Process the PDF using our advanced OCR system
102
- extract_all_text_advanced_pix2text(
103
- pdf_path=str(documents_path),
104
- output_text_file=str(text_path),
105
- output_json_file=str(json_path),
106
- output_analysis_file=str(analysis_path),
107
- )
108
-
109
- # Read the extracted text
110
- with open(text_path, "r", encoding="utf-8") as f:
111
- extracted_text = f.read()
112
-
113
- # Read the analysis for summary
114
- with open(analysis_path, "r", encoding="utf-8") as f:
115
- analysis_data = json.load(f)
116
-
117
- # Calculate processing time
118
- end_time = datetime.now()
119
- processing_time = (end_time - start_time).total_seconds()
120
-
121
- # Create summary
122
- summary = f"""
123
- 📊 **OCR Processing Complete!**
124
-
125
- ⏱️ **Processing Time:** {processing_time:.2f} seconds
126
- 📄 **Original File:** {filename}
127
- 📝 **Extracted Characters:** {len(extracted_text):,}
128
-
129
- 🔤 **Text Distribution:**
130
- - English regions: {analysis_data.get("type_distribution", {}).get("english", 0)}
131
- - Bangla regions: {analysis_data.get("type_distribution", {}).get("bangla", 0)}
132
- - Math regions: {analysis_data.get("type_distribution", {}).get("math", 0)}
133
- - Mixed regions: {analysis_data.get("type_distribution", {}).get("mixed", 0)}
134
-
135
- 📈 **Quality Metrics:**
136
- - Total text regions: {analysis_data.get("total_regions", 0)}
137
- - Pages processed: {analysis_data.get("total_pages", 0)}
138
- - Average confidence: {analysis_data.get("confidence_stats", {}).get("avg", 0):.1f}%
139
-
140
- 🔧 **Extraction Methods:**
141
- - Pix2Text (Math): {analysis_data.get("extraction_methods", {}).get("pix2text", 0)} regions
142
- - Tesseract (Text): {analysis_data.get("extraction_methods", {}).get("tesseract", 0)} regions
143
-
144
- **Status:** Extraction completed successfully!
145
- """
146
-
147
- logger.info(f"✅ OCR processing completed in {processing_time:.2f} seconds")
148
-
149
- return extracted_text, summary, str(text_path), str(json_path)
150
-
151
- except Exception as e:
152
- error_message = f"❌ **Error during OCR processing:**\n\n{str(e)}"
153
- logger.error(f"OCR processing failed: {e}")
154
- return error_message, error_message, "", ""
155
-
156
-
157
- def evaluate_ocr_files(
158
- extracted_file, baseline_file, evaluation_name: str = ""
159
- ) -> Tuple[str, str]:
160
- """
161
- Evaluate OCR accuracy by comparing extracted text with baseline.
162
-
163
- Returns:
164
- - results_text: Formatted evaluation results
165
- - summary_text: Summary of the evaluation
166
- """
167
- if extracted_file is None or baseline_file is None:
168
- return "❌ Please upload both files for evaluation", "Missing files"
169
-
170
- try:
171
- start_time = datetime.now()
172
-
173
- # Read file contents
174
- with open(extracted_file.name, "r", encoding="utf-8") as f:
175
- extracted_text = f.read()
176
-
177
- with open(baseline_file.name, "r", encoding="utf-8") as f:
178
- baseline_text = f.read()
179
-
180
- logger.info(f"📊 Starting evaluation: {evaluation_name or 'Unnamed'}")
181
- logger.info(f"Extracted text length: {len(extracted_text)} characters")
182
- logger.info(f"Baseline text length: {len(baseline_text)} characters")
183
-
184
- # Clean input texts
185
- extracted_text_clean = clean_control_characters(extracted_text)
186
- baseline_text_clean = clean_control_characters(baseline_text)
187
-
188
- # Perform evaluation
189
- evaluation_results = evaluate_ocr_accuracy(
190
- extracted_text=extracted_text_clean,
191
- baseline_text=baseline_text_clean,
192
- )
193
-
194
- # Check for evaluation errors
195
- if "error" in evaluation_results:
196
- return (
197
- f"❌ **Evaluation Error:** {evaluation_results['error']}",
198
- "Error occurred",
199
- )
200
-
201
- # Calculate processing time
202
- end_time = datetime.now()
203
- processing_time = (end_time - start_time).total_seconds()
204
-
205
- # Format results
206
- results_text = f"""
207
- 📊 **OCR Evaluation Results**
208
- {f"📝 **Evaluation Name:** {evaluation_name}" if evaluation_name else ""}
209
-
210
- 🎯 **Overall Performance**
211
- - **Overall Accuracy:** {evaluation_results["overall_accuracy"]:.2f}%
212
- - **Similarity Score:** {evaluation_results["similarity_score"]:.2f}%
213
- - **Grade:** {evaluation_results["evaluation_summary"]["grade"]}
214
-
215
- 📝 **Character-Level Analysis**
216
- - **Character Accuracy:** {evaluation_results["character_metrics"]["character_accuracy"]:.2f}%
217
- - **Character Error Rate:** {evaluation_results["character_metrics"]["character_error_rate"]:.2f}%
218
- - **Edit Distance:** {evaluation_results["character_metrics"]["edit_distance"]}
219
- - **Total Characters:** {evaluation_results["character_metrics"]["total_characters"]:,}
220
-
221
- 📚 **Word-Level Analysis**
222
- - **Word Accuracy:** {evaluation_results["word_metrics"]["word_accuracy"]:.2f}%
223
- - **Word Error Rate:** {evaluation_results["word_metrics"]["word_error_rate"]:.2f}%
224
- - **Correct Words:** {evaluation_results["word_metrics"]["correct_words"]} / {evaluation_results["word_metrics"]["total_words"]}
225
- - **Missing Words:** {evaluation_results["word_metrics"]["missing_words"]}
226
- - **Extra Words:** {evaluation_results["word_metrics"]["extra_words"]}
227
-
228
- 📄 **Line-Level Analysis**
229
- - **Line Accuracy:** {evaluation_results["line_metrics"]["line_accuracy"]:.2f}%
230
- - **Average Line Similarity:** {evaluation_results["line_metrics"]["average_line_similarity"]:.2f}%
231
- - **Lines Matched:** {evaluation_results["line_metrics"]["lines_matched"]} / {evaluation_results["line_metrics"]["total_lines"]}
232
-
233
- 🌐 **Language-Specific Accuracy**
234
- - **English:** {evaluation_results["language_specific"].get("english_accuracy", "N/A")}%
235
- - **Bangla:** {evaluation_results["language_specific"].get("bangla_accuracy", "N/A")}%
236
- - **Mathematics:** {evaluation_results["language_specific"].get("math_accuracy", "N/A")}%
237
- - **Numbers:** {evaluation_results["language_specific"].get("number_accuracy", "N/A")}%
238
-
239
- 📈 **Text Statistics**
240
- - **Extracted Length:** {evaluation_results["text_statistics"]["extracted_length"]:,} characters
241
- - **Baseline Length:** {evaluation_results["text_statistics"]["baseline_length"]:,} characters
242
- - **Extracted Words:** {evaluation_results["text_statistics"]["extracted_words"]:,}
243
- - **Baseline Words:** {evaluation_results["text_statistics"]["baseline_words"]:,}
244
-
245
- 💡 **Recommendations**
246
- """
247
-
248
- for i, rec in enumerate(
249
- evaluation_results["evaluation_summary"]["recommendations"], 1
250
- ):
251
- results_text += f"{i}. {rec}\n"
252
-
253
- # Create summary
254
- summary = f"""
255
- 🎯 **Evaluation Summary**
256
-
257
- ⏱️ **Processing Time:** {processing_time:.3f} seconds
258
- 📊 **Overall Score:** {evaluation_results["overall_accuracy"]:.2f}%
259
- 🏆 **Grade:** {evaluation_results["evaluation_summary"]["grade"]}
260
- 📝 **Character Accuracy:** {evaluation_results["character_metrics"]["character_accuracy"]:.2f}%
261
- 📚 **Word Accuracy:** {evaluation_results["word_metrics"]["word_accuracy"]:.2f}%
262
-
263
- **Evaluation completed successfully!**
264
- """
265
-
266
- logger.info(f"✅ Evaluation completed in {processing_time:.3f} seconds")
267
- logger.info(
268
- f"📊 Overall accuracy: {evaluation_results['overall_accuracy']:.2f}%"
269
- )
270
-
271
- return results_text, summary
272
-
273
- except Exception as e:
274
- error_message = f"❌ **Error during evaluation:**\n\n{str(e)}"
275
- logger.error(f"Evaluation failed: {e}")
276
- return error_message, error_message
277
-
278
-
279
- # Create Gradio interface
280
- def create_gradio_interface():
281
- """Create and configure the Gradio interface."""
282
-
283
- # Custom CSS for better styling
284
- css = """
285
- .gradio-container {
286
- font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
287
- }
288
- .output-text {
289
- font-family: 'Courier New', monospace;
290
- font-size: 14px;
291
- }
292
- .summary-box {
293
- background-color: #f0f8ff;
294
- border: 1px solid #d0e7ff;
295
- border-radius: 8px;
296
- padding: 16px;
297
- margin: 8px 0;
298
- }
299
- """
300
-
301
- with gr.Blocks(
302
- css=css, title="Advanced Multi-Language OCR System", theme=gr.themes.Soft()
303
- ) as app:
304
- # Header
305
- gr.Markdown("""
306
- # 🔍 Advanced Multi-Language OCR System
307
-
308
- **Powered by Pix2Text, Tesseract, and FastAPI**
309
-
310
- Extract text from PDFs containing **English**, **Bangla**, and **Mathematical expressions** with high accuracy.
311
- Evaluate OCR performance with comprehensive metrics and detailed analysis.
312
- """)
313
-
314
- with gr.Tabs():
315
- # Tab 1: OCR Extraction
316
- with gr.Tab("📄 PDF Text Extraction"):
317
- gr.Markdown("""
318
- ### Upload a PDF and extract text using advanced multi-language OCR
319
-
320
- **Features:**
321
- - 🌐 **Multi-language support**: English, Bangla (Bengali), and Mathematical expressions
322
- - 🧮 **Advanced Math Recognition**: Pix2Text integration for LaTeX and mathematical formulas
323
- - 📊 **Detailed Analysis**: Character-level classification and confidence scores
324
- - 💾 **Download Results**: Get extracted text and detailed JSON analysis
325
- """)
326
-
327
- with gr.Row():
328
- with gr.Column(scale=1):
329
- pdf_input = gr.File(
330
- label="📄 Upload PDF File",
331
- file_types=[".pdf"],
332
- type="filepath",
333
- )
334
- extract_btn = gr.Button(
335
- "🚀 Extract Text", variant="primary", size="lg"
336
- )
337
-
338
- with gr.Column(scale=2):
339
- extraction_summary = gr.Textbox(
340
- label="📊 Extraction Summary",
341
- lines=15,
342
- elem_classes=["summary-box"],
343
- )
344
-
345
- with gr.Row():
346
- extracted_text_output = gr.Textbox(
347
- label="📝 Extracted Text",
348
- lines=20,
349
- elem_classes=["output-text"],
350
- show_copy_button=True,
351
- )
352
-
353
- with gr.Row():
354
- text_file_download = gr.File(
355
- label="📥 Download Text File", visible=False
356
- )
357
- json_file_download = gr.File(
358
- label="📥 Download JSON Analysis", visible=False
359
- )
360
-
361
- # Connect extraction functionality
362
- extract_btn.click(
363
- fn=extract_text_from_pdf,
364
- inputs=[pdf_input],
365
- outputs=[
366
- extracted_text_output,
367
- extraction_summary,
368
- text_file_download,
369
- json_file_download,
370
- ],
371
- ).then(
372
- lambda text_path, json_path: (
373
- gr.update(
374
- visible=bool(text_path),
375
- value=text_path if text_path else None,
376
- ),
377
- gr.update(
378
- visible=bool(json_path),
379
- value=json_path if json_path else None,
380
- ),
381
- ),
382
- inputs=[text_file_download, json_file_download],
383
- outputs=[text_file_download, json_file_download],
384
- )
385
-
386
- # Tab 2: OCR Evaluation
387
- with gr.Tab("📊 OCR Accuracy Evaluation"):
388
- gr.Markdown("""
389
- ### Compare OCR extracted text with ground truth baseline for accuracy analysis
390
-
391
- **Evaluation Features:**
392
- - 🎯 **Character-level accuracy**: Precise character matching and edit distance
393
- - 📚 **Word-level accuracy**: Word matching and error rates
394
- - 📄 **Line-level accuracy**: Line comparison and similarity scores
395
- - 🌐 **Language-specific metrics**: Separate accuracy for English, Bangla, and Math
396
- - 🏆 **Grading system**: Letter grades from A+ to F with recommendations
397
- """)
398
-
399
- with gr.Row():
400
- with gr.Column():
401
- extracted_file_input = gr.File(
402
- label="📄 OCR Extracted Text File (.txt)",
403
- file_types=[".txt"],
404
- type="filepath",
405
- )
406
- baseline_file_input = gr.File(
407
- label="📑 Ground Truth Baseline File (.txt)",
408
- file_types=[".txt"],
409
- type="filepath",
410
- )
411
- evaluation_name_input = gr.Textbox(
412
- label="📝 Evaluation Name (Optional)",
413
- placeholder="e.g., Math Document Test #1",
414
- )
415
- evaluate_btn = gr.Button(
416
- "📊 Evaluate Accuracy", variant="primary", size="lg"
417
- )
418
-
419
- with gr.Column():
420
- evaluation_summary = gr.Textbox(
421
- label="🎯 Evaluation Summary",
422
- lines=10,
423
- elem_classes=["summary-box"],
424
- )
425
-
426
- with gr.Row():
427
- evaluation_results = gr.Textbox(
428
- label="📈 Detailed Evaluation Results",
429
- lines=25,
430
- elem_classes=["output-text"],
431
- show_copy_button=True,
432
- )
433
-
434
- # Connect evaluation functionality
435
- evaluate_btn.click(
436
- fn=evaluate_ocr_files,
437
- inputs=[
438
- extracted_file_input,
439
- baseline_file_input,
440
- evaluation_name_input,
441
- ],
442
- outputs=[evaluation_results, evaluation_summary],
443
- )
444
-
445
- # Tab 3: About & Help
446
- with gr.Tab("ℹ️ About & Help"):
447
- gr.Markdown("""
448
- ## 🔍 Advanced Multi-Language OCR System
449
-
450
- This application provides state-of-the-art Optical Character Recognition (OCR) for documents containing mixed languages and mathematical expressions.
451
-
452
- ### 🌟 Key Features
453
-
454
- #### 📄 **PDF Text Extraction**
455
- - **Multi-language Support**: Simultaneously process English and Bangla (Bengali) text
456
- - **Mathematical Recognition**: Advanced extraction of mathematical formulas and equations using Pix2Text
457
- - **Intelligent Classification**: Automatic detection and classification of text regions by language/content type
458
- - **High Accuracy**: Optimized preprocessing and multiple OCR engines for best results
459
- - **Detailed Analysis**: Character-by-character analysis with confidence scores and language distribution
460
-
461
- #### 📊 **OCR Accuracy Evaluation**
462
- - **Comprehensive Metrics**: Character, word, and line-level accuracy measurements
463
- - **Language-Specific Analysis**: Separate accuracy scores for different languages and mathematical content
464
- - **Edit Distance Calculation**: Precise measurement of text differences using Levenshtein distance
465
- - **Grading System**: Letter grades (A+ to F) with improvement recommendations
466
- - **Detailed Comparison**: Side-by-side diff analysis showing insertions, deletions, and matches
467
-
468
- ### 🛠️ **Technology Stack**
469
-
470
- - **Pix2Text**: Advanced mathematical expression recognition
471
- - **Tesseract OCR**: Multi-language text recognition with Bengali support
472
- - **OpenCV**: Image preprocessing and enhancement
473
- - **PDF2Image**: High-quality PDF to image conversion
474
- - **FastAPI**: RESTful API backend
475
- - **Gradio**: Interactive web interface
476
-
477
- ### 📝 **Usage Instructions**
478
-
479
- #### **For PDF Text Extraction:**
480
- 1. Upload a PDF file using the file picker
481
- 2. Click "🚀 Extract Text" to start processing
482
- 3. Review the extraction summary for statistics
483
- 4. Copy the extracted text or download the files
484
- 5. Download the JSON file for detailed analysis data
485
-
486
- #### **For OCR Evaluation:**
487
- 1. Upload the OCR-extracted text file (what you want to evaluate)
488
- 2. Upload the ground truth baseline file (the correct text)
489
- 3. Optionally provide an evaluation name for identification
490
- 4. Click "📊 Evaluate Accuracy" to run the comparison
491
- 5. Review the detailed metrics and recommendations
492
-
493
- ### 🎯 **Accuracy Grading System**
494
-
495
- - **A+ (95-100%)**: Excellent - Professional-grade accuracy
496
- - **A (90-94%)**: Very Good - High-quality results with minor errors
497
- - **B (80-89%)**: Good - Acceptable for most applications
498
- - **C (70-79%)**: Fair - May require manual review
499
- - **D (60-69%)**: Poor - Significant improvements needed
500
- - **F (<60%)**: Very Poor - Major issues requiring attention
501
-
502
- ### 📚 **Supported Languages & Content**
503
-
504
- - **English**: Full Latin alphabet with punctuation and symbols
505
- - **Bangla (Bengali)**: Complete Bengali Unicode range (U+0980-U+09FF)
506
- - **Mathematical Expressions**:
507
- - Basic arithmetic operators (+, -, ×, ÷, =)
508
- - Greek letters (α, β, γ, δ, π, θ, λ, μ, Ω, etc.)
509
- - Mathematical symbols (∑, ∫, √, ∞, ∂, →, ≤, ≥, etc.)
510
- - Subscripts and superscripts
511
- - Functions and equations
512
- - LaTeX-style expressions
513
-
514
- ### 🔧 **Tips for Best Results**
515
-
516
- 1. **PDF Quality**: Use high-resolution PDFs (300+ DPI) for better accuracy
517
- 2. **Text Clarity**: Ensure text is not blurry, skewed, or low contrast
518
- 3. **Language Consistency**: Mixed-language documents work best when languages are clearly separated
519
- 4. **Mathematical Content**: Complex equations may require manual verification
520
- 5. **File Size**: Larger documents may take longer to process
521
-
522
- ### 🐛 **Troubleshooting**
523
-
524
- - **Empty Results**: Check if the PDF contains selectable text or if images need OCR
525
- - **Low Accuracy**: Try preprocessing the PDF to improve image quality
526
- - **Mixed Languages**: Ensure the document has clear language boundaries
527
- - **Mathematical Errors**: Complex formulas may need manual correction
528
-
529
- ### 📞 **Support & Feedback**
530
-
531
- For issues, suggestions, or contributions, please visit our [GitHub repository](https://github.com/ashfaqbracu/aaladinai).
532
-
533
- ---
534
-
535
- **Made with ❤️ for advancing multilingual text recognition**
536
- """)
537
-
538
- # Footer
539
- gr.Markdown("""
540
- ---
541
-
542
- **🔗 Links:** [GitHub Repository](https://github.com/ashfaqbracu/aaladinai) | [Documentation](https://github.com/ashfaqbracu/aaladinai#readme)
543
-
544
- **⚡ Powered by:** Pix2Text • Tesseract OCR • OpenCV • FastAPI • Gradio
545
- """)
546
-
547
- return app
548
-
549
-
550
- # Main execution
551
- if __name__ == "__main__":
552
- logger.info("🚀 Starting Advanced Multi-Language OCR Gradio Interface...")
553
-
554
- # Create and launch the interface
555
- app = create_gradio_interface()
556
-
557
- # Launch configuration
558
- app.launch(
559
- server_name="0.0.0.0", # Allow external access for Hugging Face Spaces
560
- server_port=7860, # Standard port for Hugging Face Spaces
561
- share=False, # Don't create gradio.live link
562
- show_error=True, # Show detailed error messages
563
- show_tips=True, # Show helpful tips
564
- enable_queue=True, # Enable request queuing for better performance
565
- max_threads=4, # Limit concurrent requests
566
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py - Gradio Interface for Advanced Multi-Language OCR System
2
+ # Hugging Face Spaces compatible application
3
+
4
+ import os
5
+ import json
6
+ import shutil
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ from typing import Tuple
10
+ import gradio as gr
11
+
12
+ # Set up logging first
13
+ import logging
14
+
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Try to import our OCR functionality with error handling
19
+ try:
20
+ from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text
21
+ from eval import evaluate_ocr_accuracy, clean_control_characters
22
+
23
+ OCR_AVAILABLE = True
24
+ logger.info("✅ OCR modules imported successfully")
25
+ except ImportError as e:
26
+ logger.error(f"❌ OCR modules not available: {e}")
27
+ OCR_AVAILABLE = False
28
+
29
+ # Create dummy functions as fallbacks
30
+ def extract_all_text_advanced_pix2text(*args, **kwargs):
31
+ raise RuntimeError(
32
+ "OCR functionality not available due to missing dependencies"
33
+ )
34
+
35
+ def initialize_pix2text():
36
+ return None
37
+
38
+ def evaluate_ocr_accuracy(*args, **kwargs):
39
+ raise RuntimeError(
40
+ "Evaluation functionality not available due to missing dependencies"
41
+ )
42
+
43
+ def clean_control_characters(text):
44
+ return text
45
+
46
+
47
+ logging.basicConfig(level=logging.INFO)
48
+ logger = logging.getLogger(__name__)
49
+
50
+
51
+ # Create necessary directories
52
+ def create_directories():
53
+ """Create necessary directories for file storage."""
54
+ directories = ["documents", "extracted", "temp"]
55
+ for directory in directories:
56
+ Path(directory).mkdir(exist_ok=True)
57
+ logger.info(f"✅ Created/verified directory: {directory}")
58
+
59
+
60
+ # Initialize directories
61
+ create_directories()
62
+
63
+ # Initialize Pix2Text model at startup with error handling
64
+ logger.info("🚀 Initializing Pix2Text model...")
65
+ if OCR_AVAILABLE:
66
+ try:
67
+ PIX2TEXT_MODEL = initialize_pix2text()
68
+ if PIX2TEXT_MODEL:
69
+ logger.info("✅ Pix2Text model loaded successfully")
70
+ else:
71
+ logger.warning("⚠️ Pix2Text model not available, using fallback OCR")
72
+ except Exception as e:
73
+ logger.error(f"❌ Failed to initialize Pix2Text: {e}")
74
+ PIX2TEXT_MODEL = None
75
+ else:
76
+ logger.warning("⚠️ OCR modules not available - running in demo mode")
77
+ PIX2TEXT_MODEL = None
78
+
79
+
80
+ def get_safe_filename(filename: str) -> str:
81
+ """Generate a safe filename with timestamp."""
82
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
83
+ name, ext = os.path.splitext(filename)
84
+ # Remove special characters and replace spaces
85
+ safe_name = "".join(c for c in name if c.isalnum() or c in ("-", "_")).rstrip()
86
+ return f"{safe_name}_{timestamp}{ext}"
87
+
88
+
89
+ def get_extraction_filename(pdf_filename: str, file_type: str) -> str:
90
+ """Generate extraction filename with convention: [pdf_filename]_extract.[extension]"""
91
+ base_name = os.path.splitext(pdf_filename)[0]
92
+ extensions = {"txt": "txt", "json": "json", "analysis": "json"}
93
+ return f"{base_name}_extract.{extensions.get(file_type, 'txt')}"
94
+
95
+
96
+ def extract_text_from_pdf(pdf_file) -> Tuple[str, str, str, str]:
97
+ """
98
+ Extract text from uploaded PDF file using advanced OCR.
99
+
100
+ Returns:
101
+ - extracted_text: The full extracted text
102
+ - summary_text: A summary of the extraction process
103
+ - text_file_path: Path to the text file (for download)
104
+ - json_file_path: Path to the JSON file (for download)
105
+ """
106
+ if pdf_file is None:
107
+ return "❌ No file uploaded", "Please upload a PDF file", "", ""
108
+
109
+ try:
110
+ start_time = datetime.now()
111
+
112
+ # Get the uploaded file path
113
+ pdf_path = pdf_file.name
114
+ filename = os.path.basename(pdf_path)
115
+
116
+ logger.info(f"📄 Processing uploaded file: {filename}")
117
+
118
+ # Generate safe filename
119
+ safe_filename = get_safe_filename(filename)
120
+
121
+ # Copy uploaded file to documents directory
122
+ documents_path = Path("documents") / safe_filename
123
+ shutil.copy2(pdf_path, documents_path)
124
+
125
+ # Generate output filenames
126
+ text_filename = get_extraction_filename(safe_filename, "txt")
127
+ json_filename = get_extraction_filename(safe_filename, "json")
128
+ analysis_filename = get_extraction_filename(safe_filename, "analysis")
129
+
130
+ # Create full paths for extracted files
131
+ text_path = Path("extracted") / text_filename
132
+ json_path = Path("extracted") / json_filename
133
+ analysis_path = Path("extracted") / analysis_filename
134
+
135
+ logger.info("🔄 Starting OCR processing...")
136
+
137
+ # Check if OCR functionality is available
138
+ if not OCR_AVAILABLE:
139
+ return (
140
+ """❌ **OCR functionality not available**
141
+
142
+ This appears to be a demo environment where the OCR dependencies are not fully installed.
143
+
144
+ **Missing components:**
145
+ - OpenCV (cv2) for image processing
146
+ - Tesseract OCR for text recognition
147
+ - Pix2Text for mathematical expression extraction
148
+
149
+ **To use this system:**
150
+ 1. Deploy to Hugging Face Spaces with proper dependencies
151
+ 2. Or install missing packages locally:
152
+ ```bash
153
+ pip install opencv-python pytesseract pix2text
154
+ apt-get install tesseract-ocr tesseract-ocr-ben poppler-utils
155
+ ```
156
+
157
+ **Demo Features Available:**
158
+ - Interface navigation and design preview
159
+ - File upload testing (files are validated but not processed)
160
+ - System architecture demonstration
161
+ """,
162
+ "OCR dependencies not available in this environment",
163
+ "",
164
+ "",
165
+ )
166
+
167
+ # Process the PDF using our advanced OCR system
168
+ extract_all_text_advanced_pix2text(
169
+ pdf_path=str(documents_path),
170
+ output_text_file=str(text_path),
171
+ output_json_file=str(json_path),
172
+ output_analysis_file=str(analysis_path),
173
+ )
174
+
175
+ # Read the extracted text
176
+ with open(text_path, "r", encoding="utf-8") as f:
177
+ extracted_text = f.read()
178
+
179
+ # Read the analysis for summary
180
+ with open(analysis_path, "r", encoding="utf-8") as f:
181
+ analysis_data = json.load(f)
182
+
183
+ # Calculate processing time
184
+ end_time = datetime.now()
185
+ processing_time = (end_time - start_time).total_seconds()
186
+
187
+ # Create summary
188
+ summary = f"""
189
+ 📊 **OCR Processing Complete!**
190
+
191
+ ⏱️ **Processing Time:** {processing_time:.2f} seconds
192
+ 📄 **Original File:** {filename}
193
+ 📝 **Extracted Characters:** {len(extracted_text):,}
194
+
195
+ 🔤 **Text Distribution:**
196
+ - English regions: {analysis_data.get("type_distribution", {}).get("english", 0)}
197
+ - Bangla regions: {analysis_data.get("type_distribution", {}).get("bangla", 0)}
198
+ - Math regions: {analysis_data.get("type_distribution", {}).get("math", 0)}
199
+ - Mixed regions: {analysis_data.get("type_distribution", {}).get("mixed", 0)}
200
+
201
+ 📈 **Quality Metrics:**
202
+ - Total text regions: {analysis_data.get("total_regions", 0)}
203
+ - Pages processed: {analysis_data.get("total_pages", 0)}
204
+ - Average confidence: {analysis_data.get("confidence_stats", {}).get("avg", 0):.1f}%
205
+
206
+ 🔧 **Extraction Methods:**
207
+ - Pix2Text (Math): {analysis_data.get("extraction_methods", {}).get("pix2text", 0)} regions
208
+ - Tesseract (Text): {analysis_data.get("extraction_methods", {}).get("tesseract", 0)} regions
209
+
210
+ **Status:** Extraction completed successfully!
211
+ """
212
+
213
+ logger.info(f"✅ OCR processing completed in {processing_time:.2f} seconds")
214
+
215
+ return extracted_text, summary, str(text_path), str(json_path)
216
+
217
+ except Exception as e:
218
+ error_message = f"❌ **Error during OCR processing:**\n\n{str(e)}"
219
+ logger.error(f"OCR processing failed: {e}")
220
+ return error_message, error_message, "", ""
221
+
222
+
223
+ def evaluate_ocr_files(
224
+ extracted_file, baseline_file, evaluation_name: str = ""
225
+ ) -> Tuple[str, str]:
226
+ """
227
+ Evaluate OCR accuracy by comparing extracted text with baseline.
228
+
229
+ Returns:
230
+ - results_text: Formatted evaluation results
231
+ - summary_text: Summary of the evaluation
232
+ """
233
+ if extracted_file is None or baseline_file is None:
234
+ return "❌ Please upload both files for evaluation", "Missing files"
235
+
236
+ try:
237
+ start_time = datetime.now()
238
+
239
+ # Read file contents
240
+ with open(extracted_file.name, "r", encoding="utf-8") as f:
241
+ extracted_text = f.read()
242
+
243
+ with open(baseline_file.name, "r", encoding="utf-8") as f:
244
+ baseline_text = f.read()
245
+
246
+ logger.info(f"📊 Starting evaluation: {evaluation_name or 'Unnamed'}")
247
+ logger.info(f"Extracted text length: {len(extracted_text)} characters")
248
+ logger.info(f"Baseline text length: {len(baseline_text)} characters")
249
+
250
+ # Check if evaluation functionality is available
251
+ if not OCR_AVAILABLE:
252
+ return (
253
+ """❌ **Evaluation functionality not available**
254
+
255
+ This appears to be a demo environment where the evaluation dependencies are not fully installed.
256
+
257
+ **Missing components:**
258
+ - Text processing utilities
259
+ - Evaluation algorithms
260
+ - Statistical analysis functions
261
+
262
+ **To use this system:**
263
+ 1. Deploy to Hugging Face Spaces with proper dependencies
264
+ 2. Or install missing packages locally
265
+
266
+ **Demo Features Available:**
267
+ - Interface navigation and design preview
268
+ - File upload testing (files are validated but not processed)
269
+ - System architecture demonstration
270
+ """,
271
+ "Evaluation dependencies not available in this environment",
272
+ )
273
+
274
+ # Clean input texts
275
+ extracted_text_clean = clean_control_characters(extracted_text)
276
+ baseline_text_clean = clean_control_characters(baseline_text)
277
+
278
+ # Perform evaluation
279
+ evaluation_results = evaluate_ocr_accuracy(
280
+ extracted_text=extracted_text_clean,
281
+ baseline_text=baseline_text_clean,
282
+ )
283
+
284
+ # Check for evaluation errors
285
+ if "error" in evaluation_results:
286
+ return (
287
+ f"❌ **Evaluation Error:** {evaluation_results['error']}",
288
+ "Error occurred",
289
+ )
290
+
291
+ # Calculate processing time
292
+ end_time = datetime.now()
293
+ processing_time = (end_time - start_time).total_seconds()
294
+
295
+ # Format results
296
+ results_text = f"""
297
+ 📊 **OCR Evaluation Results**
298
+ {f"📝 **Evaluation Name:** {evaluation_name}" if evaluation_name else ""}
299
+
300
+ 🎯 **Overall Performance**
301
+ - **Overall Accuracy:** {evaluation_results["overall_accuracy"]:.2f}%
302
+ - **Similarity Score:** {evaluation_results["similarity_score"]:.2f}%
303
+ - **Grade:** {evaluation_results["evaluation_summary"]["grade"]}
304
+
305
+ 📝 **Character-Level Analysis**
306
+ - **Character Accuracy:** {evaluation_results["character_metrics"]["character_accuracy"]:.2f}%
307
+ - **Character Error Rate:** {evaluation_results["character_metrics"]["character_error_rate"]:.2f}%
308
+ - **Edit Distance:** {evaluation_results["character_metrics"]["edit_distance"]}
309
+ - **Total Characters:** {evaluation_results["character_metrics"]["total_characters"]:,}
310
+
311
+ 📚 **Word-Level Analysis**
312
+ - **Word Accuracy:** {evaluation_results["word_metrics"]["word_accuracy"]:.2f}%
313
+ - **Word Error Rate:** {evaluation_results["word_metrics"]["word_error_rate"]:.2f}%
314
+ - **Correct Words:** {evaluation_results["word_metrics"]["correct_words"]} / {evaluation_results["word_metrics"]["total_words"]}
315
+ - **Missing Words:** {evaluation_results["word_metrics"]["missing_words"]}
316
+ - **Extra Words:** {evaluation_results["word_metrics"]["extra_words"]}
317
+
318
+ 📄 **Line-Level Analysis**
319
+ - **Line Accuracy:** {evaluation_results["line_metrics"]["line_accuracy"]:.2f}%
320
+ - **Average Line Similarity:** {evaluation_results["line_metrics"]["average_line_similarity"]:.2f}%
321
+ - **Lines Matched:** {evaluation_results["line_metrics"]["lines_matched"]} / {evaluation_results["line_metrics"]["total_lines"]}
322
+
323
+ 🌐 **Language-Specific Accuracy**
324
+ - **English:** {evaluation_results["language_specific"].get("english_accuracy", "N/A")}%
325
+ - **Bangla:** {evaluation_results["language_specific"].get("bangla_accuracy", "N/A")}%
326
+ - **Mathematics:** {evaluation_results["language_specific"].get("math_accuracy", "N/A")}%
327
+ - **Numbers:** {evaluation_results["language_specific"].get("number_accuracy", "N/A")}%
328
+
329
+ 📈 **Text Statistics**
330
+ - **Extracted Length:** {evaluation_results["text_statistics"]["extracted_length"]:,} characters
331
+ - **Baseline Length:** {evaluation_results["text_statistics"]["baseline_length"]:,} characters
332
+ - **Extracted Words:** {evaluation_results["text_statistics"]["extracted_words"]:,}
333
+ - **Baseline Words:** {evaluation_results["text_statistics"]["baseline_words"]:,}
334
+
335
+ 💡 **Recommendations**
336
+ """
337
+
338
+ for i, rec in enumerate(
339
+ evaluation_results["evaluation_summary"]["recommendations"], 1
340
+ ):
341
+ results_text += f"{i}. {rec}\n"
342
+
343
+ # Create summary
344
+ summary = f"""
345
+ 🎯 **Evaluation Summary**
346
+
347
+ ⏱️ **Processing Time:** {processing_time:.3f} seconds
348
+ 📊 **Overall Score:** {evaluation_results["overall_accuracy"]:.2f}%
349
+ 🏆 **Grade:** {evaluation_results["evaluation_summary"]["grade"]}
350
+ 📝 **Character Accuracy:** {evaluation_results["character_metrics"]["character_accuracy"]:.2f}%
351
+ 📚 **Word Accuracy:** {evaluation_results["word_metrics"]["word_accuracy"]:.2f}%
352
+
353
+ **Evaluation completed successfully!**
354
+ """
355
+
356
+ logger.info(f"✅ Evaluation completed in {processing_time:.3f} seconds")
357
+ logger.info(
358
+ f"📊 Overall accuracy: {evaluation_results['overall_accuracy']:.2f}%"
359
+ )
360
+
361
+ return results_text, summary
362
+
363
+ except Exception as e:
364
+ error_message = f"❌ **Error during evaluation:**\n\n{str(e)}"
365
+ logger.error(f"Evaluation failed: {e}")
366
+ return error_message, error_message
367
+
368
+
369
+ # Create Gradio interface
370
+ def create_gradio_interface():
371
+ """Create and configure the Gradio interface."""
372
+
373
+ # Custom CSS for better styling
374
+ css = """
375
+ .gradio-container {
376
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
377
+ }
378
+ .output-text {
379
+ font-family: 'Courier New', monospace;
380
+ font-size: 14px;
381
+ }
382
+ .summary-box {
383
+ background-color: #f0f8ff;
384
+ border: 1px solid #d0e7ff;
385
+ border-radius: 8px;
386
+ padding: 16px;
387
+ margin: 8px 0;
388
+ }
389
+ """
390
+
391
+ with gr.Blocks(
392
+ css=css, title="Advanced Multi-Language OCR System", theme=gr.themes.Soft()
393
+ ) as app:
394
+ # Header
395
+ gr.Markdown("""
396
+ # 🔍 Advanced Multi-Language OCR System
397
+
398
+ **Powered by Pix2Text, Tesseract, and FastAPI**
399
+
400
+ Extract text from PDFs containing **English**, **Bangla**, and **Mathematical expressions** with high accuracy.
401
+ Evaluate OCR performance with comprehensive metrics and detailed analysis.
402
+ """)
403
+
404
+ with gr.Tabs():
405
+ # Tab 1: OCR Extraction
406
+ with gr.Tab("📄 PDF Text Extraction"):
407
+ gr.Markdown("""
408
+ ### Upload a PDF and extract text using advanced multi-language OCR
409
+
410
+ **Features:**
411
+ - 🌐 **Multi-language support**: English, Bangla (Bengali), and Mathematical expressions
412
+ - 🧮 **Advanced Math Recognition**: Pix2Text integration for LaTeX and mathematical formulas
413
+ - 📊 **Detailed Analysis**: Character-level classification and confidence scores
414
+ - 💾 **Download Results**: Get extracted text and detailed JSON analysis
415
+ """)
416
+
417
+ with gr.Row():
418
+ with gr.Column(scale=1):
419
+ pdf_input = gr.File(
420
+ label="📄 Upload PDF File",
421
+ file_types=[".pdf"],
422
+ type="filepath",
423
+ )
424
+ extract_btn = gr.Button(
425
+ "🚀 Extract Text", variant="primary", size="lg"
426
+ )
427
+
428
+ with gr.Column(scale=2):
429
+ extraction_summary = gr.Textbox(
430
+ label="📊 Extraction Summary",
431
+ lines=15,
432
+ elem_classes=["summary-box"],
433
+ )
434
+
435
+ with gr.Row():
436
+ extracted_text_output = gr.Textbox(
437
+ label="📝 Extracted Text",
438
+ lines=20,
439
+ elem_classes=["output-text"],
440
+ show_copy_button=True,
441
+ )
442
+
443
+ with gr.Row():
444
+ text_file_download = gr.File(
445
+ label="📥 Download Text File", visible=False
446
+ )
447
+ json_file_download = gr.File(
448
+ label="📥 Download JSON Analysis", visible=False
449
+ )
450
+
451
+ # Connect extraction functionality
452
+ extract_btn.click(
453
+ fn=extract_text_from_pdf,
454
+ inputs=[pdf_input],
455
+ outputs=[
456
+ extracted_text_output,
457
+ extraction_summary,
458
+ text_file_download,
459
+ json_file_download,
460
+ ],
461
+ ).then(
462
+ lambda text_path, json_path: (
463
+ gr.update(
464
+ visible=bool(text_path),
465
+ value=text_path if text_path else None,
466
+ ),
467
+ gr.update(
468
+ visible=bool(json_path),
469
+ value=json_path if json_path else None,
470
+ ),
471
+ ),
472
+ inputs=[text_file_download, json_file_download],
473
+ outputs=[text_file_download, json_file_download],
474
+ )
475
+
476
+ # Tab 2: OCR Evaluation
477
+ with gr.Tab("📊 OCR Accuracy Evaluation"):
478
+ gr.Markdown("""
479
+ ### Compare OCR extracted text with ground truth baseline for accuracy analysis
480
+
481
+ **Evaluation Features:**
482
+ - 🎯 **Character-level accuracy**: Precise character matching and edit distance
483
+ - 📚 **Word-level accuracy**: Word matching and error rates
484
+ - 📄 **Line-level accuracy**: Line comparison and similarity scores
485
+ - 🌐 **Language-specific metrics**: Separate accuracy for English, Bangla, and Math
486
+ - 🏆 **Grading system**: Letter grades from A+ to F with recommendations
487
+ """)
488
+
489
+ with gr.Row():
490
+ with gr.Column():
491
+ extracted_file_input = gr.File(
492
+ label="📄 OCR Extracted Text File (.txt)",
493
+ file_types=[".txt"],
494
+ type="filepath",
495
+ )
496
+ baseline_file_input = gr.File(
497
+ label="📑 Ground Truth Baseline File (.txt)",
498
+ file_types=[".txt"],
499
+ type="filepath",
500
+ )
501
+ evaluation_name_input = gr.Textbox(
502
+ label="📝 Evaluation Name (Optional)",
503
+ placeholder="e.g., Math Document Test #1",
504
+ )
505
+ evaluate_btn = gr.Button(
506
+ "📊 Evaluate Accuracy", variant="primary", size="lg"
507
+ )
508
+
509
+ with gr.Column():
510
+ evaluation_summary = gr.Textbox(
511
+ label="🎯 Evaluation Summary",
512
+ lines=10,
513
+ elem_classes=["summary-box"],
514
+ )
515
+
516
+ with gr.Row():
517
+ evaluation_results = gr.Textbox(
518
+ label="📈 Detailed Evaluation Results",
519
+ lines=25,
520
+ elem_classes=["output-text"],
521
+ show_copy_button=True,
522
+ )
523
+
524
+ # Connect evaluation functionality
525
+ evaluate_btn.click(
526
+ fn=evaluate_ocr_files,
527
+ inputs=[
528
+ extracted_file_input,
529
+ baseline_file_input,
530
+ evaluation_name_input,
531
+ ],
532
+ outputs=[evaluation_results, evaluation_summary],
533
+ )
534
+
535
+ # Tab 3: About & Help
536
+ with gr.Tab("ℹ️ About & Help"):
537
+ gr.Markdown("""
538
+ ## 🔍 Advanced Multi-Language OCR System
539
+
540
+ This application provides state-of-the-art Optical Character Recognition (OCR) for documents containing mixed languages and mathematical expressions.
541
+
542
+ ### 🌟 Key Features
543
+
544
+ #### 📄 **PDF Text Extraction**
545
+ - **Multi-language Support**: Simultaneously process English and Bangla (Bengali) text
546
+ - **Mathematical Recognition**: Advanced extraction of mathematical formulas and equations using Pix2Text
547
+ - **Intelligent Classification**: Automatic detection and classification of text regions by language/content type
548
+ - **High Accuracy**: Optimized preprocessing and multiple OCR engines for best results
549
+ - **Detailed Analysis**: Character-by-character analysis with confidence scores and language distribution
550
+
551
+ #### 📊 **OCR Accuracy Evaluation**
552
+ - **Comprehensive Metrics**: Character, word, and line-level accuracy measurements
553
+ - **Language-Specific Analysis**: Separate accuracy scores for different languages and mathematical content
554
+ - **Edit Distance Calculation**: Precise measurement of text differences using Levenshtein distance
555
+ - **Grading System**: Letter grades (A+ to F) with improvement recommendations
556
+ - **Detailed Comparison**: Side-by-side diff analysis showing insertions, deletions, and matches
557
+
558
+ ### 🛠️ **Technology Stack**
559
+
560
+ - **Pix2Text**: Advanced mathematical expression recognition
561
+ - **Tesseract OCR**: Multi-language text recognition with Bengali support
562
+ - **OpenCV**: Image preprocessing and enhancement
563
+ - **PDF2Image**: High-quality PDF to image conversion
564
+ - **FastAPI**: RESTful API backend
565
+ - **Gradio**: Interactive web interface
566
+
567
+ ### 📝 **Usage Instructions**
568
+
569
+ #### **For PDF Text Extraction:**
570
+ 1. Upload a PDF file using the file picker
571
+ 2. Click "🚀 Extract Text" to start processing
572
+ 3. Review the extraction summary for statistics
573
+ 4. Copy the extracted text or download the files
574
+ 5. Download the JSON file for detailed analysis data
575
+
576
+ #### **For OCR Evaluation:**
577
+ 1. Upload the OCR-extracted text file (what you want to evaluate)
578
+ 2. Upload the ground truth baseline file (the correct text)
579
+ 3. Optionally provide an evaluation name for identification
580
+ 4. Click "📊 Evaluate Accuracy" to run the comparison
581
+ 5. Review the detailed metrics and recommendations
582
+
583
+ ### 🎯 **Accuracy Grading System**
584
+
585
+ - **A+ (95-100%)**: Excellent - Professional-grade accuracy
586
+ - **A (90-94%)**: Very Good - High-quality results with minor errors
587
+ - **B (80-89%)**: Good - Acceptable for most applications
588
+ - **C (70-79%)**: Fair - May require manual review
589
+ - **D (60-69%)**: Poor - Significant improvements needed
590
+ - **F (<60%)**: Very Poor - Major issues requiring attention
591
+
592
+ ### 📚 **Supported Languages & Content**
593
+
594
+ - **English**: Full Latin alphabet with punctuation and symbols
595
+ - **Bangla (Bengali)**: Complete Bengali Unicode range (U+0980-U+09FF)
596
+ - **Mathematical Expressions**:
597
+ - Basic arithmetic operators (+, -, ×, ÷, =)
598
+ - Greek letters (α, β, γ, δ, π, θ, λ, μ, Ω, etc.)
599
+ - Mathematical symbols (∑, ∫, √, ∞, ∂, →, ≤, ≥, etc.)
600
+ - Subscripts and superscripts
601
+ - Functions and equations
602
+ - LaTeX-style expressions
603
+
604
+ ### 🔧 **Tips for Best Results**
605
+
606
+ 1. **PDF Quality**: Use high-resolution PDFs (300+ DPI) for better accuracy
607
+ 2. **Text Clarity**: Ensure text is not blurry, skewed, or low contrast
608
+ 3. **Language Consistency**: Mixed-language documents work best when languages are clearly separated
609
+ 4. **Mathematical Content**: Complex equations may require manual verification
610
+ 5. **File Size**: Larger documents may take longer to process
611
+
612
+ ### 🐛 **Troubleshooting**
613
+
614
+ - **Empty Results**: Check if the PDF contains selectable text or if images need OCR
615
+ - **Low Accuracy**: Try preprocessing the PDF to improve image quality
616
+ - **Mixed Languages**: Ensure the document has clear language boundaries
617
+ - **Mathematical Errors**: Complex formulas may need manual correction
618
+
619
+ ### 📞 **Support & Feedback**
620
+
621
+ For issues, suggestions, or contributions, please visit our [GitHub repository](https://github.com/ashfaqbracu/aaladinai).
622
+
623
+ ---
624
+
625
+ **Made with ❤️ for advancing multilingual text recognition**
626
+ """)
627
+
628
+ # Footer
629
+ gr.Markdown("""
630
+ ---
631
+
632
+ **🔗 Links:** [GitHub Repository](https://github.com/ashfaqbracu/aaladinai) | [Documentation](https://github.com/ashfaqbracu/aaladinai#readme)
633
+
634
+ **⚡ Powered by:** Pix2Text • Tesseract OCR • OpenCV • FastAPI • Gradio
635
+ """)
636
+
637
+ return app
638
+
639
+
640
+ # Main execution
641
+ if __name__ == "__main__":
642
+ logger.info("🚀 Starting Advanced Multi-Language OCR Gradio Interface...")
643
+
644
+ # Create and launch the interface
645
+ app = create_gradio_interface()
646
+
647
+ # Launch configuration
648
+ app.launch(
649
+ server_name="0.0.0.0", # Allow external access for Hugging Face Spaces
650
+ server_port=7860, # Standard port for Hugging Face Spaces
651
+ share=False, # Don't create gradio.live link
652
+ show_error=True, # Show detailed error messages
653
+ show_tips=True, # Show helpful tips
654
+ enable_queue=True, # Enable request queuing for better performance
655
+ max_threads=4, # Limit concurrent requests
656
+ )