Ash2749 commited on
Commit
1a3e965
·
verified ·
1 Parent(s): 6389f50

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +566 -0
  2. eval.py +428 -0
  3. main6_pix2text.py +838 -0
  4. packages.txt +10 -0
  5. requirements.txt +39 -0
app.py ADDED
@@ -0,0 +1,566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py - Gradio Interface for Advanced Multi-Language OCR System
2
+ # Hugging Face Spaces compatible application
3
+
4
+ import os
5
+ import json
6
+ import shutil
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ from typing import Tuple
10
+ import gradio as gr
11
+
12
+ # Import our OCR functionality
13
+ from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text
14
+ from eval import evaluate_ocr_accuracy, clean_control_characters
15
+
16
+ # Set up logging
17
+ import logging
18
+
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ # Create necessary directories
24
+ def create_directories():
25
+ """Create necessary directories for file storage."""
26
+ directories = ["documents", "extracted", "temp"]
27
+ for directory in directories:
28
+ Path(directory).mkdir(exist_ok=True)
29
+ logger.info(f"✅ Created/verified directory: {directory}")
30
+
31
+
32
+ # Initialize directories
33
+ create_directories()
34
+
35
+ # Initialize Pix2Text model at startup
36
+ logger.info("🚀 Initializing Pix2Text model...")
37
+ PIX2TEXT_MODEL = initialize_pix2text()
38
+ if PIX2TEXT_MODEL:
39
+ logger.info("✅ Pix2Text model loaded successfully")
40
+ else:
41
+ logger.warning("⚠️ Pix2Text model not available, using fallback OCR")
42
+
43
+
44
+ def get_safe_filename(filename: str) -> str:
45
+ """Generate a safe filename with timestamp."""
46
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
47
+ name, ext = os.path.splitext(filename)
48
+ # Remove special characters and replace spaces
49
+ safe_name = "".join(c for c in name if c.isalnum() or c in ("-", "_")).rstrip()
50
+ return f"{safe_name}_{timestamp}{ext}"
51
+
52
+
53
+ def get_extraction_filename(pdf_filename: str, file_type: str) -> str:
54
+ """Generate extraction filename with convention: [pdf_filename]_extract.[extension]"""
55
+ base_name = os.path.splitext(pdf_filename)[0]
56
+ extensions = {"txt": "txt", "json": "json", "analysis": "json"}
57
+ return f"{base_name}_extract.{extensions.get(file_type, 'txt')}"
58
+
59
+
60
+ def extract_text_from_pdf(pdf_file) -> Tuple[str, str, str, str]:
61
+ """
62
+ Extract text from uploaded PDF file using advanced OCR.
63
+
64
+ Returns:
65
+ - extracted_text: The full extracted text
66
+ - summary_text: A summary of the extraction process
67
+ - text_file_path: Path to the text file (for download)
68
+ - json_file_path: Path to the JSON file (for download)
69
+ """
70
+ if pdf_file is None:
71
+ return "❌ No file uploaded", "Please upload a PDF file", "", ""
72
+
73
+ try:
74
+ start_time = datetime.now()
75
+
76
+ # Get the uploaded file path
77
+ pdf_path = pdf_file.name
78
+ filename = os.path.basename(pdf_path)
79
+
80
+ logger.info(f"📄 Processing uploaded file: {filename}")
81
+
82
+ # Generate safe filename
83
+ safe_filename = get_safe_filename(filename)
84
+
85
+ # Copy uploaded file to documents directory
86
+ documents_path = Path("documents") / safe_filename
87
+ shutil.copy2(pdf_path, documents_path)
88
+
89
+ # Generate output filenames
90
+ text_filename = get_extraction_filename(safe_filename, "txt")
91
+ json_filename = get_extraction_filename(safe_filename, "json")
92
+ analysis_filename = get_extraction_filename(safe_filename, "analysis")
93
+
94
+ # Create full paths for extracted files
95
+ text_path = Path("extracted") / text_filename
96
+ json_path = Path("extracted") / json_filename
97
+ analysis_path = Path("extracted") / analysis_filename
98
+
99
+ logger.info("🔄 Starting OCR processing...")
100
+
101
+ # Process the PDF using our advanced OCR system
102
+ extract_all_text_advanced_pix2text(
103
+ pdf_path=str(documents_path),
104
+ output_text_file=str(text_path),
105
+ output_json_file=str(json_path),
106
+ output_analysis_file=str(analysis_path),
107
+ )
108
+
109
+ # Read the extracted text
110
+ with open(text_path, "r", encoding="utf-8") as f:
111
+ extracted_text = f.read()
112
+
113
+ # Read the analysis for summary
114
+ with open(analysis_path, "r", encoding="utf-8") as f:
115
+ analysis_data = json.load(f)
116
+
117
+ # Calculate processing time
118
+ end_time = datetime.now()
119
+ processing_time = (end_time - start_time).total_seconds()
120
+
121
+ # Create summary
122
+ summary = f"""
123
+ 📊 **OCR Processing Complete!**
124
+
125
+ ⏱️ **Processing Time:** {processing_time:.2f} seconds
126
+ 📄 **Original File:** {filename}
127
+ 📝 **Extracted Characters:** {len(extracted_text):,}
128
+
129
+ 🔤 **Text Distribution:**
130
+ - English regions: {analysis_data.get("type_distribution", {}).get("english", 0)}
131
+ - Bangla regions: {analysis_data.get("type_distribution", {}).get("bangla", 0)}
132
+ - Math regions: {analysis_data.get("type_distribution", {}).get("math", 0)}
133
+ - Mixed regions: {analysis_data.get("type_distribution", {}).get("mixed", 0)}
134
+
135
+ 📈 **Quality Metrics:**
136
+ - Total text regions: {analysis_data.get("total_regions", 0)}
137
+ - Pages processed: {analysis_data.get("total_pages", 0)}
138
+ - Average confidence: {analysis_data.get("confidence_stats", {}).get("avg", 0):.1f}%
139
+
140
+ 🔧 **Extraction Methods:**
141
+ - Pix2Text (Math): {analysis_data.get("extraction_methods", {}).get("pix2text", 0)} regions
142
+ - Tesseract (Text): {analysis_data.get("extraction_methods", {}).get("tesseract", 0)} regions
143
+
144
+ ✅ **Status:** Extraction completed successfully!
145
+ """
146
+
147
+ logger.info(f"✅ OCR processing completed in {processing_time:.2f} seconds")
148
+
149
+ return extracted_text, summary, str(text_path), str(json_path)
150
+
151
+ except Exception as e:
152
+ error_message = f"❌ **Error during OCR processing:**\n\n{str(e)}"
153
+ logger.error(f"OCR processing failed: {e}")
154
+ return error_message, error_message, "", ""
155
+
156
+
157
+ def evaluate_ocr_files(
158
+ extracted_file, baseline_file, evaluation_name: str = ""
159
+ ) -> Tuple[str, str]:
160
+ """
161
+ Evaluate OCR accuracy by comparing extracted text with baseline.
162
+
163
+ Returns:
164
+ - results_text: Formatted evaluation results
165
+ - summary_text: Summary of the evaluation
166
+ """
167
+ if extracted_file is None or baseline_file is None:
168
+ return "❌ Please upload both files for evaluation", "Missing files"
169
+
170
+ try:
171
+ start_time = datetime.now()
172
+
173
+ # Read file contents
174
+ with open(extracted_file.name, "r", encoding="utf-8") as f:
175
+ extracted_text = f.read()
176
+
177
+ with open(baseline_file.name, "r", encoding="utf-8") as f:
178
+ baseline_text = f.read()
179
+
180
+ logger.info(f"📊 Starting evaluation: {evaluation_name or 'Unnamed'}")
181
+ logger.info(f"Extracted text length: {len(extracted_text)} characters")
182
+ logger.info(f"Baseline text length: {len(baseline_text)} characters")
183
+
184
+ # Clean input texts
185
+ extracted_text_clean = clean_control_characters(extracted_text)
186
+ baseline_text_clean = clean_control_characters(baseline_text)
187
+
188
+ # Perform evaluation
189
+ evaluation_results = evaluate_ocr_accuracy(
190
+ extracted_text=extracted_text_clean,
191
+ baseline_text=baseline_text_clean,
192
+ )
193
+
194
+ # Check for evaluation errors
195
+ if "error" in evaluation_results:
196
+ return (
197
+ f"❌ **Evaluation Error:** {evaluation_results['error']}",
198
+ "Error occurred",
199
+ )
200
+
201
+ # Calculate processing time
202
+ end_time = datetime.now()
203
+ processing_time = (end_time - start_time).total_seconds()
204
+
205
+ # Format results
206
+ results_text = f"""
207
+ 📊 **OCR Evaluation Results**
208
+ {f"📝 **Evaluation Name:** {evaluation_name}" if evaluation_name else ""}
209
+
210
+ 🎯 **Overall Performance**
211
+ - **Overall Accuracy:** {evaluation_results["overall_accuracy"]:.2f}%
212
+ - **Similarity Score:** {evaluation_results["similarity_score"]:.2f}%
213
+ - **Grade:** {evaluation_results["evaluation_summary"]["grade"]}
214
+
215
+ 📝 **Character-Level Analysis**
216
+ - **Character Accuracy:** {evaluation_results["character_metrics"]["character_accuracy"]:.2f}%
217
+ - **Character Error Rate:** {evaluation_results["character_metrics"]["character_error_rate"]:.2f}%
218
+ - **Edit Distance:** {evaluation_results["character_metrics"]["edit_distance"]}
219
+ - **Total Characters:** {evaluation_results["character_metrics"]["total_characters"]:,}
220
+
221
+ 📚 **Word-Level Analysis**
222
+ - **Word Accuracy:** {evaluation_results["word_metrics"]["word_accuracy"]:.2f}%
223
+ - **Word Error Rate:** {evaluation_results["word_metrics"]["word_error_rate"]:.2f}%
224
+ - **Correct Words:** {evaluation_results["word_metrics"]["correct_words"]} / {evaluation_results["word_metrics"]["total_words"]}
225
+ - **Missing Words:** {evaluation_results["word_metrics"]["missing_words"]}
226
+ - **Extra Words:** {evaluation_results["word_metrics"]["extra_words"]}
227
+
228
+ 📄 **Line-Level Analysis**
229
+ - **Line Accuracy:** {evaluation_results["line_metrics"]["line_accuracy"]:.2f}%
230
+ - **Average Line Similarity:** {evaluation_results["line_metrics"]["average_line_similarity"]:.2f}%
231
+ - **Lines Matched:** {evaluation_results["line_metrics"]["lines_matched"]} / {evaluation_results["line_metrics"]["total_lines"]}
232
+
233
+ 🌐 **Language-Specific Accuracy**
234
+ - **English:** {evaluation_results["language_specific"].get("english_accuracy", "N/A")}%
235
+ - **Bangla:** {evaluation_results["language_specific"].get("bangla_accuracy", "N/A")}%
236
+ - **Mathematics:** {evaluation_results["language_specific"].get("math_accuracy", "N/A")}%
237
+ - **Numbers:** {evaluation_results["language_specific"].get("number_accuracy", "N/A")}%
238
+
239
+ 📈 **Text Statistics**
240
+ - **Extracted Length:** {evaluation_results["text_statistics"]["extracted_length"]:,} characters
241
+ - **Baseline Length:** {evaluation_results["text_statistics"]["baseline_length"]:,} characters
242
+ - **Extracted Words:** {evaluation_results["text_statistics"]["extracted_words"]:,}
243
+ - **Baseline Words:** {evaluation_results["text_statistics"]["baseline_words"]:,}
244
+
245
+ 💡 **Recommendations**
246
+ """
247
+
248
+ for i, rec in enumerate(
249
+ evaluation_results["evaluation_summary"]["recommendations"], 1
250
+ ):
251
+ results_text += f"{i}. {rec}\n"
252
+
253
+ # Create summary
254
+ summary = f"""
255
+ 🎯 **Evaluation Summary**
256
+
257
+ ⏱️ **Processing Time:** {processing_time:.3f} seconds
258
+ 📊 **Overall Score:** {evaluation_results["overall_accuracy"]:.2f}%
259
+ 🏆 **Grade:** {evaluation_results["evaluation_summary"]["grade"]}
260
+ 📝 **Character Accuracy:** {evaluation_results["character_metrics"]["character_accuracy"]:.2f}%
261
+ 📚 **Word Accuracy:** {evaluation_results["word_metrics"]["word_accuracy"]:.2f}%
262
+
263
+ ✅ **Evaluation completed successfully!**
264
+ """
265
+
266
+ logger.info(f"✅ Evaluation completed in {processing_time:.3f} seconds")
267
+ logger.info(
268
+ f"📊 Overall accuracy: {evaluation_results['overall_accuracy']:.2f}%"
269
+ )
270
+
271
+ return results_text, summary
272
+
273
+ except Exception as e:
274
+ error_message = f"❌ **Error during evaluation:**\n\n{str(e)}"
275
+ logger.error(f"Evaluation failed: {e}")
276
+ return error_message, error_message
277
+
278
+
279
+ # Create Gradio interface
280
+ def create_gradio_interface():
281
+ """Create and configure the Gradio interface."""
282
+
283
+ # Custom CSS for better styling
284
+ css = """
285
+ .gradio-container {
286
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
287
+ }
288
+ .output-text {
289
+ font-family: 'Courier New', monospace;
290
+ font-size: 14px;
291
+ }
292
+ .summary-box {
293
+ background-color: #f0f8ff;
294
+ border: 1px solid #d0e7ff;
295
+ border-radius: 8px;
296
+ padding: 16px;
297
+ margin: 8px 0;
298
+ }
299
+ """
300
+
301
+ with gr.Blocks(
302
+ css=css, title="Advanced Multi-Language OCR System", theme=gr.themes.Soft()
303
+ ) as app:
304
+ # Header
305
+ gr.Markdown("""
306
+ # 🔍 Advanced Multi-Language OCR System
307
+
308
+ **Powered by Pix2Text, Tesseract, and FastAPI**
309
+
310
+ Extract text from PDFs containing **English**, **Bangla**, and **Mathematical expressions** with high accuracy.
311
+ Evaluate OCR performance with comprehensive metrics and detailed analysis.
312
+ """)
313
+
314
+ with gr.Tabs():
315
+ # Tab 1: OCR Extraction
316
+ with gr.Tab("📄 PDF Text Extraction"):
317
+ gr.Markdown("""
318
+ ### Upload a PDF and extract text using advanced multi-language OCR
319
+
320
+ **Features:**
321
+ - 🌐 **Multi-language support**: English, Bangla (Bengali), and Mathematical expressions
322
+ - 🧮 **Advanced Math Recognition**: Pix2Text integration for LaTeX and mathematical formulas
323
+ - 📊 **Detailed Analysis**: Character-level classification and confidence scores
324
+ - 💾 **Download Results**: Get extracted text and detailed JSON analysis
325
+ """)
326
+
327
+ with gr.Row():
328
+ with gr.Column(scale=1):
329
+ pdf_input = gr.File(
330
+ label="📄 Upload PDF File",
331
+ file_types=[".pdf"],
332
+ type="filepath",
333
+ )
334
+ extract_btn = gr.Button(
335
+ "🚀 Extract Text", variant="primary", size="lg"
336
+ )
337
+
338
+ with gr.Column(scale=2):
339
+ extraction_summary = gr.Textbox(
340
+ label="📊 Extraction Summary",
341
+ lines=15,
342
+ elem_classes=["summary-box"],
343
+ )
344
+
345
+ with gr.Row():
346
+ extracted_text_output = gr.Textbox(
347
+ label="📝 Extracted Text",
348
+ lines=20,
349
+ elem_classes=["output-text"],
350
+ show_copy_button=True,
351
+ )
352
+
353
+ with gr.Row():
354
+ text_file_download = gr.File(
355
+ label="📥 Download Text File", visible=False
356
+ )
357
+ json_file_download = gr.File(
358
+ label="📥 Download JSON Analysis", visible=False
359
+ )
360
+
361
+ # Connect extraction functionality
362
+ extract_btn.click(
363
+ fn=extract_text_from_pdf,
364
+ inputs=[pdf_input],
365
+ outputs=[
366
+ extracted_text_output,
367
+ extraction_summary,
368
+ text_file_download,
369
+ json_file_download,
370
+ ],
371
+ ).then(
372
+ lambda text_path, json_path: (
373
+ gr.update(
374
+ visible=bool(text_path),
375
+ value=text_path if text_path else None,
376
+ ),
377
+ gr.update(
378
+ visible=bool(json_path),
379
+ value=json_path if json_path else None,
380
+ ),
381
+ ),
382
+ inputs=[text_file_download, json_file_download],
383
+ outputs=[text_file_download, json_file_download],
384
+ )
385
+
386
+ # Tab 2: OCR Evaluation
387
+ with gr.Tab("📊 OCR Accuracy Evaluation"):
388
+ gr.Markdown("""
389
+ ### Compare OCR extracted text with ground truth baseline for accuracy analysis
390
+
391
+ **Evaluation Features:**
392
+ - 🎯 **Character-level accuracy**: Precise character matching and edit distance
393
+ - 📚 **Word-level accuracy**: Word matching and error rates
394
+ - 📄 **Line-level accuracy**: Line comparison and similarity scores
395
+ - 🌐 **Language-specific metrics**: Separate accuracy for English, Bangla, and Math
396
+ - 🏆 **Grading system**: Letter grades from A+ to F with recommendations
397
+ """)
398
+
399
+ with gr.Row():
400
+ with gr.Column():
401
+ extracted_file_input = gr.File(
402
+ label="📄 OCR Extracted Text File (.txt)",
403
+ file_types=[".txt"],
404
+ type="filepath",
405
+ )
406
+ baseline_file_input = gr.File(
407
+ label="📑 Ground Truth Baseline File (.txt)",
408
+ file_types=[".txt"],
409
+ type="filepath",
410
+ )
411
+ evaluation_name_input = gr.Textbox(
412
+ label="📝 Evaluation Name (Optional)",
413
+ placeholder="e.g., Math Document Test #1",
414
+ )
415
+ evaluate_btn = gr.Button(
416
+ "📊 Evaluate Accuracy", variant="primary", size="lg"
417
+ )
418
+
419
+ with gr.Column():
420
+ evaluation_summary = gr.Textbox(
421
+ label="🎯 Evaluation Summary",
422
+ lines=10,
423
+ elem_classes=["summary-box"],
424
+ )
425
+
426
+ with gr.Row():
427
+ evaluation_results = gr.Textbox(
428
+ label="📈 Detailed Evaluation Results",
429
+ lines=25,
430
+ elem_classes=["output-text"],
431
+ show_copy_button=True,
432
+ )
433
+
434
+ # Connect evaluation functionality
435
+ evaluate_btn.click(
436
+ fn=evaluate_ocr_files,
437
+ inputs=[
438
+ extracted_file_input,
439
+ baseline_file_input,
440
+ evaluation_name_input,
441
+ ],
442
+ outputs=[evaluation_results, evaluation_summary],
443
+ )
444
+
445
+ # Tab 3: About & Help
446
+ with gr.Tab("ℹ️ About & Help"):
447
+ gr.Markdown("""
448
+ ## 🔍 Advanced Multi-Language OCR System
449
+
450
+ This application provides state-of-the-art Optical Character Recognition (OCR) for documents containing mixed languages and mathematical expressions.
451
+
452
+ ### 🌟 Key Features
453
+
454
+ #### 📄 **PDF Text Extraction**
455
+ - **Multi-language Support**: Simultaneously process English and Bangla (Bengali) text
456
+ - **Mathematical Recognition**: Advanced extraction of mathematical formulas and equations using Pix2Text
457
+ - **Intelligent Classification**: Automatic detection and classification of text regions by language/content type
458
+ - **High Accuracy**: Optimized preprocessing and multiple OCR engines for best results
459
+ - **Detailed Analysis**: Character-by-character analysis with confidence scores and language distribution
460
+
461
+ #### 📊 **OCR Accuracy Evaluation**
462
+ - **Comprehensive Metrics**: Character, word, and line-level accuracy measurements
463
+ - **Language-Specific Analysis**: Separate accuracy scores for different languages and mathematical content
464
+ - **Edit Distance Calculation**: Precise measurement of text differences using Levenshtein distance
465
+ - **Grading System**: Letter grades (A+ to F) with improvement recommendations
466
+ - **Detailed Comparison**: Side-by-side diff analysis showing insertions, deletions, and matches
467
+
468
+ ### 🛠️ **Technology Stack**
469
+
470
+ - **Pix2Text**: Advanced mathematical expression recognition
471
+ - **Tesseract OCR**: Multi-language text recognition with Bengali support
472
+ - **OpenCV**: Image preprocessing and enhancement
473
+ - **PDF2Image**: High-quality PDF to image conversion
474
+ - **FastAPI**: RESTful API backend
475
+ - **Gradio**: Interactive web interface
476
+
477
+ ### 📝 **Usage Instructions**
478
+
479
+ #### **For PDF Text Extraction:**
480
+ 1. Upload a PDF file using the file picker
481
+ 2. Click "🚀 Extract Text" to start processing
482
+ 3. Review the extraction summary for statistics
483
+ 4. Copy the extracted text or download the files
484
+ 5. Download the JSON file for detailed analysis data
485
+
486
+ #### **For OCR Evaluation:**
487
+ 1. Upload the OCR-extracted text file (what you want to evaluate)
488
+ 2. Upload the ground truth baseline file (the correct text)
489
+ 3. Optionally provide an evaluation name for identification
490
+ 4. Click "📊 Evaluate Accuracy" to run the comparison
491
+ 5. Review the detailed metrics and recommendations
492
+
493
+ ### 🎯 **Accuracy Grading System**
494
+
495
+ - **A+ (95-100%)**: Excellent - Professional-grade accuracy
496
+ - **A (90-94%)**: Very Good - High-quality results with minor errors
497
+ - **B (80-89%)**: Good - Acceptable for most applications
498
+ - **C (70-79%)**: Fair - May require manual review
499
+ - **D (60-69%)**: Poor - Significant improvements needed
500
+ - **F (<60%)**: Very Poor - Major issues requiring attention
501
+
502
+ ### 📚 **Supported Languages & Content**
503
+
504
+ - **English**: Full Latin alphabet with punctuation and symbols
505
+ - **Bangla (Bengali)**: Complete Bengali Unicode range (U+0980-U+09FF)
506
+ - **Mathematical Expressions**:
507
+ - Basic arithmetic operators (+, -, ×, ÷, =)
508
+ - Greek letters (α, β, γ, δ, π, θ, λ, μ, Ω, etc.)
509
+ - Mathematical symbols (∑, ∫, √, ∞, ∂, →, ≤, ≥, etc.)
510
+ - Subscripts and superscripts
511
+ - Functions and equations
512
+ - LaTeX-style expressions
513
+
514
+ ### 🔧 **Tips for Best Results**
515
+
516
+ 1. **PDF Quality**: Use high-resolution PDFs (300+ DPI) for better accuracy
517
+ 2. **Text Clarity**: Ensure text is not blurry, skewed, or low contrast
518
+ 3. **Language Consistency**: Mixed-language documents work best when languages are clearly separated
519
+ 4. **Mathematical Content**: Complex equations may require manual verification
520
+ 5. **File Size**: Larger documents may take longer to process
521
+
522
+ ### 🐛 **Troubleshooting**
523
+
524
+ - **Empty Results**: Check if the PDF contains selectable text or if images need OCR
525
+ - **Low Accuracy**: Try preprocessing the PDF to improve image quality
526
+ - **Mixed Languages**: Ensure the document has clear language boundaries
527
+ - **Mathematical Errors**: Complex formulas may need manual correction
528
+
529
+ ### 📞 **Support & Feedback**
530
+
531
+ For issues, suggestions, or contributions, please visit our [GitHub repository](https://github.com/ashfaqbracu/aaladinai).
532
+
533
+ ---
534
+
535
+ **Made with ❤️ for advancing multilingual text recognition**
536
+ """)
537
+
538
+ # Footer
539
+ gr.Markdown("""
540
+ ---
541
+
542
+ **🔗 Links:** [GitHub Repository](https://github.com/ashfaqbracu/aaladinai) | [Documentation](https://github.com/ashfaqbracu/aaladinai#readme)
543
+
544
+ **⚡ Powered by:** Pix2Text • Tesseract OCR • OpenCV • FastAPI • Gradio
545
+ """)
546
+
547
+ return app
548
+
549
+
550
+ # Main execution
551
+ if __name__ == "__main__":
552
+ logger.info("🚀 Starting Advanced Multi-Language OCR Gradio Interface...")
553
+
554
+ # Create and launch the interface
555
+ app = create_gradio_interface()
556
+
557
+ # Launch configuration
558
+ app.launch(
559
+ server_name="0.0.0.0", # Allow external access for Hugging Face Spaces
560
+ server_port=7860, # Standard port for Hugging Face Spaces
561
+ share=False, # Don't create gradio.live link
562
+ show_error=True, # Show detailed error messages
563
+ show_tips=True, # Show helpful tips
564
+ enable_queue=True, # Enable request queuing for better performance
565
+ max_threads=4, # Limit concurrent requests
566
+ )
eval.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # eval.py - OCR Evaluation Methods
2
+ # Comprehensive accuracy evaluation for OCR text extraction
3
+
4
+ import re
5
+ import difflib
6
+ from typing import Dict, List, Any
7
+ from collections import defaultdict
8
+ import unicodedata
9
+
10
+
11
+ def clean_control_characters(text: str) -> str:
12
+ """
13
+ Remove or replace control characters that can cause JSON encoding issues.
14
+ Properly handles Bangla and other Unicode characters.
15
+ """
16
+ if not text:
17
+ return text
18
+
19
+ # First, ensure the text is properly encoded
20
+ if isinstance(text, bytes):
21
+ try:
22
+ text = text.decode("utf-8", errors="replace")
23
+ except Exception:
24
+ text = str(text)
25
+
26
+ cleaned = ""
27
+ for char in text:
28
+ # Get Unicode category
29
+ category = unicodedata.category(char)
30
+
31
+ # Remove control characters except for common whitespace
32
+ if category.startswith("C") and char not in "\t\n\r":
33
+ # Replace with space for control characters
34
+ cleaned += " "
35
+ # Keep printable characters including Bangla unicode range
36
+ elif (
37
+ char.isprintable()
38
+ or char in "\t\n\r"
39
+ or "\u0980" <= char <= "\u09ff" # Bangla
40
+ or "\u0900" <= char <= "\u097f" # Devanagari
41
+ or "\u0600" <= char <= "\u06ff"
42
+ ): # Arabic
43
+ cleaned += char
44
+ else:
45
+ # Replace unprintable characters with space
46
+ cleaned += " "
47
+
48
+ # Clean up multiple spaces and normalize
49
+ cleaned = re.sub(r"\s+", " ", cleaned)
50
+ return cleaned.strip()
51
+
52
+
53
+ def safe_json_serialize(data: Dict[str, Any]) -> Dict[str, Any]:
54
+ """
55
+ Ensure all string values in the dictionary are safe for JSON serialization.
56
+ Handles Unicode characters properly for JSON encoding.
57
+ """
58
+ if isinstance(data, dict):
59
+ return {key: safe_json_serialize(value) for key, value in data.items()}
60
+ elif isinstance(data, list):
61
+ return [safe_json_serialize(item) for item in data]
62
+ elif isinstance(data, str):
63
+ # Clean control characters first
64
+ cleaned = clean_control_characters(data)
65
+ # Ensure the string is JSON-safe by encoding to UTF-8 and back
66
+ try:
67
+ # Test if it can be JSON serialized
68
+ import json
69
+
70
+ json.dumps(cleaned, ensure_ascii=False)
71
+ return cleaned
72
+ except Exception:
73
+ # If there are still issues, use ASCII encoding with escape sequences
74
+ return cleaned.encode("ascii", errors="replace").decode("ascii")
75
+ else:
76
+ return data
77
+
78
+
79
+ def edit_distance(s1: str, s2: str) -> int:
80
+ """
81
+ Calculate edit distance (Levenshtein distance) between two strings.
82
+ """
83
+ if len(s1) < len(s2):
84
+ return edit_distance(s2, s1)
85
+
86
+ if len(s2) == 0:
87
+ return len(s1)
88
+
89
+ previous_row = list(range(len(s2) + 1))
90
+ for i, c1 in enumerate(s1):
91
+ current_row = [i + 1]
92
+ for j, c2 in enumerate(s2):
93
+ insertions = previous_row[j + 1] + 1
94
+ deletions = current_row[j] + 1
95
+ substitutions = previous_row[j] + (c1 != c2)
96
+ current_row.append(min(insertions, deletions, substitutions))
97
+ previous_row = current_row
98
+
99
+ return previous_row[-1]
100
+
101
+
102
+ def normalize_text(text: str) -> str:
103
+ """
104
+ Normalize text for better comparison by:
105
+ - Converting to lowercase
106
+ - Removing extra whitespace
107
+ - Normalizing Unicode characters
108
+ """
109
+ # Normalize Unicode (handles accents, special characters)
110
+ text = unicodedata.normalize("NFKD", text)
111
+
112
+ # Convert to lowercase
113
+ text = text.lower()
114
+
115
+ # Remove extra whitespace and normalize line breaks
116
+ text = re.sub(r"\s+", " ", text)
117
+ text = text.strip()
118
+
119
+ return text
120
+
121
+
122
+ def calculate_character_accuracy(extracted: str, baseline: str) -> Dict[str, float]:
123
+ """
124
+ Calculate character-level accuracy metrics.
125
+ """
126
+ extracted_norm = normalize_text(extracted)
127
+ baseline_norm = normalize_text(baseline)
128
+
129
+ # Character-level metrics
130
+ total_chars = len(baseline_norm)
131
+ if total_chars == 0:
132
+ return {"character_accuracy": 0.0, "character_error_rate": 100.0}
133
+
134
+ # Calculate edit distance (Levenshtein distance)
135
+ edit_dist = edit_distance(extracted_norm, baseline_norm)
136
+
137
+ # Character accuracy = (total_chars - edit_distance) / total_chars
138
+ char_accuracy = max(0, (total_chars - edit_dist) / total_chars) * 100
139
+ char_error_rate = (edit_dist / total_chars) * 100
140
+
141
+ return {
142
+ "character_accuracy": round(char_accuracy, 2),
143
+ "character_error_rate": round(char_error_rate, 2),
144
+ "edit_distance": edit_dist,
145
+ "total_characters": total_chars,
146
+ }
147
+
148
+
149
+ def calculate_word_accuracy(extracted: str, baseline: str) -> Dict[str, float]:
150
+ """
151
+ Calculate word-level accuracy metrics.
152
+ """
153
+ extracted_words = normalize_text(extracted).split()
154
+ baseline_words = normalize_text(baseline).split()
155
+
156
+ total_words = len(baseline_words)
157
+ if total_words == 0:
158
+ return {"word_accuracy": 0.0, "word_error_rate": 100.0}
159
+
160
+ # Calculate word-level edit distance
161
+ word_edit_dist = edit_distance(" ".join(extracted_words), " ".join(baseline_words))
162
+
163
+ # Count exact word matches
164
+ extracted_set = set(extracted_words)
165
+ baseline_set = set(baseline_words)
166
+
167
+ correct_words = len(extracted_set.intersection(baseline_set))
168
+ word_accuracy = (correct_words / total_words) * 100
169
+
170
+ # Word Error Rate (WER)
171
+ word_error_rate = (word_edit_dist / total_words) * 100
172
+
173
+ return {
174
+ "word_accuracy": round(word_accuracy, 2),
175
+ "word_error_rate": round(word_error_rate, 2),
176
+ "correct_words": correct_words,
177
+ "total_words": total_words,
178
+ "missing_words": len(baseline_set - extracted_set),
179
+ "extra_words": len(extracted_set - baseline_set),
180
+ }
181
+
182
+
183
+ def calculate_line_accuracy(extracted: str, baseline: str) -> Dict[str, float]:
184
+ """
185
+ Calculate line-level accuracy metrics.
186
+ """
187
+ extracted_lines = [line.strip() for line in extracted.split("\n") if line.strip()]
188
+ baseline_lines = [line.strip() for line in baseline.split("\n") if line.strip()]
189
+
190
+ total_lines = len(baseline_lines)
191
+ if total_lines == 0:
192
+ return {"line_accuracy": 0.0, "lines_matched": 0}
193
+
194
+ # Calculate similarity for each line
195
+ matched_lines = 0
196
+ line_similarities = []
197
+
198
+ for i, baseline_line in enumerate(baseline_lines):
199
+ best_similarity = 0
200
+ for extracted_line in extracted_lines:
201
+ similarity = difflib.SequenceMatcher(
202
+ None, normalize_text(baseline_line), normalize_text(extracted_line)
203
+ ).ratio()
204
+ best_similarity = max(best_similarity, similarity)
205
+
206
+ line_similarities.append(best_similarity)
207
+ if best_similarity > 0.8: # 80% similarity threshold
208
+ matched_lines += 1
209
+
210
+ line_accuracy = (matched_lines / total_lines) * 100
211
+ avg_line_similarity = (sum(line_similarities) / len(line_similarities)) * 100
212
+
213
+ return {
214
+ "line_accuracy": round(line_accuracy, 2),
215
+ "average_line_similarity": round(avg_line_similarity, 2),
216
+ "lines_matched": matched_lines,
217
+ "total_lines": total_lines,
218
+ }
219
+
220
+
221
+ def calculate_language_specific_accuracy(
222
+ extracted: str, baseline: str
223
+ ) -> Dict[str, Any]:
224
+ """
225
+ Calculate accuracy for different language components (English, Bangla, Math).
226
+ """
227
+
228
+ def classify_char(char):
229
+ if "\u0980" <= char <= "\u09ff": # Bangla unicode range
230
+ return "bangla"
231
+ elif char.isascii() and char.isalpha():
232
+ return "english"
233
+ elif char.isdigit():
234
+ return "number"
235
+ elif char in "=+-×÷∑∫√π∞∂→≤≥∝∴∵∠∆∇∀∃∈∉⊂⊃⊆⊇∪∩∧∨¬αβγδεζηθικλμνξοπρστυφχψω":
236
+ return "math"
237
+ else:
238
+ return "other"
239
+
240
+ # Analyze character distribution
241
+ extracted_chars = defaultdict(list)
242
+ baseline_chars = defaultdict(list)
243
+
244
+ for char in extracted:
245
+ char_type = classify_char(char)
246
+ extracted_chars[char_type].append(char)
247
+
248
+ for char in baseline:
249
+ char_type = classify_char(char)
250
+ baseline_chars[char_type].append(char)
251
+
252
+ language_accuracy = {}
253
+
254
+ for lang_type in ["english", "bangla", "math", "number"]:
255
+ extracted_text = "".join(extracted_chars.get(lang_type, []))
256
+ baseline_text = "".join(baseline_chars.get(lang_type, []))
257
+
258
+ if baseline_text:
259
+ char_metrics = calculate_character_accuracy(extracted_text, baseline_text)
260
+ language_accuracy[f"{lang_type}_accuracy"] = char_metrics[
261
+ "character_accuracy"
262
+ ]
263
+ else:
264
+ language_accuracy[f"{lang_type}_accuracy"] = (
265
+ 100.0 if not extracted_text else 0.0
266
+ )
267
+
268
+ return language_accuracy
269
+
270
+
271
+ def calculate_similarity_score(extracted: str, baseline: str) -> float:
272
+ """
273
+ Calculate overall similarity score using sequence matcher.
274
+ """
275
+ similarity = difflib.SequenceMatcher(
276
+ None, normalize_text(extracted), normalize_text(baseline)
277
+ ).ratio()
278
+ return round(similarity * 100, 2)
279
+
280
+
281
+ def generate_detailed_diff(extracted: str, baseline: str) -> List[Dict[str, str]]:
282
+ """
283
+ Generate a detailed diff showing insertions, deletions, and matches.
284
+ """
285
+ extracted_norm = normalize_text(extracted)
286
+ baseline_norm = normalize_text(baseline)
287
+
288
+ differ = difflib.unified_diff(
289
+ baseline_norm.splitlines(keepends=True),
290
+ extracted_norm.splitlines(keepends=True),
291
+ fromfile="baseline",
292
+ tofile="extracted",
293
+ lineterm="",
294
+ )
295
+
296
+ diff_result = []
297
+ for line in differ:
298
+ if line.startswith("---") or line.startswith("+++") or line.startswith("@@"):
299
+ continue
300
+ elif line.startswith("-"):
301
+ content = clean_control_characters(line[1:])
302
+ diff_result.append({"type": "deletion", "content": content})
303
+ elif line.startswith("+"):
304
+ content = clean_control_characters(line[1:])
305
+ diff_result.append({"type": "insertion", "content": content})
306
+ else:
307
+ content = clean_control_characters(line)
308
+ diff_result.append({"type": "match", "content": content})
309
+
310
+ return diff_result
311
+
312
+
313
+ def evaluate_ocr_accuracy(extracted_text: str, baseline_text: str) -> Dict[str, Any]:
314
+ """
315
+ Comprehensive OCR accuracy evaluation.
316
+
317
+ Args:
318
+ extracted_text: The text extracted by OCR
319
+ baseline_text: The ground truth text
320
+
321
+ Returns:
322
+ Dictionary containing various accuracy metrics
323
+ """
324
+ if not extracted_text and not baseline_text:
325
+ return {"error": "Both texts are empty"}
326
+
327
+ if not baseline_text:
328
+ return {"error": "Baseline text is empty"}
329
+
330
+ # Clean control characters from input texts
331
+ extracted_text = clean_control_characters(extracted_text)
332
+ baseline_text = clean_control_characters(baseline_text)
333
+
334
+ # Calculate all metrics
335
+ char_metrics = calculate_character_accuracy(extracted_text, baseline_text)
336
+ word_metrics = calculate_word_accuracy(extracted_text, baseline_text)
337
+ line_metrics = calculate_line_accuracy(extracted_text, baseline_text)
338
+ lang_metrics = calculate_language_specific_accuracy(extracted_text, baseline_text)
339
+ similarity_score = calculate_similarity_score(extracted_text, baseline_text)
340
+ detailed_diff = generate_detailed_diff(extracted_text, baseline_text)
341
+
342
+ # Calculate overall score (weighted average)
343
+ overall_score = (
344
+ char_metrics["character_accuracy"] * 0.4
345
+ + word_metrics["word_accuracy"] * 0.3
346
+ + line_metrics["line_accuracy"] * 0.2
347
+ + similarity_score * 0.1
348
+ )
349
+
350
+ result = {
351
+ "overall_accuracy": round(overall_score, 2),
352
+ "similarity_score": similarity_score,
353
+ "character_metrics": char_metrics,
354
+ "word_metrics": word_metrics,
355
+ "line_metrics": line_metrics,
356
+ "language_specific": lang_metrics,
357
+ "text_statistics": {
358
+ "extracted_length": len(extracted_text),
359
+ "baseline_length": len(baseline_text),
360
+ "extracted_words": len(extracted_text.split()),
361
+ "baseline_words": len(baseline_text.split()),
362
+ "extracted_lines": len(extracted_text.split("\n")),
363
+ "baseline_lines": len(baseline_text.split("\n")),
364
+ },
365
+ "detailed_diff": detailed_diff[:50], # Limit to first 50 diff items
366
+ "evaluation_summary": {
367
+ "grade": get_accuracy_grade(overall_score),
368
+ "recommendations": get_recommendations(
369
+ char_metrics, word_metrics, lang_metrics
370
+ ),
371
+ },
372
+ }
373
+
374
+ # Clean all string values to ensure JSON safety
375
+ return safe_json_serialize(result)
376
+
377
+
378
+ def get_accuracy_grade(score: float) -> str:
379
+ """Convert accuracy score to letter grade."""
380
+ if score >= 95:
381
+ return "A+ (Excellent)"
382
+ elif score >= 90:
383
+ return "A (Very Good)"
384
+ elif score >= 80:
385
+ return "B (Good)"
386
+ elif score >= 70:
387
+ return "C (Fair)"
388
+ elif score >= 60:
389
+ return "D (Poor)"
390
+ else:
391
+ return "F (Very Poor)"
392
+
393
+
394
+ def get_recommendations(
395
+ char_metrics: Dict, word_metrics: Dict, lang_metrics: Dict
396
+ ) -> List[str]:
397
+ """Generate recommendations based on accuracy metrics."""
398
+ recommendations = []
399
+
400
+ if char_metrics["character_accuracy"] < 80:
401
+ recommendations.append(
402
+ "Consider improving image preprocessing (noise reduction, contrast adjustment)"
403
+ )
404
+
405
+ if word_metrics["word_accuracy"] < 70:
406
+ recommendations.append(
407
+ "Word-level accuracy is low - check language model configuration"
408
+ )
409
+
410
+ if lang_metrics.get("bangla_accuracy", 100) < 80:
411
+ recommendations.append(
412
+ "Bangla text accuracy is low - ensure Bengali language pack is installed"
413
+ )
414
+
415
+ if lang_metrics.get("math_accuracy", 100) < 70:
416
+ recommendations.append(
417
+ "Mathematical expression accuracy is low - consider tuning Pix2Text parameters"
418
+ )
419
+
420
+ if lang_metrics.get("english_accuracy", 100) < 85:
421
+ recommendations.append(
422
+ "English text accuracy could be improved - check OCR engine settings"
423
+ )
424
+
425
+ if not recommendations:
426
+ recommendations.append("Excellent accuracy! No specific improvements needed.")
427
+
428
+ return recommendations
main6_pix2text.py ADDED
@@ -0,0 +1,838 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import pytesseract
3
+ from pytesseract import Output
4
+ from pdf2image import convert_from_path
5
+ import numpy as np
6
+ import json
7
+ from tqdm import tqdm
8
+ import unicodedata
9
+ from collections import defaultdict
10
+ from PIL import Image
11
+ import logging
12
+
13
+
14
+ try:
15
+ from pix2text import Pix2Text
16
+
17
+ PIX2TEXT_AVAILABLE = True
18
+ print("Pix2Text imported successfully for advanced math extraction")
19
+ except ImportError:
20
+ PIX2TEXT_AVAILABLE = False
21
+ print("Pix2Text not available. Install with: pip install pix2text")
22
+ print(" Falling back to traditional OCR for math expressions")
23
+
24
+
25
+ logging.basicConfig(level=logging.INFO)
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ # ----------------------------
30
+ # STEP 1: Enhanced Character Classification
31
+ # ----------------------------
32
+ def classify_character(char):
33
+ """
34
+ Classify a single character as English, Bangla, Math, or Other.
35
+ Enhanced for better math detection.
36
+ """
37
+ if not char or char.isspace():
38
+ return "space"
39
+
40
+ # Unicode ranges for Bangla
41
+ if "\u0980" <= char <= "\u09ff": # Bangla unicode range
42
+ return "bangla"
43
+
44
+ # Enhanced mathematical symbols and operators
45
+ math_chars = set(
46
+ "=+-×÷∑∫√π∞∂→≤≥∝∴∵∠∆∇∀∃∈∉⊂⊃⊆⊇∪∩∧∨¬"
47
+ "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ"
48
+ "±≈≠≡⇒⇔∘∗⊕⊗⊙⊥∥∦∝∞"
49
+ )
50
+
51
+ # Extended math ranges
52
+ math_ranges = [
53
+ ("\u2200", "\u22ff"), # Mathematical Operators
54
+ ("\u2190", "\u21ff"), # Arrows
55
+ ("\u0370", "\u03ff"), # Greek and Coptic
56
+ ("\u2070", "\u209f"), # Superscripts and Subscripts
57
+ ("\u27c0", "\u27ef"), # Miscellaneous Mathematical Symbols-A
58
+ ("\u2980", "\u29ff"), # Miscellaneous Mathematical Symbols-B
59
+ ]
60
+
61
+ if char in math_chars:
62
+ return "math"
63
+
64
+ for start, end in math_ranges:
65
+ if start <= char <= end:
66
+ return "math"
67
+
68
+ # Numbers (also often mathematical)
69
+ if char.isdigit():
70
+ return "number"
71
+
72
+ # English letters
73
+ if char.isascii() and char.isalpha():
74
+ return "english"
75
+
76
+ # Mathematical punctuation
77
+ if char in ".,;:!?()[]{}\"'-_/\\^":
78
+ return "punctuation"
79
+
80
+ return "other"
81
+
82
+
83
+ def classify_text_region(text):
84
+ """
85
+ Enhanced text region classification with better math detection.
86
+ """
87
+ if not text.strip():
88
+ return "empty"
89
+
90
+ char_counts = defaultdict(int)
91
+ for char in text:
92
+ char_type = classify_character(char)
93
+ char_counts[char_type] += 1
94
+
95
+ # Remove spaces from consideration
96
+ significant_chars = {k: v for k, v in char_counts.items() if k not in ["space"]}
97
+
98
+ if not significant_chars:
99
+ return "empty"
100
+
101
+ total_significant = sum(significant_chars.values())
102
+ percentages = {k: v / total_significant for k, v in significant_chars.items()}
103
+
104
+ # Enhanced classification logic
105
+ math_indicators = percentages.get("math", 0) + percentages.get("number", 0) * 0.5
106
+
107
+ if percentages.get("bangla", 0) > 0.5:
108
+ return "bangla"
109
+ elif math_indicators > 0.3 or has_math_patterns(text):
110
+ return "math"
111
+ elif percentages.get("english", 0) > 0.5:
112
+ return "english"
113
+ else:
114
+ return "mixed"
115
+
116
+
117
+ def has_math_patterns(text):
118
+ """
119
+ Detect mathematical patterns in text using regex and heuristics.
120
+ """
121
+ import re
122
+
123
+ # Common mathematical patterns
124
+ math_patterns = [
125
+ r"\d+[\+\-\*/=]\d+", # Simple arithmetic
126
+ r"[xy]\^?\d+", # Variables with powers
127
+ r"\\[a-zA-Z]+", # LaTeX commands
128
+ r"\$.*?\$", # LaTeX inline math
129
+ r"[a-zA-Z]\([a-zA-Z,\d\s]+\)", # Functions like f(x)
130
+ r"\b(sin|cos|tan|log|ln|exp|sqrt|int|sum|lim)\b", # Math functions
131
+ r"[≤≥≠≈∫∑∂∞]", # Math symbols
132
+ ]
133
+
134
+ for pattern in math_patterns:
135
+ if re.search(pattern, text, re.IGNORECASE):
136
+ return True
137
+
138
+ return False
139
+
140
+
141
+ # ----------------------------
142
+ # STEP 2: Initialize Pix2Text
143
+ # ----------------------------
144
+ def initialize_pix2text():
145
+ """Initialize Pix2Text model for mathematical expression extraction."""
146
+ if not PIX2TEXT_AVAILABLE:
147
+ return None
148
+
149
+ try:
150
+ # Initialize Pix2Text with specific configuration for math
151
+ # Try different initialization methods
152
+ logger.info("Initializing Pix2Text...")
153
+
154
+ # Method 1: Default initialization
155
+ try:
156
+ p2t = Pix2Text.from_config()
157
+ logger.info("✅ Pix2Text initialized with default config")
158
+ return p2t
159
+ except Exception as e1:
160
+ logger.warning(f"Default Pix2Text init failed: {e1}")
161
+
162
+ # Method 2: Try with specific config
163
+ try:
164
+ p2t = Pix2Text()
165
+ logger.info("✅ Pix2Text initialized with basic constructor")
166
+ return p2t
167
+ except Exception as e2:
168
+ logger.warning(f"Basic Pix2Text init failed: {e2}")
169
+
170
+ # Method 3: Try with minimal config
171
+ try:
172
+ config = {"device": "cpu"} # Force CPU to avoid CUDA issues
173
+ p2t = Pix2Text.from_config(config)
174
+ logger.info("✅ Pix2Text initialized with CPU config")
175
+ return p2t
176
+ except Exception as e3:
177
+ logger.error(f"All Pix2Text initialization methods failed: {e3}")
178
+
179
+ return None
180
+
181
+ except Exception as e:
182
+ logger.error(f"❌ Failed to initialize Pix2Text: {e}")
183
+ return None
184
+
185
+
186
+ # ----------------------------
187
+ # STEP 3: Enhanced Image Preprocessing
188
+ # ----------------------------
189
+ def preprocess_image_advanced(pil_image):
190
+ """Enhanced image preprocessing with multiple techniques."""
191
+ img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
192
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
193
+
194
+ # Noise reduction
195
+ gray = cv2.fastNlMeansDenoising(gray, h=15)
196
+
197
+ # Adaptive thresholding for better text separation
198
+ binary = cv2.adaptiveThreshold(
199
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 5
200
+ )
201
+
202
+ # Enhance contrast
203
+ enhanced = cv2.convertScaleAbs(binary, alpha=1.2, beta=10)
204
+
205
+ # Scale up for better OCR accuracy
206
+ height, width = enhanced.shape
207
+ scaled = cv2.resize(
208
+ enhanced, (width * 2, height * 2), interpolation=cv2.INTER_CUBIC
209
+ )
210
+
211
+ return scaled
212
+
213
+
214
+ def preprocess_for_pix2text(pil_image, region):
215
+ """
216
+ Special preprocessing for Pix2Text mathematical expression extraction.
217
+ """
218
+ # Convert PIL to numpy array
219
+ img = np.array(pil_image)
220
+
221
+ # Crop the specific region
222
+ x, y, w, h = region["left"], region["top"], region["width"], region["height"]
223
+
224
+ # Validate region dimensions
225
+ if w <= 0 or h <= 0:
226
+ logger.warning(f"Invalid region dimensions: w={w}, h={h}. Skipping Pix2Text.")
227
+ return None
228
+
229
+ # Add padding around the math region for better recognition
230
+ padding = 10
231
+ x_start = max(0, x - padding)
232
+ y_start = max(0, y - padding)
233
+ x_end = min(img.shape[1], x + w + padding)
234
+ y_end = min(img.shape[0], y + h + padding)
235
+
236
+ # Validate cropping bounds
237
+ if x_end <= x_start or y_end <= y_start:
238
+ logger.warning(
239
+ f"Invalid crop bounds: x({x_start}:{x_end}), y({y_start}:{y_end}). Skipping Pix2Text."
240
+ )
241
+ return None
242
+
243
+ cropped = img[y_start:y_end, x_start:x_end]
244
+
245
+ # Check if crop resulted in empty image
246
+ if cropped.size == 0:
247
+ logger.warning("Cropped image is empty. Skipping Pix2Text.")
248
+ return None
249
+
250
+ # Convert back to PIL Image
251
+ try:
252
+ cropped_pil = Image.fromarray(cropped)
253
+ except Exception as e:
254
+ logger.error(f"Failed to create PIL image from cropped array: {e}")
255
+ return None
256
+
257
+ # Ensure minimum size for Pix2Text
258
+ min_size = 32
259
+ if cropped_pil.width <= 0 or cropped_pil.height <= 0:
260
+ logger.warning(
261
+ f"Invalid PIL image dimensions: {cropped_pil.width}x{cropped_pil.height}"
262
+ )
263
+ return None
264
+
265
+ if cropped_pil.width < min_size or cropped_pil.height < min_size:
266
+ # Resize maintaining aspect ratio
267
+ try:
268
+ ratio = max(min_size / cropped_pil.width, min_size / cropped_pil.height)
269
+ new_width = int(cropped_pil.width * ratio)
270
+ new_height = int(cropped_pil.height * ratio)
271
+
272
+ # Ensure new dimensions are valid
273
+ if new_width <= 0 or new_height <= 0:
274
+ logger.warning(f"Invalid resized dimensions: {new_width}x{new_height}")
275
+ return None
276
+
277
+ cropped_pil = cropped_pil.resize((new_width, new_height), Image.LANCZOS)
278
+ except Exception as e:
279
+ logger.error(f"Failed to resize image: {e}")
280
+ return None
281
+
282
+ return cropped_pil
283
+
284
+
285
+ # ----------------------------
286
+ # STEP 4: Text Detection and Line Segmentation
287
+ # ----------------------------
288
+ def detect_text_regions(image):
289
+ """Detect text regions and classify them by line and character type."""
290
+ data = pytesseract.image_to_data(image, output_type=Output.DICT, lang="eng+ben")
291
+
292
+ text_regions = []
293
+ for i in range(len(data["text"])):
294
+ text = data["text"][i].strip()
295
+ if text and int(data["conf"][i]) > 25: # Lowered threshold for math
296
+ # Validate region dimensions
297
+ width = int(data["width"][i])
298
+ height = int(data["height"][i])
299
+ left = int(data["left"][i])
300
+ top = int(data["top"][i])
301
+
302
+ # Skip regions with invalid dimensions
303
+ if width <= 0 or height <= 0:
304
+ logger.debug(
305
+ f"Skipping region with invalid dimensions: {width}x{height}"
306
+ )
307
+ continue
308
+
309
+ # Skip regions that are too small to be meaningful
310
+ if width < 3 or height < 3:
311
+ logger.debug(f"Skipping tiny region: {width}x{height}")
312
+ continue
313
+
314
+ region = {
315
+ "text": text,
316
+ "left": left,
317
+ "top": top,
318
+ "width": width,
319
+ "height": height,
320
+ "confidence": int(data["conf"][i]),
321
+ "type": classify_text_region(text),
322
+ }
323
+ text_regions.append(region)
324
+
325
+ logger.info(f"Detected {len(text_regions)} valid text regions")
326
+ return text_regions
327
+
328
+
329
+ def group_regions_by_line(regions, line_tolerance=15):
330
+ """Group text regions into lines with better tolerance for math expressions."""
331
+ if not regions:
332
+ return []
333
+
334
+ regions_sorted = sorted(regions, key=lambda x: x["top"])
335
+
336
+ lines = []
337
+ current_line = [regions_sorted[0]]
338
+ current_top = regions_sorted[0]["top"]
339
+
340
+ for region in regions_sorted[1:]:
341
+ # More flexible line grouping for mathematical expressions
342
+ # Handle zero heights safely
343
+ current_height = max(1, current_line[0]["height"]) # Avoid division by zero
344
+ region_height = max(1, region["height"]) # Avoid division by zero
345
+ height_avg = (current_height + region_height) / 2
346
+ tolerance = max(line_tolerance, height_avg * 0.3)
347
+
348
+ if abs(region["top"] - current_top) <= tolerance:
349
+ current_line.append(region)
350
+ else:
351
+ current_line.sort(key=lambda x: x["left"])
352
+ lines.append(current_line)
353
+ current_line = [region]
354
+ current_top = region["top"]
355
+
356
+ if current_line:
357
+ current_line.sort(key=lambda x: x["left"])
358
+ lines.append(current_line)
359
+
360
+ return lines
361
+
362
+
363
+ # ----------------------------
364
+ # STEP 5: Advanced OCR Extractors
365
+ # ----------------------------
366
+ def extract_english_region(image, region):
367
+ """Extract English text from a specific region with optimized settings."""
368
+ x, y, w, h = region["left"], region["top"], region["width"], region["height"]
369
+
370
+ roi = image[y : y + h, x : x + w]
371
+ if roi.size == 0:
372
+ return region["text"]
373
+
374
+ config = r"--oem 3 --psm 8 -l eng"
375
+ try:
376
+ result = pytesseract.image_to_string(roi, config=config).strip()
377
+ return result if result else region["text"]
378
+ except Exception:
379
+ return region["text"]
380
+
381
+
382
+ def extract_bangla_region(image, region):
383
+ """Extract Bangla text from a specific region with optimized settings."""
384
+ x, y, w, h = region["left"], region["top"], region["width"], region["height"]
385
+
386
+ roi = image[y : y + h, x : x + w]
387
+ if roi.size == 0:
388
+ return region["text"]
389
+
390
+ config = r"--oem 3 --psm 8 -l ben"
391
+ try:
392
+ result = pytesseract.image_to_string(roi, config=config).strip()
393
+ return result if result else region["text"]
394
+ except Exception:
395
+ return region["text"]
396
+
397
+
398
+ def extract_math_region_pix2text(pil_image, region, p2t_model):
399
+ """
400
+ Extract mathematical expressions using Pix2Text with fallback to traditional OCR.
401
+ """
402
+ if not p2t_model:
403
+ return extract_math_region_traditional(pil_image, region)
404
+
405
+ try:
406
+ # Preprocess image for Pix2Text
407
+ math_image = preprocess_for_pix2text(pil_image, region)
408
+
409
+ # If preprocessing failed, fall back to traditional OCR
410
+ if math_image is None:
411
+ logger.warning(
412
+ "Pix2Text preprocessing failed, falling back to traditional OCR"
413
+ )
414
+ return extract_math_region_traditional(pil_image, region)
415
+
416
+ # Use Pix2Text to extract mathematical expressions
417
+ result = p2t_model(math_image)
418
+
419
+ # Enhanced result parsing to handle different Pix2Text response formats
420
+ extracted_text = parse_pix2text_result(result)
421
+
422
+ if extracted_text and extracted_text.strip():
423
+ # Filter out invalid responses
424
+ if not is_valid_pix2text_result(extracted_text):
425
+ logger.warning(f"Invalid Pix2Text result: {extracted_text[:100]}...")
426
+ return extract_math_region_traditional(pil_image, region)
427
+
428
+ logger.info(f"✅ Pix2Text extracted: {extracted_text[:50]}...")
429
+ return extracted_text.strip()
430
+ else:
431
+ logger.warning(
432
+ "⚠️ Pix2Text returned empty result, falling back to traditional OCR"
433
+ )
434
+ return extract_math_region_traditional(pil_image, region)
435
+
436
+ except Exception as e:
437
+ logger.error(f"❌ Pix2Text extraction failed: {e}")
438
+ return extract_math_region_traditional(pil_image, region)
439
+
440
+
441
+ def parse_pix2text_result(result):
442
+ """
443
+ Parse Pix2Text result handling various response formats.
444
+ """
445
+ try:
446
+ if isinstance(result, dict):
447
+ # Handle different Pix2Text response formats
448
+ # Try common keys for mathematical content
449
+ for key in ["text", "formula", "latex", "content", "output"]:
450
+ if key in result and result[key]:
451
+ return str(result[key])
452
+
453
+ # If no specific key found, convert entire dict to string
454
+ # but filter out obviously bad content
455
+ result_str = str(result)
456
+ if len(result_str) > 1000: # Too long, likely debug info
457
+ return ""
458
+ return result_str
459
+
460
+ elif isinstance(result, list):
461
+ # Handle list responses
462
+ if not result:
463
+ return ""
464
+
465
+ # Join list elements that look like mathematical content
466
+ valid_items = []
467
+ for item in result:
468
+ item_str = str(item).strip()
469
+ if item_str and not is_debug_content(item_str):
470
+ valid_items.append(item_str)
471
+
472
+ return " ".join(valid_items)
473
+
474
+ elif isinstance(result, str):
475
+ return result
476
+ else:
477
+ return str(result)
478
+
479
+ except Exception as e:
480
+ logger.error(f"Error parsing Pix2Text result: {e}")
481
+ return ""
482
+
483
+
484
+ def is_valid_pix2text_result(text):
485
+ """
486
+ Check if the Pix2Text result is valid mathematical content.
487
+ """
488
+ if not text or not text.strip():
489
+ return False
490
+
491
+ text = text.strip()
492
+
493
+ # Filter out obvious debug/error content
494
+ invalid_patterns = [
495
+ "Page(id=",
496
+ "elements=[]",
497
+ "number=0",
498
+ "Error:",
499
+ "Exception:",
500
+ "Traceback:",
501
+ "DEBUG:",
502
+ "INFO:",
503
+ "WARNING:",
504
+ "ERROR:",
505
+ ]
506
+
507
+ for pattern in invalid_patterns:
508
+ if pattern in text:
509
+ return False
510
+
511
+ # Must have some reasonable length for math content
512
+ if len(text) < 1:
513
+ return False
514
+
515
+ # Should contain some mathematical or textual content
516
+ # Allow mathematical symbols, letters, numbers, basic punctuation
517
+ import re
518
+
519
+ if re.search(r"[a-zA-Z0-9=+\-*/(){}[\]^_√∫∑∂πθαβγδλμΩ]", text):
520
+ return True
521
+
522
+ return False
523
+
524
+
525
+ def is_debug_content(text):
526
+ """
527
+ Check if text appears to be debug/logging content rather than actual content.
528
+ """
529
+ debug_indicators = [
530
+ "Page(",
531
+ "id=",
532
+ "number=",
533
+ "elements=",
534
+ "[])",
535
+ "DEBUG",
536
+ "INFO",
537
+ "WARNING",
538
+ "ERROR",
539
+ "Exception",
540
+ "Traceback",
541
+ 'File "',
542
+ "line ",
543
+ " at 0x",
544
+ ]
545
+
546
+ for indicator in debug_indicators:
547
+ if indicator in text:
548
+ return True
549
+
550
+ return False
551
+
552
+
553
+ def extract_math_region_traditional(pil_image, region):
554
+ """
555
+ Fallback traditional OCR for mathematical expressions.
556
+ """
557
+ # Convert PIL to OpenCV format
558
+ img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
559
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
560
+
561
+ x, y, w, h = region["left"], region["top"], region["width"], region["height"]
562
+ roi = gray[y : y + h, x : x + w]
563
+
564
+ if roi.size == 0:
565
+ return region["text"]
566
+
567
+ # Math-optimized OCR with expanded symbol whitelist
568
+ math_chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz=+-×÷∑∫√π∞∂→≤≥∝∴∵∠∆∇()[]{}.,;:^_αβγδλμθΩ±≈≠≡⇒⇔"
569
+ config = f"--oem 3 --psm 6 -c tessedit_char_whitelist={math_chars}"
570
+
571
+ try:
572
+ result = pytesseract.image_to_string(roi, config=config).strip()
573
+ return result if result else region["text"]
574
+ except Exception:
575
+ return region["text"]
576
+
577
+
578
+ def extract_mixed_region(pil_image, region, p2t_model):
579
+ """Extract mixed content using multiple approaches."""
580
+ # Convert PIL to OpenCV for traditional OCR
581
+ img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
582
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
583
+
584
+ eng_result = extract_english_region(gray, region)
585
+ bangla_result = extract_bangla_region(gray, region)
586
+
587
+ # If it might contain math, try Pix2Text too
588
+ if has_math_patterns(region["text"]):
589
+ math_result = extract_math_region_pix2text(pil_image, region, p2t_model)
590
+ # Choose the longest non-empty result
591
+ results = [r for r in [eng_result, bangla_result, math_result] if r.strip()]
592
+ return max(results, key=len) if results else region["text"]
593
+
594
+ # Choose between English and Bangla
595
+ return bangla_result if len(bangla_result) > len(eng_result) else eng_result
596
+
597
+
598
+ # ----------------------------
599
+ # STEP 6: Character Analysis (unchanged)
600
+ # ----------------------------
601
+ def analyze_character_by_character(text):
602
+ """Analyze text character by character to identify language patterns."""
603
+ analysis = {
604
+ "characters": [],
605
+ "language_segments": [],
606
+ "total_chars": len(text),
607
+ "language_distribution": defaultdict(int),
608
+ }
609
+
610
+ for i, char in enumerate(text):
611
+ char_type = classify_character(char)
612
+ analysis["characters"].append(
613
+ {
614
+ "char": char,
615
+ "position": i,
616
+ "type": char_type,
617
+ "unicode_name": unicodedata.name(char, "UNKNOWN"),
618
+ }
619
+ )
620
+ analysis["language_distribution"][char_type] += 1
621
+
622
+ # Create language segments
623
+ current_segment = None
624
+ for char_info in analysis["characters"]:
625
+ if char_info["type"] in ["space", "punctuation"]:
626
+ continue
627
+
628
+ if current_segment is None or current_segment["type"] != char_info["type"]:
629
+ if current_segment:
630
+ analysis["language_segments"].append(current_segment)
631
+ current_segment = {
632
+ "type": char_info["type"],
633
+ "start": char_info["position"],
634
+ "end": char_info["position"],
635
+ "text": char_info["char"],
636
+ }
637
+ else:
638
+ current_segment["end"] = char_info["position"]
639
+ current_segment["text"] += char_info["char"]
640
+
641
+ if current_segment:
642
+ analysis["language_segments"].append(current_segment)
643
+
644
+ return analysis
645
+
646
+
647
+ # ----------------------------
648
+ # STEP 7: Main Processing Pipeline
649
+ # ----------------------------
650
+ def process_page_advanced(page_image, page_num, p2t_model):
651
+ """
652
+ Advanced page processing with Pix2Text integration.
653
+ """
654
+ print(f"Processing page {page_num + 1}...")
655
+
656
+ # Preprocess image
657
+ processed_image = preprocess_image_advanced(page_image)
658
+
659
+ # Detect text regions
660
+ regions = detect_text_regions(processed_image)
661
+
662
+ # Group regions by lines
663
+ lines = group_regions_by_line(regions)
664
+
665
+ page_results = []
666
+
667
+ for line_num, line in enumerate(lines):
668
+ line_text_parts = []
669
+
670
+ for region in line:
671
+ # Choose appropriate extractor based on region type
672
+ if region["type"] == "english":
673
+ extracted_text = extract_english_region(processed_image, region)
674
+ elif region["type"] == "bangla":
675
+ extracted_text = extract_bangla_region(processed_image, region)
676
+ elif region["type"] == "math":
677
+ extracted_text = extract_math_region_pix2text(
678
+ page_image, region, p2t_model
679
+ )
680
+ elif region["type"] == "mixed":
681
+ extracted_text = extract_mixed_region(page_image, region, p2t_model)
682
+ else:
683
+ extracted_text = region["text"]
684
+
685
+ # Character-by-character analysis
686
+ char_analysis = analyze_character_by_character(extracted_text)
687
+
688
+ region_result = {
689
+ "page": page_num,
690
+ "line": line_num,
691
+ "text": extracted_text,
692
+ "original_text": region["text"],
693
+ "position": {
694
+ "left": region["left"],
695
+ "top": region["top"],
696
+ "width": region["width"],
697
+ "height": region["height"],
698
+ },
699
+ "confidence": region["confidence"],
700
+ "detected_type": region["type"],
701
+ "extraction_method": "pix2text"
702
+ if region["type"] == "math" and p2t_model
703
+ else "tesseract",
704
+ "character_analysis": char_analysis,
705
+ }
706
+
707
+ page_results.append(region_result)
708
+ line_text_parts.append(extracted_text)
709
+
710
+ # Log line information
711
+ if line_text_parts:
712
+ line_text = " ".join(line_text_parts)
713
+ print(f" Line {line_num + 1}: {line_text[:100]}...")
714
+
715
+ return page_results
716
+
717
+
718
+ def extract_all_text_advanced_pix2text(
719
+ pdf_path, output_text_file, output_json_file, output_analysis_file
720
+ ):
721
+ """
722
+ Advanced text extraction with Pix2Text integration.
723
+ """
724
+ print("[INFO] Initializing Pix2Text for mathematical expression extraction...")
725
+ p2t_model = initialize_pix2text()
726
+
727
+ if p2t_model:
728
+ print("✅ Pix2Text ready for advanced math extraction")
729
+ else:
730
+ print("⚠️ Using traditional OCR for math expressions")
731
+
732
+ print("[INFO] Converting PDF to images...")
733
+ pages = convert_from_path(pdf_path, dpi=300)
734
+
735
+ all_results = []
736
+ combined_text_parts = []
737
+
738
+ for page_num, page_image in enumerate(tqdm(pages, desc="Processing pages")):
739
+ page_results = process_page_advanced(page_image, page_num, p2t_model)
740
+ all_results.extend(page_results)
741
+
742
+ # Build page text
743
+ page_text_parts = [result["text"] for result in page_results]
744
+ page_text = " ".join(page_text_parts)
745
+ combined_text_parts.append(page_text)
746
+
747
+ # Combine all text
748
+ final_text = "\n\n".join(combined_text_parts)
749
+
750
+ # Save text file
751
+ with open(output_text_file, "w", encoding="utf-8") as f:
752
+ f.write(final_text)
753
+
754
+ # Save detailed JSON results
755
+ with open(output_json_file, "w", encoding="utf-8") as f:
756
+ json.dump(all_results, f, ensure_ascii=False, indent=2)
757
+
758
+ # Create summary analysis
759
+ summary_analysis = create_extraction_summary(all_results)
760
+ with open(output_analysis_file, "w", encoding="utf-8") as f:
761
+ json.dump(summary_analysis, f, ensure_ascii=False, indent=2)
762
+
763
+ print("\n[✅] Advanced Pix2Text extraction complete!")
764
+ print(f"→ Text file saved to: {output_text_file}")
765
+ print(f"→ Detailed JSON saved to: {output_json_file}")
766
+ print(f"→ Analysis report saved to: {output_analysis_file}")
767
+
768
+ # Print summary
769
+ print("\n📊 Extraction Summary:")
770
+ print(f" Total text regions: {len(all_results)}")
771
+ print(f" English regions: {summary_analysis['type_distribution']['english']}")
772
+ print(f" Bangla regions: {summary_analysis['type_distribution']['bangla']}")
773
+ print(f" Math regions: {summary_analysis['type_distribution']['math']}")
774
+ print(f" Mixed regions: {summary_analysis['type_distribution']['mixed']}")
775
+
776
+ # Show extraction method statistics
777
+ method_stats = defaultdict(int)
778
+ for result in all_results:
779
+ method_stats[result.get("extraction_method", "unknown")] += 1
780
+
781
+ print("\n🔧 Extraction Methods Used:")
782
+ for method, count in method_stats.items():
783
+ print(f" {method}: {count} regions")
784
+
785
+
786
+ def create_extraction_summary(results):
787
+ """Create a comprehensive summary of the extraction results."""
788
+ summary = {
789
+ "total_regions": len(results),
790
+ "total_pages": len(set(r["page"] for r in results)),
791
+ "type_distribution": defaultdict(int),
792
+ "character_distribution": defaultdict(int),
793
+ "confidence_stats": {"min": 100, "max": 0, "avg": 0},
794
+ "language_segments_summary": defaultdict(int),
795
+ "extraction_methods": defaultdict(int),
796
+ }
797
+
798
+ total_confidence = 0
799
+ for result in results:
800
+ summary["type_distribution"][result["detected_type"]] += 1
801
+ summary["extraction_methods"][result.get("extraction_method", "unknown")] += 1
802
+
803
+ conf = result["confidence"]
804
+ total_confidence += conf
805
+ summary["confidence_stats"]["min"] = min(
806
+ summary["confidence_stats"]["min"], conf
807
+ )
808
+ summary["confidence_stats"]["max"] = max(
809
+ summary["confidence_stats"]["max"], conf
810
+ )
811
+
812
+ # Character distribution
813
+ char_analysis = result["character_analysis"]
814
+ for char_type, count in char_analysis["language_distribution"].items():
815
+ summary["character_distribution"][char_type] += count
816
+
817
+ # Language segments
818
+ for segment in char_analysis["language_segments"]:
819
+ summary["language_segments_summary"][segment["type"]] += 1
820
+
821
+ if results:
822
+ summary["confidence_stats"]["avg"] = total_confidence / len(results)
823
+
824
+ return summary
825
+
826
+
827
+ # ----------------------
828
+ # MAIN EXECUTION SECTION
829
+ # ----------------------
830
+ if __name__ == "__main__":
831
+ pdf_path = r"math102.pdf"
832
+ output_text_file = "math102_pix2text.txt"
833
+ output_json_file = "math102_pix2text.json"
834
+ output_analysis_file = "math102_pix2text_analysis.json"
835
+
836
+ extract_all_text_advanced_pix2text(
837
+ pdf_path, output_text_file, output_json_file, output_analysis_file
838
+ )
packages.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ tesseract-ocr
2
+ tesseract-ocr-ben
3
+ tesseract-ocr-eng
4
+ poppler-utils
5
+ libgl1-mesa-glx
6
+ libglib2.0-0
7
+ libsm6
8
+ libxext6
9
+ libfontconfig1
10
+ libxrender1
requirements.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Requirements for Advanced Multi-Language OCR System
2
+ # Compatible with Hugging Face Spaces
3
+
4
+ # Gradio Web Interface for HuggingFace Spaces
5
+ gradio>=4.0.0
6
+
7
+ # FastAPI Web Service Dependencies (for backend compatibility)
8
+ fastapi>=0.104.0
9
+ uvicorn[standard]>=0.23.0
10
+ python-multipart>=0.0.6
11
+
12
+ # Core OCR Dependencies
13
+ opencv-python>=4.8.0
14
+ pytesseract>=0.3.10
15
+ pdf2image>=1.16.0
16
+ pillow>=9.0.0
17
+ numpy>=1.24.0
18
+ tqdm>=4.65.0
19
+
20
+ # Pix2Text for advanced mathematical expression extraction
21
+ pix2text>=1.0.0
22
+
23
+ # AI/ML Dependencies for Math Extraction
24
+ torch>=2.0.0
25
+ torchvision>=0.15.0
26
+ transformers>=4.20.0
27
+
28
+ # Additional utilities
29
+ unicodedata2>=15.0.0
30
+
31
+ # System dependencies that may be needed for Hugging Face Spaces
32
+ # These are usually pre-installed in HF Spaces but listed for completeness
33
+ # poppler-utils # For pdf2image (system package)
34
+ # tesseract-ocr # Tesseract binary (system package)
35
+ # tesseract-ocr-ben # Bengali language pack (system package)
36
+
37
+ # Optional: GPU support for faster processing
38
+ # torch-audio # Uncomment if using GPU
39
+ # Note: Install CUDA-compatible PyTorch for GPU acceleration