Chirapath commited on
Commit
6063ce4
·
verified ·
1 Parent(s): 0133ada

Upload 7 files

Browse files
Files changed (7) hide show
  1. .env +57 -0
  2. app.py +613 -0
  3. backend.py +368 -0
  4. ocr_service.py +531 -0
  5. readme.md +231 -0
  6. requirements.txt +40 -0
  7. test_setup.py +183 -0
.env ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PDF OCR Service Environment Configuration
2
+
3
+ # ======================
4
+ # Azure Document Intelligence Configuration
5
+ # ======================
6
+ # Get these from Azure Portal > Document Intelligence resource
7
+ AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=https://ocr-doc-ccib-service-01.cognitiveservices.azure.com/
8
+ AZURE_DOCUMENT_INTELLIGENCE_KEY=73200Rna0T57qmWbFu0qK9Z8h4eA1AFvyHAOhSoVXRAnSRXy0ZJtJQQJ99BFACqBBLyXJ3w3AAALACOGjs1l
9
+
10
+ # ======================
11
+ # Server Configuration
12
+ # ======================
13
+ # Gradio server settings
14
+ SERVER_HOST=127.0.0.1
15
+ SERVER_PORT=7860
16
+ SHARE_GRADIO=false
17
+
18
+ # ======================
19
+ # File Processing Limits
20
+ # ======================
21
+ # Maximum file size in MB
22
+ MAX_FILE_SIZE_MB=50
23
+
24
+ # Maximum number of history entries to keep
25
+ MAX_HISTORY_SIZE=100
26
+
27
+ # ======================
28
+ # Logging Configuration
29
+ # ======================
30
+ # Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
31
+ LOG_LEVEL=INFO
32
+
33
+ # ======================
34
+ # Instructions for Setup
35
+ # ======================
36
+ # 1. Copy this file and rename it to .env (remove .example)
37
+ # 2. Fill in your Azure Document Intelligence credentials above
38
+ # 3. Adjust other settings as needed for your environment
39
+ # 4. Never commit the .env file to version control
40
+ # 5. Add .env to your .gitignore file
41
+
42
+ # ======================
43
+ # How to get Azure credentials:
44
+ # ======================
45
+ # 1. Go to Azure Portal (portal.azure.com)
46
+ # 2. Create a new "Document Intelligence" resource
47
+ # 3. Choose a subscription, resource group, and region
48
+ # 4. Select pricing tier (F0 for free tier, S0 for standard)
49
+ # 5. After creation, go to "Keys and Endpoint" section
50
+ # 6. Copy the endpoint URL and one of the keys
51
+ # 7. Replace the values above with your actual credentials
52
+
53
+ # ======================
54
+ # Example values:
55
+ # ======================
56
+ # AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=https://myservice.cognitiveservices.azure.com/
57
+ # AZURE_DOCUMENT_INTELLIGENCE_KEY=1234567890abcdef1234567890abcdef
app.py ADDED
@@ -0,0 +1,613 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio UI for PDF OCR Service
3
+ User interface for PDF to text conversion with multiple OCR providers
4
+ """
5
+
6
+ import gradio as gr
7
+ import os
8
+ import tempfile
9
+ import logging
10
+ from pathlib import Path
11
+ from datetime import datetime
12
+
13
+ # Load environment variables
14
+ from dotenv import load_dotenv
15
+ load_dotenv()
16
+
17
+ from backend import BackendManager
18
+
19
+ # Configure logging
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Initialize backend manager
24
+ backend_manager = BackendManager()
25
+
26
+ # Check if python-docx is available
27
+ from docx.shared import Pt
28
+ from docx.enum.table import WD_TABLE_ALIGNMENT
29
+ try:
30
+ from docx import Document
31
+ from docx.shared import Inches
32
+ HAS_DOCX_SUPPORT = True
33
+ logger.info("DOCX export available")
34
+ except ImportError:
35
+ HAS_DOCX_SUPPORT = False
36
+ logger.info("DOCX export not available - install python-docx to enable")
37
+
38
+
39
+ def process_pdf_file(pdf_file, ocr_method, progress=gr.Progress()):
40
+ """
41
+ Process uploaded PDF file and return extracted text
42
+
43
+ Args:
44
+ pdf_file: Uploaded PDF file object
45
+ ocr_method: Selected OCR method
46
+ progress: Gradio progress tracker
47
+
48
+ Returns:
49
+ Tuple of (extracted_text, metadata_info, processing_status)
50
+ """
51
+ if pdf_file is None:
52
+ return "No file uploaded.", "", "❌ Error: No file selected"
53
+
54
+ temp_file_path = None
55
+ try:
56
+ progress(0.1, desc="Initializing...")
57
+
58
+ # Handle Gradio file object - pdf_file.name contains the file path
59
+ temp_file_path = pdf_file.name
60
+
61
+ progress(0.3, desc="Processing PDF...")
62
+
63
+ # Process the PDF
64
+ result = backend_manager.process_pdf(temp_file_path, ocr_method)
65
+
66
+ progress(0.9, desc="Finalizing...")
67
+ progress(1.0, desc="Complete!")
68
+
69
+ if result['success']:
70
+ # Format metadata for display
71
+ metadata_info = format_metadata(result['metadata'], result['method_used'])
72
+ status = f"✅ Success: Processed using {result['method_used']}"
73
+ return result['text'], metadata_info, status
74
+ else:
75
+ error_msg = result.get('error', 'Unknown error occurred')
76
+ return f"Error: {error_msg}", "", f"❌ Processing failed: {error_msg}"
77
+
78
+ except Exception as e:
79
+ logger.error(f"UI processing error: {e}")
80
+ return f"Error: {str(e)}", "", f"❌ Unexpected error: {str(e)}"
81
+
82
+
83
+ def format_metadata(metadata, method_used):
84
+ """Format metadata for display"""
85
+ if not metadata:
86
+ return f"Method used: {method_used}"
87
+
88
+ info_lines = [f"Method used: {method_used}"]
89
+
90
+ if 'pages' in metadata:
91
+ info_lines.append(f"Pages processed: {metadata['pages']}")
92
+
93
+ if 'tables' in metadata:
94
+ info_lines.append(f"Tables detected: {metadata['tables']}")
95
+
96
+ if 'has_handwritten' in metadata:
97
+ handwritten_status = "Yes" if metadata['has_handwritten'] else "No"
98
+ info_lines.append(f"Handwritten content: {handwritten_status}")
99
+
100
+ if 'processing_time_seconds' in metadata:
101
+ info_lines.append(f"Processing time: {metadata['processing_time_seconds']:.2f} seconds")
102
+
103
+ return "\n".join(info_lines)
104
+
105
+
106
+ def create_txt_file(text_content, metadata_info=""):
107
+ """Create a TXT file from extracted text"""
108
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
109
+ temp_file = tempfile.NamedTemporaryFile(
110
+ suffix=f'_extracted_text_{timestamp}.txt',
111
+ delete=False,
112
+ mode='w',
113
+ encoding='utf-8'
114
+ )
115
+
116
+ try:
117
+ # Add header
118
+ temp_file.write("PDF OCR Extraction Results\n")
119
+ temp_file.write("=" * 50 + "\n\n")
120
+
121
+ # Add metadata
122
+ if metadata_info:
123
+ temp_file.write("Processing Information:\n")
124
+ temp_file.write("-" * 25 + "\n")
125
+ temp_file.write(metadata_info + "\n\n")
126
+
127
+ # Add timestamp
128
+ temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
129
+ temp_file.write("=" * 50 + "\n\n")
130
+
131
+ # Add main content
132
+ temp_file.write("Extracted Text:\n")
133
+ temp_file.write("-" * 15 + "\n\n")
134
+ temp_file.write(text_content)
135
+
136
+ temp_file.close()
137
+ return temp_file.name
138
+
139
+ except Exception as e:
140
+ logger.error(f"Error creating TXT file: {e}")
141
+ temp_file.close()
142
+ raise
143
+
144
+
145
+ def create_docx_file(text_content, metadata_info=""):
146
+ """Create a DOCX file with enhanced formatting and table preservation"""
147
+ if not HAS_DOCX_SUPPORT:
148
+ raise ImportError("python-docx not installed. Cannot create DOCX files.")
149
+
150
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
151
+ temp_file = tempfile.NamedTemporaryFile(
152
+ suffix=f'_extracted_text_{timestamp}.docx',
153
+ delete=False
154
+ )
155
+ temp_file.close()
156
+
157
+ try:
158
+ from docx import Document
159
+ from docx.shared import Inches, Pt
160
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
161
+ from docx.enum.table import WD_TABLE_ALIGNMENT
162
+
163
+ # Create new Document with better styling
164
+ doc = Document()
165
+
166
+ # Set document margins
167
+ sections = doc.sections
168
+ for section in sections:
169
+ section.top_margin = Inches(1)
170
+ section.bottom_margin = Inches(1)
171
+ section.left_margin = Inches(1)
172
+ section.right_margin = Inches(1)
173
+
174
+ # Add title
175
+ title = doc.add_heading('PDF OCR Extraction Results', 0)
176
+ title.alignment = WD_ALIGN_PARAGRAPH.CENTER
177
+
178
+ # Add metadata section if available
179
+ if metadata_info:
180
+ doc.add_heading('Processing Information', level=1)
181
+ metadata_para = doc.add_paragraph(metadata_info)
182
+ metadata_para.style = 'Intense Quote'
183
+ doc.add_page_break()
184
+
185
+ # Enhanced content processing
186
+ if '=== PAGE ' in text_content:
187
+ # Process page-by-page with better formatting
188
+ pages = text_content.split('=== PAGE ')
189
+
190
+ for i, page_content in enumerate(pages):
191
+ if i == 0 and not page_content.strip():
192
+ continue
193
+
194
+ if i > 0:
195
+ # Add page header
196
+ page_num = page_content.split(' ===')[0] if ' ===' in page_content else str(i)
197
+ page_heading = doc.add_heading(f'Page {page_num}', level=1)
198
+ page_heading.alignment = WD_ALIGN_PARAGRAPH.LEFT
199
+
200
+ # Get content after page header
201
+ content = page_content.split('===\n', 1)[-1] if '===\n' in page_content else page_content
202
+ else:
203
+ content = page_content
204
+
205
+ # Process content with enhanced table handling
206
+ _process_page_content_enhanced(doc, content)
207
+
208
+ else:
209
+ # Process as continuous content
210
+ _process_page_content_enhanced(doc, text_content)
211
+
212
+ # Add footer
213
+ footer_section = doc.sections[0]
214
+ footer = footer_section.footer
215
+ footer_para = footer.paragraphs[0]
216
+ footer_para.text = f"Generated by PDF OCR Service on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
217
+
218
+ # Save document
219
+ doc.save(temp_file.name)
220
+ logger.info(f"Enhanced DOCX file created: {temp_file.name}")
221
+ return temp_file.name
222
+
223
+ except Exception as e:
224
+ logger.error(f"Error creating enhanced DOCX file: {e}")
225
+ try:
226
+ os.unlink(temp_file.name)
227
+ except:
228
+ pass
229
+ raise
230
+
231
+
232
+ def _process_page_content_enhanced(doc, content):
233
+ """Process page content with enhanced table and formatting handling"""
234
+ if not content.strip():
235
+ return
236
+
237
+ # Split content by tables and regular text
238
+ sections = content.split('--- TABLE ')
239
+
240
+ # Process first section (before any tables)
241
+ if sections[0].strip():
242
+ _add_formatted_text(doc, sections[0])
243
+
244
+ # Process tables and subsequent content
245
+ for i in range(1, len(sections)):
246
+ table_section = sections[i]
247
+
248
+ # Extract table content
249
+ if '---' in table_section:
250
+ parts = table_section.split('---', 1)
251
+ table_header = f"TABLE {parts[0].strip()}"
252
+ table_content = parts[1].strip() if len(parts) > 1 else ""
253
+
254
+ # Add table header
255
+ table_heading = doc.add_heading(table_header, level=3)
256
+
257
+ # Process table content
258
+ if table_content:
259
+ table_lines = [line for line in table_content.split('\n') if line.strip()]
260
+ if table_lines:
261
+ _create_formatted_table(doc, table_lines)
262
+
263
+ # Add any remaining content after this table
264
+ remaining_parts = table_section.split('---')[2:] if '---' in table_section else []
265
+ if remaining_parts:
266
+ remaining_text = '---'.join(remaining_parts)
267
+ if remaining_text.strip():
268
+ _add_formatted_text(doc, remaining_text)
269
+
270
+
271
+ def _add_formatted_text(doc, text):
272
+ """Add formatted text with better paragraph handling"""
273
+ if not text.strip():
274
+ return
275
+
276
+ paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
277
+
278
+ for para_text in paragraphs:
279
+ if para_text.startswith('# '):
280
+ # Main heading
281
+ heading = doc.add_heading(para_text[2:], level=1)
282
+ elif para_text.startswith('## '):
283
+ # Sub heading
284
+ heading = doc.add_heading(para_text[3:], level=2)
285
+ elif para_text.startswith('### '):
286
+ # Sub-sub heading
287
+ heading = doc.add_heading(para_text[4:], level=3)
288
+ else:
289
+ # Regular paragraph
290
+ para = doc.add_paragraph(para_text)
291
+ # Add spacing for readability
292
+ if len(para_text) < 80:
293
+ para.space_after = Pt(6)
294
+
295
+
296
+ def _create_formatted_table(doc, table_lines):
297
+ """Create a properly formatted table from text lines"""
298
+ if not table_lines:
299
+ return
300
+
301
+ # Parse table structure
302
+ rows = []
303
+ for line in table_lines:
304
+ if '|' in line:
305
+ # Split by | and clean up cells
306
+ cells = [cell.strip() for cell in line.split('|')]
307
+ # Remove empty cells at start/end
308
+ while cells and not cells[0]:
309
+ cells.pop(0)
310
+ while cells and not cells[-1]:
311
+ cells.pop()
312
+ if cells:
313
+ rows.append(cells)
314
+ elif line.strip() and not line.startswith('-'):
315
+ # Single column row or header
316
+ rows.append([line.strip()])
317
+
318
+ if not rows:
319
+ # If no table structure found, add as preformatted text
320
+ para = doc.add_paragraph(style='Normal')
321
+ run = para.add_run('\n'.join(table_lines))
322
+ run.font.name = 'Courier New'
323
+ run.font.size = Pt(10)
324
+ return
325
+
326
+ # Determine number of columns
327
+ max_cols = max(len(row) for row in rows) if rows else 1
328
+
329
+ # Create table
330
+ table = doc.add_table(rows=len(rows), cols=max_cols)
331
+ table.style = 'Table Grid'
332
+ table.alignment = WD_TABLE_ALIGNMENT.LEFT
333
+
334
+ # Fill table cells
335
+ for row_idx, row_data in enumerate(rows):
336
+ table_row = table.rows[row_idx]
337
+
338
+ for col_idx in range(max_cols):
339
+ cell = table_row.cells[col_idx]
340
+
341
+ if col_idx < len(row_data):
342
+ cell.text = row_data[col_idx]
343
+ else:
344
+ cell.text = ""
345
+
346
+ # Format header row differently
347
+ if row_idx == 0:
348
+ for paragraph in cell.paragraphs:
349
+ for run in paragraph.runs:
350
+ run.bold = True
351
+
352
+ # Set font and alignment
353
+ for paragraph in cell.paragraphs:
354
+ for run in paragraph.runs:
355
+ run.font.size = Pt(10)
356
+ run.font.name = 'Calibri'
357
+
358
+ # Add spacing after table
359
+ doc.add_paragraph("")
360
+
361
+
362
+ def get_method_info(method):
363
+ """Get information about selected OCR method"""
364
+ method_descriptions = {
365
+ "auto": "🤖 **Auto Selection**: Automatically chooses the best available method. Prefers Azure → Tesseract → PyMuPDF in order.",
366
+ "azure": "☁️ **Azure Document Intelligence**: Advanced cloud-based OCR with excellent layout preservation, table detection, and handwriting recognition. Best quality but requires API credentials.",
367
+ "tesseract": "🔍 **Tesseract OCR**: Open-source OCR engine with image preprocessing. Good for scanned documents and images. Works offline.",
368
+ "pymupdf": "📄 **PyMuPDF**: Fast text extraction for PDFs with embedded text. Best for digital PDFs but limited OCR capabilities for scanned documents."
369
+ }
370
+
371
+ return method_descriptions.get(method, "Select a method to see details.")
372
+
373
+
374
+ def check_service_status():
375
+ """Check and display service status"""
376
+ available_methods = backend_manager.get_available_methods()
377
+
378
+ status_lines = ["**Available OCR Methods:**"]
379
+
380
+ if "azure" in available_methods:
381
+ status_lines.append("✅ Azure Document Intelligence - Ready")
382
+ else:
383
+ status_lines.append("❌ Azure Document Intelligence - Not configured")
384
+
385
+ if "tesseract" in available_methods:
386
+ status_lines.append("✅ Tesseract OCR - Ready")
387
+ else:
388
+ status_lines.append("❌ Tesseract OCR - Not available")
389
+
390
+ if "pymupdf" in available_methods:
391
+ status_lines.append("✅ PyMuPDF - Ready")
392
+ else:
393
+ status_lines.append("❌ PyMuPDF - Not available")
394
+
395
+ # Add DOCX support status
396
+ if HAS_DOCX_SUPPORT:
397
+ status_lines.append("✅ DOCX Export - Available")
398
+ else:
399
+ status_lines.append("❌ DOCX Export - Install python-docx to enable")
400
+
401
+ return "\n".join(status_lines)
402
+
403
+
404
+ def process_and_prepare_downloads(pdf_file, method):
405
+ """Process PDF and prepare both TXT and DOCX downloads if successful"""
406
+ text, metadata, status = process_pdf_file(pdf_file, method)
407
+
408
+ # Prepare downloads if processing was successful
409
+ if text and not text.startswith("Error:") and not text.startswith("No file"):
410
+ try:
411
+ # Create TXT file
412
+ txt_path = create_txt_file(text, metadata)
413
+
414
+ # Create DOCX file if support is available
415
+ if HAS_DOCX_SUPPORT:
416
+ try:
417
+ docx_path = create_docx_file(text, metadata)
418
+ return (text, metadata, status,
419
+ gr.update(visible=True, value=txt_path),
420
+ gr.update(visible=True, value=docx_path))
421
+ except Exception as docx_error:
422
+ logger.warning(f"DOCX creation failed: {docx_error}")
423
+ return (text, metadata, status,
424
+ gr.update(visible=True, value=txt_path),
425
+ gr.update(visible=False))
426
+ else:
427
+ return (text, metadata, status,
428
+ gr.update(visible=True, value=txt_path),
429
+ gr.update(visible=False))
430
+
431
+ except Exception as file_error:
432
+ logger.error(f"File creation error: {file_error}")
433
+ return (text, metadata, status,
434
+ gr.update(visible=False),
435
+ gr.update(visible=False))
436
+ else:
437
+ return (text, metadata, status,
438
+ gr.update(visible=False),
439
+ gr.update(visible=False))
440
+
441
+
442
+ def create_interface():
443
+ """Create and configure the Gradio interface"""
444
+
445
+ with gr.Blocks(
446
+ title="PDF OCR Service",
447
+ theme=gr.themes.Soft(),
448
+ css="""
449
+ .main-header { text-align: center; margin-bottom: 2rem; }
450
+ .method-info { background-color: #f8f9fa; padding: 1rem; border-radius: 0.5rem; margin: 1rem 0; }
451
+ .status-box { border-left: 4px solid #007bff; padding: 1rem; background-color: #f8f9fa; }
452
+ """
453
+ ) as interface:
454
+
455
+ gr.HTML("""
456
+ <div class="main-header">
457
+ <h1>📄 PDF OCR Service</h1>
458
+ <p>Convert PDF documents to text using advanced OCR technologies</p>
459
+ </div>
460
+ """)
461
+
462
+ with gr.Row():
463
+ with gr.Column(scale=1):
464
+ gr.HTML("<h3>📁 Upload & Configure</h3>")
465
+
466
+ # File upload
467
+ pdf_input = gr.File(
468
+ label="Upload PDF File",
469
+ file_types=[".pdf"],
470
+ file_count="single"
471
+ )
472
+
473
+ # OCR method selection
474
+ method_choice = gr.Dropdown(
475
+ choices=["auto", "azure", "tesseract", "pymupdf"],
476
+ value="auto",
477
+ label="OCR Method",
478
+ info="Choose OCR method or use auto-selection"
479
+ )
480
+
481
+ # Method information display
482
+ method_info = gr.Markdown(
483
+ value=get_method_info("auto"),
484
+ elem_classes=["method-info"]
485
+ )
486
+
487
+ # Process button
488
+ process_btn = gr.Button(
489
+ "🚀 Process PDF",
490
+ variant="primary",
491
+ size="lg"
492
+ )
493
+
494
+ # Service status
495
+ gr.HTML("<h4>🔧 Service Status</h4>")
496
+ service_status = gr.Markdown(
497
+ value=check_service_status(),
498
+ elem_classes=["status-box"]
499
+ )
500
+
501
+ # Refresh status button
502
+ refresh_btn = gr.Button("🔄 Refresh Status", size="sm")
503
+
504
+ with gr.Column(scale=2):
505
+ gr.HTML("<h3>📝 Results</h3>")
506
+
507
+ # Processing status
508
+ processing_status = gr.Textbox(
509
+ label="Processing Status",
510
+ interactive=False,
511
+ lines=1
512
+ )
513
+
514
+ # Extracted text output
515
+ text_output = gr.Textbox(
516
+ label="Extracted Text",
517
+ placeholder="Processed text will appear here...",
518
+ lines=20,
519
+ max_lines=30,
520
+ interactive=False,
521
+ show_copy_button=True
522
+ )
523
+
524
+ # Metadata information
525
+ metadata_output = gr.Textbox(
526
+ label="Processing Information",
527
+ interactive=False,
528
+ lines=4
529
+ )
530
+
531
+ # Download buttons
532
+ with gr.Row():
533
+ download_txt_btn = gr.DownloadButton(
534
+ "📄 Download TXT",
535
+ visible=False,
536
+ variant="secondary"
537
+ )
538
+ download_docx_btn = gr.DownloadButton(
539
+ "📝 Download DOCX",
540
+ visible=False,
541
+ variant="secondary"
542
+ )
543
+
544
+ # Add tips section
545
+ gr.HTML("<h3>💡 Tips & Features</h3>")
546
+
547
+ # Create tips content based on available features
548
+ download_info = "Get results as formatted TXT files"
549
+ if HAS_DOCX_SUPPORT:
550
+ download_info += " and structured DOCX files"
551
+ else:
552
+ download_info += " (install python-docx for DOCX export)"
553
+
554
+ tips_html = f"""
555
+ <div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem; margin: 1rem 0;">
556
+ <ul>
557
+ <li><strong>Auto method</strong> is recommended for most users - intelligently selects the best OCR method</li>
558
+ <li><strong>Azure Document Intelligence</strong> provides the best quality for complex documents with tables and formatting</li>
559
+ <li><strong>Tesseract</strong> works well for scanned documents and images with preprocessing</li>
560
+ <li><strong>PyMuPDF</strong> is fastest for digital PDFs with embedded text</li>
561
+ <li><strong>Download Options:</strong> {download_info}</li>
562
+ <li><strong>Format Preservation:</strong> Original spacing and layout are maintained where possible</li>
563
+ <li>Larger files may take longer to process - the progress bar shows current status</li>
564
+ <li>Supported file types: PDF documents (up to 50MB by default)</li>
565
+ </ul>
566
+ </div>
567
+ """
568
+
569
+ gr.HTML(tips_html)
570
+
571
+ # Event handlers
572
+ method_choice.change(
573
+ fn=get_method_info,
574
+ inputs=[method_choice],
575
+ outputs=[method_info]
576
+ )
577
+
578
+ refresh_btn.click(
579
+ fn=check_service_status,
580
+ outputs=[service_status]
581
+ )
582
+
583
+ process_btn.click(
584
+ fn=process_and_prepare_downloads,
585
+ inputs=[pdf_input, method_choice],
586
+ outputs=[text_output, metadata_output, processing_status, download_txt_btn, download_docx_btn]
587
+ )
588
+
589
+ return interface
590
+
591
+
592
+ def launch_ui():
593
+ """Launch the Gradio interface"""
594
+ interface = create_interface()
595
+
596
+ # Get configuration from environment
597
+ server_port = int(os.getenv('SERVER_PORT', 7860))
598
+ server_host = os.getenv('SERVER_HOST', '127.0.0.1')
599
+ share_gradio = os.getenv('SHARE_GRADIO', 'false').lower() == 'true'
600
+
601
+ logger.info(f"Starting Gradio UI on {server_host}:{server_port}")
602
+
603
+ interface.launch(
604
+ server_name=server_host,
605
+ server_port=server_port,
606
+ share=share_gradio,
607
+ inbrowser=True,
608
+ show_error=True
609
+ )
610
+
611
+
612
+ if __name__ == "__main__":
613
+ launch_ui()
backend.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Backend Management Module
3
+ Coordinates between UI and OCR services, handles file management and processing logic
4
+ """
5
+
6
+ import os
7
+ import logging
8
+ import tempfile
9
+ from typing import Dict, Any, List
10
+ from pathlib import Path
11
+ import hashlib
12
+ import json
13
+ from datetime import datetime
14
+
15
+ # Load environment variables
16
+ from dotenv import load_dotenv
17
+ load_dotenv()
18
+
19
+ from ocr_service import OCRService
20
+
21
+ # Configure logging
22
+ logging.basicConfig(level=logging.INFO)
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class BackendManager:
27
+ """Backend manager for PDF OCR processing"""
28
+
29
+ def __init__(self):
30
+ self.ocr_service = OCRService()
31
+ self.processing_history = []
32
+ self.max_history_size = int(os.getenv('MAX_HISTORY_SIZE', 100))
33
+
34
+ # Create directories for temporary files and logs
35
+ self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service'
36
+ self.temp_dir.mkdir(exist_ok=True)
37
+
38
+ logger.info("Backend manager initialized successfully")
39
+
40
+ def process_pdf(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
41
+ """
42
+ Process PDF file and return results
43
+
44
+ Args:
45
+ pdf_path: Path to the PDF file
46
+ method: OCR method to use
47
+
48
+ Returns:
49
+ Dict containing processing results
50
+ """
51
+ start_time = datetime.now()
52
+
53
+ # Validate input
54
+ if not os.path.exists(pdf_path):
55
+ return {
56
+ 'success': False,
57
+ 'error': f"File not found: {pdf_path}",
58
+ 'text': '',
59
+ 'method_used': '',
60
+ 'metadata': {}
61
+ }
62
+
63
+ # Check file size (limit to 50MB by default)
64
+ max_file_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024
65
+ file_size = os.path.getsize(pdf_path)
66
+
67
+ if file_size > max_file_size:
68
+ return {
69
+ 'success': False,
70
+ 'error': f"File too large. Maximum size: {max_file_size // (1024*1024)}MB",
71
+ 'text': '',
72
+ 'method_used': '',
73
+ 'metadata': {}
74
+ }
75
+
76
+ # Generate file hash for caching/tracking
77
+ file_hash = self._calculate_file_hash(pdf_path)
78
+
79
+ logger.info(f"Processing PDF: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
80
+ logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
81
+
82
+ try:
83
+ # Process the PDF
84
+ result = self.ocr_service.convert_pdf_to_text(pdf_path, method)
85
+
86
+ # Add processing metadata
87
+ processing_time = (datetime.now() - start_time).total_seconds()
88
+
89
+ result['metadata'].update({
90
+ 'file_hash': file_hash,
91
+ 'file_size_mb': round(file_size / (1024*1024), 2),
92
+ 'processing_time_seconds': round(processing_time, 2),
93
+ 'timestamp': start_time.isoformat()
94
+ })
95
+
96
+ # Log results
97
+ if result['success']:
98
+ text_length = len(result['text'])
99
+ logger.info(f"Processing completed successfully in {processing_time:.2f}s")
100
+ logger.info(f"Method used: {result['method_used']}")
101
+ logger.info(f"Text extracted: {text_length} characters")
102
+
103
+ # Add to processing history
104
+ self._add_to_history({
105
+ 'timestamp': start_time.isoformat(),
106
+ 'file_hash': file_hash,
107
+ 'method_used': result['method_used'],
108
+ 'success': True,
109
+ 'text_length': text_length,
110
+ 'processing_time': processing_time
111
+ })
112
+ else:
113
+ logger.error(f"Processing failed: {result.get('error', 'Unknown error')}")
114
+
115
+ # Add to processing history
116
+ self._add_to_history({
117
+ 'timestamp': start_time.isoformat(),
118
+ 'file_hash': file_hash,
119
+ 'method_requested': method,
120
+ 'success': False,
121
+ 'error': result.get('error', 'Unknown error'),
122
+ 'processing_time': processing_time
123
+ })
124
+
125
+ return result
126
+
127
+ except Exception as e:
128
+ logger.error(f"Unexpected error during processing: {e}")
129
+
130
+ # Add to processing history
131
+ processing_time = (datetime.now() - start_time).total_seconds()
132
+ self._add_to_history({
133
+ 'timestamp': start_time.isoformat(),
134
+ 'file_hash': file_hash,
135
+ 'method_requested': method,
136
+ 'success': False,
137
+ 'error': str(e),
138
+ 'processing_time': processing_time
139
+ })
140
+
141
+ return {
142
+ 'success': False,
143
+ 'error': f"Processing error: {str(e)}",
144
+ 'text': '',
145
+ 'method_used': '',
146
+ 'metadata': {
147
+ 'file_hash': file_hash,
148
+ 'processing_time_seconds': round(processing_time, 2),
149
+ 'timestamp': start_time.isoformat()
150
+ }
151
+ }
152
+
153
+ def get_available_methods(self) -> List[str]:
154
+ """Get list of available OCR methods"""
155
+ methods = self.ocr_service.get_available_methods()
156
+ logger.info(f"Available OCR methods: {methods}")
157
+ return methods
158
+
159
+ def get_service_status(self) -> Dict[str, Any]:
160
+ """Get comprehensive service status"""
161
+ available_methods = self.get_available_methods()
162
+
163
+ status = {
164
+ 'service_healthy': True,
165
+ 'available_methods': available_methods,
166
+ 'azure_configured': 'azure' in available_methods,
167
+ 'tesseract_available': 'tesseract' in available_methods,
168
+ 'pymupdf_available': 'pymupdf' in available_methods,
169
+ 'total_processed': len(self.processing_history),
170
+ 'successful_processes': sum(1 for h in self.processing_history if h.get('success', False)),
171
+ 'temp_dir': str(self.temp_dir),
172
+ 'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50))
173
+ }
174
+
175
+ return status
176
+
177
+ def get_processing_history(self, limit: int = 10) -> List[Dict[str, Any]]:
178
+ """Get recent processing history"""
179
+ return self.processing_history[-limit:]
180
+
181
+ def cleanup_temp_files(self):
182
+ """Clean up temporary files"""
183
+ try:
184
+ temp_files = list(self.temp_dir.glob('*'))
185
+ cleaned_count = 0
186
+
187
+ for temp_file in temp_files:
188
+ try:
189
+ # Remove files older than 1 hour
190
+ if temp_file.is_file() and temp_file.stat().st_mtime < (datetime.now().timestamp() - 3600):
191
+ temp_file.unlink()
192
+ cleaned_count += 1
193
+ except Exception as e:
194
+ logger.warning(f"Could not remove temp file {temp_file}: {e}")
195
+
196
+ if cleaned_count > 0:
197
+ logger.info(f"Cleaned up {cleaned_count} temporary files")
198
+
199
+ except Exception as e:
200
+ logger.error(f"Error during cleanup: {e}")
201
+
202
+ def validate_pdf_file(self, file_path: str) -> Dict[str, Any]:
203
+ """
204
+ Validate PDF file before processing
205
+
206
+ Args:
207
+ file_path: Path to the PDF file
208
+
209
+ Returns:
210
+ Dict with validation results
211
+ """
212
+ validation_result = {
213
+ 'valid': False,
214
+ 'error': None,
215
+ 'warnings': [],
216
+ 'file_info': {}
217
+ }
218
+
219
+ try:
220
+ # Check if file exists
221
+ if not os.path.exists(file_path):
222
+ validation_result['error'] = "File does not exist"
223
+ return validation_result
224
+
225
+ # Check file extension
226
+ if not file_path.lower().endswith('.pdf'):
227
+ validation_result['warnings'].append("File does not have .pdf extension")
228
+
229
+ # Check file size
230
+ file_size = os.path.getsize(file_path)
231
+ max_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024
232
+
233
+ if file_size > max_size:
234
+ validation_result['error'] = f"File too large ({file_size/(1024*1024):.1f}MB > {max_size/(1024*1024)}MB)"
235
+ return validation_result
236
+
237
+ if file_size == 0:
238
+ validation_result['error'] = "File is empty"
239
+ return validation_result
240
+
241
+ # Try to open with PyMuPDF to check if it's a valid PDF
242
+ try:
243
+ import fitz
244
+ doc = fitz.open(file_path)
245
+ page_count = len(doc)
246
+ doc.close()
247
+
248
+ if page_count == 0:
249
+ validation_result['warnings'].append("PDF contains no pages")
250
+
251
+ validation_result['file_info'] = {
252
+ 'size_mb': round(file_size / (1024*1024), 2),
253
+ 'pages': page_count
254
+ }
255
+
256
+ except Exception as pdf_error:
257
+ validation_result['error'] = f"Invalid PDF file: {str(pdf_error)}"
258
+ return validation_result
259
+
260
+ validation_result['valid'] = True
261
+
262
+ except Exception as e:
263
+ validation_result['error'] = f"Validation error: {str(e)}"
264
+
265
+ return validation_result
266
+
267
+ def _calculate_file_hash(self, file_path: str) -> str:
268
+ """Calculate SHA-256 hash of file"""
269
+ sha256_hash = hashlib.sha256()
270
+
271
+ try:
272
+ with open(file_path, "rb") as f:
273
+ for chunk in iter(lambda: f.read(4096), b""):
274
+ sha256_hash.update(chunk)
275
+ return sha256_hash.hexdigest()
276
+ except Exception as e:
277
+ logger.error(f"Error calculating file hash: {e}")
278
+ return f"error_{datetime.now().timestamp()}"
279
+
280
+ def _add_to_history(self, entry: Dict[str, Any]):
281
+ """Add entry to processing history"""
282
+ self.processing_history.append(entry)
283
+
284
+ # Limit history size
285
+ if len(self.processing_history) > self.max_history_size:
286
+ self.processing_history = self.processing_history[-self.max_history_size:]
287
+
288
+ def export_processing_history(self, file_path: str = None) -> str:
289
+ """Export processing history to JSON file"""
290
+ if file_path is None:
291
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
292
+ file_path = self.temp_dir / f"processing_history_{timestamp}.json"
293
+
294
+ try:
295
+ history_data = {
296
+ 'exported_at': datetime.now().isoformat(),
297
+ 'total_entries': len(self.processing_history),
298
+ 'service_status': self.get_service_status(),
299
+ 'history': self.processing_history
300
+ }
301
+
302
+ with open(file_path, 'w') as f:
303
+ json.dump(history_data, f, indent=2)
304
+
305
+ logger.info(f"Processing history exported to: {file_path}")
306
+ return str(file_path)
307
+
308
+ except Exception as e:
309
+ logger.error(f"Error exporting history: {e}")
310
+ raise
311
+
312
+ def get_statistics(self) -> Dict[str, Any]:
313
+ """Get processing statistics"""
314
+ if not self.processing_history:
315
+ return {
316
+ 'total_processed': 0,
317
+ 'success_rate': 0,
318
+ 'average_processing_time': 0,
319
+ 'most_used_method': 'N/A',
320
+ 'total_text_extracted': 0
321
+ }
322
+
323
+ total_processed = len(self.processing_history)
324
+ successful = [h for h in self.processing_history if h.get('success', False)]
325
+ success_rate = (len(successful) / total_processed) * 100 if total_processed > 0 else 0
326
+
327
+ # Calculate average processing time
328
+ processing_times = [h.get('processing_time', 0) for h in self.processing_history if 'processing_time' in h]
329
+ avg_processing_time = sum(processing_times) / len(processing_times) if processing_times else 0
330
+
331
+ # Find most used method
332
+ methods = [h.get('method_used', 'unknown') for h in successful]
333
+ most_used_method = max(set(methods), key=methods.count) if methods else 'N/A'
334
+
335
+ # Calculate total text extracted
336
+ total_text = sum(h.get('text_length', 0) for h in successful)
337
+
338
+ return {
339
+ 'total_processed': total_processed,
340
+ 'success_rate': round(success_rate, 2),
341
+ 'average_processing_time': round(avg_processing_time, 2),
342
+ 'most_used_method': most_used_method,
343
+ 'total_text_extracted': total_text,
344
+ 'successful_processes': len(successful),
345
+ 'failed_processes': total_processed - len(successful)
346
+ }
347
+
348
+
349
+ # Initialize global backend manager instance
350
+ _backend_manager = None
351
+
352
+ def get_backend_manager() -> BackendManager:
353
+ """Get global backend manager instance"""
354
+ global _backend_manager
355
+ if _backend_manager is None:
356
+ _backend_manager = BackendManager()
357
+ return _backend_manager
358
+
359
+
360
+ if __name__ == "__main__":
361
+ # Test the backend manager
362
+ manager = BackendManager()
363
+
364
+ print("Backend Manager Test")
365
+ print("===================")
366
+ print(f"Available methods: {manager.get_available_methods()}")
367
+ print(f"Service status: {manager.get_service_status()}")
368
+ print(f"Statistics: {manager.get_statistics()}")
ocr_service.py ADDED
@@ -0,0 +1,531 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OCR Service Module - FIXED VERSION
3
+ Handles PDF to text conversion using Azure Document Intelligence with fallback methods
4
+ """
5
+
6
+ import os
7
+ import logging
8
+ from typing import Optional, Dict, Any, Tuple
9
+ import tempfile
10
+ from pathlib import Path
11
+
12
+ # Load environment variables
13
+ from dotenv import load_dotenv
14
+ load_dotenv()
15
+
16
+ # Azure Document Intelligence
17
+ from azure.core.credentials import AzureKeyCredential
18
+ from azure.ai.documentintelligence import DocumentIntelligenceClient
19
+ from azure.core.exceptions import AzureError
20
+
21
+ # Fallback OCR libraries
22
+ try:
23
+ import pytesseract
24
+ from PIL import Image
25
+ import cv2
26
+ import numpy as np
27
+ TESSERACT_AVAILABLE = True
28
+ except ImportError:
29
+ TESSERACT_AVAILABLE = False
30
+
31
+ import fitz # PyMuPDF
32
+
33
+ # Configure logging
34
+ logging.basicConfig(level=logging.INFO)
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class OCRService:
39
+ """Main OCR service with multiple providers and fallback mechanisms"""
40
+
41
+ def __init__(self):
42
+ self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
43
+ self.azure_key = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_KEY')
44
+
45
+ # Initialize Azure client if credentials are available
46
+ self.azure_client = None
47
+ if self.azure_endpoint and self.azure_key:
48
+ try:
49
+ self.azure_client = DocumentIntelligenceClient(
50
+ endpoint=self.azure_endpoint,
51
+ credential=AzureKeyCredential(self.azure_key)
52
+ )
53
+ logger.info("Azure Document Intelligence client initialized successfully")
54
+ except Exception as e:
55
+ logger.error(f"Failed to initialize Azure client: {e}")
56
+ else:
57
+ logger.warning("Azure credentials not found. Azure OCR will be unavailable.")
58
+
59
+ def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
60
+ """
61
+ Convert PDF to text using specified method
62
+
63
+ Args:
64
+ pdf_path: Path to the PDF file
65
+ method: OCR method ('azure', 'tesseract', 'pymupdf', 'auto')
66
+
67
+ Returns:
68
+ Dict containing text content, metadata, and processing info
69
+ """
70
+ result = {
71
+ 'success': False,
72
+ 'text': '',
73
+ 'method_used': '',
74
+ 'metadata': {},
75
+ 'error': None
76
+ }
77
+
78
+ if not os.path.exists(pdf_path):
79
+ result['error'] = f"PDF file not found: {pdf_path}"
80
+ return result
81
+
82
+ # Auto method selection
83
+ if method == "auto":
84
+ if self.azure_client:
85
+ method = "azure"
86
+ elif self._check_tesseract_available():
87
+ method = "tesseract"
88
+ else:
89
+ method = "pymupdf"
90
+
91
+ # Try primary method
92
+ try:
93
+ if method == "azure" and self.azure_client:
94
+ result = self._azure_ocr(pdf_path)
95
+ elif method == "tesseract":
96
+ result = self._tesseract_ocr(pdf_path)
97
+ elif method == "pymupdf":
98
+ result = self._pymupdf_extract(pdf_path)
99
+ else:
100
+ result['error'] = f"Method '{method}' not available or not configured"
101
+
102
+ except Exception as e:
103
+ logger.error(f"Primary method '{method}' failed: {e}")
104
+ result['error'] = str(e)
105
+
106
+ # Fallback mechanism
107
+ if not result['success']:
108
+ logger.info("Primary method failed, trying fallback methods...")
109
+ result = self._try_fallback_methods(pdf_path, exclude_method=method)
110
+
111
+ return result
112
+
113
+ def _azure_ocr(self, pdf_path: str) -> Dict[str, Any]:
114
+ """Azure Document Intelligence OCR with enhanced layout preservation"""
115
+ result = {
116
+ 'success': False,
117
+ 'text': '',
118
+ 'method_used': 'azure_document_intelligence',
119
+ 'metadata': {},
120
+ 'error': None
121
+ }
122
+
123
+ try:
124
+ with open(pdf_path, 'rb') as pdf_file:
125
+ file_content = pdf_file.read()
126
+
127
+ # Try different API call patterns for different SDK versions
128
+ try:
129
+ # Pattern 1: body + content_type (most common for current SDK)
130
+ poller = self.azure_client.begin_analyze_document(
131
+ "prebuilt-layout",
132
+ body=file_content,
133
+ content_type="application/pdf"
134
+ )
135
+ except TypeError:
136
+ try:
137
+ # Pattern 2: model_id + body
138
+ poller = self.azure_client.begin_analyze_document(
139
+ model_id="prebuilt-layout",
140
+ body=file_content
141
+ )
142
+ except TypeError:
143
+ # Pattern 3: document parameter (older SDK)
144
+ pdf_file.seek(0)
145
+ poller = self.azure_client.begin_analyze_document(
146
+ "prebuilt-layout",
147
+ document=pdf_file
148
+ )
149
+
150
+ analysis_result = poller.result()
151
+
152
+ # Enhanced format preservation with better structure
153
+ formatted_text = self._format_azure_result_enhanced(analysis_result)
154
+
155
+ result.update({
156
+ 'success': True,
157
+ 'text': formatted_text,
158
+ 'metadata': {
159
+ 'pages': len(analysis_result.pages) if analysis_result.pages else 0,
160
+ 'tables': len(analysis_result.tables) if analysis_result.tables else 0,
161
+ 'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
162
+ 'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
163
+ 'azure_analysis': analysis_result # Pass full result for DOCX formatting
164
+ }
165
+ })
166
+
167
+ logger.info("Azure OCR completed successfully with enhanced formatting")
168
+
169
+ except Exception as e:
170
+ logger.error(f"Azure OCR error: {e}")
171
+ result['error'] = f"Azure OCR error: {e}"
172
+
173
+ return result
174
+
175
+ def _format_azure_result_enhanced(self, analysis_result) -> str:
176
+ """Enhanced formatting that preserves more layout structure"""
177
+ formatted_parts = []
178
+
179
+ if not analysis_result.pages:
180
+ return ""
181
+
182
+ for page_num, page in enumerate(analysis_result.pages, 1):
183
+ formatted_parts.append(f"\n=== PAGE {page_num} ===\n")
184
+
185
+ # Collect all content with positions for better ordering
186
+ content_items = []
187
+
188
+ # Add paragraphs if available (better than individual lines)
189
+ if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
190
+ page_paragraphs = [p for p in analysis_result.paragraphs if
191
+ p.bounding_regions and
192
+ p.bounding_regions[0].page_number == page_num]
193
+
194
+ for para in page_paragraphs:
195
+ if para.content.strip():
196
+ y_pos = para.bounding_regions[0].polygon[1] if para.bounding_regions[0].polygon else 0
197
+ content_items.append({
198
+ 'type': 'paragraph',
199
+ 'content': para.content.strip(),
200
+ 'y_pos': y_pos,
201
+ 'role': getattr(para, 'role', 'paragraph')
202
+ })
203
+
204
+ # Add lines if paragraphs not available
205
+ elif page.lines:
206
+ for line in page.lines:
207
+ if line.content.strip():
208
+ y_pos = line.polygon[1] if line.polygon else 0
209
+ content_items.append({
210
+ 'type': 'line',
211
+ 'content': line.content.strip(),
212
+ 'y_pos': y_pos,
213
+ 'role': 'text'
214
+ })
215
+
216
+ # Sort content by vertical position (top to bottom)
217
+ content_items.sort(key=lambda x: x['y_pos'])
218
+
219
+ # Add formatted content
220
+ for item in content_items:
221
+ if item['role'] == 'title':
222
+ formatted_parts.append(f"\n# {item['content']}\n")
223
+ elif item['role'] == 'sectionHeading':
224
+ formatted_parts.append(f"\n## {item['content']}\n")
225
+ else:
226
+ formatted_parts.append(item['content'])
227
+ formatted_parts.append("") # Add line break
228
+
229
+ # Add tables for this page
230
+ if analysis_result.tables:
231
+ page_tables = [t for t in analysis_result.tables if any(
232
+ cell.bounding_regions and
233
+ cell.bounding_regions[0].page_number == page_num
234
+ for cell in t.cells
235
+ )]
236
+
237
+ for table_idx, table in enumerate(page_tables):
238
+ formatted_parts.append(f"\n--- TABLE {table_idx + 1} ---")
239
+ table_text = self._format_table_enhanced(table)
240
+ formatted_parts.append(table_text)
241
+ formatted_parts.append("")
242
+
243
+ return '\n'.join(formatted_parts)
244
+
245
+ def _format_table_enhanced(self, table) -> str:
246
+ """Enhanced table formatting with better structure"""
247
+ if not table.cells:
248
+ return ""
249
+
250
+ # Create matrix
251
+ max_row = max(cell.row_index for cell in table.cells) + 1
252
+ max_col = max(cell.column_index for cell in table.cells) + 1
253
+
254
+ table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
255
+
256
+ # Fill matrix with cell content
257
+ for cell in table.cells:
258
+ content = (cell.content or "").strip()
259
+ table_matrix[cell.row_index][cell.column_index] = content
260
+
261
+ # Calculate column widths
262
+ col_widths = [0] * max_col
263
+ for row in table_matrix:
264
+ for col_idx, cell in enumerate(row):
265
+ col_widths[col_idx] = max(col_widths[col_idx], len(cell))
266
+
267
+ # Format as aligned table
268
+ formatted_rows = []
269
+ for row_idx, row in enumerate(table_matrix):
270
+ formatted_cells = []
271
+ for col_idx, cell in enumerate(row):
272
+ width = max(col_widths[col_idx], 3) # Minimum width
273
+ formatted_cells.append(cell.ljust(width))
274
+
275
+ formatted_row = " | ".join(formatted_cells)
276
+ formatted_rows.append(formatted_row)
277
+
278
+ # Add separator after header row
279
+ if row_idx == 0 and max_row > 1:
280
+ separator = " | ".join(["-" * max(col_widths[i], 3) for i in range(max_col)])
281
+ formatted_rows.append(separator)
282
+
283
+ return "\n".join(formatted_rows)
284
+
285
+ def _format_azure_result(self, analysis_result) -> str:
286
+ """Format Azure Document Intelligence result preserving layout"""
287
+ formatted_text = []
288
+
289
+ if analysis_result.pages:
290
+ for page_num, page in enumerate(analysis_result.pages, 1):
291
+ formatted_text.append(f"\n--- Page {page_num} ---\n")
292
+
293
+ # Sort lines by vertical position for better reading order
294
+ if page.lines:
295
+ sorted_lines = sorted(page.lines, key=lambda line: (
296
+ line.polygon[1] if line.polygon else 0, # Y coordinate
297
+ line.polygon[0] if line.polygon else 0 # X coordinate
298
+ ))
299
+
300
+ for line in sorted_lines:
301
+ formatted_text.append(line.content)
302
+
303
+ # Add tables if present
304
+ if analysis_result.tables:
305
+ page_tables = [t for t in analysis_result.tables if any(
306
+ cell.bounding_regions and
307
+ cell.bounding_regions[0].page_number == page_num
308
+ for cell in t.cells
309
+ )]
310
+
311
+ for table_idx, table in enumerate(page_tables):
312
+ formatted_text.append(f"\n--- Table {table_idx + 1} ---")
313
+ formatted_text.append(self._format_table(table))
314
+
315
+ return '\n'.join(formatted_text)
316
+
317
+ def _format_table(self, table) -> str:
318
+ """Format table from Azure Document Intelligence"""
319
+ if not table.cells:
320
+ return ""
321
+
322
+ # Create matrix
323
+ max_row = max(cell.row_index for cell in table.cells) + 1
324
+ max_col = max(cell.column_index for cell in table.cells) + 1
325
+
326
+ table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
327
+
328
+ for cell in table.cells:
329
+ table_matrix[cell.row_index][cell.column_index] = cell.content or ""
330
+
331
+ # Format as text table
332
+ formatted_rows = []
333
+ for row in table_matrix:
334
+ formatted_rows.append(" | ".join(row))
335
+
336
+ return "\n".join(formatted_rows)
337
+
338
+ def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
339
+ """Tesseract OCR with image preprocessing - FIXED VERSION"""
340
+ result = {
341
+ 'success': False,
342
+ 'text': '',
343
+ 'method_used': 'tesseract',
344
+ 'metadata': {},
345
+ 'error': None
346
+ }
347
+
348
+ if not TESSERACT_AVAILABLE:
349
+ result['error'] = "Tesseract not available"
350
+ return result
351
+
352
+ pdf_document = None
353
+ try:
354
+ # Convert PDF to images
355
+ pdf_document = fitz.open(pdf_path)
356
+ page_count = len(pdf_document) # Get count before processing
357
+ all_text = []
358
+
359
+ for page_num in range(page_count):
360
+ page = pdf_document.load_page(page_num)
361
+
362
+ # Render page to image
363
+ mat = fitz.Matrix(2.0, 2.0) # High resolution
364
+ pix = page.get_pixmap(matrix=mat)
365
+ img_data = pix.tobytes("png")
366
+
367
+ # Convert to PIL Image
368
+ temp_img_path = None
369
+ try:
370
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
371
+ temp_img.write(img_data)
372
+ temp_img_path = temp_img.name
373
+
374
+ # Preprocess image for better OCR
375
+ processed_img = self._preprocess_image(temp_img_path)
376
+
377
+ # OCR with custom config
378
+ custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
379
+ text = pytesseract.image_to_string(processed_img, config=custom_config, lang='eng')
380
+
381
+ all_text.append(f"\n--- Page {page_num + 1} ---\n")
382
+ all_text.append(text)
383
+
384
+ finally:
385
+ # Clean up temp image file
386
+ if temp_img_path and os.path.exists(temp_img_path):
387
+ try:
388
+ os.unlink(temp_img_path)
389
+ except:
390
+ pass
391
+
392
+ result.update({
393
+ 'success': True,
394
+ 'text': '\n'.join(all_text),
395
+ 'metadata': {'pages': page_count}
396
+ })
397
+
398
+ logger.info("Tesseract OCR completed successfully")
399
+
400
+ except Exception as e:
401
+ logger.error(f"Tesseract OCR error: {e}")
402
+ result['error'] = f"Tesseract OCR error: {e}"
403
+ finally:
404
+ # FIXED: Ensure document is properly closed
405
+ if pdf_document is not None:
406
+ try:
407
+ pdf_document.close()
408
+ except:
409
+ pass
410
+
411
+ return result
412
+
413
+ def _preprocess_image(self, image_path: str) -> np.ndarray:
414
+ """Preprocess image for better OCR accuracy"""
415
+ # Read image
416
+ img = cv2.imread(image_path)
417
+
418
+ # Convert to grayscale
419
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
420
+
421
+ # Noise removal
422
+ denoised = cv2.medianBlur(gray, 3)
423
+
424
+ # Threshold to get binary image
425
+ _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
426
+
427
+ return binary
428
+
429
+ def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
430
+ """PyMuPDF text extraction - FIXED VERSION"""
431
+ result = {
432
+ 'success': False,
433
+ 'text': '',
434
+ 'method_used': 'pymupdf',
435
+ 'metadata': {},
436
+ 'error': None
437
+ }
438
+
439
+ pdf_document = None
440
+ try:
441
+ pdf_document = fitz.open(pdf_path)
442
+ page_count = len(pdf_document) # FIXED: Get count first before processing
443
+ all_text = []
444
+
445
+ for page_num in range(page_count):
446
+ page = pdf_document.load_page(page_num)
447
+ text = page.get_text()
448
+
449
+ all_text.append(f"\n--- Page {page_num + 1} ---\n")
450
+ all_text.append(text)
451
+
452
+ result.update({
453
+ 'success': True,
454
+ 'text': '\n'.join(all_text),
455
+ 'metadata': {'pages': page_count} # FIXED: Use stored count
456
+ })
457
+
458
+ logger.info("PyMuPDF extraction completed successfully")
459
+
460
+ except Exception as e:
461
+ logger.error(f"PyMuPDF error: {e}")
462
+ result['error'] = f"PyMuPDF error: {e}"
463
+ finally:
464
+ # FIXED: Ensure document is properly closed
465
+ if pdf_document is not None:
466
+ try:
467
+ pdf_document.close()
468
+ except:
469
+ pass
470
+
471
+ return result
472
+
473
+ def _try_fallback_methods(self, pdf_path: str, exclude_method: str = None) -> Dict[str, Any]:
474
+ """Try fallback OCR methods"""
475
+ fallback_methods = []
476
+
477
+ # Order of fallback preference
478
+ if exclude_method != "azure" and self.azure_client:
479
+ fallback_methods.append("azure")
480
+ if exclude_method != "tesseract" and self._check_tesseract_available():
481
+ fallback_methods.append("tesseract")
482
+ if exclude_method != "pymupdf":
483
+ fallback_methods.append("pymupdf")
484
+
485
+ for method in fallback_methods:
486
+ logger.info(f"Trying fallback method: {method}")
487
+ try:
488
+ if method == "azure":
489
+ result = self._azure_ocr(pdf_path)
490
+ elif method == "tesseract":
491
+ result = self._tesseract_ocr(pdf_path)
492
+ elif method == "pymupdf":
493
+ result = self._pymupdf_extract(pdf_path)
494
+
495
+ if result['success']:
496
+ result['method_used'] += '_fallback'
497
+ return result
498
+
499
+ except Exception as e:
500
+ logger.error(f"Fallback method {method} failed: {e}")
501
+ continue
502
+
503
+ return {
504
+ 'success': False,
505
+ 'text': '',
506
+ 'method_used': 'all_methods_failed',
507
+ 'metadata': {},
508
+ 'error': 'All OCR methods failed'
509
+ }
510
+
511
+ def _check_tesseract_available(self) -> bool:
512
+ """Check if Tesseract is available"""
513
+ if not TESSERACT_AVAILABLE:
514
+ return False
515
+ try:
516
+ pytesseract.get_tesseract_version()
517
+ return True
518
+ except:
519
+ return False
520
+
521
+ def get_available_methods(self) -> list:
522
+ """Get list of available OCR methods"""
523
+ methods = []
524
+
525
+ if self.azure_client:
526
+ methods.append("azure")
527
+ if self._check_tesseract_available():
528
+ methods.append("tesseract")
529
+ methods.append("pymupdf") # Always available
530
+
531
+ return methods
readme.md ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PDF OCR Service
2
+
3
+ A comprehensive PDF to text conversion service with multiple OCR providers and a user-friendly web interface.
4
+
5
+ ## Features
6
+
7
+ - 🔄 **Multiple OCR Methods**: Azure Document Intelligence, Tesseract OCR, and PyMuPDF
8
+ - 📄 **Format Preservation**: Maintains original spacing and layout from PDFs
9
+ - 🛡️ **Fallback Mechanisms**: Automatically tries alternative methods if primary fails
10
+ - 🌐 **Web Interface**: Clean, intuitive Gradio-based UI
11
+ - 📊 **Processing Analytics**: Track processing history and statistics
12
+ - ⚡ **High Performance**: Optimized for speed and accuracy
13
+
14
+ ## Architecture
15
+
16
+ The service consists of three main components:
17
+
18
+ 1. **`ocr_service.py`** - Core OCR processing with Azure, Tesseract, and PyMuPDF
19
+ 2. **`backend.py`** - Backend management, file handling, and coordination
20
+ 3. **`ui.py`** - Gradio web interface for user interaction
21
+
22
+ ## Quick Start
23
+
24
+ ### 1. Install Dependencies
25
+
26
+ ```bash
27
+ # Install Python dependencies
28
+ pip install -r requirements.txt
29
+
30
+ # Install system dependencies (Ubuntu/Debian)
31
+ sudo apt-get update
32
+ sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
33
+ sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
34
+
35
+ # For macOS
36
+ brew install tesseract
37
+
38
+ # For Windows
39
+ # Download Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
40
+ # Add to PATH environment variable
41
+ ```
42
+
43
+ ### 2. Configure Environment
44
+
45
+ ```bash
46
+ # Copy environment template
47
+ cp .env.example .env
48
+
49
+ # Edit .env file with your settings
50
+ nano .env
51
+ ```
52
+
53
+ **Required Configuration:**
54
+ - Set Azure Document Intelligence endpoint and key (for best quality)
55
+ - Adjust file size limits and server settings as needed
56
+
57
+ ### 3. Run the Service
58
+
59
+ ```bash
60
+ # Start the web interface
61
+ python app.py
62
+
63
+ # Or run individual components
64
+ python backend.py # Test backend functionality
65
+ python ocr_service.py # Test OCR service
66
+ ```
67
+
68
+ The service will be available at `http://localhost:7860`
69
+
70
+ ## Azure Document Intelligence Setup
71
+
72
+ 1. **Create Azure Resource**
73
+ - Go to [Azure Portal](https://portal.azure.com)
74
+ - Create new "Document Intelligence" resource
75
+ - Choose subscription, resource group, and region
76
+ - Select pricing tier (F0 for free, S0 for standard)
77
+
78
+ 2. **Get Credentials**
79
+ - Navigate to "Keys and Endpoint" section
80
+ - Copy the endpoint URL and API key
81
+ - Add to your `.env` file:
82
+ ```bash
83
+ AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
84
+ AZURE_DOCUMENT_INTELLIGENCE_KEY=your-api-key-here
85
+ ```
86
+
87
+ ## OCR Methods
88
+
89
+ ### Azure Document Intelligence (Recommended)
90
+ - **Best Quality**: Advanced layout analysis and text extraction
91
+ - **Features**: Table detection, handwriting recognition, form understanding
92
+ - **Use Case**: Complex documents, forms, tables, mixed content
93
+ - **Requirements**: Azure subscription and API key
94
+
95
+ ### Tesseract OCR
96
+ - **Good Quality**: Open-source OCR with preprocessing
97
+ - **Features**: Multiple language support, image enhancement
98
+ - **Use Case**: Scanned documents, images, simple PDFs
99
+ - **Requirements**: Tesseract installation
100
+
101
+ ### PyMuPDF
102
+ - **Fast Processing**: Direct text extraction from digital PDFs
103
+ - **Features**: Fastest processing, embedded text extraction
104
+ - **Use Case**: Digital PDFs with embedded text
105
+ - **Requirements**: No additional setup needed
106
+
107
+ ## Usage Examples
108
+
109
+ ### Web Interface
110
+ 1. Open `http://localhost:7860` in your browser
111
+ 2. Upload a PDF file
112
+ 3. Select OCR method (or use "auto")
113
+ 4. Click "Process PDF"
114
+ 5. Download extracted text
115
+
116
+ ### Python API
117
+ ```python
118
+ from backend import BackendManager
119
+
120
+ # Initialize backend
121
+ manager = BackendManager()
122
+
123
+ # Process PDF
124
+ result = manager.process_pdf('document.pdf', method='auto')
125
+
126
+ if result['success']:
127
+ print("Extracted Text:")
128
+ print(result['text'])
129
+ print(f"Method used: {result['method_used']}")
130
+ print(f"Pages: {result['metadata']['pages']}")
131
+ else:
132
+ print(f"Error: {result['error']}")
133
+ ```
134
+
135
+ ## Configuration Options
136
+
137
+ ### File Processing
138
+ - `MAX_FILE_SIZE_MB`: Maximum file size (default: 50MB)
139
+ - `PROCESSING_TIMEOUT`: Processing timeout in seconds
140
+ - `MAX_CONCURRENT_TASKS`: Concurrent processing limit
141
+
142
+ ### OCR Settings
143
+ - `DEFAULT_OCR_METHOD`: Default method (auto/azure/tesseract/pymupdf)
144
+ - `AZURE_OCR_MODEL`: Azure model (prebuilt-layout/prebuilt-read)
145
+ - `TESSERACT_LANGUAGES`: Tesseract language packs
146
+
147
+ ### Server Settings
148
+ - `SERVER_HOST`: Web server host (default: 127.0.0.1)
149
+ - `SERVER_PORT`: Web server port (default: 7860)
150
+ - `SHARE_GRADIO`: Enable public sharing
151
+
152
+ ## Troubleshooting
153
+
154
+ ### Common Issues
155
+
156
+ 1. **Azure OCR not working**
157
+ - Verify endpoint URL and API key
158
+ - Check Azure subscription status
159
+ - Ensure resource region matches endpoint
160
+
161
+ 2. **Tesseract not found**
162
+ - Install Tesseract OCR system package
163
+ - Verify installation: `tesseract --version`
164
+ - Check PATH environment variable
165
+
166
+ 3. **Large file processing fails**
167
+ - Increase `MAX_FILE_SIZE_MB` in .env
168
+ - Check available memory and disk space
169
+ - Consider splitting large PDFs
170
+
171
+ 4. **Poor OCR quality**
172
+ - Try different OCR methods
173
+ - Use Azure for best quality
174
+ - Ensure good PDF scan quality
175
+
176
+ ### Performance Optimization
177
+
178
+ - **Use Azure Document Intelligence** for best accuracy
179
+ - **Enable image preprocessing** for scanned documents
180
+ - **Increase DPI settings** for better image quality
181
+ - **Configure memory limits** based on available resources
182
+
183
+ ## File Structure
184
+
185
+ ```
186
+ pdf-ocr-service/
187
+ ├── ocr_service.py # Core OCR processing
188
+ ├── backend.py # Backend management
189
+ ├── ui.py # Gradio web interface
190
+ ├── requirements.txt # Python dependencies
191
+ ├── .env # Environment configuration
192
+ ├── README.md # This file
193
+ ├── logs/ # Log files (created automatically)
194
+ ├── temp/ # Temporary files (created automatically)
195
+ └── cache/ # Cache directory (optional)
196
+ ```
197
+
198
+ ## Security Considerations
199
+
200
+ - Never commit `.env` file to version control
201
+ - Use secure methods to store API keys in production
202
+ - Enable file validation to prevent malicious uploads
203
+ - Consider rate limiting for public deployments
204
+ - Regular cleanup of temporary files
205
+
206
+ ## Contributing
207
+
208
+ 1. Fork the repository
209
+ 2. Create a feature branch
210
+ 3. Make your changes
211
+ 4. Add tests if applicable
212
+ 5. Submit a pull request
213
+
214
+ ## License
215
+
216
+ This project is licensed under the MIT License. See LICENSE file for details.
217
+
218
+ ## Support
219
+
220
+ - Check the troubleshooting section above
221
+ - Review Azure Document Intelligence documentation
222
+ - Open an issue for bug reports or feature requests
223
+
224
+ ## Changelog
225
+
226
+ ### Version 1.0.0
227
+ - Initial release
228
+ - Azure Document Intelligence integration
229
+ - Multiple OCR fallback methods
230
+ - Gradio web interface
231
+ - Processing history and analytics
requirements.txt ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PDF OCR Service Requirements
2
+
3
+ # Core web framework and UI
4
+ gradio>=4.0.0
5
+
6
+ # Environment configuration
7
+ python-dotenv>=1.0.0
8
+
9
+ # Azure Document Intelligence
10
+ azure-ai-documentintelligence>=1.0.0b1
11
+ azure-core>=1.28.0
12
+
13
+ # OCR and image processing
14
+ pytesseract>=0.3.10
15
+ Pillow>=10.0.0
16
+ opencv-python>=4.8.0
17
+ numpy>=1.24.0
18
+
19
+ # PDF processing
20
+ PyMuPDF>=1.23.0
21
+
22
+ # Document export formats
23
+ python-docx>=0.8.11
24
+
25
+ # System dependencies information (install separately):
26
+ #
27
+ # For Ubuntu/Debian:
28
+ # sudo apt-get update
29
+ # sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
30
+ # sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
31
+ #
32
+ # For CentOS/RHEL:
33
+ # sudo yum install -y tesseract tesseract-langpack-eng
34
+ #
35
+ # For macOS:
36
+ # brew install tesseract
37
+ #
38
+ # For Windows:
39
+ # Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
40
+ # Add Tesseract to PATH environment variable
test_setup.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simple test script to verify the PDF OCR Service setup
4
+ Run this to check if everything is working properly
5
+ """
6
+
7
+ import sys
8
+ import os
9
+ from pathlib import Path
10
+
11
+ def test_imports():
12
+ """Test if all required modules can be imported"""
13
+ print("🧪 Testing imports...")
14
+
15
+ required_imports = [
16
+ ('dotenv', 'python-dotenv'),
17
+ ('gradio', 'gradio'),
18
+ ('azure.ai.documentintelligence', 'azure-ai-documentintelligence'),
19
+ ('azure.core', 'azure-core'),
20
+ ('fitz', 'PyMuPDF'),
21
+ ('PIL', 'Pillow'),
22
+ ('cv2', 'opencv-python'),
23
+ ('numpy', 'numpy'),
24
+ ]
25
+
26
+ optional_imports = [
27
+ ('pytesseract', 'pytesseract'),
28
+ ('docx', 'python-docx'),
29
+ ]
30
+
31
+ all_good = True
32
+
33
+ # Test required imports
34
+ for module, package in required_imports:
35
+ try:
36
+ __import__(module)
37
+ print(f"✅ {package}")
38
+ except ImportError:
39
+ print(f"❌ {package} - Run: pip install {package}")
40
+ all_good = False
41
+
42
+ # Test optional imports
43
+ for module, package in optional_imports:
44
+ try:
45
+ __import__(module)
46
+ print(f"✅ {package} (optional)")
47
+ except ImportError:
48
+ print(f"⚠️ {package} (optional) - Run: pip install {package}")
49
+
50
+ return all_good
51
+
52
+ def test_files():
53
+ """Test if all required files exist"""
54
+ print("\n📁 Testing files...")
55
+
56
+ required_files = ['ocr_service.py', 'backend.py', 'requirements.txt', '.env']
57
+
58
+ # Check for UI file (either ui.py or app.py)
59
+ ui_file = None
60
+ if Path('ui.py').exists():
61
+ ui_file = 'ui.py'
62
+ elif Path('app.py').exists():
63
+ ui_file = 'app.py'
64
+
65
+ all_good = True
66
+ for file in required_files:
67
+ if Path(file).exists():
68
+ print(f"✅ {file}")
69
+ else:
70
+ print(f"❌ {file} missing")
71
+ all_good = False
72
+
73
+ # Check UI file
74
+ if ui_file:
75
+ print(f"✅ {ui_file} (UI file)")
76
+ else:
77
+ print("❌ UI file missing (need either ui.py or app.py)")
78
+ all_good = False
79
+
80
+ return all_good
81
+
82
+ def test_env_config():
83
+ """Test environment configuration"""
84
+ print("\n🔧 Testing environment...")
85
+
86
+ try:
87
+ from dotenv import load_dotenv
88
+ load_dotenv()
89
+
90
+ endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
91
+ key = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_KEY')
92
+
93
+ if endpoint and key:
94
+ if endpoint.startswith('https://') and endpoint.endswith('/'):
95
+ print("✅ Azure endpoint configured properly")
96
+ else:
97
+ print("⚠️ Azure endpoint format may be incorrect")
98
+
99
+ if len(key) > 20:
100
+ print("✅ Azure key configured")
101
+ else:
102
+ print("⚠️ Azure key may be incorrect")
103
+
104
+ return True
105
+ else:
106
+ print("⚠️ Azure credentials not configured")
107
+ print(" Update your .env file with valid credentials")
108
+ return False
109
+
110
+ except ImportError:
111
+ print("❌ python-dotenv not available")
112
+ return False
113
+
114
+ def test_service():
115
+ """Test if the service can be imported and initialized"""
116
+ print("\n🚀 Testing service initialization...")
117
+
118
+ try:
119
+ from backend import BackendManager
120
+ manager = BackendManager()
121
+
122
+ methods = manager.get_available_methods()
123
+ print(f"✅ Service initialized successfully")
124
+ print(f" Available methods: {methods}")
125
+
126
+ if 'azure' in methods:
127
+ print("✅ Azure OCR ready")
128
+ else:
129
+ print("⚠️ Azure OCR not available (check credentials)")
130
+
131
+ return True
132
+
133
+ except Exception as e:
134
+ print(f"❌ Service initialization failed: {e}")
135
+ return False
136
+
137
+ def main():
138
+ """Run all tests"""
139
+ print("🧪 PDF OCR Service Setup Test")
140
+ print("=" * 40)
141
+
142
+ tests = [
143
+ ("Import test", test_imports),
144
+ ("File test", test_files),
145
+ ("Environment test", test_env_config),
146
+ ("Service test", test_service),
147
+ ]
148
+
149
+ results = {}
150
+ for test_name, test_func in tests:
151
+ print(f"\n{'='*40}")
152
+ print(f"{test_name.upper()}")
153
+ print('='*40)
154
+ results[test_name] = test_func()
155
+
156
+ # Summary
157
+ print(f"\n{'='*40}")
158
+ print("TEST SUMMARY")
159
+ print('='*40)
160
+
161
+ all_passed = True
162
+ for test_name, passed in results.items():
163
+ status = "✅ PASS" if passed else "❌ FAIL"
164
+ print(f"{status} {test_name}")
165
+ if not passed:
166
+ all_passed = False
167
+
168
+ print('='*40)
169
+ if all_passed:
170
+ print("���� All tests passed! You can run the service with:")
171
+ print(" python ui.py")
172
+ else:
173
+ print("⚠️ Some tests failed. Please fix the issues above.")
174
+ print("\nQuick fixes:")
175
+ print("1. Install missing packages: pip install -r requirements.txt")
176
+ print("2. Configure your .env file with Azure credentials")
177
+ print("3. Ensure all files are present")
178
+
179
+ return all_passed
180
+
181
+ if __name__ == "__main__":
182
+ success = main()
183
+ sys.exit(0 if success else 1)