Chirapath commited on
Commit
5ba08f1
·
verified ·
1 Parent(s): 22b4814

Upload 10 files

Browse files
Files changed (6) hide show
  1. .gitattributes +35 -35
  2. app.py +635 -783
  3. backend.py +556 -413
  4. ocr_service.py +790 -324
  5. readme.md +196 -157
  6. requirements.txt +24 -8
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,18 +1,16 @@
1
- """
2
- Gradio UI for PDF OCR Service - Enhanced with Header/Footer Removal
3
- User interface for PDF to text conversion with multiple OCR providers and preprocessing options
4
- """
5
  import re
6
  import gradio as gr
7
  import os
8
  import tempfile
9
  import logging
 
10
  from pathlib import Path
11
  from datetime import datetime
12
  import cv2
13
  import numpy as np
14
  from PIL import Image
15
  import fitz # PyMuPDF
 
16
 
17
  # Load environment variables
18
  from dotenv import load_dotenv
@@ -28,179 +26,330 @@ logger = logging.getLogger(__name__)
28
  backend_manager = BackendManager()
29
 
30
  # Check if python-docx is available
31
- from docx.shared import Pt
32
- from docx.enum.table import WD_TABLE_ALIGNMENT
33
  try:
34
  from docx import Document
35
- from docx.shared import Inches
 
36
  HAS_DOCX_SUPPORT = True
37
  logger.info("DOCX export available")
38
  except ImportError:
39
  HAS_DOCX_SUPPORT = False
40
  logger.info("DOCX export not available - install python-docx to enable")
41
 
42
- # Global variables for crop preview
43
- current_crop_settings = {
44
- 'top': 0,
45
- 'bottom': 0,
46
- 'left': 0,
47
- 'right': 0
 
48
  }
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- def generate_preview_image(pdf_file, page_num=0):
52
- """Generate preview image from PDF first page for cropping"""
53
  if pdf_file is None:
54
- return None
55
 
56
  try:
57
- pdf_path = pdf_file.name
58
- doc = fitz.open(pdf_path)
59
-
60
- if page_num >= len(doc):
61
- page_num = 0
62
-
63
- page = doc.load_page(page_num)
64
 
65
- # Render page to image with good resolution
66
- mat = fitz.Matrix(2.0, 2.0)
67
- pix = page.get_pixmap(matrix=mat)
68
- img_data = pix.tobytes("png")
69
-
70
- # Convert to PIL Image and then to numpy array
71
- import io
72
- pil_image = Image.open(io.BytesIO(img_data))
73
- img_array = np.array(pil_image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- doc.close()
 
 
76
 
77
- return img_array
78
  except Exception as e:
79
- logger.error(f"Error generating preview: {e}")
80
  return None
81
 
82
-
83
- def update_crop_preview(pdf_file, top_crop, bottom_crop, left_crop, right_crop):
84
- """Update preview image with crop areas highlighted"""
85
- if pdf_file is None:
86
  return None
87
 
88
  try:
89
- img_array = generate_preview_image(pdf_file)
90
- if img_array is None:
91
- return None
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- # Convert to BGR for OpenCV
94
- if len(img_array.shape) == 3 and img_array.shape[2] == 4:
95
- # RGBA to RGB
96
- img_array = img_array[:, :, :3]
97
-
98
- img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
99
- height, width = img_bgr.shape[:2]
100
-
101
- # Calculate crop areas
102
- top_px = int(height * top_crop / 100)
103
- bottom_px = int(height * bottom_crop / 100)
104
- left_px = int(width * left_crop / 100)
105
- right_px = int(width * right_crop / 100)
106
-
107
- # Store current settings
108
- current_crop_settings.update({
109
- 'top': top_px,
110
- 'bottom': bottom_px,
111
- 'left': left_px,
112
- 'right': right_px
113
- })
114
-
115
- # Create overlay
116
- overlay = img_bgr.copy()
117
-
118
- # Draw crop areas in red (areas to be removed)
119
- if top_px > 0:
120
- cv2.rectangle(overlay, (0, 0), (width, top_px), (0, 0, 255), -1)
121
- if bottom_px > 0:
122
- cv2.rectangle(overlay, (0, height - bottom_px), (width, height), (0, 0, 255), -1)
123
- if left_px > 0:
124
- cv2.rectangle(overlay, (0, 0), (left_px, height), (0, 0, 255), -1)
125
- if right_px > 0:
126
- cv2.rectangle(overlay, (width - right_px, 0), (width, height), (0, 0, 255), -1)
127
-
128
- # Draw content area outline in green
129
- content_top = top_px
130
- content_bottom = height - bottom_px
131
- content_left = left_px
132
- content_right = width - right_px
133
-
134
- if content_right > content_left and content_bottom > content_top:
135
- cv2.rectangle(overlay, (content_left, content_top), (content_right, content_bottom), (0, 255, 0), 3)
136
-
137
- # Blend overlay with original
138
- result = cv2.addWeighted(img_bgr, 0.7, overlay, 0.3, 0)
139
-
140
- # Add text annotations
141
- cv2.putText(result, "RED: Areas to remove", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
142
- cv2.putText(result, "GREEN: Content area", (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
143
-
144
- # Convert back to RGB for display
145
- result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
146
-
147
- return result_rgb
148
 
149
  except Exception as e:
150
  logger.error(f"Error updating crop preview: {e}")
151
  return None
152
 
153
-
154
- def process_pdf_file(pdf_file, ocr_method, enable_header_footer_removal, crop_top, crop_bottom, crop_left, crop_right,
155
- progress=gr.Progress()):
156
- """
157
- Process uploaded PDF file with optional header/footer removal
158
- """
159
  if pdf_file is None:
160
- return "No file uploaded.", "", " Error: No file selected"
161
 
162
- temp_file_path = None
163
  try:
164
- progress(0.1, desc="Initializing...")
165
-
166
- # Handle Gradio file object
167
- temp_file_path = pdf_file.name
168
 
169
- # Prepare preprocessing options
170
  preprocessing_options = {
171
  'enable_header_footer_removal': enable_header_footer_removal,
172
- 'crop_settings': {
173
- 'top': crop_top,
174
- 'bottom': crop_bottom,
175
- 'left': crop_left,
176
- 'right': crop_right
177
- }
178
  }
179
 
180
- progress(0.3, desc="Processing PDF...")
181
 
182
- # Process the PDF with preprocessing options
183
- result = backend_manager.process_pdf(temp_file_path, ocr_method, preprocessing_options)
 
 
184
 
185
- progress(0.9, desc="Finalizing...")
186
  progress(1.0, desc="Complete!")
187
 
188
  if result['success']:
189
- # Format metadata for display
190
- metadata_info = format_metadata(result['metadata'], result['method_used'])
191
- status = f"✅ Success: Processed using {result['method_used']}"
192
- return result['text'], metadata_info, status
 
 
 
 
193
  else:
194
  error_msg = result.get('error', 'Unknown error occurred')
195
- return f"Error: {error_msg}", "", f" Processing failed: {error_msg}"
196
 
197
  except Exception as e:
198
- logger.error(f"UI processing error: {e}")
199
- return f"Error: {str(e)}", "", f" Unexpected error: {str(e)}"
200
-
201
 
202
- def format_metadata(metadata, method_used):
203
- """Format metadata for display"""
204
  if not metadata:
205
  return f"Method used: {method_used}"
206
 
@@ -209,750 +358,453 @@ def format_metadata(metadata, method_used):
209
  if 'pages' in metadata:
210
  info_lines.append(f"Pages processed: {metadata['pages']}")
211
 
212
- if 'tables' in metadata:
213
- info_lines.append(f"Tables detected: {metadata['tables']}")
 
 
 
214
 
215
- if 'has_handwritten' in metadata:
216
- handwritten_status = "Yes" if metadata['has_handwritten'] else "No"
217
- info_lines.append(f"Handwritten content: {handwritten_status}")
218
 
219
- if 'header_footer_removed' in metadata:
220
- removal_status = "Yes" if metadata['header_footer_removed'] else "No"
221
- info_lines.append(f"Header/Footer removed: {removal_status}")
 
 
222
 
223
  if 'processing_time_seconds' in metadata:
224
  info_lines.append(f"Processing time: {metadata['processing_time_seconds']:.2f} seconds")
225
 
226
  return "\n".join(info_lines)
227
 
228
-
229
- def create_txt_file(text_content, metadata_info=""):
230
- """Create a TXT file from extracted text with clean table handling"""
231
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
232
- temp_file = tempfile.NamedTemporaryFile(
233
- suffix=f'_extracted_text_{timestamp}.txt',
234
- delete=False,
235
- mode='w',
236
- encoding='utf-8'
237
  )
238
 
239
- try:
240
- # Add header
241
- temp_file.write("PDF OCR Extraction Results\n")
242
- temp_file.write("=" * 50 + "\n\n")
243
-
244
- # Add metadata
245
- if metadata_info:
246
- temp_file.write("Processing Information:\n")
247
- temp_file.write("-" * 25 + "\n")
248
- temp_file.write(metadata_info + "\n\n")
249
-
250
- # Add timestamp
251
- temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
252
- temp_file.write("=" * 50 + "\n\n")
253
-
254
- # Add main content with clean table processing
255
- temp_file.write("Extracted Text:\n")
256
- temp_file.write("-" * 15 + "\n\n")
257
-
258
- # Process content to clean up table duplications
259
- cleaned_content = _clean_text_content_for_txt(text_content)
260
- temp_file.write(cleaned_content)
261
-
262
- temp_file.close()
263
- return temp_file.name
264
-
265
- except Exception as e:
266
- logger.error(f"Error creating TXT file: {e}")
267
- temp_file.close()
268
- raise
269
-
270
-
271
- def _clean_text_content_for_txt(content):
272
- """Clean text content for TXT export, removing table duplications"""
273
- if not content.strip():
274
- return content
275
-
276
- # Split by pages first
277
- if '=== PAGE ' in content:
278
- pages = content.split('=== PAGE ')
279
- cleaned_pages = []
280
-
281
- for i, page_content in enumerate(pages):
282
- if i == 0 and not page_content.strip():
283
- continue
284
-
285
- if i > 0:
286
- # Add page header
287
- page_num = page_content.split(' ===')[0] if ' ===' in page_content else str(i)
288
- cleaned_pages.append(f"\n--- Page {page_num} ---\n")
289
-
290
- # Get content after page header
291
- content_part = page_content.split('===\n', 1)[-1] if '===\n' in page_content else page_content
292
- else:
293
- content_part = page_content
294
-
295
- # Clean this page's content
296
- cleaned_page = _clean_page_content_for_txt(content_part)
297
- if cleaned_page.strip():
298
- cleaned_pages.append(cleaned_page)
299
-
300
- return '\n'.join(cleaned_pages)
301
- else:
302
- # No page structure, clean as single content
303
- return _clean_page_content_for_txt(content)
304
-
305
-
306
- def _clean_page_content_for_txt(content):
307
- """Clean a single page's content for TXT export"""
308
- if not content.strip():
309
- return ""
310
-
311
- import re
312
-
313
- # Split content by table markers
314
- parts = re.split(r'\n?--- TABLE \d+ ---\n?', content)
315
-
316
- cleaned_parts = []
317
- table_count = 0
318
-
319
- # Find all table sections
320
- table_matches = re.finditer(r'\n?--- TABLE (\d+) ---\n?(.*?)(?=\n?--- TABLE \d+ ---|$)', content, re.DOTALL)
321
- table_contents = {}
322
-
323
- for match in table_matches:
324
- table_num = match.group(1)
325
- table_content = match.group(2).strip()
326
- table_contents[int(table_num)] = table_content
327
-
328
- # Process each part
329
- for i, part in enumerate(parts):
330
- if part.strip():
331
- # Clean the text part
332
- cleaned_part = _clean_text_part(part)
333
- if cleaned_part.strip():
334
- cleaned_parts.append(cleaned_part)
335
-
336
- # Add table if this part was followed by one
337
- if i < len(parts) - 1: # Not the last part
338
- table_count += 1
339
- if table_count in table_contents:
340
- table_header = f"\n--- TABLE {table_count} ---\n"
341
- table_text = _format_table_for_txt(table_contents[table_count])
342
- cleaned_parts.append(table_header + table_text)
343
-
344
- return '\n'.join(cleaned_parts)
345
-
346
-
347
- def _clean_text_part(text_part):
348
- """Clean a text part of any remaining table content"""
349
- if not text_part.strip():
350
- return ""
351
-
352
- import re
353
-
354
- # Remove any stray table markers
355
- cleaned = re.sub(r'\n?--- TABLE \d+ ---\n?', '', text_part)
356
- cleaned = re.sub(r'\n?--- Table \d+ ---\n?', '', cleaned)
357
-
358
- # Split into lines and filter out table-like content
359
- lines = cleaned.split('\n')
360
- filtered_lines = []
361
-
362
- for line in lines:
363
- line = line.strip()
364
- if not line:
365
- filtered_lines.append('') # Keep empty lines for spacing
366
- continue
367
-
368
- # Skip lines that look like table content (multiple | separators)
369
- if line.count('|') >= 2:
370
- continue
371
-
372
- # Skip separator lines
373
- if line.replace('-', '').replace(' ', '').replace('|', '') == '':
374
- continue
375
-
376
- filtered_lines.append(line)
377
-
378
- # Remove excessive empty lines
379
- result_lines = []
380
- prev_empty = False
381
-
382
- for line in filtered_lines:
383
- if line == '':
384
- if not prev_empty:
385
- result_lines.append(line)
386
- prev_empty = True
387
- else:
388
- result_lines.append(line)
389
- prev_empty = False
390
-
391
- return '\n'.join(result_lines)
392
-
393
-
394
- def _format_table_for_txt(table_content):
395
- """Format table content nicely for TXT output"""
396
- if not table_content.strip():
397
- return ""
398
-
399
- lines = [line.strip() for line in table_content.split('\n') if line.strip()]
400
-
401
- # Look for table structure
402
- table_lines = []
403
- for line in lines:
404
- if '|' in line:
405
- # Clean up the table line
406
- cells = [cell.strip() for cell in line.split('|')]
407
- # Remove empty cells at start/end
408
- while cells and not cells[0]:
409
- cells.pop(0)
410
- while cells and not cells[-1]:
411
- cells.pop()
412
- if cells:
413
- table_lines.append(cells)
414
-
415
- if not table_lines:
416
- return table_content # Return as is if no table structure found
417
-
418
- # Calculate column widths
419
- if table_lines:
420
- max_cols = max(len(row) for row in table_lines)
421
- col_widths = [0] * max_cols
422
-
423
- for row in table_lines:
424
- for i in range(min(len(row), max_cols)):
425
- col_widths[i] = max(col_widths[i], len(row[i]) if i < len(row) else 0)
426
-
427
- # Format table with proper alignment
428
- formatted_lines = []
429
- for i, row in enumerate(table_lines):
430
- formatted_row = []
431
- for j in range(max_cols):
432
- cell_content = row[j] if j < len(row) else ""
433
- width = max(col_widths[j], 3)
434
- formatted_row.append(cell_content.ljust(width))
435
-
436
- formatted_lines.append(" | ".join(formatted_row))
437
-
438
- # Add separator after header row
439
- if i == 0 and len(table_lines) > 1:
440
- separator = " | ".join(["-" * max(col_widths[k], 3) for k in range(max_cols)])
441
- formatted_lines.append(separator)
442
-
443
- return '\n'.join(formatted_lines)
444
-
445
- return table_content
446
-
447
-
448
- def create_docx_file(text_content, metadata_info=""):
449
- """Create DOCX file with enhanced table handling - NO separator rows"""
450
- if not HAS_DOCX_SUPPORT:
451
- raise ImportError("python-docx not installed. Cannot create DOCX files.")
452
-
453
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
454
- temp_file = tempfile.NamedTemporaryFile(
455
- suffix=f'_extracted_text_{timestamp}.docx',
456
- delete=False
457
- )
458
- temp_file.close()
459
-
460
- try:
461
- from docx import Document
462
- from docx.shared import Inches, Pt
463
- from docx.enum.text import WD_ALIGN_PARAGRAPH
464
- from docx.enum.table import WD_TABLE_ALIGNMENT
465
-
466
- doc = Document()
467
-
468
- # Set margins
469
- sections = doc.sections
470
- for section in sections:
471
- section.top_margin = Inches(1)
472
- section.bottom_margin = Inches(1)
473
- section.left_margin = Inches(1)
474
- section.right_margin = Inches(1)
475
-
476
- # Title
477
- title = doc.add_heading('PDF OCR Extraction Results', 0)
478
- title.alignment = WD_ALIGN_PARAGRAPH.CENTER
479
-
480
- # Metadata
481
- if metadata_info:
482
- doc.add_heading('Processing Information', level=1)
483
- metadata_para = doc.add_paragraph(metadata_info)
484
- metadata_para.style = 'Intense Quote'
485
- doc.add_page_break()
486
-
487
- # Enhanced content processing
488
- _add_enhanced_content_to_docx(doc, text_content)
489
-
490
- # Footer
491
- footer_section = doc.sections[0]
492
- footer = footer_section.footer
493
- footer_para = footer.paragraphs[0]
494
- footer_para.text = f"Generated by PDF OCR Service on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
495
-
496
- doc.save(temp_file.name)
497
- logger.info(f"Enhanced DOCX file created: {temp_file.name}")
498
- return temp_file.name
499
-
500
- except Exception as e:
501
- logger.error(f"Error creating DOCX file: {e}")
502
  try:
503
- os.unlink(temp_file.name)
504
- except:
505
- pass
506
- raise
507
-
508
-
509
- def _add_enhanced_content_to_docx(doc, text_content):
510
- """Enhanced content addition with NO separator rows in tables"""
511
- import re
512
-
513
- # Split by lines and process sequentially
514
- lines = text_content.split('\n')
515
- current_table_content = []
516
- in_table = False
517
-
518
- for line in lines:
519
- line = line.strip()
520
-
521
- # Handle page markers
522
- if line.startswith('=== PAGE '):
523
- if current_table_content:
524
- _add_enhanced_table(doc, current_table_content)
525
- current_table_content = []
526
- in_table = False
527
-
528
- page_num = line.replace('=== PAGE ', '').replace(' ===', '')
529
- doc.add_heading(f'Page {page_num}', level=1)
530
- continue
531
-
532
- # Handle table start
533
- if line.startswith('--- TABLE '):
534
- if current_table_content:
535
- _add_enhanced_table(doc, current_table_content)
536
-
537
- current_table_content = []
538
- in_table = True
539
- table_num = line.replace('--- TABLE ', '').replace(' ---', '')
540
- current_table_content.append(f"Table {table_num}")
541
- continue
542
-
543
- # Handle content
544
- if in_table:
545
- if line and not line.startswith('==='):
546
- current_table_content.append(line)
547
- else:
548
- # Regular text
549
- if line:
550
- if line.startswith('# '):
551
- doc.add_heading(line[2:], level=1)
552
- elif line.startswith('## '):
553
- doc.add_heading(line[3:], level=2)
554
- elif line.startswith('### '):
555
- doc.add_heading(line[4:], level=3)
556
- else:
557
- doc.add_paragraph(line)
558
- else:
559
- # Empty line - add small space
560
- doc.add_paragraph("")
561
-
562
- # Handle any remaining table
563
- if current_table_content:
564
- _add_enhanced_table(doc, current_table_content)
565
-
566
-
567
- def _add_enhanced_table(doc, table_content):
568
- """Add table with enhanced processing - REMOVES separator rows"""
569
- if not table_content:
570
- return
571
-
572
- # First line should be table title
573
- if table_content:
574
- doc.add_heading(table_content[0], level=3)
575
- table_lines = table_content[1:]
576
  else:
577
- table_lines = table_content
578
-
579
- if not table_lines:
580
- return
581
-
582
- # Find lines that contain pipes (table rows) and FILTER OUT separator rows
583
- table_rows = []
584
- for line in table_lines:
585
- if '|' in line and line.strip():
586
- # CRITICAL: Skip separator rows (lines that are mostly dashes)
587
- line_content = line.replace('|', '').replace(' ', '')
588
- if line_content.replace('-', '') == '':
589
- continue # Skip this separator row
590
-
591
- # Split and clean
592
- cells = [cell.strip() for cell in line.split('|')]
593
- # Remove empty cells at edges
594
- while cells and not cells[0]:
595
- cells.pop(0)
596
- while cells and not cells[-1]:
597
- cells.pop()
598
- if cells:
599
- table_rows.append(cells)
600
-
601
- if not table_rows:
602
- # No table structure, add as text
603
- for line in table_lines:
604
- if line.strip():
605
- doc.add_paragraph(line)
606
- return
607
-
608
- # Create table
609
- max_cols = max(len(row) for row in table_rows)
610
- table = doc.add_table(rows=len(table_rows), cols=max_cols)
611
- table.style = 'Table Grid'
612
-
613
- # Fill table
614
- for row_idx, row_data in enumerate(table_rows):
615
- table_row = table.rows[row_idx]
616
- for col_idx in range(max_cols):
617
- cell = table_row.cells[col_idx]
618
- if col_idx < len(row_data):
619
- cell.text = row_data[col_idx]
620
-
621
- # Bold first row
622
- if row_idx == 0:
623
- for paragraph in cell.paragraphs:
624
- for run in paragraph.runs:
625
- run.bold = True
626
-
627
- doc.add_paragraph("") # Space after table
628
-
629
 
630
- def get_method_info(method):
631
- """Get information about selected OCR method"""
632
  method_descriptions = {
633
- "auto": "🤖 **Auto Selection**: Automatically chooses the best available method. Prefers Azure Tesseract PyMuPDF in order.",
634
- "azure": "☁️ **Azure Document Intelligence**: Advanced cloud-based OCR with excellent layout preservation, table detection, and handwriting recognition. Best quality but requires API credentials.",
635
- "tesseract": "🔍 **Tesseract OCR**: Open-source OCR engine with image preprocessing. Good for scanned documents and images. Works offline.",
636
- "pymupdf": "📄 **PyMuPDF**: Fast text extraction for PDFs with embedded text. Best for digital PDFs but limited OCR capabilities for scanned documents."
637
  }
638
 
639
  return method_descriptions.get(method, "Select a method to see details.")
640
 
641
-
642
- def check_service_status():
643
- """Check and display service status"""
644
  available_methods = backend_manager.get_available_methods()
645
 
646
- status_lines = ["**Available OCR Methods:**"]
647
 
648
  if "azure" in available_methods:
649
- status_lines.append(" Azure Document Intelligence - Ready")
650
  else:
651
- status_lines.append(" Azure Document Intelligence - Not configured")
652
 
653
  if "tesseract" in available_methods:
654
- status_lines.append(" Tesseract OCR - Ready")
655
  else:
656
- status_lines.append(" Tesseract OCR - Not available")
657
 
658
  if "pymupdf" in available_methods:
659
- status_lines.append(" PyMuPDF - Ready")
660
  else:
661
- status_lines.append(" PyMuPDF - Not available")
 
 
 
 
 
 
 
662
 
663
- # Add DOCX support status
664
  if HAS_DOCX_SUPPORT:
665
- status_lines.append(" DOCX Export - Available")
666
  else:
667
- status_lines.append(" DOCX Export - Install python-docx to enable")
668
 
669
- return "\n".join(status_lines)
670
-
671
- def process_and_prepare_downloads(pdf_file, method, enable_header_footer_removal, crop_top, crop_bottom, crop_left, crop_right):
672
- """Process PDF and prepare both TXT and DOCX downloads if successful"""
673
- text, metadata, status = process_pdf_file(pdf_file, method, enable_header_footer_removal, crop_top, crop_bottom, crop_left, crop_right)
674
 
675
- # Prepare downloads if processing was successful
676
- if text and not text.startswith("Error:") and not text.startswith("No file"):
677
- try:
678
- # Create TXT file
679
- txt_path = create_txt_file(text, metadata)
680
-
681
- # Create DOCX file if support is available
682
- if HAS_DOCX_SUPPORT:
683
- try:
684
- docx_path = create_docx_file(text, metadata)
685
- return (text, metadata, status,
686
- gr.update(visible=True, value=txt_path),
687
- gr.update(visible=True, value=docx_path))
688
- except Exception as docx_error:
689
- logger.warning(f"DOCX creation failed: {docx_error}")
690
- return (text, metadata, status,
691
- gr.update(visible=True, value=txt_path),
692
- gr.update(visible=False))
693
- else:
694
- return (text, metadata, status,
695
- gr.update(visible=True, value=txt_path),
696
- gr.update(visible=False))
697
-
698
- except Exception as file_error:
699
- logger.error(f"File creation error: {file_error}")
700
- return (text, metadata, status,
701
- gr.update(visible=False),
702
- gr.update(visible=False))
703
- else:
704
- return (text, metadata, status,
705
- gr.update(visible=False),
706
- gr.update(visible=False))
707
-
708
 
709
- def create_interface():
710
- """Create and configure the Gradio interface"""
711
 
712
  with gr.Blocks(
713
- title="PDF OCR Service - Enhanced",
714
  theme=gr.themes.Soft(),
715
  css="""
716
  .main-header { text-align: center; margin-bottom: 2rem; }
717
- .method-info { background-color: #f8f9fa; padding: 1rem; border-radius: 0.5rem; margin: 1rem 0; }
 
 
 
 
718
  .status-box { border-left: 4px solid #007bff; padding: 1rem; background-color: #f8f9fa; }
719
- .preprocessing-box { border: 2px solid #28a745; padding: 1rem; border-radius: 0.5rem; background-color: #f8fff8; }
720
  """
721
  ) as interface:
722
 
723
  gr.HTML("""
724
  <div class="main-header">
725
- <h1>📄 PDF OCR Service - Enhanced</h1>
726
- <p>Convert PDF documents to text using advanced OCR technologies with preprocessing options</p>
727
  </div>
728
  """)
729
 
730
- with gr.Row():
731
- with gr.Column(scale=1):
732
- gr.HTML("<h3>📁 Upload & Configure</h3>")
733
-
734
- # File upload
735
- pdf_input = gr.File(
736
- label="Upload PDF File",
737
- file_types=[".pdf"],
738
- file_count="single"
739
- )
 
 
 
740
 
741
- # OCR method selection
742
- method_choice = gr.Dropdown(
743
- choices=["auto", "azure", "tesseract", "pymupdf"],
744
- value="auto",
745
- label="OCR Method",
746
- info="Choose OCR method or use auto-selection"
747
- )
748
-
749
- # Method information display
750
- method_info = gr.Markdown(
751
- value=get_method_info("auto"),
752
- elem_classes=["method-info"]
753
- )
754
-
755
- # Header/Footer Removal Section
756
- gr.HTML('<div class="preprocessing-box">')
757
- gr.HTML("<h4>🔧 Header/Footer Removal</h4>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
758
 
759
  enable_header_footer_removal = gr.Checkbox(
760
- label="Enable Header/Footer Removal",
761
  value=False,
762
- info="Remove headers and footers from all pages"
763
  )
764
 
765
- # Crop controls with preview
766
  with gr.Group(visible=False) as crop_controls:
767
- gr.HTML("<h5>📏 Crop Areas (% of page)</h5>")
768
 
769
- crop_top = gr.Slider(
770
- minimum=0,
771
- maximum=30,
772
- value=5,
773
- step=0.5,
774
- label="Top Crop %",
775
- info="Percentage of page height to remove from top"
776
- )
 
 
 
 
 
 
 
 
777
 
778
- crop_bottom = gr.Slider(
779
- minimum=0,
780
- maximum=30,
781
- value=5,
782
- step=0.5,
783
- label="Bottom Crop %",
784
- info="Percentage of page height to remove from bottom"
785
- )
786
 
787
- crop_left = gr.Slider(
788
- minimum=0,
789
- maximum=20,
790
- value=2,
791
- step=0.5,
792
- label="Left Crop %",
793
- info="Percentage of page width to remove from left"
794
- )
 
 
 
 
 
 
 
 
795
 
796
- crop_right = gr.Slider(
797
- minimum=0,
798
- maximum=20,
799
- value=2,
800
- step=0.5,
801
- label="Right Crop %",
802
- info="Percentage of page width to remove from right"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
  )
804
-
805
- gr.HTML('</div>')
806
-
807
- # Process button
808
- process_btn = gr.Button(
809
- "🚀 Process PDF",
810
- variant="primary",
811
- size="lg"
812
- )
813
-
814
- # Service status
815
- gr.HTML("<h4>🔧 Service Status</h4>")
816
- service_status = gr.Markdown(
817
- value=check_service_status(),
818
- elem_classes=["status-box"]
819
- )
820
-
821
- # Refresh status button
822
- refresh_btn = gr.Button("🔄 Refresh Status", size="sm")
823
 
824
  with gr.Column(scale=2):
825
- gr.HTML("<h3>📋 Results</h3>")
826
-
827
- # Crop preview (visible only when crop method is selected)
828
- crop_preview = gr.Image(
829
- label="Crop Preview",
830
- visible=False,
831
- interactive=False,
832
- height=400
833
- )
834
-
835
- # Processing status
836
- processing_status = gr.Textbox(
837
- label="Processing Status",
838
- interactive=False,
839
- lines=1
840
- )
841
-
842
- # Extracted text output
843
- text_output = gr.Textbox(
844
- label="Extracted Text",
845
- placeholder="Processed text will appear here...",
846
- lines=20,
847
- max_lines=30,
848
- interactive=False,
849
- show_copy_button=True
850
- )
851
-
852
- # Metadata information
853
- metadata_output = gr.Textbox(
854
- label="Processing Information",
855
- interactive=False,
856
- lines=4
857
- )
858
-
859
- # Download buttons
860
- with gr.Row():
861
- download_txt_btn = gr.DownloadButton(
862
- "📄 Download TXT",
863
- visible=False,
864
- variant="secondary"
865
  )
866
- download_docx_btn = gr.DownloadButton(
867
- "📊 Download DOCX",
868
- visible=False,
869
- variant="secondary"
 
 
870
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
871
 
872
- # Add tips section
873
- gr.HTML("<h3>💡 Tips & Features</h3>")
874
-
875
- # Create tips content based on available features
876
- download_info = "Get results as formatted TXT files"
877
- if HAS_DOCX_SUPPORT:
878
- download_info += " and structured DOCX files with clean table formatting"
879
- else:
880
- download_info += " (install python-docx for DOCX export)"
881
-
882
- tips_html = f"""
883
- <div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem; margin: 1rem 0;">
884
- <ul>
885
- <li><strong>Auto method</strong> is recommended for most users - intelligently selects the best OCR method</li>
886
- <li><strong>Header/Footer Removal:</strong> Clean up scanned documents by removing headers and footers</li>
887
- <li><strong>Fixed Removal:</strong> Remove specific pixel amounts from top/bottom of each page</li>
888
- <li><strong>Smart Crop:</strong> Use visual preview to set exact crop areas</li>
889
- <li><strong>Table Processing:</strong> Enhanced table detection with clean formatting (no separator lines)</li>
890
- <li><strong>Download Options:</strong> {download_info}</li>
891
- <li><strong>Azure Document Intelligence</strong> provides the best quality for complex documents</li>
892
- <li>Larger files may take longer to process - progress bar shows current status</li>
893
- <li>Supported file types: PDF documents (up to 50MB by default)</li>
894
- </ul>
895
- </div>
896
- """
897
 
898
- gr.HTML(tips_html)
 
 
 
 
 
899
 
900
- # Event handlers
901
  method_choice.change(
902
- fn=get_method_info,
903
  inputs=[method_choice],
904
  outputs=[method_info]
905
  )
906
 
 
907
  enable_header_footer_removal.change(
908
- fn=lambda enabled: (
909
  gr.update(visible=enabled),
910
- gr.update(visible=enabled and "fixed"),
911
- gr.update(visible=enabled and "crop"),
912
- gr.update(visible=enabled and "crop")
913
- ),
914
  inputs=[enable_header_footer_removal],
915
- outputs=[crop_controls, crop_preview]
916
  )
917
-
918
- # Update crop preview when parameters change
919
- for crop_input in [crop_top, crop_bottom, crop_left, crop_right]:
 
 
 
 
 
 
 
920
  crop_input.change(
921
- fn=update_crop_preview,
922
- inputs=[pdf_input, crop_top, crop_bottom, crop_left, crop_right],
923
  outputs=[crop_preview]
924
  )
925
 
926
- # Update preview when PDF is uploaded
927
- pdf_input.change(
928
- fn=update_crop_preview,
929
- inputs=[pdf_input, crop_top, crop_bottom, crop_left, crop_right],
930
- outputs=[crop_preview]
 
 
 
 
 
 
 
 
 
 
 
 
931
  )
932
 
 
 
 
 
 
 
933
  refresh_btn.click(
934
- fn=check_service_status,
935
  outputs=[service_status]
936
  )
937
 
 
938
  process_btn.click(
939
- fn=process_and_prepare_downloads,
940
- inputs=[pdf_input, method_choice, enable_header_footer_removal, crop_top, crop_bottom, crop_left, crop_right],
941
- outputs=[text_output, metadata_output, processing_status, download_txt_btn, download_docx_btn]
 
 
 
942
  )
943
 
944
  return interface
945
 
946
-
947
- def launch_ui():
948
- """Launch the Gradio interface"""
949
- interface = create_interface()
950
- interface.launch(
951
- server_name="0.0.0.0",
952
- server_port=7860,
953
- share=False,
954
- show_error=True
955
- )
 
 
 
956
 
957
  if __name__ == "__main__":
958
- launch_ui()
 
 
 
 
 
1
  import re
2
  import gradio as gr
3
  import os
4
  import tempfile
5
  import logging
6
+ import json
7
  from pathlib import Path
8
  from datetime import datetime
9
  import cv2
10
  import numpy as np
11
  from PIL import Image
12
  import fitz # PyMuPDF
13
+ from typing import Dict, List, Tuple, Optional
14
 
15
  # Load environment variables
16
  from dotenv import load_dotenv
 
26
  backend_manager = BackendManager()
27
 
28
  # Check if python-docx is available
 
 
29
  try:
30
  from docx import Document
31
+ from docx.shared import Inches, Pt
32
+ from docx.enum.table import WD_TABLE_ALIGNMENT
33
  HAS_DOCX_SUPPORT = True
34
  logger.info("DOCX export available")
35
  except ImportError:
36
  HAS_DOCX_SUPPORT = False
37
  logger.info("DOCX export not available - install python-docx to enable")
38
 
39
+ # Global variables for enhanced crop management
40
+ current_pdf_data = {
41
+ 'path': None,
42
+ 'page_count': 0,
43
+ 'page_images': {},
44
+ 'crop_settings': {},
45
+ 'default_crop_all': True
46
  }
47
 
48
+ class PDFPageManager:
49
+ """Manages PDF page previews and crop settings with enhanced resolution - FIXED VERSION"""
50
+
51
+ def __init__(self):
52
+ self.pdf_doc = None
53
+ self.page_images = {}
54
+ self.crop_settings = {}
55
+ self.current_page = 0
56
+ self.high_res_scale = 2.0 # Reduced from 3.0 for better performance
57
+
58
+ def load_pdf(self, pdf_path: str) -> Dict:
59
+ """Load PDF and generate high-resolution page previews - FIXED"""
60
+ try:
61
+ if self.pdf_doc:
62
+ self.pdf_doc.close()
63
+
64
+ self.pdf_doc = fitz.open(pdf_path)
65
+ page_count = len(self.pdf_doc)
66
+
67
+ # Generate high-resolution previews for all pages
68
+ self.page_images = {}
69
+ for page_num in range(page_count):
70
+ self.page_images[page_num] = self._generate_high_res_preview(page_num)
71
+
72
+ # Initialize default crop settings for all pages
73
+ self.crop_settings = {
74
+ i: {'top': 0, 'bottom': 0, 'left': 0, 'right': 0, 'custom': False}
75
+ for i in range(page_count)
76
+ }
77
+
78
+ logger.info(f"PDF loaded successfully: {page_count} pages")
79
+
80
+ return {
81
+ 'success': True,
82
+ 'page_count': page_count,
83
+ 'pages': list(range(page_count))
84
+ }
85
+
86
+ except Exception as e:
87
+ logger.error(f"Error loading PDF: {e}")
88
+ return {'success': False, 'error': str(e)}
89
+
90
+ def _generate_high_res_preview(self, page_num: int) -> np.ndarray:
91
+ """Generate high-resolution preview for better crop visualization - FIXED"""
92
+ try:
93
+ if not self.pdf_doc:
94
+ return None
95
+
96
+ page = self.pdf_doc.load_page(page_num)
97
+
98
+ # Use high resolution matrix for better quality
99
+ mat = fitz.Matrix(self.high_res_scale, self.high_res_scale)
100
+ pix = page.get_pixmap(matrix=mat)
101
+ img_data = pix.tobytes("png")
102
+
103
+ # Convert to PIL Image and then to numpy array
104
+ import io
105
+ pil_image = Image.open(io.BytesIO(img_data))
106
+ img_array = np.array(pil_image)
107
+
108
+ # Convert RGBA to RGB if needed
109
+ if len(img_array.shape) == 3 and img_array.shape[2] == 4:
110
+ img_array = img_array[:, :, :3]
111
+
112
+ return img_array
113
+
114
+ except Exception as e:
115
+ logger.error(f"Error generating preview for page {page_num}: {e}")
116
+ return None
117
+
118
+ def update_crop_visualization(self, page_num: int, crop_coords: Dict) -> np.ndarray:
119
+ """Update crop visualization with enhanced preview - FIXED"""
120
+ if page_num not in self.page_images or self.page_images[page_num] is None:
121
+ logger.warning(f"No image available for page {page_num}")
122
+ return None
123
+
124
+ try:
125
+ img_array = self.page_images[page_num].copy()
126
+ height, width = img_array.shape[:2]
127
+
128
+ # Convert coordinates from percentages to pixels
129
+ x1 = int(crop_coords.get('left', 0) * width / 100)
130
+ y1 = int(crop_coords.get('top', 0) * height / 100)
131
+ x2 = width - int(crop_coords.get('right', 0) * width / 100)
132
+ y2 = height - int(crop_coords.get('bottom', 0) * height / 100)
133
+
134
+ # Ensure coordinates are valid
135
+ x1 = max(0, min(x1, width))
136
+ x2 = max(0, min(x2, width))
137
+ y1 = max(0, min(y1, height))
138
+ y2 = max(0, min(y2, height))
139
+
140
+ # Create overlay
141
+ overlay = img_array.copy()
142
+
143
+ # Draw crop areas in semi-transparent red (areas to be removed)
144
+ alpha = 0.3
145
+ if crop_coords.get('top', 0) > 0 and y1 > 0:
146
+ cv2.rectangle(overlay, (0, 0), (width, y1), (255, 0, 0), -1)
147
+ if crop_coords.get('bottom', 0) > 0 and y2 < height:
148
+ cv2.rectangle(overlay, (0, y2), (width, height), (255, 0, 0), -1)
149
+ if crop_coords.get('left', 0) > 0 and x1 > 0:
150
+ cv2.rectangle(overlay, (0, 0), (x1, height), (255, 0, 0), -1)
151
+ if crop_coords.get('right', 0) > 0 and x2 < width:
152
+ cv2.rectangle(overlay, (x2, 0), (width, height), (255, 0, 0), -1)
153
+
154
+ # Draw content area outline in green
155
+ if x2 > x1 and y2 > y1:
156
+ thickness = max(2, int(self.high_res_scale * 2))
157
+ cv2.rectangle(overlay, (x1, y1), (x2, y2), (0, 255, 0), thickness)
158
+
159
+ # Blend overlay with original
160
+ result = cv2.addWeighted(img_array, 1-alpha, overlay, alpha, 0)
161
+
162
+ # Add informative text with better scaling
163
+ font_scale = max(0.8, self.high_res_scale / 3)
164
+ thickness = max(1, int(self.high_res_scale))
165
+ text_color = (255, 255, 255)
166
+ background_color = (0, 0, 0)
167
+
168
+ # Add text with background for better visibility
169
+ texts = [
170
+ f"Page {page_num + 1}",
171
+ "RED: Remove areas",
172
+ "GREEN: Content area",
173
+ f"Crop: T{crop_coords.get('top', 0):.1f}% B{crop_coords.get('bottom', 0):.1f}% L{crop_coords.get('left', 0):.1f}% R{crop_coords.get('right', 0):.1f}%"
174
+ ]
175
+
176
+ y_offset = 30
177
+ for i, text in enumerate(texts):
178
+ y_pos = y_offset + (i * 30)
179
+ # Add background rectangle for text
180
+ (text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)
181
+ cv2.rectangle(result, (10, y_pos - text_height - 5), (text_width + 20, y_pos + 5), background_color, -1)
182
+ cv2.putText(result, text, (15, y_pos), cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, thickness)
183
+
184
+ return result
185
+
186
+ except Exception as e:
187
+ logger.error(f"Error updating crop visualization: {e}")
188
+ return self.page_images[page_num] if page_num in self.page_images else None
189
+
190
+ def set_crop_for_page(self, page_num: int, crop_coords: Dict):
191
+ """Set crop coordinates for specific page - FIXED"""
192
+ if page_num in self.crop_settings:
193
+ self.crop_settings[page_num].update(crop_coords)
194
+ self.crop_settings[page_num]['custom'] = True
195
+ logger.info(f"Set crop for page {page_num}: {crop_coords}")
196
+
197
+ def set_crop_for_all_pages(self, crop_coords: Dict):
198
+ """Apply same crop settings to all pages - FIXED"""
199
+ for page_num in self.crop_settings:
200
+ if not self.crop_settings[page_num].get('custom', False):
201
+ self.crop_settings[page_num].update(crop_coords)
202
+ logger.info(f"Applied crop to all non-custom pages: {crop_coords}")
203
+
204
+ def get_crop_settings_for_processing(self) -> Dict:
205
+ """Get crop settings in format expected by backend - FIXED"""
206
+ return {
207
+ 'per_page_crops': self.crop_settings,
208
+ 'has_custom_crops': any(page.get('custom', False) for page in self.crop_settings.values()),
209
+ 'enhanced_resolution': True,
210
+ 'resolution_scale': self.high_res_scale
211
+ }
212
+
213
+ def close(self):
214
+ """Clean up resources"""
215
+ if self.pdf_doc:
216
+ self.pdf_doc.close()
217
+ self.pdf_doc = None
218
+ self.page_images.clear()
219
+ self.crop_settings.clear()
220
+
221
+ # Global page manager instance
222
+ pdf_manager = PDFPageManager()
223
 
224
+ def load_pdf_for_preview(pdf_file):
225
+ """Load PDF and return page thumbnails for selection - FIXED"""
226
  if pdf_file is None:
227
+ return None, gr.update(choices=[], value=None), gr.update(visible=False), "No PDF loaded"
228
 
229
  try:
230
+ result = pdf_manager.load_pdf(pdf_file.name)
 
 
 
 
 
 
231
 
232
+ if result['success']:
233
+ # Create page choices for dropdown
234
+ page_choices = [f"Page {i+1}" for i in range(result['page_count'])]
235
+
236
+ # Get first page preview with default crop
237
+ first_page_preview = pdf_manager.update_crop_visualization(0, {
238
+ 'top': 0, 'bottom': 0, 'left': 0, 'right': 0
239
+ }) if 0 in pdf_manager.page_images else None
240
+
241
+ status = f"PDF loaded successfully: {result['page_count']} pages"
242
+
243
+ return (first_page_preview,
244
+ gr.update(choices=page_choices, value=page_choices[0] if page_choices else None, visible=True),
245
+ gr.update(visible=True),
246
+ status)
247
+ else:
248
+ return None, gr.update(choices=[], value=None, visible=False), gr.update(visible=False), f"Error: {result['error']}"
249
+
250
+ except Exception as e:
251
+ logger.error(f"Error in load_pdf_for_preview: {e}")
252
+ return None, gr.update(choices=[], value=None, visible=False), gr.update(visible=False), f"Error loading PDF: {str(e)}"
253
+
254
+ def change_preview_page(page_selection, crop_top, crop_bottom, crop_left, crop_right):
255
+ """Change preview to selected page with current crop settings - FIXED"""
256
+ if not page_selection:
257
+ return None
258
+
259
+ try:
260
+ page_num = int(page_selection.split()[1]) - 1 # Extract page number
261
+
262
+ # Get current crop settings for this page
263
+ crop_coords = {
264
+ 'top': crop_top,
265
+ 'bottom': crop_bottom,
266
+ 'left': crop_left,
267
+ 'right': crop_right
268
+ }
269
 
270
+ # Update visualization
271
+ preview_image = pdf_manager.update_crop_visualization(page_num, crop_coords)
272
+ return preview_image
273
 
 
274
  except Exception as e:
275
+ logger.error(f"Error changing preview page: {e}")
276
  return None
277
 
278
+ def update_crop_preview_interactive(page_selection, crop_top, crop_bottom, crop_left, crop_right, apply_to_all):
279
+ """Update crop preview with interactive feedback - FIXED"""
280
+ if not page_selection or not pdf_manager.pdf_doc:
 
281
  return None
282
 
283
  try:
284
+ page_num = int(page_selection.split()[1]) - 1
285
+
286
+ crop_coords = {
287
+ 'top': crop_top,
288
+ 'bottom': crop_bottom,
289
+ 'left': crop_left,
290
+ 'right': crop_right
291
+ }
292
+
293
+ # Apply to current page or all pages based on setting
294
+ if apply_to_all:
295
+ pdf_manager.set_crop_for_all_pages(crop_coords)
296
+ else:
297
+ pdf_manager.set_crop_for_page(page_num, crop_coords)
298
 
299
+ # Return updated preview
300
+ return pdf_manager.update_crop_visualization(page_num, crop_coords)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
  except Exception as e:
303
  logger.error(f"Error updating crop preview: {e}")
304
  return None
305
 
306
+ def process_pdf_with_html_enhancement(pdf_file, ocr_method, enable_header_footer_removal,
307
+ crop_top, crop_bottom, crop_left, crop_right,
308
+ apply_to_all_pages, current_page_selection,
309
+ progress=gr.Progress()):
310
+ """Process PDF with HTML enhancement and improved table handling - FIXED"""
 
311
  if pdf_file is None:
312
+ return "No file uploaded.", "", "", "Error: No file selected"
313
 
 
314
  try:
315
+ progress(0.1, desc="Initializing HTML-enhanced processing...")
 
 
 
316
 
317
+ # Prepare enhanced preprocessing options
318
  preprocessing_options = {
319
  'enable_header_footer_removal': enable_header_footer_removal,
320
+ 'enhanced_crop_processing': True,
321
+ 'crop_settings': pdf_manager.get_crop_settings_for_processing() if enable_header_footer_removal else None
 
 
 
 
322
  }
323
 
324
+ progress(0.3, desc="Processing with HTML enhancement...")
325
 
326
+ # Process the PDF with enhanced preprocessing
327
+ result = backend_manager.process_pdf_with_enhanced_resolution(
328
+ pdf_file.name, ocr_method, preprocessing_options
329
+ )
330
 
331
+ progress(0.9, desc="Finalizing HTML processing...")
332
  progress(1.0, desc="Complete!")
333
 
334
  if result['success']:
335
+ metadata_info = format_enhanced_metadata(result['metadata'], result['method_used'])
336
+ status = f"Success: Processed using {result['method_used']} with HTML enhancement"
337
+
338
+ # Return text, HTML, metadata, and status
339
+ return (result['text'],
340
+ result.get('html', ''),
341
+ metadata_info,
342
+ status)
343
  else:
344
  error_msg = result.get('error', 'Unknown error occurred')
345
+ return f"Error: {error_msg}", "", "", f"Processing failed: {error_msg}"
346
 
347
  except Exception as e:
348
+ logger.error(f"HTML-enhanced processing error: {e}")
349
+ return f"Error: {str(e)}", "", "", f"Unexpected error: {str(e)}"
 
350
 
351
+ def format_enhanced_metadata(metadata, method_used):
352
+ """Enhanced metadata formatting with HTML processing info"""
353
  if not metadata:
354
  return f"Method used: {method_used}"
355
 
 
358
  if 'pages' in metadata:
359
  info_lines.append(f"Pages processed: {metadata['pages']}")
360
 
361
+ if metadata.get('enhanced_processing', False):
362
+ info_lines.append("Enhanced processing: Enabled")
363
+
364
+ if metadata.get('html_processing', False):
365
+ info_lines.append("HTML generation: Enabled")
366
 
367
+ if metadata.get('enhanced_resolution', False) and 'resolution_scale' in metadata:
368
+ info_lines.append(f"Enhanced resolution: {metadata.get('resolution_scale', 'N/A')}x")
 
369
 
370
+ if 'custom_crops_applied' in metadata:
371
+ info_lines.append(f"Custom crops per page: {metadata['custom_crops_applied']}")
372
+
373
+ if 'tables' in metadata:
374
+ info_lines.append(f"Tables detected: {metadata['tables']}")
375
 
376
  if 'processing_time_seconds' in metadata:
377
  info_lines.append(f"Processing time: {metadata['processing_time_seconds']:.2f} seconds")
378
 
379
  return "\n".join(info_lines)
380
 
381
+ def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
382
+ crop_top, crop_bottom, crop_left, crop_right,
383
+ apply_to_all_pages, current_page_selection):
384
+ """Prepare enhanced downloads with HTML processing"""
385
+ text, html, metadata, status = process_pdf_with_html_enhancement(
386
+ pdf_file, method, enable_header_footer_removal,
387
+ crop_top, crop_bottom, crop_left, crop_right,
388
+ apply_to_all_pages, current_page_selection
 
389
  )
390
 
391
+ # Prepare downloads if processing was successful
392
+ if text and not text.startswith("Error:") and not text.startswith("No file"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  try:
394
+ # Create enhanced download files
395
+ download_files = backend_manager.create_enhanced_downloads(text, html, metadata)
396
+
397
+ # Prepare gradio updates for download buttons
398
+ updates = [
399
+ text, metadata, status, # Display outputs
400
+ gr.update(visible=True, value=download_files.get('txt')) if 'txt' in download_files else gr.update(visible=False),
401
+ gr.update(visible=True, value=download_files.get('docx')) if 'docx' in download_files else gr.update(visible=False),
402
+ gr.update(visible=True, value=download_files.get('html')) if 'html' in download_files else gr.update(visible=False)
403
+ ]
404
+
405
+ return tuple(updates)
406
+
407
+ except Exception as file_error:
408
+ logger.error(f"Enhanced file creation error: {file_error}")
409
+ return (text, metadata, status,
410
+ gr.update(visible=False),
411
+ gr.update(visible=False),
412
+ gr.update(visible=False))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  else:
414
+ return (text, metadata, status,
415
+ gr.update(visible=False),
416
+ gr.update(visible=False),
417
+ gr.update(visible=False))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
 
419
+ def get_enhanced_method_info(method):
420
+ """Get information about selected OCR method with HTML processing"""
421
  method_descriptions = {
422
+ "auto": "**Auto Selection**: Automatically chooses the best available method with HTML processing and enhanced table handling.",
423
+ "azure": "**Azure Document Intelligence**: Advanced cloud-based OCR with HTML generation, layout preservation, and smart table detection.",
424
+ "tesseract": "**Tesseract OCR**: Open-source OCR with HTML output, enhanced image preprocessing, and resolution scaling.",
425
+ "pymupdf": "**PyMuPDF**: Fast extraction enhanced with HTML processing and improved formatting preservation."
426
  }
427
 
428
  return method_descriptions.get(method, "Select a method to see details.")
429
 
430
+ def check_enhanced_service_status():
431
+ """Check and display enhanced service status"""
 
432
  available_methods = backend_manager.get_available_methods()
433
 
434
+ status_lines = ["**Available OCR Methods (Enhanced with HTML Processing):**"]
435
 
436
  if "azure" in available_methods:
437
+ status_lines.append(" Azure Document Intelligence - Ready (HTML + Tables)")
438
  else:
439
+ status_lines.append(" Azure Document Intelligence - Not configured")
440
 
441
  if "tesseract" in available_methods:
442
+ status_lines.append(" Tesseract OCR - Ready (HTML Enhanced)")
443
  else:
444
+ status_lines.append(" Tesseract OCR - Not available")
445
 
446
  if "pymupdf" in available_methods:
447
+ status_lines.append(" PyMuPDF - Ready (HTML Enhanced)")
448
  else:
449
+ status_lines.append(" PyMuPDF - Not available")
450
+
451
+ # Add enhanced features status
452
+ status_lines.append("✓ HTML Processing - Available")
453
+ status_lines.append("✓ Enhanced Table Handling - Available")
454
+ status_lines.append("✓ Smart Text Preservation - Available")
455
+ status_lines.append("✓ Multi-Page Crop Preview - Available")
456
+ status_lines.append("✓ Per-Page Crop Customization - Available")
457
 
 
458
  if HAS_DOCX_SUPPORT:
459
+ status_lines.append(" Enhanced DOCX Export - Available")
460
  else:
461
+ status_lines.append(" Enhanced DOCX Export - Install python-docx to enable")
462
 
463
+ status_lines.append("✓ HTML File Export - Available")
464
+ status_lines.append("✓ Enhanced Text Export - Available")
 
 
 
465
 
466
+ return "\n".join(status_lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
467
 
468
+ def create_enhanced_interface():
469
+ """Create enhanced Gradio interface with improved layout and HTML processing"""
470
 
471
  with gr.Blocks(
472
+ title="PDF OCR Service - Enhanced with HTML Processing",
473
  theme=gr.themes.Soft(),
474
  css="""
475
  .main-header { text-align: center; margin-bottom: 2rem; }
476
+ .config-panel { border: 2px solid #007bff; padding: 1.5rem; border-radius: 0.8rem; background-color: #f8f9fa; margin-bottom: 1rem; }
477
+ .instructions-panel { border: 2px solid #28a745; padding: 1.5rem; border-radius: 0.8rem; background-color: #f0fff0; margin-bottom: 1rem; }
478
+ .crop-controls { border: 2px solid #ffc107; padding: 1rem; border-radius: 0.5rem; background-color: #fffef7; }
479
+ .page-preview { border: 2px solid #17a2b8; padding: 1rem; border-radius: 0.5rem; background-color: #f0f8ff; }
480
+ .results-panel { border: 2px solid #6f42c1; padding: 1rem; border-radius: 0.5rem; background-color: #f8f5ff; }
481
  .status-box { border-left: 4px solid #007bff; padding: 1rem; background-color: #f8f9fa; }
 
482
  """
483
  ) as interface:
484
 
485
  gr.HTML("""
486
  <div class="main-header">
487
+ <h1>PDF OCR Service - Enhanced with HTML Processing</h1>
488
+ <p>Convert PDF documents to text using enhanced OCR with HTML intermediate processing, smart table handling, and format preservation</p>
489
  </div>
490
  """)
491
 
492
+ # Instructions at the top
493
+ with gr.Group(elem_classes=["instructions-panel"]):
494
+ gr.HTML("<h3>Instructions & Features</h3>")
495
+ gr.HTML("""
496
+ <div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem;">
497
+ <h4>How to Use:</h4>
498
+ <ol>
499
+ <li><strong>Upload PDF:</strong> Select your PDF file in the configuration panel below</li>
500
+ <li><strong>Choose Method:</strong> Select OCR method (Auto recommended for best results)</li>
501
+ <li><strong>Configure Crop (Optional):</strong> Enable header/footer removal and adjust crop settings</li>
502
+ <li><strong>Process:</strong> Click the process button to extract text with HTML enhancement</li>
503
+ <li><strong>Download:</strong> Get results in TXT, DOCX, or HTML format</li>
504
+ </ol>
505
 
506
+ <h4>Enhanced Features:</h4>
507
+ <ul>
508
+ <li><strong>Smart Table Detection:</strong> 70% overlap threshold prevents text loss</li>
509
+ <li><strong>HTML Processing:</strong> Better structure and formatting preservation</li>
510
+ <li><strong>Multi-format Export:</strong> TXT, DOCX, and HTML downloads</li>
511
+ <li><strong>Advanced Crop Control:</strong> Per-page customization with real-time preview</li>
512
+ <li><strong>Enhanced Resolution:</strong> High-quality processing for better accuracy</li>
513
+ <li><strong>Page Numbers:</strong> Automatic page numbering in extracted content</li>
514
+ <li><strong>Proper Indentation:</strong> Preserved spacing and formatting</li>
515
+ </ul>
516
+ </div>
517
+ """)
518
+
519
+ # Configuration Panel - Top Left
520
+ with gr.Group(elem_classes=["config-panel"]):
521
+ gr.HTML("<h3>Configuration Panel</h3>")
522
+
523
+ with gr.Row():
524
+ with gr.Column(scale=1):
525
+ # File upload
526
+ pdf_input = gr.File(
527
+ label="Upload PDF File",
528
+ file_types=[".pdf"],
529
+ file_count="single"
530
+ )
531
+
532
+ # PDF loading status
533
+ pdf_load_status = gr.Textbox(
534
+ label="PDF Status",
535
+ interactive=False,
536
+ lines=1,
537
+ value="No PDF loaded"
538
+ )
539
+
540
+ with gr.Column(scale=1):
541
+ # OCR method selection
542
+ method_choice = gr.Dropdown(
543
+ choices=["auto", "azure", "tesseract", "pymupdf"],
544
+ value="auto",
545
+ label="OCR Method",
546
+ info="Choose OCR method (all enhanced with HTML processing)"
547
+ )
548
+
549
+ # Method information display
550
+ method_info = gr.Markdown(
551
+ value=get_enhanced_method_info("auto"),
552
+ elem_classes=["method-info"]
553
+ )
554
+
555
+ # Enhanced Header/Footer Removal Section
556
+ with gr.Group(elem_classes=["crop-controls"]):
557
+ gr.HTML("<h4>Header/Footer Removal & Crop Settings</h4>")
558
 
559
  enable_header_footer_removal = gr.Checkbox(
560
+ label="Enable Enhanced Header/Footer Removal",
561
  value=False,
562
+ info="Remove headers and footers with high-resolution processing"
563
  )
564
 
565
+ # Multi-page controls
566
  with gr.Group(visible=False) as crop_controls:
567
+ gr.HTML("<h5>Multi-Page Crop Control</h5>")
568
 
569
+ with gr.Row():
570
+ # Page selection
571
+ page_selector = gr.Dropdown(
572
+ label="Select Page for Preview",
573
+ choices=[],
574
+ value=None,
575
+ info="Choose page to preview and customize crop settings",
576
+ visible=False
577
+ )
578
+
579
+ # Apply to all pages toggle
580
+ apply_to_all_pages = gr.Checkbox(
581
+ label="Apply crop settings to all pages",
582
+ value=True,
583
+ info="When enabled, changes apply to all pages"
584
+ )
585
 
586
+ gr.HTML("<h5>Crop Areas (% of page)</h5>")
 
 
 
 
 
 
 
587
 
588
+ with gr.Row():
589
+ crop_top = gr.Slider(
590
+ minimum=0,
591
+ maximum=40,
592
+ value=8,
593
+ step=0.5,
594
+ label="Top Crop %"
595
+ )
596
+
597
+ crop_bottom = gr.Slider(
598
+ minimum=0,
599
+ maximum=40,
600
+ value=8,
601
+ step=0.5,
602
+ label="Bottom Crop %"
603
+ )
604
 
605
+ with gr.Row():
606
+ crop_left = gr.Slider(
607
+ minimum=0,
608
+ maximum=30,
609
+ value=3,
610
+ step=0.5,
611
+ label="Left Crop %"
612
+ )
613
+
614
+ crop_right = gr.Slider(
615
+ minimum=0,
616
+ maximum=30,
617
+ value=3,
618
+ step=0.5,
619
+ label="Right Crop %"
620
+ )
621
+
622
+ # Quick preset buttons
623
+ with gr.Row():
624
+ preset_light = gr.Button("Light Crop (5%)", size="sm")
625
+ preset_medium = gr.Button("Medium Crop (10%)", size="sm")
626
+ preset_heavy = gr.Button("Heavy Crop (15%)", size="sm")
627
+ preset_reset = gr.Button("Reset", size="sm")
628
+
629
+ # Process button
630
+ process_btn = gr.Button(
631
+ "Process PDF with HTML Enhancement",
632
+ variant="primary",
633
+ size="lg"
634
+ )
635
+
636
+ # Results and Preview Section
637
+ with gr.Row():
638
+ with gr.Column(scale=1):
639
+ # Enhanced crop preview with multi-page support
640
+ with gr.Group(visible=False, elem_classes=["page-preview"]) as preview_group:
641
+ gr.HTML("<h4>Page Preview with Crop Visualization</h4>")
642
+ crop_preview = gr.Image(
643
+ label="High-Resolution Page Preview",
644
+ interactive=False,
645
+ height=500,
646
+ show_label=False
647
  )
648
+
649
+ gr.HTML("""
650
+ <p style="font-size: 0.9em; color: #666; text-align: center;">
651
+ <strong>Red areas:</strong> Will be removed | <strong>Green outline:</strong> Content area |
652
+ <strong>Enhanced:</strong> 2x resolution processing
653
+ </p>
654
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
655
 
656
  with gr.Column(scale=2):
657
+ with gr.Group(elem_classes=["results-panel"]):
658
+ gr.HTML("<h3>Results & Downloads</h3>")
659
+
660
+ # Processing status
661
+ processing_status = gr.Textbox(
662
+ label="Processing Status",
663
+ interactive=False,
664
+ lines=1
665
+ )
666
+
667
+ # Extracted text output
668
+ text_output = gr.Textbox(
669
+ label="Extracted Text (Enhanced with Proper Formatting and Page Numbers)",
670
+ placeholder="Processed text with HTML enhancement and preserved formatting will appear here...",
671
+ lines=20,
672
+ max_lines=30,
673
+ interactive=False,
674
+ show_copy_button=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
  )
676
+
677
+ # Metadata information
678
+ metadata_output = gr.Textbox(
679
+ label="Processing Information",
680
+ interactive=False,
681
+ lines=4
682
  )
683
+
684
+ # Enhanced download buttons
685
+ with gr.Row():
686
+ download_txt_btn = gr.DownloadButton(
687
+ "Download Enhanced TXT",
688
+ visible=False,
689
+ variant="secondary"
690
+ )
691
+ download_docx_btn = gr.DownloadButton(
692
+ "Download Enhanced DOCX",
693
+ visible=False,
694
+ variant="secondary"
695
+ )
696
+ download_html_btn = gr.DownloadButton(
697
+ "Download HTML File",
698
+ visible=False,
699
+ variant="secondary"
700
+ )
701
+
702
+ # Service Status at the bottom
703
+ with gr.Group(elem_classes=["status-box"]):
704
+ gr.HTML("<h4>Service Status</h4>")
705
+ service_status = gr.Markdown(
706
+ value=check_enhanced_service_status()
707
+ )
708
+
709
+ # Refresh status button
710
+ refresh_btn = gr.Button("Refresh Status", size="sm")
711
 
712
+ # Event handlers with enhanced functionality
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
713
 
714
+ # PDF upload handler
715
+ pdf_input.change(
716
+ fn=load_pdf_for_preview,
717
+ inputs=[pdf_input],
718
+ outputs=[crop_preview, page_selector, crop_controls, pdf_load_status]
719
+ )
720
 
721
+ # Method info handler
722
  method_choice.change(
723
+ fn=get_enhanced_method_info,
724
  inputs=[method_choice],
725
  outputs=[method_info]
726
  )
727
 
728
+ # Header/footer removal handler
729
  enable_header_footer_removal.change(
730
+ fn=lambda enabled: [
731
  gr.update(visible=enabled),
732
+ gr.update(visible=enabled)
733
+ ],
 
 
734
  inputs=[enable_header_footer_removal],
735
+ outputs=[crop_controls, preview_group]
736
  )
737
+
738
+ # Page selection handler
739
+ page_selector.change(
740
+ fn=change_preview_page,
741
+ inputs=[page_selector, crop_top, crop_bottom, crop_left, crop_right],
742
+ outputs=[crop_preview]
743
+ )
744
+
745
+ # Crop parameter handlers - update preview in real-time
746
+ for crop_input in [crop_top, crop_bottom, crop_left, crop_right, apply_to_all_pages]:
747
  crop_input.change(
748
+ fn=update_crop_preview_interactive,
749
+ inputs=[page_selector, crop_top, crop_bottom, crop_left, crop_right, apply_to_all_pages],
750
  outputs=[crop_preview]
751
  )
752
 
753
+ # Preset button handlers
754
+ def apply_preset(top, bottom, left, right):
755
+ return top, bottom, left, right
756
+
757
+ preset_light.click(
758
+ fn=lambda: apply_preset(5, 5, 2, 2),
759
+ outputs=[crop_top, crop_bottom, crop_left, crop_right]
760
+ )
761
+
762
+ preset_medium.click(
763
+ fn=lambda: apply_preset(10, 10, 5, 5),
764
+ outputs=[crop_top, crop_bottom, crop_left, crop_right]
765
+ )
766
+
767
+ preset_heavy.click(
768
+ fn=lambda: apply_preset(15, 15, 8, 8),
769
+ outputs=[crop_top, crop_bottom, crop_left, crop_right]
770
  )
771
 
772
+ preset_reset.click(
773
+ fn=lambda: apply_preset(0, 0, 0, 0),
774
+ outputs=[crop_top, crop_bottom, crop_left, crop_right]
775
+ )
776
+
777
+ # Status refresh handler
778
  refresh_btn.click(
779
+ fn=check_enhanced_service_status,
780
  outputs=[service_status]
781
  )
782
 
783
+ # Main processing handler with enhanced downloads
784
  process_btn.click(
785
+ fn=prepare_enhanced_downloads,
786
+ inputs=[pdf_input, method_choice, enable_header_footer_removal,
787
+ crop_top, crop_bottom, crop_left, crop_right,
788
+ apply_to_all_pages, page_selector],
789
+ outputs=[text_output, metadata_output, processing_status,
790
+ download_txt_btn, download_docx_btn, download_html_btn]
791
  )
792
 
793
  return interface
794
 
795
+ def launch_enhanced_ui():
796
+ """Launch the enhanced Gradio interface with HTML processing"""
797
+ try:
798
+ interface = create_enhanced_interface()
799
+ interface.launch(
800
+ server_name="0.0.0.0",
801
+ server_port=7860,
802
+ share=False,
803
+ show_error=True
804
+ )
805
+ finally:
806
+ # Clean up resources
807
+ pdf_manager.close()
808
 
809
  if __name__ == "__main__":
810
+ launch_enhanced_ui()
backend.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Backend Management Module - Enhanced with Header/Footer Removal
3
  Coordinates between UI and OCR services, handles file management and preprocessing
4
  """
5
  import re
@@ -26,8 +26,385 @@ logging.basicConfig(level=logging.INFO)
26
  logger = logging.getLogger(__name__)
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  class BackendManager:
30
- """Backend manager for PDF OCR processing with enhanced preprocessing and table handling"""
31
 
32
  def __init__(self):
33
  self.ocr_service = OCRService()
@@ -38,12 +415,12 @@ class BackendManager:
38
  self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service'
39
  self.temp_dir.mkdir(exist_ok=True)
40
 
41
- logger.info("Enhanced backend manager initialized successfully")
42
 
43
- def process_pdf(self, pdf_path: str, method: str = "auto",
44
- preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
45
  """
46
- Process PDF file with optional preprocessing and return results
47
 
48
  Args:
49
  pdf_path: Path to the PDF file
@@ -51,7 +428,7 @@ class BackendManager:
51
  preprocessing_options: Dictionary containing preprocessing settings
52
 
53
  Returns:
54
- Dict containing processing results
55
  """
56
  start_time = datetime.now()
57
 
@@ -61,11 +438,12 @@ class BackendManager:
61
  'success': False,
62
  'error': f"File not found: {pdf_path}",
63
  'text': '',
 
64
  'method_used': '',
65
  'metadata': {}
66
  }
67
 
68
- # Check file size (limit to 50MB by default)
69
  max_file_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024
70
  file_size = os.path.getsize(pdf_path)
71
 
@@ -74,14 +452,15 @@ class BackendManager:
74
  'success': False,
75
  'error': f"File too large. Maximum size: {max_file_size // (1024*1024)}MB",
76
  'text': '',
 
77
  'method_used': '',
78
  'metadata': {}
79
  }
80
 
81
- # Generate file hash for caching/tracking
82
  file_hash = self._calculate_file_hash(pdf_path)
83
 
84
- logger.info(f"Processing PDF: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
85
  logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
86
 
87
  # Handle preprocessing if enabled
@@ -89,18 +468,17 @@ class BackendManager:
89
  preprocessing_applied = False
90
 
91
  if preprocessing_options and preprocessing_options.get('enable_header_footer_removal', False):
92
- logger.info("Applying header/footer removal preprocessing...")
93
  try:
94
- processed_pdf_path = self._apply_preprocessing(pdf_path, preprocessing_options)
95
  preprocessing_applied = True
96
- logger.info("Preprocessing completed successfully")
97
  except Exception as e:
98
  logger.error(f"Preprocessing failed: {e}")
99
- # Continue with original file if preprocessing fails
100
  processed_pdf_path = pdf_path
101
 
102
  try:
103
- # Process the PDF (original or preprocessed)
104
  result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method)
105
 
106
  # Add processing metadata
@@ -111,15 +489,12 @@ class BackendManager:
111
  'file_size_mb': round(file_size / (1024*1024), 2),
112
  'processing_time_seconds': round(processing_time, 2),
113
  'timestamp': start_time.isoformat(),
 
 
114
  'header_footer_removed': preprocessing_applied,
115
  'preprocessing_options': preprocessing_options if preprocessing_applied else None
116
  })
117
 
118
- # Post-process for better table handling if needed
119
- if result['success'] and result['text']:
120
- result['text'] = self._post_process_extracted_text(result['text'])
121
- result['metadata']['post_processed'] = True
122
-
123
  # Cleanup temporary preprocessed file
124
  if preprocessing_applied and processed_pdf_path != pdf_path:
125
  try:
@@ -130,14 +505,17 @@ class BackendManager:
130
  # Log results
131
  if result['success']:
132
  text_length = len(result['text'])
133
- table_count = result['text'].count('--- TABLE ')
134
- logger.info(f"Processing completed successfully in {processing_time:.2f}s")
 
 
135
  logger.info(f"Method used: {result['method_used']}")
136
  logger.info(f"Text extracted: {text_length} characters")
 
137
  if table_count > 0:
138
  logger.info(f"Tables detected: {table_count}")
139
  if preprocessing_applied:
140
- logger.info("Header/footer removal applied")
141
 
142
  # Add to processing history
143
  self._add_to_history({
@@ -148,10 +526,12 @@ class BackendManager:
148
  'text_length': text_length,
149
  'table_count': table_count,
150
  'processing_time': processing_time,
151
- 'preprocessing_applied': preprocessing_applied
 
 
152
  })
153
  else:
154
- logger.error(f"Processing failed: {result.get('error', 'Unknown error')}")
155
 
156
  # Add to processing history
157
  self._add_to_history({
@@ -161,15 +541,16 @@ class BackendManager:
161
  'success': False,
162
  'error': result.get('error', 'Unknown error'),
163
  'processing_time': processing_time,
164
- 'preprocessing_applied': preprocessing_applied
 
165
  })
166
 
167
  return result
168
 
169
  except Exception as e:
170
- logger.error(f"Unexpected error during processing: {e}")
171
 
172
- # Cleanup temporary preprocessed file
173
  if preprocessing_applied and processed_pdf_path != pdf_path:
174
  try:
175
  os.unlink(processed_pdf_path)
@@ -184,57 +565,36 @@ class BackendManager:
184
  'method_requested': method,
185
  'success': False,
186
  'error': str(e),
187
- 'processing_time': processing_time
 
188
  })
189
 
190
  return {
191
  'success': False,
192
- 'error': f"Processing error: {str(e)}",
193
  'text': '',
 
194
  'method_used': '',
195
  'metadata': {
196
  'file_hash': file_hash,
197
  'processing_time_seconds': round(processing_time, 2),
198
- 'timestamp': start_time.isoformat()
 
199
  }
200
  }
201
 
202
- def _apply_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str:
203
- """
204
- Apply header/footer removal preprocessing to PDF
205
-
206
- Args:
207
- pdf_path: Path to original PDF
208
- options: Preprocessing options
209
-
210
- Returns:
211
- Path to preprocessed PDF file
212
- """
213
- removal_method = options.get('removal_method', 'fixed')
214
 
215
  # Create temporary file for processed PDF
216
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
217
- temp_pdf_path = self.temp_dir / f"preprocessed_{timestamp}.pdf"
218
-
219
- if removal_method == 'fixed':
220
- return self._apply_fixed_removal(pdf_path, str(temp_pdf_path), options)
221
- elif removal_method == 'crop':
222
- return self._apply_crop_removal(pdf_path, str(temp_pdf_path), options)
223
- else:
224
- raise ValueError(f"Unknown removal method: {removal_method}")
225
-
226
- def _apply_crop_removal(self, input_pdf: str, output_pdf: str, options: Dict[str, Any]) -> str:
227
- """Apply percentage-based crop removal"""
228
- crop_settings = options.get('crop_settings', {})
229
- top_percent = crop_settings.get('top', 0)
230
- bottom_percent = crop_settings.get('bottom', 0)
231
- left_percent = crop_settings.get('left', 0)
232
- right_percent = crop_settings.get('right', 0)
233
 
234
- if all(v == 0 for v in [top_percent, bottom_percent, left_percent, right_percent]):
235
- return input_pdf # No processing needed
236
-
237
- doc = fitz.open(input_pdf)
238
  new_doc = fitz.open()
239
 
240
  try:
@@ -242,7 +602,17 @@ class BackendManager:
242
  page = doc.load_page(page_num)
243
  page_rect = page.rect
244
 
245
- # Calculate crop amounts in pixels
 
 
 
 
 
 
 
 
 
 
246
  width = page_rect.width
247
  height = page_rect.height
248
 
@@ -259,319 +629,109 @@ class BackendManager:
259
  page_rect.y1 - crop_bottom
260
  )
261
 
262
- # Create new page with cropped content
263
- new_page = new_doc.new_page(width=new_rect.width, height=new_rect.height)
 
 
264
 
265
- # Copy content from cropped area
266
- new_page.show_pdf_page(
267
- new_page.rect,
268
- doc,
269
- page_num,
270
- clip=new_rect
271
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
- new_doc.save(output_pdf)
274
- logger.info(f"Crop removal applied: top={top_percent}%, bottom={bottom_percent}%, left={left_percent}%, right={right_percent}%")
275
 
 
 
 
276
  finally:
277
  doc.close()
278
  new_doc.close()
279
 
280
- return output_pdf
281
 
282
- def _post_process_extracted_text(self, text: str) -> str:
283
- """
284
- Post-process extracted text with ZERO duplication - completely rewritten
285
-
286
- Args:
287
- text: Raw extracted text
288
-
289
- Returns:
290
- Cleaned and formatted text with zero duplicates
291
- """
292
- if not text or not text.strip():
293
- return text
294
-
295
- import re
296
-
297
- # Step 1: Split by page markers first to handle each page individually
298
- if '=== PAGE ' in text:
299
- pages = re.split(r'(=== PAGE \d+ ===)', text)
300
- processed_pages = []
301
-
302
- for i, page_part in enumerate(pages):
303
- if not page_part.strip():
304
- continue
305
-
306
- if page_part.startswith('=== PAGE '):
307
- # This is a page marker, keep it
308
- processed_pages.append(page_part)
309
- else:
310
- # This is page content, process it
311
- cleaned_content = self._clean_page_content(page_part)
312
- if cleaned_content.strip():
313
- processed_pages.append(cleaned_content)
314
-
315
- return '\n'.join(processed_pages)
316
- else:
317
- # Single page content
318
- return self._clean_page_content(text)
319
-
320
- def _clean_page_content(self, content: str) -> str:
321
- """Clean individual page content removing all duplicates and artifacts"""
322
- if not content.strip():
323
- return content
324
-
325
- import re
326
-
327
- # Step 1: Identify and preserve table sections
328
- table_pattern = r'(--- TABLE \d+ ---\n.*?)(?=\n--- TABLE \d+ ---|\n=== PAGE |\Z)'
329
- table_sections = {}
330
- table_positions = []
331
-
332
- for match in re.finditer(table_pattern, content, re.DOTALL):
333
- start_pos = match.start()
334
- end_pos = match.end()
335
- table_content = match.group(1)
336
- table_sections[start_pos] = table_content
337
- table_positions.append((start_pos, end_pos))
338
-
339
- # Step 2: Extract pure text content (excluding table regions)
340
- text_content = content
341
-
342
- # Remove table sections from text processing
343
- for start_pos, end_pos in sorted(table_positions, reverse=True):
344
- text_content = text_content[:start_pos] + '\n<<<TABLE_PLACEHOLDER>>>\n' + text_content[end_pos:]
345
-
346
- # Step 3: Clean the text content
347
- lines = text_content.split('\n')
348
- cleaned_lines = []
349
-
350
- for line in lines:
351
- if line.strip() == '<<<TABLE_PLACEHOLDER>>>':
352
- cleaned_lines.append(line) # Preserve placeholder
353
- continue
354
-
355
- # Remove excessive whitespace but preserve structure
356
- if line.strip():
357
- # Clean up multiple spaces but preserve indentation
358
- leading_spaces = len(line) - len(line.lstrip())
359
- content_part = re.sub(r'\s+', ' ', line.strip())
360
- cleaned_line = ' ' * leading_spaces + content_part
361
- cleaned_lines.append(cleaned_line)
362
- else:
363
- cleaned_lines.append('')
364
-
365
- # Step 4: Remove excessive empty lines
366
- result_lines = []
367
- empty_count = 0
368
-
369
- for line in cleaned_lines:
370
- if not line.strip() and line != '<<<TABLE_PLACEHOLDER>>>':
371
- empty_count += 1
372
- if empty_count <= 1: # Allow max 1 empty line between content
373
- result_lines.append('')
374
- else:
375
- empty_count = 0
376
- result_lines.append(line)
377
-
378
- # Step 5: Restore table sections with enhanced cleaning
379
- processed_text = '\n'.join(result_lines)
380
-
381
- # Replace placeholders with cleaned table content
382
- for start_pos in sorted(table_sections.keys()):
383
- table_content = table_sections[start_pos]
384
- # ENHANCED: Clean table content to remove separator rows
385
- cleaned_table_content = self._clean_table_content(table_content)
386
- processed_text = processed_text.replace('\n<<<TABLE_PLACEHOLDER>>>\n', f'\n{cleaned_table_content}\n', 1)
387
-
388
- return processed_text
389
-
390
- def _clean_table_content(self, table_content: str) -> str:
391
- """Clean table content removing separator rows and duplicates"""
392
- lines = table_content.split('\n')
393
- cleaned_lines = []
394
-
395
- for line in lines:
396
- line_stripped = line.strip()
397
-
398
- # Keep table headers
399
- if line_stripped.startswith('--- TABLE '):
400
- cleaned_lines.append(line_stripped)
401
- continue
402
-
403
- # CRITICAL: Skip separator rows (lines that are mostly dashes and pipes)
404
- if line_stripped:
405
- # Remove pipes and spaces, check if remaining content is just dashes
406
- content_check = line_stripped.replace('|', '').replace(' ', '')
407
- if content_check.replace('-', '') == '':
408
- # This is a separator row, skip it
409
- continue
410
-
411
- # Keep actual content rows
412
- cleaned_lines.append(line_stripped)
413
-
414
- return '\n'.join(cleaned_lines)
415
-
416
- def extract_table_data(self, text: str) -> Dict[str, Any]:
417
- """
418
- Extract structured table data from processed text - NO duplicates
419
-
420
- Args:
421
- text: Processed text containing table markers
422
-
423
- Returns:
424
- Dictionary containing extracted table information
425
- """
426
- import re
427
-
428
- tables = {}
429
-
430
- # More precise pattern to avoid overlapping matches
431
- table_pattern = r'--- TABLE (\d+) ---\n(.*?)(?=\n--- TABLE \d+ ---|$|\n=== PAGE)'
432
-
433
- matches = re.finditer(table_pattern, text, re.DOTALL)
434
-
435
- for match in matches:
436
- table_num = int(match.group(1))
437
- table_content = match.group(2).strip()
438
-
439
- # Only process if we haven't seen this table number before
440
- if table_num not in tables:
441
- table_data = self._parse_table_content(table_content)
442
- tables[table_num] = table_data
443
-
444
- return {
445
- 'table_count': len(tables),
446
- 'tables': tables,
447
- 'has_tables': len(tables) > 0
448
- }
449
-
450
- def _parse_table_content(self, content: str) -> Dict[str, Any]:
451
- """Parse individual table content into structured data - improved with separator filtering"""
452
- lines = [line.strip() for line in content.split('\n') if line.strip()]
453
-
454
- table_data = {
455
- 'rows': [],
456
- 'columns': 0,
457
- 'has_header': False
458
- }
459
-
460
- seen_rows = set() # Track seen row content to avoid duplicates
461
-
462
- for i, line in enumerate(lines):
463
- # ENHANCED: Skip separator lines more comprehensively
464
- line_content = line.replace('|', '').replace(' ', '')
465
- if line_content.replace('-', '') == '':
466
- continue # Skip separator rows
467
-
468
- if '|' in line:
469
- # Split by | and clean up cells
470
- cells = [cell.strip() for cell in line.split('|')]
471
- # Remove empty cells at start/end
472
- while cells and not cells[0]:
473
- cells.pop(0)
474
- while cells and not cells[-1]:
475
- cells.pop()
476
-
477
- if cells:
478
- # Create a key for duplicate detection
479
- row_key = '|'.join(cells).lower().strip()
480
-
481
- # Only add if we haven't seen this exact row before
482
- if row_key not in seen_rows:
483
- table_data['rows'].append(cells)
484
- table_data['columns'] = max(table_data['columns'], len(cells))
485
- seen_rows.add(row_key)
486
-
487
- # Assume first unique row is header
488
- if len(table_data['rows']) == 1:
489
- table_data['has_header'] = True
490
-
491
- return table_data
492
-
493
- def validate_pdf_file(self, file_path: str) -> Dict[str, Any]:
494
- """
495
- Validate PDF file before processing - enhanced validation
496
-
497
- Args:
498
- file_path: Path to the PDF file
499
-
500
- Returns:
501
- Dict with validation results
502
- """
503
- validation_result = {
504
- 'valid': False,
505
- 'error': None,
506
- 'warnings': [],
507
- 'file_info': {}
508
- }
509
 
510
  try:
511
- # Check if file exists
512
- if not os.path.exists(file_path):
513
- validation_result['error'] = "File does not exist"
514
- return validation_result
515
-
516
- # Check file extension
517
- if not file_path.lower().endswith('.pdf'):
518
- validation_result['warnings'].append("File does not have .pdf extension")
519
-
520
- # Check file size
521
- file_size = os.path.getsize(file_path)
522
- max_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024
523
-
524
- if file_size > max_size:
525
- validation_result['error'] = f"File too large ({file_size/(1024*1024):.1f}MB > {max_size/(1024*1024)}MB)"
526
- return validation_result
527
-
528
- if file_size == 0:
529
- validation_result['error'] = "File is empty"
530
- return validation_result
531
-
532
- # Try to open with PyMuPDF to check if it's a valid PDF
533
  try:
534
- import fitz
535
- doc = fitz.open(file_path)
536
- page_count = len(doc)
537
-
538
- # Quick validation - try to access first page
539
- if page_count > 0:
540
- first_page = doc.load_page(0)
541
- # Try to get some text to ensure it's readable
542
- first_page.get_text()
543
-
544
- doc.close()
545
-
546
- if page_count == 0:
547
- validation_result['warnings'].append("PDF contains no pages")
548
-
549
- validation_result['file_info'] = {
550
- 'size_mb': round(file_size / (1024*1024), 2),
551
- 'pages': page_count
552
- }
553
-
554
- except Exception as pdf_error:
555
- validation_result['error'] = f"Invalid or corrupted PDF file: {str(pdf_error)}"
556
- return validation_result
557
 
558
- validation_result['valid'] = True
 
 
 
 
 
 
 
 
559
 
560
  except Exception as e:
561
- validation_result['error'] = f"Validation error: {str(e)}"
 
562
 
563
- return validation_result
564
 
565
  def get_available_methods(self) -> List[str]:
566
  """Get list of available OCR methods"""
567
  methods = self.ocr_service.get_available_methods()
568
- logger.info(f"Available OCR methods: {methods}")
569
  return methods
570
 
571
  def get_service_status(self) -> Dict[str, Any]:
572
- """Get comprehensive service status"""
573
  available_methods = self.get_available_methods()
574
 
 
 
 
 
 
 
 
575
  status = {
576
  'service_healthy': True,
577
  'available_methods': available_methods,
@@ -582,37 +742,16 @@ class BackendManager:
582
  'successful_processes': sum(1 for h in self.processing_history if h.get('success', False)),
583
  'temp_dir': str(self.temp_dir),
584
  'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)),
585
- 'table_processing_enhanced': True,
586
- 'preprocessing_available': True
 
 
 
 
587
  }
588
 
589
  return status
590
 
591
- def get_processing_history(self, limit: int = 10) -> List[Dict[str, Any]]:
592
- """Get recent processing history"""
593
- return self.processing_history[-limit:]
594
-
595
- def cleanup_temp_files(self):
596
- """Clean up temporary files"""
597
- try:
598
- temp_files = list(self.temp_dir.glob('*'))
599
- cleaned_count = 0
600
-
601
- for temp_file in temp_files:
602
- try:
603
- # Remove files older than 1 hour
604
- if temp_file.is_file() and temp_file.stat().st_mtime < (datetime.now().timestamp() - 3600):
605
- temp_file.unlink()
606
- cleaned_count += 1
607
- except Exception as e:
608
- logger.warning(f"Could not remove temp file {temp_file}: {e}")
609
-
610
- if cleaned_count > 0:
611
- logger.info(f"Cleaned up {cleaned_count} temporary files")
612
-
613
- except Exception as e:
614
- logger.error(f"Error during cleanup: {e}")
615
-
616
  def _calculate_file_hash(self, file_path: str) -> str:
617
  """Calculate SHA-256 hash of file"""
618
  sha256_hash = hashlib.sha256()
@@ -634,32 +773,29 @@ class BackendManager:
634
  if len(self.processing_history) > self.max_history_size:
635
  self.processing_history = self.processing_history[-self.max_history_size:]
636
 
637
- def export_processing_history(self, file_path: str = None) -> str:
638
- """Export processing history to JSON file"""
639
- if file_path is None:
640
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
641
- file_path = self.temp_dir / f"processing_history_{timestamp}.json"
642
-
643
  try:
644
- history_data = {
645
- 'exported_at': datetime.now().isoformat(),
646
- 'total_entries': len(self.processing_history),
647
- 'service_status': self.get_service_status(),
648
- 'history': self.processing_history
649
- }
650
-
651
- with open(file_path, 'w') as f:
652
- json.dump(history_data, f, indent=2)
653
 
654
- logger.info(f"Processing history exported to: {file_path}")
655
- return str(file_path)
 
 
 
 
 
 
656
 
 
 
 
657
  except Exception as e:
658
- logger.error(f"Error exporting history: {e}")
659
- raise
660
 
661
- def get_statistics(self) -> Dict[str, Any]:
662
- """Get processing statistics"""
663
  if not self.processing_history:
664
  return {
665
  'total_processed': 0,
@@ -668,27 +804,31 @@ class BackendManager:
668
  'most_used_method': 'N/A',
669
  'total_text_extracted': 0,
670
  'total_tables_processed': 0,
671
- 'preprocessing_usage': 0
 
 
672
  }
673
 
674
  total_processed = len(self.processing_history)
675
  successful = [h for h in self.processing_history if h.get('success', False)]
676
  success_rate = (len(successful) / total_processed) * 100 if total_processed > 0 else 0
677
 
678
- # Calculate average processing time
679
  processing_times = [h.get('processing_time', 0) for h in self.processing_history if 'processing_time' in h]
680
  avg_processing_time = sum(processing_times) / len(processing_times) if processing_times else 0
681
 
682
- # Find most used method
683
  methods = [h.get('method_used', 'unknown') for h in successful]
684
  most_used_method = max(set(methods), key=methods.count) if methods else 'N/A'
685
 
686
- # Calculate total text and tables extracted
687
  total_text = sum(h.get('text_length', 0) for h in successful)
688
  total_tables = sum(h.get('table_count', 0) for h in successful)
689
 
690
- # Calculate preprocessing usage
691
  preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False))
 
 
 
 
 
692
 
693
  return {
694
  'total_processed': total_processed,
@@ -699,15 +839,18 @@ class BackendManager:
699
  'total_tables_processed': total_tables,
700
  'successful_processes': len(successful),
701
  'failed_processes': total_processed - len(successful),
702
- 'preprocessing_usage': preprocessing_usage
 
 
 
703
  }
704
 
705
 
706
- # Initialize global backend manager instance
707
  _backend_manager = None
708
 
709
  def get_backend_manager() -> BackendManager:
710
- """Get global backend manager instance"""
711
  global _backend_manager
712
  if _backend_manager is None:
713
  _backend_manager = BackendManager()
@@ -715,11 +858,11 @@ def get_backend_manager() -> BackendManager:
715
 
716
 
717
  if __name__ == "__main__":
718
- # Test the backend manager
719
  manager = BackendManager()
720
 
721
- print("Enhanced Backend Manager Test")
722
- print("============================")
723
  print(f"Available methods: {manager.get_available_methods()}")
724
  print(f"Service status: {manager.get_service_status()}")
725
- print(f"Statistics: {manager.get_statistics()}")
 
1
  """
2
+ Backend Management Module - FIXED VERSION with Corrected Crop Processing
3
  Coordinates between UI and OCR services, handles file management and preprocessing
4
  """
5
  import re
 
26
  logger = logging.getLogger(__name__)
27
 
28
 
29
+ class DocumentExporter:
30
+ """Advanced document export with HTML-based formatting"""
31
+
32
+ @staticmethod
33
+ def create_enhanced_txt_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
34
+ """Create enhanced TXT file with improved formatting"""
35
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
36
+ temp_file = tempfile.NamedTemporaryFile(
37
+ suffix=f'_extracted_text_{timestamp}.txt',
38
+ delete=False,
39
+ mode='w',
40
+ encoding='utf-8'
41
+ )
42
+
43
+ try:
44
+ # Add header
45
+ temp_file.write("PDF OCR Extraction Results - Enhanced with HTML Processing\n")
46
+ temp_file.write("=" * 70 + "\n\n")
47
+
48
+ # Add metadata
49
+ if metadata_info:
50
+ temp_file.write("Processing Information:\n")
51
+ temp_file.write("-" * 25 + "\n")
52
+ temp_file.write(metadata_info + "\n\n")
53
+
54
+ # Add timestamp
55
+ temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
56
+ temp_file.write("=" * 70 + "\n\n")
57
+
58
+ # Add main content
59
+ temp_file.write("Extracted Text (Formatted):\n")
60
+ temp_file.write("-" * 30 + "\n\n")
61
+ temp_file.write(text_content)
62
+
63
+ temp_file.close()
64
+ return temp_file.name
65
+
66
+ except Exception as e:
67
+ logger.error(f"Error creating enhanced TXT file: {e}")
68
+ temp_file.close()
69
+ raise
70
+
71
+ @staticmethod
72
+ def create_enhanced_docx_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
73
+ """Create enhanced DOCX file from HTML content with proper spacing and indentation"""
74
+ try:
75
+ from docx import Document
76
+ from docx.shared import Inches, Pt, RGBColor
77
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
78
+ from docx.enum.table import WD_TABLE_ALIGNMENT
79
+ from docx.oxml.shared import OxmlElement, qn
80
+ from html.parser import HTMLParser
81
+
82
+ # Enhanced HTML to DOCX parser with spacing preservation
83
+ class EnhancedDOCXHTMLParser(HTMLParser):
84
+ def __init__(self, doc):
85
+ super().__init__()
86
+ self.doc = doc
87
+ self.current_paragraph = None
88
+ self.current_run = None
89
+ self.in_table = False
90
+ self.current_table = None
91
+ self.current_row = None
92
+ self.current_cell = None
93
+ self.table_data = []
94
+ self.current_table_row = []
95
+ self.current_indent_em = 0
96
+ self.is_bold = False
97
+ self.is_title = False
98
+ self.is_heading = False
99
+ self.is_bullet_point = False
100
+
101
+ def handle_starttag(self, tag, attrs):
102
+ attr_dict = dict(attrs)
103
+ class_attr = attr_dict.get('class', '')
104
+ style_attr = attr_dict.get('style', '')
105
+
106
+ if tag == 'div' and 'page' in class_attr:
107
+ # Add minimal page separation (just paragraph spacing, no page break)
108
+ if hasattr(self, 'has_content'):
109
+ # Add just 2 line breaks worth of spacing
110
+ self.doc.add_paragraph()
111
+ self.doc.add_paragraph()
112
+ self.has_content = True
113
+
114
+ elif tag == 'div' and 'page-header' in class_attr:
115
+ self.current_paragraph = self.doc.add_heading(level=1)
116
+ self.current_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
117
+
118
+ elif tag == 'div' and 'title' in class_attr:
119
+ self.current_paragraph = self.doc.add_heading(level=1)
120
+ self.is_title = True
121
+ self._apply_spacing_from_style(style_attr)
122
+
123
+ elif tag == 'div' and 'section-heading' in class_attr:
124
+ self.current_paragraph = self.doc.add_heading(level=2)
125
+ self.is_heading = True
126
+ self._apply_spacing_from_style(style_attr)
127
+
128
+ elif tag == 'div' and 'paragraph' in class_attr:
129
+ self.current_paragraph = self.doc.add_paragraph()
130
+ self.is_bullet_point = 'bullet-point' in class_attr
131
+ self._apply_spacing_from_style(style_attr)
132
+
133
+ elif tag == 'table':
134
+ self.in_table = True
135
+ self.table_data = []
136
+
137
+ elif tag == 'tr':
138
+ self.current_table_row = []
139
+
140
+ elif tag == 'th' or tag == 'td':
141
+ pass # Will be handled in handle_data
142
+
143
+ elif tag == 'br':
144
+ if self.current_paragraph:
145
+ self.current_paragraph.add_run().add_break()
146
+
147
+ def _apply_spacing_from_style(self, style_attr):
148
+ """Apply spacing and indentation from HTML style to DOCX paragraph"""
149
+ if not self.current_paragraph:
150
+ return
151
+
152
+ # Extract margin-left for indentation
153
+ import re
154
+ margin_match = re.search(r'margin-left:\s*(\d+(?:\.\d+)?)em', style_attr)
155
+ if margin_match:
156
+ em_value = float(margin_match.group(1))
157
+ # Convert em to inches (1em ≈ 12pt, 72pt = 1 inch)
158
+ indent_inches = (em_value * 12) / 72
159
+ self.current_paragraph.paragraph_format.left_indent = Inches(indent_inches)
160
+
161
+ # For bullet points, add hanging indent
162
+ if self.is_bullet_point:
163
+ self.current_paragraph.paragraph_format.first_line_indent = Inches(-0.25)
164
+
165
+ # Set line spacing for better readability
166
+ from docx.shared import Length
167
+ self.current_paragraph.paragraph_format.line_spacing = 1.15
168
+
169
+ # Add appropriate spacing after paragraphs
170
+ self.current_paragraph.paragraph_format.space_after = Pt(6)
171
+
172
+ def handle_endtag(self, tag):
173
+ if tag == 'div' and (self.is_title or self.is_heading):
174
+ self.is_title = False
175
+ self.is_heading = False
176
+ self.current_paragraph = None
177
+
178
+ elif tag == 'div' and self.current_paragraph and not self.in_table:
179
+ self.is_bullet_point = False
180
+ self.current_paragraph = None
181
+
182
+ elif tag == 'table':
183
+ self.in_table = False
184
+ self._create_enhanced_docx_table()
185
+
186
+ elif tag == 'tr' and self.current_table_row:
187
+ self.table_data.append(self.current_table_row[:])
188
+ self.current_table_row = []
189
+
190
+ def handle_data(self, data):
191
+ if data.strip():
192
+ # Convert &nbsp; back to regular spaces
193
+ data = data.replace('&nbsp;', ' ')
194
+
195
+ if self.in_table:
196
+ self.current_table_row.append(data.strip())
197
+ elif self.current_paragraph is not None:
198
+ run = self.current_paragraph.add_run(data)
199
+ if self.is_title:
200
+ run.bold = True
201
+ run.font.size = Pt(16)
202
+ elif self.is_heading:
203
+ run.bold = True
204
+ run.font.size = Pt(14)
205
+ else:
206
+ # Regular text formatting
207
+ run.font.size = Pt(11)
208
+
209
+ def _create_enhanced_docx_table(self):
210
+ if not self.table_data:
211
+ return
212
+
213
+ # Create table with proper formatting
214
+ rows = len(self.table_data)
215
+ cols = max(len(row) for row in self.table_data) if self.table_data else 1
216
+
217
+ table = self.doc.add_table(rows=rows, cols=cols)
218
+ table.style = 'Table Grid'
219
+ table.alignment = WD_TABLE_ALIGNMENT.LEFT
220
+
221
+ # Set table margins
222
+ table.autofit = False
223
+
224
+ # Fill table data with proper formatting
225
+ for row_idx, row_data in enumerate(self.table_data):
226
+ table_row = table.rows[row_idx]
227
+ for col_idx, cell_data in enumerate(row_data):
228
+ if col_idx < len(table_row.cells):
229
+ cell = table_row.cells[col_idx]
230
+ cell.text = str(cell_data)
231
+
232
+ # Style header row
233
+ if row_idx == 0:
234
+ for paragraph in cell.paragraphs:
235
+ for run in paragraph.runs:
236
+ run.bold = True
237
+ run.font.size = Pt(10)
238
+ paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
239
+ else:
240
+ # Regular data cells
241
+ for paragraph in cell.paragraphs:
242
+ for run in paragraph.runs:
243
+ run.font.size = Pt(10)
244
+
245
+ # Set cell margins for better spacing
246
+ cell.vertical_alignment = WD_ALIGN_PARAGRAPH.LEFT
247
+
248
+ # Add spacing after table
249
+ self.doc.add_paragraph()
250
+
251
+ # Create DOCX document
252
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
253
+ temp_file = tempfile.NamedTemporaryFile(
254
+ suffix=f'_extracted_document_{timestamp}.docx',
255
+ delete=False
256
+ )
257
+ temp_file.close()
258
+
259
+ doc = Document()
260
+
261
+ # Set document margins for better spacing
262
+ sections = doc.sections
263
+ for section in sections:
264
+ section.top_margin = Inches(1)
265
+ section.bottom_margin = Inches(1)
266
+ section.left_margin = Inches(1)
267
+ section.right_margin = Inches(1)
268
+
269
+ # Title with better formatting
270
+ title = doc.add_heading('PDF OCR Extraction Results', 0)
271
+ title.alignment = WD_ALIGN_PARAGRAPH.CENTER
272
+
273
+ # Add subtitle with enhanced styling
274
+ subtitle_para = doc.add_paragraph()
275
+ subtitle_run = subtitle_para.add_run('Enhanced with HTML Processing and Preserved Formatting')
276
+ subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
277
+ subtitle_run.italic = True
278
+ subtitle_run.font.size = Pt(12)
279
+ subtitle_run.font.color.rgb = RGBColor(102, 102, 102)
280
+
281
+ # Metadata section with better formatting
282
+ if metadata_info:
283
+ doc.add_heading('Processing Information', level=1)
284
+ meta_para = doc.add_paragraph()
285
+ meta_run = meta_para.add_run(metadata_info)
286
+ meta_run.font.size = Pt(10)
287
+ meta_para.style = 'Intense Quote'
288
+ doc.add_paragraph() # Add spacing
289
+
290
+ # Process HTML content with enhanced spacing
291
+ doc.add_heading('Extracted Content', level=1)
292
+
293
+ if html_content and '<table' in html_content:
294
+ # Parse HTML and convert to DOCX with spacing preservation
295
+ parser = EnhancedDOCXHTMLParser(doc)
296
+ parser.feed(html_content)
297
+ else:
298
+ # Fallback to text content with enhanced formatting
299
+ paragraphs = text_content.split('\n\n')
300
+ for para in paragraphs:
301
+ if para.strip():
302
+ if para.strip().startswith('==='):
303
+ # Page headers with minimal separation
304
+ page_header = doc.add_heading(para.strip(), level=1)
305
+ page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER
306
+ elif para.strip().startswith('#'):
307
+ # Titles
308
+ title_text = para.strip().lstrip('#').strip()
309
+ title_para = doc.add_heading(title_text, level=1)
310
+ elif para.strip().startswith('##'):
311
+ # Section headings
312
+ heading_text = para.strip().lstrip('#').strip()
313
+ heading_para = doc.add_heading(heading_text, level=2)
314
+ else:
315
+ # Regular paragraphs with spacing preservation
316
+ lines = para.split('\n')
317
+ for line in lines:
318
+ if line.strip():
319
+ para_element = doc.add_paragraph()
320
+
321
+ # Calculate indentation from leading spaces
322
+ leading_spaces = len(line) - len(line.lstrip())
323
+ if leading_spaces > 0:
324
+ indent_level = leading_spaces // 2 # 2 spaces = 1 indent level
325
+ para_element.paragraph_format.left_indent = Inches(0.5 * indent_level)
326
+
327
+ # Add the text content
328
+ run = para_element.add_run(line.strip())
329
+ run.font.size = Pt(11)
330
+
331
+ # Set line spacing
332
+ para_element.paragraph_format.line_spacing = 1.15
333
+ para_element.paragraph_format.space_after = Pt(3)
334
+
335
+ # Enhanced footer
336
+ footer_section = doc.sections[0]
337
+ footer = footer_section.footer
338
+ footer_para = footer.paragraphs[0]
339
+ footer_para.text = f"Generated by Enhanced PDF OCR Service on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
340
+ footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
341
+ footer_run = footer_para.runs[0]
342
+ footer_run.font.size = Pt(9)
343
+ footer_run.font.color.rgb = RGBColor(128, 128, 128)
344
+
345
+ doc.save(temp_file.name)
346
+ logger.info(f"Enhanced DOCX file with proper spacing created: {temp_file.name}")
347
+ return temp_file.name
348
+
349
+ except ImportError:
350
+ raise ImportError("python-docx not installed. Cannot create DOCX files.")
351
+ except Exception as e:
352
+ logger.error(f"Error creating enhanced DOCX file: {e}")
353
+ try:
354
+ os.unlink(temp_file.name)
355
+ except:
356
+ pass
357
+ raise
358
+
359
+ @staticmethod
360
+ def create_html_file(html_content: str, metadata_info: str = "") -> str:
361
+ """Create standalone HTML file"""
362
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
363
+ temp_file = tempfile.NamedTemporaryFile(
364
+ suffix=f'_extracted_document_{timestamp}.html',
365
+ delete=False,
366
+ mode='w',
367
+ encoding='utf-8'
368
+ )
369
+
370
+ try:
371
+ # Enhanced HTML with better styling
372
+ enhanced_html = html_content.replace(
373
+ '<style>',
374
+ '''<style>
375
+ body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; line-height: 1.6; margin: 20px; background-color: #f9f9f9; }
376
+ .container { max-width: 1200px; margin: 0 auto; background-color: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
377
+ .header { text-align: center; margin-bottom: 30px; border-bottom: 3px solid #2c3e50; padding-bottom: 20px; }
378
+ .metadata { background-color: #ecf0f1; padding: 15px; border-radius: 5px; margin-bottom: 25px; border-left: 4px solid #3498db; }
379
+ '''
380
+ )
381
+
382
+ # Wrap content in container
383
+ if '<body>' in enhanced_html:
384
+ enhanced_html = enhanced_html.replace(
385
+ '<body>',
386
+ '''<body>
387
+ <div class="container">
388
+ <div class="header">
389
+ <h1>PDF OCR Extraction Results</h1>
390
+ <p>Enhanced with HTML Processing and Format Preservation</p>
391
+ </div>''' +
392
+ (f'<div class="metadata"><h3>Processing Information</h3><pre>{metadata_info}</pre></div>' if metadata_info else '')
393
+ )
394
+ enhanced_html = enhanced_html.replace('</body>', '</div></body>')
395
+
396
+ temp_file.write(enhanced_html)
397
+ temp_file.close()
398
+ return temp_file.name
399
+
400
+ except Exception as e:
401
+ logger.error(f"Error creating HTML file: {e}")
402
+ temp_file.close()
403
+ raise
404
+
405
+
406
  class BackendManager:
407
+ """Enhanced backend manager with FIXED crop processing and advanced export capabilities"""
408
 
409
  def __init__(self):
410
  self.ocr_service = OCRService()
 
415
  self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service'
416
  self.temp_dir.mkdir(exist_ok=True)
417
 
418
+ logger.info("Enhanced backend manager with fixed crop processing initialized successfully")
419
 
420
+ def process_pdf_with_enhanced_resolution(self, pdf_path: str, method: str = "auto",
421
+ preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
422
  """
423
+ Process PDF with enhanced resolution and HTML generation
424
 
425
  Args:
426
  pdf_path: Path to the PDF file
 
428
  preprocessing_options: Dictionary containing preprocessing settings
429
 
430
  Returns:
431
+ Dict containing processing results with HTML content
432
  """
433
  start_time = datetime.now()
434
 
 
438
  'success': False,
439
  'error': f"File not found: {pdf_path}",
440
  'text': '',
441
+ 'html': '',
442
  'method_used': '',
443
  'metadata': {}
444
  }
445
 
446
+ # Check file size
447
  max_file_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024
448
  file_size = os.path.getsize(pdf_path)
449
 
 
452
  'success': False,
453
  'error': f"File too large. Maximum size: {max_file_size // (1024*1024)}MB",
454
  'text': '',
455
+ 'html': '',
456
  'method_used': '',
457
  'metadata': {}
458
  }
459
 
460
+ # Generate file hash for tracking
461
  file_hash = self._calculate_file_hash(pdf_path)
462
 
463
+ logger.info(f"Processing PDF with enhanced resolution: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
464
  logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
465
 
466
  # Handle preprocessing if enabled
 
468
  preprocessing_applied = False
469
 
470
  if preprocessing_options and preprocessing_options.get('enable_header_footer_removal', False):
471
+ logger.info("Applying enhanced preprocessing...")
472
  try:
473
+ processed_pdf_path = self._apply_enhanced_preprocessing(pdf_path, preprocessing_options)
474
  preprocessing_applied = True
475
+ logger.info("Enhanced preprocessing completed successfully")
476
  except Exception as e:
477
  logger.error(f"Preprocessing failed: {e}")
 
478
  processed_pdf_path = pdf_path
479
 
480
  try:
481
+ # Process with enhanced OCR
482
  result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method)
483
 
484
  # Add processing metadata
 
489
  'file_size_mb': round(file_size / (1024*1024), 2),
490
  'processing_time_seconds': round(processing_time, 2),
491
  'timestamp': start_time.isoformat(),
492
+ 'enhanced_processing': True,
493
+ 'html_processing': True,
494
  'header_footer_removed': preprocessing_applied,
495
  'preprocessing_options': preprocessing_options if preprocessing_applied else None
496
  })
497
 
 
 
 
 
 
498
  # Cleanup temporary preprocessed file
499
  if preprocessing_applied and processed_pdf_path != pdf_path:
500
  try:
 
505
  # Log results
506
  if result['success']:
507
  text_length = len(result['text'])
508
+ has_html = bool(result.get('html'))
509
+ table_count = result['text'].count('Table ') if 'Table ' in result['text'] else 0
510
+
511
+ logger.info(f"Enhanced processing completed successfully in {processing_time:.2f}s")
512
  logger.info(f"Method used: {result['method_used']}")
513
  logger.info(f"Text extracted: {text_length} characters")
514
+ logger.info(f"HTML generated: {has_html}")
515
  if table_count > 0:
516
  logger.info(f"Tables detected: {table_count}")
517
  if preprocessing_applied:
518
+ logger.info("Enhanced preprocessing applied")
519
 
520
  # Add to processing history
521
  self._add_to_history({
 
526
  'text_length': text_length,
527
  'table_count': table_count,
528
  'processing_time': processing_time,
529
+ 'preprocessing_applied': preprocessing_applied,
530
+ 'html_generated': has_html,
531
+ 'enhanced_processing': True
532
  })
533
  else:
534
+ logger.error(f"Enhanced processing failed: {result.get('error', 'Unknown error')}")
535
 
536
  # Add to processing history
537
  self._add_to_history({
 
541
  'success': False,
542
  'error': result.get('error', 'Unknown error'),
543
  'processing_time': processing_time,
544
+ 'preprocessing_applied': preprocessing_applied,
545
+ 'enhanced_processing': True
546
  })
547
 
548
  return result
549
 
550
  except Exception as e:
551
+ logger.error(f"Unexpected error during enhanced processing: {e}")
552
 
553
+ # Cleanup
554
  if preprocessing_applied and processed_pdf_path != pdf_path:
555
  try:
556
  os.unlink(processed_pdf_path)
 
565
  'method_requested': method,
566
  'success': False,
567
  'error': str(e),
568
+ 'processing_time': processing_time,
569
+ 'enhanced_processing': True
570
  })
571
 
572
  return {
573
  'success': False,
574
+ 'error': f"Enhanced processing error: {str(e)}",
575
  'text': '',
576
+ 'html': '',
577
  'method_used': '',
578
  'metadata': {
579
  'file_hash': file_hash,
580
  'processing_time_seconds': round(processing_time, 2),
581
+ 'timestamp': start_time.isoformat(),
582
+ 'enhanced_processing': True
583
  }
584
  }
585
 
586
+ def _apply_enhanced_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str:
587
+ """Apply enhanced preprocessing with high-resolution crop handling - FIXED"""
588
+ crop_settings = options.get('crop_settings', {})
589
+ per_page_crops = crop_settings.get('per_page_crops', {})
590
+ enhanced_resolution = crop_settings.get('enhanced_resolution', True)
591
+ resolution_scale = crop_settings.get('resolution_scale', 2.0)
 
 
 
 
 
 
592
 
593
  # Create temporary file for processed PDF
594
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
595
+ temp_pdf_path = self.temp_dir / f"enhanced_preprocessed_{timestamp}.pdf"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
 
597
+ doc = fitz.open(pdf_path)
 
 
 
598
  new_doc = fitz.open()
599
 
600
  try:
 
602
  page = doc.load_page(page_num)
603
  page_rect = page.rect
604
 
605
+ # Get crop settings for this page - FIXED indexing
606
+ page_crop = per_page_crops.get(page_num, per_page_crops.get(0, {
607
+ 'top': 0, 'bottom': 0, 'left': 0, 'right': 0
608
+ }))
609
+
610
+ top_percent = page_crop.get('top', 0)
611
+ bottom_percent = page_crop.get('bottom', 0)
612
+ left_percent = page_crop.get('left', 0)
613
+ right_percent = page_crop.get('right', 0)
614
+
615
+ # Calculate crop amounts
616
  width = page_rect.width
617
  height = page_rect.height
618
 
 
629
  page_rect.y1 - crop_bottom
630
  )
631
 
632
+ # Ensure the rectangle is valid
633
+ if new_rect.width <= 0 or new_rect.height <= 0:
634
+ logger.warning(f"Invalid crop rectangle for page {page_num}, using original page")
635
+ new_rect = page_rect
636
 
637
+ # Create new page with enhanced resolution if enabled
638
+ if enhanced_resolution:
639
+ # Use high resolution for better quality
640
+ new_page = new_doc.new_page(
641
+ width=new_rect.width,
642
+ height=new_rect.height
643
+ )
644
+
645
+ # Copy content with proper transformation
646
+ mat = fitz.Matrix(1, 1).prescale(resolution_scale, resolution_scale)
647
+ new_page.show_pdf_page(
648
+ new_page.rect,
649
+ doc,
650
+ page_num,
651
+ clip=new_rect
652
+ )
653
+ else:
654
+ # Standard resolution
655
+ new_page = new_doc.new_page(width=new_rect.width, height=new_rect.height)
656
+ new_page.show_pdf_page(
657
+ new_page.rect,
658
+ doc,
659
+ page_num,
660
+ clip=new_rect
661
+ )
662
+
663
+ logger.debug(f"Page {page_num}: Applied crop T{top_percent}% B{bottom_percent}% L{left_percent}% R{right_percent}%")
664
 
665
+ new_doc.save(str(temp_pdf_path))
666
+ logger.info(f"Enhanced preprocessing applied with {resolution_scale}x resolution to {len(doc)} pages")
667
 
668
+ except Exception as e:
669
+ logger.error(f"Error in enhanced preprocessing: {e}")
670
+ raise
671
  finally:
672
  doc.close()
673
  new_doc.close()
674
 
675
+ return str(temp_pdf_path)
676
 
677
+ def create_enhanced_downloads(self, text_content: str, html_content: str,
678
+ metadata_info: str = "") -> Dict[str, str]:
679
+ """Create enhanced download files with HTML processing"""
680
+ download_files = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
681
 
682
  try:
683
+ # Create enhanced TXT file
684
+ txt_path = DocumentExporter.create_enhanced_txt_file(
685
+ text_content, html_content, metadata_info
686
+ )
687
+ download_files['txt'] = txt_path
688
+ logger.info(f"Enhanced TXT file created: {txt_path}")
689
+
690
+ # Create enhanced DOCX file if possible
 
 
 
 
 
 
 
 
 
 
 
 
 
 
691
  try:
692
+ docx_path = DocumentExporter.create_enhanced_docx_file(
693
+ text_content, html_content, metadata_info
694
+ )
695
+ download_files['docx'] = docx_path
696
+ logger.info(f"Enhanced DOCX file created: {docx_path}")
697
+ except ImportError:
698
+ logger.warning("python-docx not available. DOCX creation skipped.")
699
+ except Exception as e:
700
+ logger.error(f"DOCX creation failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
701
 
702
+ # Create standalone HTML file
703
+ try:
704
+ html_path = DocumentExporter.create_html_file(
705
+ html_content, metadata_info
706
+ )
707
+ download_files['html'] = html_path
708
+ logger.info(f"HTML file created: {html_path}")
709
+ except Exception as e:
710
+ logger.error(f"HTML file creation failed: {e}")
711
 
712
  except Exception as e:
713
+ logger.error(f"Error creating enhanced downloads: {e}")
714
+ raise
715
 
716
+ return download_files
717
 
718
  def get_available_methods(self) -> List[str]:
719
  """Get list of available OCR methods"""
720
  methods = self.ocr_service.get_available_methods()
721
+ logger.info(f"Available enhanced OCR methods: {methods}")
722
  return methods
723
 
724
  def get_service_status(self) -> Dict[str, Any]:
725
+ """Get comprehensive service status with enhanced features"""
726
  available_methods = self.get_available_methods()
727
 
728
+ # Check DOCX support
729
+ try:
730
+ import docx
731
+ docx_available = True
732
+ except ImportError:
733
+ docx_available = False
734
+
735
  status = {
736
  'service_healthy': True,
737
  'available_methods': available_methods,
 
742
  'successful_processes': sum(1 for h in self.processing_history if h.get('success', False)),
743
  'temp_dir': str(self.temp_dir),
744
  'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)),
745
+ 'enhanced_processing': True,
746
+ 'html_processing': True,
747
+ 'docx_export_available': docx_available,
748
+ 'enhanced_crop_processing': True,
749
+ 'multi_resolution_support': True,
750
+ 'crop_processing_fixed': True
751
  }
752
 
753
  return status
754
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
755
  def _calculate_file_hash(self, file_path: str) -> str:
756
  """Calculate SHA-256 hash of file"""
757
  sha256_hash = hashlib.sha256()
 
773
  if len(self.processing_history) > self.max_history_size:
774
  self.processing_history = self.processing_history[-self.max_history_size:]
775
 
776
+ def cleanup_temp_files(self):
777
+ """Clean up temporary files"""
 
 
 
 
778
  try:
779
+ temp_files = list(self.temp_dir.glob('*'))
780
+ cleaned_count = 0
 
 
 
 
 
 
 
781
 
782
+ for temp_file in temp_files:
783
+ try:
784
+ # Remove files older than 1 hour
785
+ if temp_file.is_file() and temp_file.stat().st_mtime < (datetime.now().timestamp() - 3600):
786
+ temp_file.unlink()
787
+ cleaned_count += 1
788
+ except Exception as e:
789
+ logger.warning(f"Could not remove temp file {temp_file}: {e}")
790
 
791
+ if cleaned_count > 0:
792
+ logger.info(f"Cleaned up {cleaned_count} temporary files")
793
+
794
  except Exception as e:
795
+ logger.error(f"Error during cleanup: {e}")
 
796
 
797
+ def get_enhanced_statistics(self) -> Dict[str, Any]:
798
+ """Get enhanced processing statistics"""
799
  if not self.processing_history:
800
  return {
801
  'total_processed': 0,
 
804
  'most_used_method': 'N/A',
805
  'total_text_extracted': 0,
806
  'total_tables_processed': 0,
807
+ 'preprocessing_usage': 0,
808
+ 'html_generation_rate': 0,
809
+ 'enhanced_processing_usage': 0
810
  }
811
 
812
  total_processed = len(self.processing_history)
813
  successful = [h for h in self.processing_history if h.get('success', False)]
814
  success_rate = (len(successful) / total_processed) * 100 if total_processed > 0 else 0
815
 
816
+ # Calculate statistics
817
  processing_times = [h.get('processing_time', 0) for h in self.processing_history if 'processing_time' in h]
818
  avg_processing_time = sum(processing_times) / len(processing_times) if processing_times else 0
819
 
 
820
  methods = [h.get('method_used', 'unknown') for h in successful]
821
  most_used_method = max(set(methods), key=methods.count) if methods else 'N/A'
822
 
 
823
  total_text = sum(h.get('text_length', 0) for h in successful)
824
  total_tables = sum(h.get('table_count', 0) for h in successful)
825
 
 
826
  preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False))
827
+ html_generated = sum(1 for h in self.processing_history if h.get('html_generated', False))
828
+ enhanced_processing = sum(1 for h in self.processing_history if h.get('enhanced_processing', False))
829
+
830
+ html_generation_rate = (html_generated / total_processed) * 100 if total_processed > 0 else 0
831
+ enhanced_processing_rate = (enhanced_processing / total_processed) * 100 if total_processed > 0 else 0
832
 
833
  return {
834
  'total_processed': total_processed,
 
839
  'total_tables_processed': total_tables,
840
  'successful_processes': len(successful),
841
  'failed_processes': total_processed - len(successful),
842
+ 'preprocessing_usage': preprocessing_usage,
843
+ 'html_generation_rate': round(html_generation_rate, 2),
844
+ 'enhanced_processing_usage': enhanced_processing,
845
+ 'enhanced_processing_rate': round(enhanced_processing_rate, 2)
846
  }
847
 
848
 
849
+ # Global backend manager instance
850
  _backend_manager = None
851
 
852
  def get_backend_manager() -> BackendManager:
853
+ """Get global enhanced backend manager instance"""
854
  global _backend_manager
855
  if _backend_manager is None:
856
  _backend_manager = BackendManager()
 
858
 
859
 
860
  if __name__ == "__main__":
861
+ # Test the enhanced backend manager
862
  manager = BackendManager()
863
 
864
+ print("Enhanced Backend Manager with Fixed Crop Processing Test")
865
+ print("=" * 60)
866
  print(f"Available methods: {manager.get_available_methods()}")
867
  print(f"Service status: {manager.get_service_status()}")
868
+ print(f"Enhanced statistics: {manager.get_enhanced_statistics()}")
ocr_service.py CHANGED
@@ -1,11 +1,11 @@
1
  """
2
- OCR Service Module - FIXED VERSION
3
- Handles PDF to text conversion using Azure Document Intelligence with fallback methods
4
  """
5
  import re
6
  import os
7
  import logging
8
- from typing import Optional, Dict, Any, Tuple
9
  import tempfile
10
  from pathlib import Path
11
 
@@ -35,8 +35,708 @@ logging.basicConfig(level=logging.INFO)
35
  logger = logging.getLogger(__name__)
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  class OCRService:
39
- """Main OCR service with multiple providers and fallback mechanisms"""
40
 
41
  def __init__(self):
42
  self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
@@ -58,18 +758,19 @@ class OCRService:
58
 
59
  def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
60
  """
61
- Convert PDF to text using specified method
62
 
63
  Args:
64
  pdf_path: Path to the PDF file
65
  method: OCR method ('azure', 'tesseract', 'pymupdf', 'auto')
66
 
67
  Returns:
68
- Dict containing text content, metadata, and processing info
69
  """
70
  result = {
71
  'success': False,
72
  'text': '',
 
73
  'method_used': '',
74
  'metadata': {},
75
  'error': None
@@ -91,7 +792,7 @@ class OCRService:
91
  # Try primary method
92
  try:
93
  if method == "azure" and self.azure_client:
94
- result = self._azure_ocr(pdf_path)
95
  elif method == "tesseract":
96
  result = self._tesseract_ocr(pdf_path)
97
  elif method == "pymupdf":
@@ -110,11 +811,12 @@ class OCRService:
110
 
111
  return result
112
 
113
- def _azure_ocr(self, pdf_path: str) -> Dict[str, Any]:
114
- """Azure Document Intelligence OCR with enhanced layout preservation"""
115
  result = {
116
  'success': False,
117
  'text': '',
 
118
  'method_used': 'azure_document_intelligence',
119
  'metadata': {},
120
  'error': None
@@ -124,9 +826,8 @@ class OCRService:
124
  with open(pdf_path, 'rb') as pdf_file:
125
  file_content = pdf_file.read()
126
 
127
- # Try different API call patterns for different SDK versions
128
  try:
129
- # Pattern 1: body + content_type (most common for current SDK)
130
  poller = self.azure_client.begin_analyze_document(
131
  "prebuilt-layout",
132
  body=file_content,
@@ -134,13 +835,11 @@ class OCRService:
134
  )
135
  except TypeError:
136
  try:
137
- # Pattern 2: model_id + body
138
  poller = self.azure_client.begin_analyze_document(
139
  model_id="prebuilt-layout",
140
  body=file_content
141
  )
142
  except TypeError:
143
- # Pattern 3: document parameter (older SDK)
144
  pdf_file.seek(0)
145
  poller = self.azure_client.begin_analyze_document(
146
  "prebuilt-layout",
@@ -149,22 +848,29 @@ class OCRService:
149
 
150
  analysis_result = poller.result()
151
 
152
- # Enhanced format preservation with better structure
153
- formatted_text = self._format_azure_result_enhanced(analysis_result)
 
 
 
154
 
155
  result.update({
156
  'success': True,
157
  'text': formatted_text,
 
158
  'metadata': {
159
  'pages': len(analysis_result.pages) if analysis_result.pages else 0,
160
  'tables': len(analysis_result.tables) if analysis_result.tables else 0,
161
  'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
162
  'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
163
- 'azure_analysis': analysis_result # Pass full result for DOCX formatting
 
 
 
164
  }
165
  })
166
 
167
- logger.info("Azure OCR completed successfully with enhanced formatting")
168
 
169
  except Exception as e:
170
  logger.error(f"Azure OCR error: {e}")
@@ -172,281 +878,12 @@ class OCRService:
172
 
173
  return result
174
 
175
- def _format_azure_result_enhanced(self, analysis_result) -> str:
176
- """FIXED: Enhanced formatting that eliminates ALL duplication at the source"""
177
- formatted_parts = []
178
-
179
- if not analysis_result.pages:
180
- return ""
181
-
182
- for page_num, page in enumerate(analysis_result.pages, 1):
183
- formatted_parts.append(f"\n=== PAGE {page_num} ===\n")
184
-
185
- # Get all tables for this page first
186
- page_tables = []
187
- table_regions = []
188
-
189
- if analysis_result.tables:
190
- for table_idx, table in enumerate(analysis_result.tables):
191
- if any(cell.bounding_regions and
192
- cell.bounding_regions[0].page_number == page_num
193
- for cell in table.cells):
194
- page_tables.append((table_idx, table))
195
-
196
- # Calculate table bounding region
197
- if table.bounding_regions:
198
- table_regions.append(table.bounding_regions[0])
199
-
200
- # CRITICAL FIX: Use ONLY paragraphs OR lines, never both
201
- content_items = []
202
-
203
- # Priority 1: Use paragraphs if available (they contain consolidated content)
204
- if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
205
- page_paragraphs = [p for p in analysis_result.paragraphs if
206
- p.bounding_regions and
207
- p.bounding_regions[0].page_number == page_num]
208
-
209
- # Use paragraph content ONLY - don't use lines at all
210
- for para in page_paragraphs:
211
- if para.content.strip() and not self._is_content_in_table(para, table_regions):
212
- y_pos = para.bounding_regions[0].polygon[1] if para.bounding_regions[0].polygon else 0
213
- content_items.append({
214
- 'type': 'paragraph',
215
- 'content': para.content.strip(),
216
- 'y_pos': y_pos,
217
- 'role': getattr(para, 'role', 'paragraph')
218
- })
219
-
220
- # Priority 2: Only if NO paragraphs available, use lines
221
- elif page.lines:
222
- # Deduplicate lines first - group by approximate position
223
- unique_lines = []
224
- seen_content = set()
225
-
226
- for line in page.lines:
227
- line_content = line.content.strip().lower()
228
- if (line_content and
229
- line_content not in seen_content and
230
- not self._is_content_in_table_by_line(line, table_regions)):
231
-
232
- seen_content.add(line_content)
233
- y_pos = line.polygon[1] if line.polygon else 0
234
- unique_lines.append({
235
- 'type': 'line',
236
- 'content': line.content.strip(),
237
- 'y_pos': y_pos,
238
- 'role': 'text'
239
- })
240
-
241
- content_items.extend(unique_lines)
242
-
243
- # Add table positions to content items
244
- for table_idx, table in page_tables:
245
- if table.bounding_regions:
246
- table_y_pos = table.bounding_regions[0].polygon[1] if table.bounding_regions[0].polygon else 9999
247
- content_items.append({
248
- 'type': 'table',
249
- 'content': table,
250
- 'y_pos': table_y_pos,
251
- 'table_idx': table_idx
252
- })
253
-
254
- # Sort all content by vertical position
255
- content_items.sort(key=lambda x: x['y_pos'])
256
-
257
- # FINAL DEDUPLICATION: Remove content that appears multiple times
258
- seen_text_content = set()
259
- final_content = []
260
-
261
- for item in content_items:
262
- if item['type'] == 'table':
263
- final_content.append(item)
264
- else:
265
- # Check for text duplication
266
- text_key = item['content'].lower().strip()
267
- if text_key not in seen_text_content:
268
- seen_text_content.add(text_key)
269
- final_content.append(item)
270
-
271
- # Add formatted content
272
- for item in final_content:
273
- if item['type'] == 'table':
274
- formatted_parts.append(f"\n--- TABLE {item['table_idx'] + 1} ---")
275
- table_text = self._format_table_enhanced(item['content'])
276
- formatted_parts.append(table_text)
277
- formatted_parts.append("")
278
- else:
279
- # Add text content
280
- if item['role'] == 'title':
281
- formatted_parts.append(f"\n# {item['content']}\n")
282
- elif item['role'] == 'sectionHeading':
283
- formatted_parts.append(f"\n## {item['content']}\n")
284
- else:
285
- formatted_parts.append(item['content'])
286
-
287
- # Clean up excessive empty lines
288
- result = '\n'.join(formatted_parts)
289
- result = re.sub(r'\n{3,}', '\n\n', result) # Max 2 consecutive newlines
290
- return result
291
-
292
- def _is_content_in_table(self, content_item, table_regions):
293
- """Check if content overlaps with any table region"""
294
- if not table_regions or not content_item.bounding_regions:
295
- return False
296
-
297
- content_region = content_item.bounding_regions[0]
298
- if not content_region.polygon:
299
- return False
300
-
301
- content_y1 = content_region.polygon[1] # Top Y
302
- content_y2 = content_region.polygon[5] # Bottom Y
303
- content_x1 = content_region.polygon[0] # Left X
304
- content_x2 = content_region.polygon[2] # Right X
305
-
306
- for table_region in table_regions:
307
- if not table_region.polygon:
308
- continue
309
-
310
- table_y1 = table_region.polygon[1] # Top Y
311
- table_y2 = table_region.polygon[5] # Bottom Y
312
- table_x1 = table_region.polygon[0] # Left X
313
- table_x2 = table_region.polygon[2] # Right X
314
-
315
- # Check for overlap with some tolerance
316
- y_overlap = not (content_y2 < table_y1 - 10 or content_y1 > table_y2 + 10)
317
- x_overlap = not (content_x2 < table_x1 - 10 or content_x1 > table_x2 + 10)
318
-
319
- if y_overlap and x_overlap:
320
- return True
321
-
322
- return False
323
-
324
- def _is_content_in_table_by_line(self, line, table_regions):
325
- """Check if line content overlaps with any table region"""
326
- if not table_regions or not line.polygon:
327
- return False
328
-
329
- line_y1 = line.polygon[1] # Top Y
330
- line_y2 = line.polygon[5] # Bottom Y
331
- line_x1 = line.polygon[0] # Left X
332
- line_x2 = line.polygon[2] # Right X
333
-
334
- for table_region in table_regions:
335
- if not table_region.polygon:
336
- continue
337
-
338
- table_y1 = table_region.polygon[1] # Top Y
339
- table_y2 = table_region.polygon[5] # Bottom Y
340
- table_x1 = table_region.polygon[0] # Left X
341
- table_x2 = table_region.polygon[2] # Right X
342
-
343
- # Check for overlap with tolerance
344
- y_overlap = not (line_y2 < table_y1 - 10 or line_y1 > table_y2 + 10)
345
- x_overlap = not (line_x2 < table_x1 - 10 or line_x1 > table_x2 + 10)
346
-
347
- if y_overlap and x_overlap:
348
- return True
349
-
350
- return False
351
-
352
- def _format_table_enhanced(self, table) -> str:
353
- """Enhanced table formatting with better structure"""
354
- if not table.cells:
355
- return ""
356
-
357
- # Create matrix
358
- max_row = max(cell.row_index for cell in table.cells) + 1
359
- max_col = max(cell.column_index for cell in table.cells) + 1
360
-
361
- table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
362
-
363
- # Fill matrix with cell content
364
- for cell in table.cells:
365
- content = (cell.content or "").strip()
366
- table_matrix[cell.row_index][cell.column_index] = content
367
-
368
- # Calculate column widths
369
- col_widths = [0] * max_col
370
- for row in table_matrix:
371
- for col_idx, cell in enumerate(row):
372
- col_widths[col_idx] = max(col_widths[col_idx], len(cell))
373
-
374
- # Format as aligned table
375
- formatted_rows = []
376
- for row_idx, row in enumerate(table_matrix):
377
- formatted_cells = []
378
- for col_idx, cell in enumerate(row):
379
- width = max(col_widths[col_idx], 3) # Minimum width
380
- formatted_cells.append(cell.ljust(width))
381
-
382
- formatted_row = " | ".join(formatted_cells)
383
- formatted_rows.append(formatted_row)
384
-
385
- # Add separator after header row
386
- if row_idx == 0 and max_row > 1:
387
- separator = " | ".join(["-" * max(col_widths[i], 3) for i in range(max_col)])
388
- formatted_rows.append(separator)
389
-
390
- return "\n".join(formatted_rows)
391
-
392
- def _format_azure_result(self, analysis_result) -> str:
393
- """Format Azure Document Intelligence result preserving layout"""
394
- formatted_text = []
395
-
396
- if analysis_result.pages:
397
- for page_num, page in enumerate(analysis_result.pages, 1):
398
- formatted_text.append(f"\n--- Page {page_num} ---\n")
399
-
400
- # Sort lines by vertical position for better reading order
401
- if page.lines:
402
- sorted_lines = sorted(page.lines, key=lambda line: (
403
- line.polygon[1] if line.polygon else 0, # Y coordinate
404
- line.polygon[0] if line.polygon else 0 # X coordinate
405
- ))
406
-
407
- for line in sorted_lines:
408
- formatted_text.append(line.content)
409
-
410
- # Add tables if present
411
- if analysis_result.tables:
412
- page_tables = [t for t in analysis_result.tables if any(
413
- cell.bounding_regions and
414
- cell.bounding_regions[0].page_number == page_num
415
- for cell in t.cells
416
- )]
417
-
418
- for table_idx, table in enumerate(page_tables):
419
- formatted_text.append(f"\n--- Table {table_idx + 1} ---")
420
- formatted_text.append(self._format_table(table))
421
-
422
- return '\n'.join(formatted_text)
423
-
424
- def _format_table(self, table) -> str:
425
- """Format table from Azure Document Intelligence"""
426
- if not table.cells:
427
- return ""
428
-
429
- # Create matrix
430
- max_row = max(cell.row_index for cell in table.cells) + 1
431
- max_col = max(cell.column_index for cell in table.cells) + 1
432
-
433
- table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
434
-
435
- for cell in table.cells:
436
- table_matrix[cell.row_index][cell.column_index] = cell.content or ""
437
-
438
- # Format as text table
439
- formatted_rows = []
440
- for row in table_matrix:
441
- formatted_rows.append(" | ".join(row))
442
-
443
- return "\n".join(formatted_rows)
444
-
445
  def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
446
- """Tesseract OCR with image preprocessing - FIXED VERSION"""
447
  result = {
448
  'success': False,
449
  'text': '',
 
450
  'method_used': 'tesseract',
451
  'metadata': {},
452
  'error': None
@@ -458,57 +895,72 @@ class OCRService:
458
 
459
  pdf_document = None
460
  try:
461
- # Convert PDF to images
462
  pdf_document = fitz.open(pdf_path)
463
- page_count = len(pdf_document) # Get count before processing
464
  all_text = []
 
 
 
 
 
465
 
466
  for page_num in range(page_count):
 
 
 
 
467
  page = pdf_document.load_page(page_num)
468
 
469
  # Render page to image
470
- mat = fitz.Matrix(2.0, 2.0) # High resolution
471
  pix = page.get_pixmap(matrix=mat)
472
  img_data = pix.tobytes("png")
473
 
474
- # Convert to PIL Image
475
  temp_img_path = None
476
  try:
477
  with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
478
  temp_img.write(img_data)
479
  temp_img_path = temp_img.name
480
 
481
- # Preprocess image for better OCR
482
  processed_img = self._preprocess_image(temp_img_path)
483
 
484
- # OCR with custom config
485
  custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
486
  text = pytesseract.image_to_string(processed_img, config=custom_config, lang='eng')
487
 
488
- all_text.append(f"\n--- Page {page_num + 1} ---\n")
489
  all_text.append(text)
490
 
 
 
 
 
 
491
  finally:
492
- # Clean up temp image file
493
  if temp_img_path and os.path.exists(temp_img_path):
494
  try:
495
  os.unlink(temp_img_path)
496
  except:
497
  pass
498
 
 
 
499
  result.update({
500
  'success': True,
501
  'text': '\n'.join(all_text),
502
- 'metadata': {'pages': page_count}
 
 
 
 
 
 
503
  })
504
 
505
- logger.info("Tesseract OCR completed successfully")
506
 
507
  except Exception as e:
508
  logger.error(f"Tesseract OCR error: {e}")
509
  result['error'] = f"Tesseract OCR error: {e}"
510
  finally:
511
- # FIXED: Ensure document is properly closed
512
  if pdf_document is not None:
513
  try:
514
  pdf_document.close()
@@ -517,27 +969,12 @@ class OCRService:
517
 
518
  return result
519
 
520
- def _preprocess_image(self, image_path: str) -> np.ndarray:
521
- """Preprocess image for better OCR accuracy"""
522
- # Read image
523
- img = cv2.imread(image_path)
524
-
525
- # Convert to grayscale
526
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
527
-
528
- # Noise removal
529
- denoised = cv2.medianBlur(gray, 3)
530
-
531
- # Threshold to get binary image
532
- _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
533
-
534
- return binary
535
-
536
  def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
537
- """PyMuPDF text extraction - FIXED VERSION"""
538
  result = {
539
  'success': False,
540
  'text': '',
 
541
  'method_used': 'pymupdf',
542
  'metadata': {},
543
  'error': None
@@ -546,29 +983,50 @@ class OCRService:
546
  pdf_document = None
547
  try:
548
  pdf_document = fitz.open(pdf_path)
549
- page_count = len(pdf_document) # FIXED: Get count first before processing
550
  all_text = []
 
 
 
 
 
551
 
552
  for page_num in range(page_count):
 
 
 
 
553
  page = pdf_document.load_page(page_num)
554
  text = page.get_text()
555
 
556
- all_text.append(f"\n--- Page {page_num + 1} ---\n")
557
  all_text.append(text)
 
 
 
 
 
 
 
 
558
 
559
  result.update({
560
  'success': True,
561
  'text': '\n'.join(all_text),
562
- 'metadata': {'pages': page_count} # FIXED: Use stored count
 
 
 
 
 
 
563
  })
564
 
565
- logger.info("PyMuPDF extraction completed successfully")
566
 
567
  except Exception as e:
568
  logger.error(f"PyMuPDF error: {e}")
569
  result['error'] = f"PyMuPDF error: {e}"
570
  finally:
571
- # FIXED: Ensure document is properly closed
572
  if pdf_document is not None:
573
  try:
574
  pdf_document.close()
@@ -577,11 +1035,18 @@ class OCRService:
577
 
578
  return result
579
 
 
 
 
 
 
 
 
 
580
  def _try_fallback_methods(self, pdf_path: str, exclude_method: str = None) -> Dict[str, Any]:
581
  """Try fallback OCR methods"""
582
  fallback_methods = []
583
 
584
- # Order of fallback preference
585
  if exclude_method != "azure" and self.azure_client:
586
  fallback_methods.append("azure")
587
  if exclude_method != "tesseract" and self._check_tesseract_available():
@@ -593,7 +1058,7 @@ class OCRService:
593
  logger.info(f"Trying fallback method: {method}")
594
  try:
595
  if method == "azure":
596
- result = self._azure_ocr(pdf_path)
597
  elif method == "tesseract":
598
  result = self._tesseract_ocr(pdf_path)
599
  elif method == "pymupdf":
@@ -610,6 +1075,7 @@ class OCRService:
610
  return {
611
  'success': False,
612
  'text': '',
 
613
  'method_used': 'all_methods_failed',
614
  'metadata': {},
615
  'error': 'All OCR methods failed'
@@ -633,6 +1099,6 @@ class OCRService:
633
  methods.append("azure")
634
  if self._check_tesseract_available():
635
  methods.append("tesseract")
636
- methods.append("pymupdf") # Always available
637
 
638
  return methods
 
1
  """
2
+ OCR Service Module - FIXED VERSION with Improved Text Formatting and Page Numbers
3
+ Handles PDF to text conversion with proper indentation, spacing, and page numbering
4
  """
5
  import re
6
  import os
7
  import logging
8
+ from typing import Optional, Dict, Any, Tuple, List
9
  import tempfile
10
  from pathlib import Path
11
 
 
35
  logger = logging.getLogger(__name__)
36
 
37
 
38
+ class HTMLProcessor:
39
+ """Process OCR results through HTML for better formatting preservation - FIXED VERSION"""
40
+
41
+ @staticmethod
42
+ def create_html_from_azure_result(analysis_result) -> str:
43
+ """Create structured HTML from Azure Document Intelligence result with proper spacing and page numbers"""
44
+ html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8">']
45
+ html_parts.append('<style>')
46
+ html_parts.append('''
47
+ body {
48
+ font-family: 'Consolas', 'Courier New', monospace;
49
+ line-height: 1.6;
50
+ margin: 20px;
51
+ white-space: pre-wrap;
52
+ font-size: 11pt;
53
+ background-color: #fafafa;
54
+ }
55
+ .page {
56
+ margin-bottom: 30px;
57
+ border: 1px solid #ddd;
58
+ padding: 20px;
59
+ background-color: white;
60
+ border-radius: 5px;
61
+ box-shadow: 0 2px 5px rgba(0,0,0,0.1);
62
+ }
63
+ .page-header {
64
+ font-weight: bold;
65
+ color: #2c3e50;
66
+ margin-bottom: 15px;
67
+ text-align: center;
68
+ border-bottom: 2px solid #3498db;
69
+ padding-bottom: 8px;
70
+ font-size: 14pt;
71
+ text-transform: uppercase;
72
+ letter-spacing: 1px;
73
+ }
74
+ .paragraph {
75
+ margin-bottom: 0.8em;
76
+ white-space: pre-wrap;
77
+ font-family: 'Consolas', 'Courier New', monospace;
78
+ line-height: 1.4;
79
+ }
80
+ .title {
81
+ font-size: 1.4em;
82
+ font-weight: bold;
83
+ margin: 15px 0 12px 0;
84
+ color: #2c3e50;
85
+ border-left: 4px solid #3498db;
86
+ padding-left: 10px;
87
+ }
88
+ .section-heading {
89
+ font-size: 1.2em;
90
+ font-weight: bold;
91
+ margin: 12px 0 8px 0;
92
+ color: #34495e;
93
+ border-left: 3px solid #95a5a6;
94
+ padding-left: 8px;
95
+ }
96
+ .table-container {
97
+ margin: 15px 0;
98
+ font-family: 'Consolas', 'Courier New', monospace;
99
+ background-color: #f8f9fa;
100
+ padding: 10px;
101
+ border-radius: 5px;
102
+ border: 1px solid #dee2e6;
103
+ }
104
+ .table {
105
+ border-collapse: collapse;
106
+ width: 100%;
107
+ margin: 8px 0;
108
+ font-family: 'Consolas', 'Courier New', monospace;
109
+ font-size: 10pt;
110
+ background-color: white;
111
+ }
112
+ .table th, .table td {
113
+ border: 1px solid #bdc3c7;
114
+ padding: 6px 10px;
115
+ text-align: left;
116
+ white-space: pre-wrap;
117
+ vertical-align: top;
118
+ }
119
+ .table th {
120
+ background-color: #ecf0f1;
121
+ font-weight: bold;
122
+ color: #2c3e50;
123
+ }
124
+ .table tr:nth-child(even) {
125
+ background-color: #f8f9fa;
126
+ }
127
+ .indented {
128
+ display: inline-block;
129
+ white-space: pre-wrap;
130
+ }
131
+ .bullet-point {
132
+ position: relative;
133
+ padding-left: 1.2em;
134
+ margin-bottom: 0.3em;
135
+ }
136
+ .bullet-point:before {
137
+ content: "•";
138
+ position: absolute;
139
+ left: 0;
140
+ color: #3498db;
141
+ font-weight: bold;
142
+ }
143
+ .spaced {
144
+ margin-top: 10px;
145
+ }
146
+ .page-number {
147
+ position: relative;
148
+ float: right;
149
+ background-color: #3498db;
150
+ color: white;
151
+ padding: 2px 8px;
152
+ border-radius: 3px;
153
+ font-size: 9pt;
154
+ margin-top: -5px;
155
+ }
156
+ ''')
157
+ html_parts.append('</style></head><body>')
158
+
159
+ if not analysis_result.pages:
160
+ html_parts.append('<p>No content found</p></body></html>')
161
+ return '\n'.join(html_parts)
162
+
163
+ for page_num, page in enumerate(analysis_result.pages, 1):
164
+ html_parts.append(f'<div class="page">')
165
+ html_parts.append(f'<div class="page-header">Page {page_num} <span class="page-number">{page_num}</span></div>')
166
+
167
+ # Process content with proper ordering and spacing preservation
168
+ content_items = HTMLProcessor._extract_page_content(page, analysis_result, page_num)
169
+ content_items.sort(key=lambda x: (x['y_pos'], x['x_pos']))
170
+
171
+ # Generate HTML for each content item with preserved spacing
172
+ for item in content_items:
173
+ if item['type'] == 'table':
174
+ html_parts.append(HTMLProcessor._table_to_html(item['content'], item['table_idx']))
175
+ else:
176
+ html_parts.append(HTMLProcessor._text_to_html(item))
177
+
178
+ html_parts.append('</div>')
179
+
180
+ html_parts.append('</body></html>')
181
+ return '\n'.join(html_parts)
182
+
183
+ @staticmethod
184
+ def _extract_page_content(page, analysis_result, page_num):
185
+ """Extract and organize page content without losing text with proper spacing"""
186
+ content_items = []
187
+
188
+ # First, collect all tables for this page
189
+ page_tables = []
190
+ table_regions = []
191
+
192
+ if analysis_result.tables:
193
+ for table_idx, table in enumerate(analysis_result.tables):
194
+ if HTMLProcessor._is_table_on_page(table, page_num):
195
+ page_tables.append((table_idx, table))
196
+ # Store table regions for overlap detection
197
+ if table.bounding_regions:
198
+ table_regions.append({
199
+ 'polygon': table.bounding_regions[0].polygon,
200
+ 'table_idx': table_idx
201
+ })
202
+
203
+ # Add table items to content
204
+ for table_idx, table in page_tables:
205
+ if table.bounding_regions and table.bounding_regions[0].polygon:
206
+ polygon = table.bounding_regions[0].polygon
207
+ y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) # Top Y
208
+ x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) # Left X
209
+
210
+ content_items.append({
211
+ 'type': 'table',
212
+ 'content': table,
213
+ 'table_idx': table_idx,
214
+ 'y_pos': y_pos,
215
+ 'x_pos': x_pos
216
+ })
217
+
218
+ # Calculate page margins for proper indentation detection
219
+ page_left_margin = HTMLProcessor._calculate_page_margins(page, analysis_result, page_num)
220
+
221
+ # Process text content - use paragraphs if available, otherwise lines
222
+ if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
223
+ # Use paragraphs (better content grouping)
224
+ page_paragraphs = [p for p in analysis_result.paragraphs if
225
+ p.bounding_regions and
226
+ p.bounding_regions[0].page_number == page_num]
227
+
228
+ for para in page_paragraphs:
229
+ if para.content.strip():
230
+ # Check if this paragraph overlaps significantly with any table
231
+ overlap_ratio = HTMLProcessor._calculate_table_overlap(para, table_regions)
232
+
233
+ # Only exclude if heavily overlapping (>70%) with a table
234
+ if overlap_ratio < 0.7:
235
+ polygon = para.bounding_regions[0].polygon
236
+ y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
237
+ x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
238
+
239
+ # Calculate proper indentation based on page margins
240
+ indent_info = HTMLProcessor._calculate_precise_indentation(x_pos, page_left_margin, para.content)
241
+
242
+ content_items.append({
243
+ 'type': 'paragraph',
244
+ 'content': para.content.strip(),
245
+ 'role': getattr(para, 'role', 'paragraph'),
246
+ 'y_pos': y_pos,
247
+ 'x_pos': x_pos,
248
+ 'indent_level': indent_info['level'],
249
+ 'indent_pixels': indent_info['pixels'],
250
+ 'is_bullet': indent_info['is_bullet'],
251
+ 'preserve_spacing': True
252
+ })
253
+
254
+ elif page.lines:
255
+ # Use lines as fallback with enhanced spacing preservation
256
+ processed_lines = HTMLProcessor._process_lines_content_with_spacing(page.lines, table_regions, page_left_margin)
257
+ content_items.extend(processed_lines)
258
+
259
+ return content_items
260
+
261
+ @staticmethod
262
+ def _is_table_on_page(table, page_num):
263
+ """Check if table belongs to the specified page"""
264
+ if not table.cells:
265
+ return False
266
+
267
+ for cell in table.cells:
268
+ if (cell.bounding_regions and
269
+ cell.bounding_regions[0].page_number == page_num):
270
+ return True
271
+ return False
272
+
273
+ @staticmethod
274
+ def _calculate_table_overlap(content_item, table_regions):
275
+ """Calculate overlap ratio between content and tables (FIXED)"""
276
+ if not table_regions or not content_item.bounding_regions:
277
+ return 0.0
278
+
279
+ content_polygon = content_item.bounding_regions[0].polygon
280
+ if not content_polygon or len(content_polygon) < 8:
281
+ return 0.0
282
+
283
+ # Content bounding box
284
+ content_x1 = min(content_polygon[0], content_polygon[2], content_polygon[4], content_polygon[6])
285
+ content_x2 = max(content_polygon[0], content_polygon[2], content_polygon[4], content_polygon[6])
286
+ content_y1 = min(content_polygon[1], content_polygon[3], content_polygon[5], content_polygon[7])
287
+ content_y2 = max(content_polygon[1], content_polygon[3], content_polygon[5], content_polygon[7])
288
+
289
+ content_area = (content_x2 - content_x1) * (content_y2 - content_y1)
290
+ if content_area <= 0:
291
+ return 0.0
292
+
293
+ max_overlap_ratio = 0.0
294
+
295
+ for table_region in table_regions:
296
+ table_polygon = table_region['polygon']
297
+ if not table_polygon or len(table_polygon) < 8:
298
+ continue
299
+
300
+ # Table bounding box
301
+ table_x1 = min(table_polygon[0], table_polygon[2], table_polygon[4], table_polygon[6])
302
+ table_x2 = max(table_polygon[0], table_polygon[2], table_polygon[4], table_polygon[6])
303
+ table_y1 = min(table_polygon[1], table_polygon[3], table_polygon[5], table_polygon[7])
304
+ table_y2 = max(table_polygon[1], table_polygon[3], table_polygon[5], table_polygon[7])
305
+
306
+ # Calculate intersection
307
+ intersect_x1 = max(content_x1, table_x1)
308
+ intersect_x2 = min(content_x2, table_x2)
309
+ intersect_y1 = max(content_y1, table_y1)
310
+ intersect_y2 = min(content_y2, table_y2)
311
+
312
+ if intersect_x2 > intersect_x1 and intersect_y2 > intersect_y1:
313
+ intersect_area = (intersect_x2 - intersect_x1) * (intersect_y2 - intersect_y1)
314
+ overlap_ratio = intersect_area / content_area
315
+ max_overlap_ratio = max(max_overlap_ratio, overlap_ratio)
316
+
317
+ return max_overlap_ratio
318
+
319
+ @staticmethod
320
+ def _calculate_page_margins(page, analysis_result, page_num):
321
+ """Calculate page margins to determine proper indentation baseline"""
322
+ left_positions = []
323
+
324
+ # Collect x positions from paragraphs if available
325
+ if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
326
+ page_paragraphs = [p for p in analysis_result.paragraphs if
327
+ p.bounding_regions and
328
+ p.bounding_regions[0].page_number == page_num]
329
+
330
+ for para in page_paragraphs:
331
+ if para.bounding_regions and para.bounding_regions[0].polygon:
332
+ polygon = para.bounding_regions[0].polygon
333
+ x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6])
334
+ left_positions.append(x_pos)
335
+
336
+ # Fallback to lines if no paragraphs
337
+ elif page.lines:
338
+ for line in page.lines:
339
+ if line.polygon:
340
+ x_pos = min(line.polygon[0], line.polygon[2], line.polygon[4], line.polygon[6])
341
+ left_positions.append(x_pos)
342
+
343
+ # Find the most common left margin (baseline)
344
+ if left_positions:
345
+ left_positions.sort()
346
+ # Take the most frequent left position as the main margin
347
+ from collections import Counter
348
+ position_counts = Counter([round(pos, -1) for pos in left_positions]) # Round to nearest 10
349
+ base_margin = position_counts.most_common(1)[0][0]
350
+ return base_margin
351
+
352
+ return 50 # Default margin if no content found
353
+
354
+ @staticmethod
355
+ def _calculate_precise_indentation(x_pos, base_margin, content):
356
+ """Calculate precise indentation based on x position and content analysis"""
357
+ # Calculate indent distance from base margin
358
+ indent_distance = max(0, x_pos - base_margin)
359
+
360
+ # Define indentation levels based on distance
361
+ # Each level represents approximately 0.5 inch or 36 points
362
+ level_threshold = 30 # Reduced threshold for better sensitivity
363
+ indent_level = int(indent_distance / level_threshold)
364
+
365
+ # Detect bullet points or numbered lists
366
+ is_bullet = False
367
+ content_stripped = content.strip()
368
+
369
+ # Common bullet point patterns
370
+ bullet_patterns = [
371
+ r'^\s*[•·▪▫◦‣⁃]\s+', # Bullet symbols
372
+ r'^\s*[\-\*\+]\s+', # Dash, asterisk, plus
373
+ r'^\s*\d+[\.\)]\s+', # Numbered lists (1. or 1))
374
+ r'^\s*[a-zA-Z][\.\)]\s+', # Lettered lists (a. or a))
375
+ r'^\s*[ivxlcdm]+[\.\)]\s+', # Roman numerals
376
+ ]
377
+
378
+ for pattern in bullet_patterns:
379
+ if re.match(pattern, content_stripped, re.IGNORECASE):
380
+ is_bullet = True
381
+ break
382
+
383
+ return {
384
+ 'level': min(indent_level, 6), # Cap at level 6
385
+ 'pixels': indent_distance,
386
+ 'is_bullet': is_bullet
387
+ }
388
+
389
+ @staticmethod
390
+ def _process_lines_content_with_spacing(lines, table_regions, page_left_margin):
391
+ """Process lines content with enhanced spacing preservation"""
392
+ content_items = []
393
+ processed_content = set()
394
+
395
+ for line in lines:
396
+ if not line.content.strip():
397
+ continue
398
+
399
+ # Avoid duplicates
400
+ content_key = line.content.strip().lower()
401
+ if content_key in processed_content:
402
+ continue
403
+ processed_content.add(content_key)
404
+
405
+ # Check table overlap
406
+ overlap_ratio = HTMLProcessor._calculate_line_table_overlap(line, table_regions)
407
+
408
+ # Only exclude if heavily overlapping with table
409
+ if overlap_ratio < 0.7:
410
+ polygon = line.polygon
411
+ y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
412
+ x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
413
+
414
+ # Calculate precise indentation for lines
415
+ indent_info = HTMLProcessor._calculate_precise_indentation(x_pos, page_left_margin, line.content)
416
+
417
+ content_items.append({
418
+ 'type': 'line',
419
+ 'content': line.content.strip(),
420
+ 'role': 'text',
421
+ 'y_pos': y_pos,
422
+ 'x_pos': x_pos,
423
+ 'indent_level': indent_info['level'],
424
+ 'indent_pixels': indent_info['pixels'],
425
+ 'is_bullet': indent_info['is_bullet'],
426
+ 'preserve_spacing': True
427
+ })
428
+
429
+ return content_items
430
+
431
+ @staticmethod
432
+ def _calculate_line_table_overlap(line, table_regions):
433
+ """Calculate overlap between line and tables"""
434
+ if not table_regions or not line.polygon:
435
+ return 0.0
436
+
437
+ line_polygon = line.polygon
438
+ if len(line_polygon) < 8:
439
+ return 0.0
440
+
441
+ # Line bounding box
442
+ line_x1 = min(line_polygon[0], line_polygon[2], line_polygon[4], line_polygon[6])
443
+ line_x2 = max(line_polygon[0], line_polygon[2], line_polygon[4], line_polygon[6])
444
+ line_y1 = min(line_polygon[1], line_polygon[3], line_polygon[5], line_polygon[7])
445
+ line_y2 = max(line_polygon[1], line_polygon[3], line_polygon[5], line_polygon[7])
446
+
447
+ line_area = (line_x2 - line_x1) * (line_y2 - line_y1)
448
+ if line_area <= 0:
449
+ return 0.0
450
+
451
+ max_overlap = 0.0
452
+
453
+ for table_region in table_regions:
454
+ table_polygon = table_region['polygon']
455
+ if not table_polygon or len(table_polygon) < 8:
456
+ continue
457
+
458
+ table_x1 = min(table_polygon[0], table_polygon[2], table_polygon[4], table_polygon[6])
459
+ table_x2 = max(table_polygon[0], table_polygon[2], table_polygon[4], table_polygon[6])
460
+ table_y1 = min(table_polygon[1], table_polygon[3], table_polygon[5], table_polygon[7])
461
+ table_y2 = max(table_polygon[1], table_polygon[3], table_polygon[5], table_polygon[7])
462
+
463
+ # Calculate intersection
464
+ intersect_x1 = max(line_x1, table_x1)
465
+ intersect_x2 = min(line_x2, table_x2)
466
+ intersect_y1 = max(line_y1, table_y1)
467
+ intersect_y2 = min(line_y2, table_y2)
468
+
469
+ if intersect_x2 > intersect_x1 and intersect_y2 > intersect_y1:
470
+ intersect_area = (intersect_x2 - intersect_x1) * (intersect_y2 - intersect_y1)
471
+ overlap_ratio = intersect_area / line_area
472
+ max_overlap = max(max_overlap, overlap_ratio)
473
+
474
+ return max_overlap
475
+
476
+ @staticmethod
477
+ def _text_to_html(item):
478
+ """Convert text item to HTML with proper formatting and preserved spacing"""
479
+ content = item['content']
480
+ role = item.get('role', 'paragraph')
481
+ indent_level = item.get('indent_level', 0)
482
+ indent_pixels = item.get('indent_pixels', 0)
483
+ is_bullet = item.get('is_bullet', False)
484
+ preserve_spacing = item.get('preserve_spacing', False)
485
+
486
+ # Calculate CSS indentation
487
+ css_indent = max(0, indent_level)
488
+
489
+ # Build CSS classes and inline styles
490
+ css_classes = []
491
+ inline_styles = []
492
+
493
+ if css_indent > 0:
494
+ inline_styles.append(f"margin-left: {css_indent * 1.5}em")
495
+ css_classes.append("indented")
496
+
497
+ if is_bullet:
498
+ css_classes.append("bullet-point")
499
+
500
+ # Preserve internal spacing within content
501
+ if preserve_spacing:
502
+ # Replace multiple spaces with &nbsp; to preserve spacing
503
+ content = re.sub(r' +', lambda m: '&nbsp;' * len(m.group()), content)
504
+ # Preserve line breaks within content
505
+ content = content.replace('\n', '<br>')
506
+
507
+ # Combine CSS
508
+ class_str = f' class="{" ".join(css_classes)}"' if css_classes else ''
509
+ style_str = f' style="{"; ".join(inline_styles)}"' if inline_styles else ''
510
+
511
+ if role == 'title':
512
+ return f'<div class="title"{class_str}{style_str}>{content}</div>'
513
+ elif role == 'sectionHeading':
514
+ return f'<div class="section-heading"{class_str}{style_str}>{content}</div>'
515
+ else:
516
+ # Regular paragraphs with preserved formatting
517
+ return f'<div class="paragraph"{class_str}{style_str}>{content}</div>'
518
+
519
+ @staticmethod
520
+ def _table_to_html(table, table_idx):
521
+ """Convert table to HTML with proper structure"""
522
+ if not table.cells:
523
+ return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
524
+
525
+ # Create table matrix
526
+ max_row = max(cell.row_index for cell in table.cells) + 1
527
+ max_col = max(cell.column_index for cell in table.cells) + 1
528
+
529
+ table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
530
+
531
+ # Fill matrix
532
+ for cell in table.cells:
533
+ content = (cell.content or "").strip()
534
+ table_matrix[cell.row_index][cell.column_index] = content
535
+
536
+ # Generate HTML
537
+ html_parts = [f'<div class="table-container">']
538
+ html_parts.append(f'<h4>Table {table_idx + 1}</h4>')
539
+ html_parts.append('<table class="table">')
540
+
541
+ for row_idx, row in enumerate(table_matrix):
542
+ if row_idx == 0 and any(cell.strip() for cell in row):
543
+ # Header row
544
+ html_parts.append('<tr>')
545
+ for cell in row:
546
+ html_parts.append(f'<th>{cell}</th>')
547
+ html_parts.append('</tr>')
548
+ else:
549
+ # Data row
550
+ if any(cell.strip() for cell in row): # Skip empty rows
551
+ html_parts.append('<tr>')
552
+ for cell in row:
553
+ html_parts.append(f'<td>{cell}</td>')
554
+ html_parts.append('</tr>')
555
+
556
+ html_parts.append('</table></div>')
557
+ return '\n'.join(html_parts)
558
+
559
+ @staticmethod
560
+ def html_to_formatted_text(html_content):
561
+ """Convert HTML back to formatted text preserving structure, spacing, and adding page numbers"""
562
+ from html.parser import HTMLParser
563
+
564
+ class FixedSpacingTextExtractor(HTMLParser):
565
+ def __init__(self):
566
+ super().__init__()
567
+ self.text_parts = []
568
+ self.in_title = False
569
+ self.in_section_heading = False
570
+ self.in_table = False
571
+ self.in_table_header = False
572
+ self.current_table_row = []
573
+ self.table_data = []
574
+ self.current_indent = 0
575
+ self.preserve_spacing = False
576
+ self.in_page_header = False
577
+ self.current_page_num = 0
578
+
579
+ def handle_starttag(self, tag, attrs):
580
+ attr_dict = dict(attrs)
581
+ class_attr = attr_dict.get('class', '')
582
+ style_attr = attr_dict.get('style', '')
583
+
584
+ if 'page-header' in class_attr:
585
+ self.in_page_header = True
586
+ # Add proper page separation with page number
587
+ if len(self.text_parts) > 0:
588
+ self.text_parts.append('\n\n' + '=' * 80 + '\n')
589
+
590
+ elif 'title' in class_attr:
591
+ self.in_title = True
592
+ elif 'section-heading' in class_attr:
593
+ self.in_section_heading = True
594
+ elif tag == 'table':
595
+ self.in_table = True
596
+ self.table_data = []
597
+ elif tag == 'th':
598
+ self.in_table_header = True
599
+ elif tag == 'tr':
600
+ self.current_table_row = []
601
+ elif tag == 'br':
602
+ self.text_parts.append('\n')
603
+
604
+ # Extract indentation from style
605
+ if 'margin-left' in style_attr:
606
+ import re
607
+ margin_match = re.search(r'margin-left:\s*(\d+(?:\.\d+)?)em', style_attr)
608
+ if margin_match:
609
+ self.current_indent = int(float(margin_match.group(1)))
610
+ else:
611
+ self.current_indent = 0
612
+ else:
613
+ # Count indented classes as fallback
614
+ self.current_indent = class_attr.count('indented')
615
+
616
+ # Check if we should preserve spacing
617
+ self.preserve_spacing = 'paragraph' in class_attr or 'bullet-point' in class_attr
618
+
619
+ def handle_endtag(self, tag):
620
+ if tag == 'div' and self.in_page_header:
621
+ self.text_parts.append('\n' + '=' * 80 + '\n\n')
622
+ self.in_page_header = False
623
+ elif tag == 'div' and self.in_title:
624
+ self.text_parts.append('\n\n')
625
+ self.in_title = False
626
+ elif tag == 'div' and self.in_section_heading:
627
+ self.text_parts.append('\n\n')
628
+ self.in_section_heading = False
629
+ elif tag == 'table':
630
+ self.in_table = False
631
+ self._format_table()
632
+ elif tag == 'th':
633
+ self.in_table_header = False
634
+ elif tag == 'tr' and self.current_table_row:
635
+ self.table_data.append(self.current_table_row[:])
636
+ elif tag == 'div' and not self.in_table and not self.in_title and not self.in_section_heading and not self.in_page_header:
637
+ if not self.preserve_spacing:
638
+ self.text_parts.append('\n')
639
+
640
+ # Reset indentation when closing div
641
+ if tag == 'div':
642
+ self.current_indent = 0
643
+ self.preserve_spacing = False
644
+
645
+ def handle_data(self, data):
646
+ if data.strip():
647
+ # Convert &nbsp; back to spaces for proper spacing
648
+ data = data.replace('&nbsp;', ' ')
649
+
650
+ if self.in_page_header:
651
+ # Extract page number and format properly
652
+ page_match = re.search(r'Page (\d+)', data)
653
+ if page_match:
654
+ self.current_page_num = int(page_match.group(1))
655
+ page_header = f"PAGE {self.current_page_num}"
656
+ self.text_parts.append(page_header.center(80))
657
+ elif self.in_title:
658
+ indent_str = " " * self.current_indent
659
+ self.text_parts.append(f'\n{indent_str}## {data.strip()}')
660
+ elif self.in_section_heading:
661
+ indent_str = " " * self.current_indent
662
+ self.text_parts.append(f'\n{indent_str}### {data.strip()}')
663
+ elif self.in_table:
664
+ if self.in_table_header or self.current_table_row is not None:
665
+ self.current_table_row.append(data.strip())
666
+ else:
667
+ # Apply indentation and preserve internal spacing
668
+ indent_str = " " * self.current_indent
669
+
670
+ if self.preserve_spacing:
671
+ # Keep the exact spacing from the data
672
+ formatted_data = data
673
+ else:
674
+ # Clean up spacing for non-preserved content
675
+ formatted_data = re.sub(r'\s+', ' ', data).strip()
676
+
677
+ # Handle bullet points specially
678
+ if 'bullet-point' in getattr(self, '_last_class', ''):
679
+ # Remove the bullet symbol that CSS adds and format properly
680
+ self.text_parts.append(f'{indent_str}• {formatted_data}')
681
+ else:
682
+ self.text_parts.append(f'{indent_str}{formatted_data}')
683
+
684
+ def _format_table(self):
685
+ if not self.table_data:
686
+ return
687
+
688
+ self.text_parts.append('\n\n')
689
+
690
+ # Calculate column widths for better formatting
691
+ if self.table_data:
692
+ max_cols = max(len(row) for row in self.table_data)
693
+ col_widths = [0] * max_cols
694
+
695
+ for row in self.table_data:
696
+ for i, cell in enumerate(row):
697
+ if i < max_cols:
698
+ col_widths[i] = max(col_widths[i], len(str(cell)))
699
+
700
+ # Ensure minimum column width
701
+ col_widths = [max(width, 8) for width in col_widths]
702
+
703
+ # Format rows with proper alignment
704
+ for row_idx, row in enumerate(self.table_data):
705
+ formatted_cells = []
706
+ for i, cell in enumerate(row):
707
+ if i < max_cols:
708
+ width = col_widths[i]
709
+ formatted_cells.append(str(cell).ljust(width))
710
+
711
+ row_text = ' | '.join(formatted_cells)
712
+ self.text_parts.append(row_text)
713
+
714
+ # Add separator after header
715
+ if row_idx == 0 and len(self.table_data) > 1:
716
+ separator_cells = ['-' * col_widths[i] for i in range(max_cols)]
717
+ separator_text = ' | '.join(separator_cells)
718
+ self.text_parts.append(separator_text)
719
+
720
+ self.text_parts.append('\n')
721
+
722
+ self.text_parts.append('\n')
723
+
724
+ extractor = FixedSpacingTextExtractor()
725
+ extractor.feed(html_content)
726
+
727
+ result = ''.join(extractor.text_parts)
728
+
729
+ # Clean up excessive newlines while preserving intentional spacing
730
+ result = re.sub(r'\n{4,}', '\n\n\n', result) # Max 3 consecutive newlines
731
+
732
+ # Ensure proper spacing around page headers
733
+ result = re.sub(r'(={80})\n*([A-Z ]+)\n*(={80})', r'\1\n\2\n\3', result)
734
+
735
+ return result.strip()
736
+
737
+
738
  class OCRService:
739
+ """Main OCR service with HTML processing and improved table handling"""
740
 
741
  def __init__(self):
742
  self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
 
758
 
759
  def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
760
  """
761
+ Convert PDF to text using specified method with HTML processing
762
 
763
  Args:
764
  pdf_path: Path to the PDF file
765
  method: OCR method ('azure', 'tesseract', 'pymupdf', 'auto')
766
 
767
  Returns:
768
+ Dict containing text content, HTML, metadata, and processing info
769
  """
770
  result = {
771
  'success': False,
772
  'text': '',
773
+ 'html': '',
774
  'method_used': '',
775
  'metadata': {},
776
  'error': None
 
792
  # Try primary method
793
  try:
794
  if method == "azure" and self.azure_client:
795
+ result = self._azure_ocr_with_html(pdf_path)
796
  elif method == "tesseract":
797
  result = self._tesseract_ocr(pdf_path)
798
  elif method == "pymupdf":
 
811
 
812
  return result
813
 
814
+ def _azure_ocr_with_html(self, pdf_path: str) -> Dict[str, Any]:
815
+ """Azure Document Intelligence OCR with HTML processing"""
816
  result = {
817
  'success': False,
818
  'text': '',
819
+ 'html': '',
820
  'method_used': 'azure_document_intelligence',
821
  'metadata': {},
822
  'error': None
 
826
  with open(pdf_path, 'rb') as pdf_file:
827
  file_content = pdf_file.read()
828
 
829
+ # Try different API call patterns
830
  try:
 
831
  poller = self.azure_client.begin_analyze_document(
832
  "prebuilt-layout",
833
  body=file_content,
 
835
  )
836
  except TypeError:
837
  try:
 
838
  poller = self.azure_client.begin_analyze_document(
839
  model_id="prebuilt-layout",
840
  body=file_content
841
  )
842
  except TypeError:
 
843
  pdf_file.seek(0)
844
  poller = self.azure_client.begin_analyze_document(
845
  "prebuilt-layout",
 
848
 
849
  analysis_result = poller.result()
850
 
851
+ # Generate HTML first
852
+ html_content = HTMLProcessor.create_html_from_azure_result(analysis_result)
853
+
854
+ # Convert HTML to formatted text with proper page numbers and spacing
855
+ formatted_text = HTMLProcessor.html_to_formatted_text(html_content)
856
 
857
  result.update({
858
  'success': True,
859
  'text': formatted_text,
860
+ 'html': html_content,
861
  'metadata': {
862
  'pages': len(analysis_result.pages) if analysis_result.pages else 0,
863
  'tables': len(analysis_result.tables) if analysis_result.tables else 0,
864
  'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
865
  'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
866
+ 'html_generated': True,
867
+ 'improved_formatting': True,
868
+ 'page_numbers_added': True,
869
+ 'azure_analysis': analysis_result
870
  }
871
  })
872
 
873
+ logger.info("Azure OCR with improved HTML processing completed successfully")
874
 
875
  except Exception as e:
876
  logger.error(f"Azure OCR error: {e}")
 
878
 
879
  return result
880
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
881
  def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
882
+ """Tesseract OCR with basic HTML generation and page numbers"""
883
  result = {
884
  'success': False,
885
  'text': '',
886
+ 'html': '',
887
  'method_used': 'tesseract',
888
  'metadata': {},
889
  'error': None
 
895
 
896
  pdf_document = None
897
  try:
 
898
  pdf_document = fitz.open(pdf_path)
899
+ page_count = len(pdf_document)
900
  all_text = []
901
+ html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
902
+ html_parts.append('body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }')
903
+ html_parts.append('.page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }')
904
+ html_parts.append('.page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }')
905
+ html_parts.append('</style></head><body>')
906
 
907
  for page_num in range(page_count):
908
+ # Add page header to text
909
+ page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
910
+ all_text.append(page_header)
911
+
912
  page = pdf_document.load_page(page_num)
913
 
914
  # Render page to image
915
+ mat = fitz.Matrix(2.0, 2.0)
916
  pix = page.get_pixmap(matrix=mat)
917
  img_data = pix.tobytes("png")
918
 
 
919
  temp_img_path = None
920
  try:
921
  with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
922
  temp_img.write(img_data)
923
  temp_img_path = temp_img.name
924
 
 
925
  processed_img = self._preprocess_image(temp_img_path)
926
 
 
927
  custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
928
  text = pytesseract.image_to_string(processed_img, config=custom_config, lang='eng')
929
 
 
930
  all_text.append(text)
931
 
932
+ # Add to HTML with page number
933
+ html_parts.append(f'<div class="page">')
934
+ html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
935
+ html_parts.append(f'<pre>{text}</pre></div>')
936
+
937
  finally:
 
938
  if temp_img_path and os.path.exists(temp_img_path):
939
  try:
940
  os.unlink(temp_img_path)
941
  except:
942
  pass
943
 
944
+ html_parts.append('</body></html>')
945
+
946
  result.update({
947
  'success': True,
948
  'text': '\n'.join(all_text),
949
+ 'html': '\n'.join(html_parts),
950
+ 'metadata': {
951
+ 'pages': page_count,
952
+ 'html_generated': True,
953
+ 'page_numbers_added': True,
954
+ 'improved_formatting': True
955
+ }
956
  })
957
 
958
+ logger.info("Tesseract OCR with improved formatting completed successfully")
959
 
960
  except Exception as e:
961
  logger.error(f"Tesseract OCR error: {e}")
962
  result['error'] = f"Tesseract OCR error: {e}"
963
  finally:
 
964
  if pdf_document is not None:
965
  try:
966
  pdf_document.close()
 
969
 
970
  return result
971
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
972
  def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
973
+ """PyMuPDF text extraction with HTML generation and page numbers"""
974
  result = {
975
  'success': False,
976
  'text': '',
977
+ 'html': '',
978
  'method_used': 'pymupdf',
979
  'metadata': {},
980
  'error': None
 
983
  pdf_document = None
984
  try:
985
  pdf_document = fitz.open(pdf_path)
986
+ page_count = len(pdf_document)
987
  all_text = []
988
+ html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
989
+ html_parts.append('body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }')
990
+ html_parts.append('.page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }')
991
+ html_parts.append('.page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }')
992
+ html_parts.append('</style></head><body>')
993
 
994
  for page_num in range(page_count):
995
+ # Add page header to text
996
+ page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
997
+ all_text.append(page_header)
998
+
999
  page = pdf_document.load_page(page_num)
1000
  text = page.get_text()
1001
 
 
1002
  all_text.append(text)
1003
+
1004
+ # Add to HTML with better formatting and page numbers
1005
+ html_parts.append(f'<div class="page">')
1006
+ html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
1007
+ formatted_text = text.replace('\n', '<br>')
1008
+ html_parts.append(f'<div>{formatted_text}</div></div>')
1009
+
1010
+ html_parts.append('</body></html>')
1011
 
1012
  result.update({
1013
  'success': True,
1014
  'text': '\n'.join(all_text),
1015
+ 'html': '\n'.join(html_parts),
1016
+ 'metadata': {
1017
+ 'pages': page_count,
1018
+ 'html_generated': True,
1019
+ 'page_numbers_added': True,
1020
+ 'improved_formatting': True
1021
+ }
1022
  })
1023
 
1024
+ logger.info("PyMuPDF extraction with improved formatting completed successfully")
1025
 
1026
  except Exception as e:
1027
  logger.error(f"PyMuPDF error: {e}")
1028
  result['error'] = f"PyMuPDF error: {e}"
1029
  finally:
 
1030
  if pdf_document is not None:
1031
  try:
1032
  pdf_document.close()
 
1035
 
1036
  return result
1037
 
1038
+ def _preprocess_image(self, image_path: str) -> np.ndarray:
1039
+ """Preprocess image for better OCR accuracy"""
1040
+ img = cv2.imread(image_path)
1041
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
1042
+ denoised = cv2.medianBlur(gray, 3)
1043
+ _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
1044
+ return binary
1045
+
1046
  def _try_fallback_methods(self, pdf_path: str, exclude_method: str = None) -> Dict[str, Any]:
1047
  """Try fallback OCR methods"""
1048
  fallback_methods = []
1049
 
 
1050
  if exclude_method != "azure" and self.azure_client:
1051
  fallback_methods.append("azure")
1052
  if exclude_method != "tesseract" and self._check_tesseract_available():
 
1058
  logger.info(f"Trying fallback method: {method}")
1059
  try:
1060
  if method == "azure":
1061
+ result = self._azure_ocr_with_html(pdf_path)
1062
  elif method == "tesseract":
1063
  result = self._tesseract_ocr(pdf_path)
1064
  elif method == "pymupdf":
 
1075
  return {
1076
  'success': False,
1077
  'text': '',
1078
+ 'html': '',
1079
  'method_used': 'all_methods_failed',
1080
  'metadata': {},
1081
  'error': 'All OCR methods failed'
 
1099
  methods.append("azure")
1100
  if self._check_tesseract_available():
1101
  methods.append("tesseract")
1102
+ methods.append("pymupdf")
1103
 
1104
  return methods
readme.md CHANGED
@@ -1,231 +1,270 @@
1
  # PDF OCR Service
2
 
3
- A comprehensive PDF to text conversion service with multiple OCR providers and a user-friendly web interface.
4
 
5
  ## Features
6
 
7
- - 🔄 **Multiple OCR Methods**: Azure Document Intelligence, Tesseract OCR, and PyMuPDF
8
- - 📄 **Format Preservation**: Maintains original spacing and layout from PDFs
9
- - 🛡️ **Fallback Mechanisms**: Automatically tries alternative methods if primary fails
10
- - 🌐 **Web Interface**: Clean, intuitive Gradio-based UI
11
- - 📊 **Processing Analytics**: Track processing history and statistics
12
- - **High Performance**: Optimized for speed and accuracy
 
 
13
 
14
- ## Architecture
15
 
16
- The service consists of three main components:
17
 
18
- 1. **`ocr_service.py`** - Core OCR processing with Azure, Tesseract, and PyMuPDF
19
- 2. **`backend.py`** - Backend management, file handling, and coordination
20
- 3. **`ui.py`** - Gradio web interface for user interaction
21
-
22
- ## Quick Start
23
-
24
- ### 1. Install Dependencies
25
 
 
26
  ```bash
27
- # Install Python dependencies
28
- pip install -r requirements.txt
29
-
30
- # Install system dependencies (Ubuntu/Debian)
31
  sudo apt-get update
32
  sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
33
  sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
 
 
34
 
35
- # For macOS
 
36
  brew install tesseract
37
-
38
- # For Windows
39
- # Download Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
40
- # Add to PATH environment variable
41
  ```
42
 
43
- ### 2. Configure Environment
 
 
 
 
44
 
45
  ```bash
46
- # Copy environment template
47
- cp .env.example .env
 
 
 
 
48
 
49
- # Edit .env file with your settings
50
- nano .env
 
 
 
 
 
 
51
  ```
52
 
53
- **Required Configuration:**
54
- - Set Azure Document Intelligence endpoint and key (for best quality)
55
- - Adjust file size limits and server settings as needed
56
 
57
- ### 3. Run the Service
58
 
59
  ```bash
60
- # Start the web interface
61
  python app.py
62
-
63
- # Or run individual components
64
- python backend.py # Test backend functionality
65
- python ocr_service.py # Test OCR service
66
  ```
67
 
68
- The service will be available at `http://localhost:7860`
69
 
70
- ## Azure Document Intelligence Setup
71
 
72
- 1. **Create Azure Resource**
73
- - Go to [Azure Portal](https://portal.azure.com)
74
- - Create new "Document Intelligence" resource
75
- - Choose subscription, resource group, and region
76
- - Select pricing tier (F0 for free, S0 for standard)
 
 
 
 
 
 
 
 
77
 
78
- 2. **Get Credentials**
79
- - Navigate to "Keys and Endpoint" section
80
- - Copy the endpoint URL and API key
81
- - Add to your `.env` file:
82
- ```bash
83
- AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
84
- AZURE_DOCUMENT_INTELLIGENCE_KEY=your-api-key-here
85
- ```
86
 
87
- ## OCR Methods
88
 
89
- ### Azure Document Intelligence (Recommended)
90
- - **Best Quality**: Advanced layout analysis and text extraction
91
- - **Features**: Table detection, handwriting recognition, form understanding
92
- - **Use Case**: Complex documents, forms, tables, mixed content
93
- - **Requirements**: Azure subscription and API key
94
 
95
- ### Tesseract OCR
96
- - **Good Quality**: Open-source OCR with preprocessing
97
- - **Features**: Multiple language support, image enhancement
98
- - **Use Case**: Scanned documents, images, simple PDFs
99
- - **Requirements**: Tesseract installation
100
 
101
- ### PyMuPDF
102
- - **Fast Processing**: Direct text extraction from digital PDFs
103
- - **Features**: Fastest processing, embedded text extraction
104
- - **Use Case**: Digital PDFs with embedded text
105
- - **Requirements**: No additional setup needed
106
 
107
- ## Usage Examples
108
 
109
- ### Web Interface
110
- 1. Open `http://localhost:7860` in your browser
111
- 2. Upload a PDF file
112
- 3. Select OCR method (or use "auto")
113
- 4. Click "Process PDF"
114
- 5. Download extracted text
115
-
116
- ### Python API
117
  ```python
118
  from backend import BackendManager
119
 
120
  # Initialize backend
121
- manager = BackendManager()
122
-
123
- # Process PDF
124
- result = manager.process_pdf('document.pdf', method='auto')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  if result['success']:
127
- print("Extracted Text:")
128
- print(result['text'])
129
- print(f"Method used: {result['method_used']}")
130
- print(f"Pages: {result['metadata']['pages']}")
131
  else:
132
- print(f"Error: {result['error']}")
133
  ```
134
 
135
- ## Configuration Options
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
- ### File Processing
138
- - `MAX_FILE_SIZE_MB`: Maximum file size (default: 50MB)
139
- - `PROCESSING_TIMEOUT`: Processing timeout in seconds
140
- - `MAX_CONCURRENT_TASKS`: Concurrent processing limit
141
 
142
- ### OCR Settings
143
- - `DEFAULT_OCR_METHOD`: Default method (auto/azure/tesseract/pymupdf)
144
- - `AZURE_OCR_MODEL`: Azure model (prebuilt-layout/prebuilt-read)
145
- - `TESSERACT_LANGUAGES`: Tesseract language packs
 
146
 
147
- ### Server Settings
148
- - `SERVER_HOST`: Web server host (default: 127.0.0.1)
149
- - `SERVER_PORT`: Web server port (default: 7860)
150
- - `SHARE_GRADIO`: Enable public sharing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  ## Troubleshooting
153
 
154
  ### Common Issues
155
 
156
- 1. **Azure OCR not working**
157
- - Verify endpoint URL and API key
158
- - Check Azure subscription status
159
- - Ensure resource region matches endpoint
160
 
161
- 2. **Tesseract not found**
162
- - Install Tesseract OCR system package
163
- - Verify installation: `tesseract --version`
164
- - Check PATH environment variable
165
 
166
- 3. **Large file processing fails**
167
- - Increase `MAX_FILE_SIZE_MB` in .env
168
- - Check available memory and disk space
169
- - Consider splitting large PDFs
170
 
171
- 4. **Poor OCR quality**
172
- - Try different OCR methods
173
- - Use Azure for best quality
174
- - Ensure good PDF scan quality
175
 
176
  ### Performance Optimization
177
 
178
- - **Use Azure Document Intelligence** for best accuracy
179
- - **Enable image preprocessing** for scanned documents
180
- - **Increase DPI settings** for better image quality
181
- - **Configure memory limits** based on available resources
182
 
183
- ## File Structure
184
 
185
- ```
186
- pdf-ocr-service/
187
- ├── ocr_service.py # Core OCR processing
188
- ├── backend.py # Backend management
189
- ├── ui.py # Gradio web interface
190
- ├── requirements.txt # Python dependencies
191
- ├── .env # Environment configuration
192
- ├── README.md # This file
193
- ├── logs/ # Log files (created automatically)
194
- ├── temp/ # Temporary files (created automatically)
195
- └── cache/ # Cache directory (optional)
196
- ```
197
 
198
- ## Security Considerations
 
 
 
199
 
200
- - Never commit `.env` file to version control
201
- - Use secure methods to store API keys in production
202
- - Enable file validation to prevent malicious uploads
203
- - Consider rate limiting for public deployments
204
- - Regular cleanup of temporary files
 
 
 
205
 
206
  ## Contributing
207
 
208
  1. Fork the repository
209
  2. Create a feature branch
210
  3. Make your changes
211
- 4. Add tests if applicable
212
  5. Submit a pull request
213
 
214
- ## License
215
-
216
- This project is licensed under the MIT License. See LICENSE file for details.
217
-
218
  ## Support
219
 
220
- - Check the troubleshooting section above
221
- - Review Azure Document Intelligence documentation
222
- - Open an issue for bug reports or feature requests
223
-
224
- ## Changelog
225
-
226
- ### Version 1.0.0
227
- - Initial release
228
- - Azure Document Intelligence integration
229
- - Multiple OCR fallback methods
230
- - Gradio web interface
231
- - Processing history and analytics
 
1
  # PDF OCR Service
2
 
3
+ A comprehensive PDF OCR service with HTML processing, smart table detection, and multiple export formats. Convert PDF documents to text with preserved formatting, enhanced table handling, and advanced preprocessing options.
4
 
5
  ## Features
6
 
7
+ - **Multiple OCR Engines**: Azure Document Intelligence, Tesseract OCR, and PyMuPDF
8
+ - **Smart Table Detection**: Preserves text while accurately detecting and formatting tables
9
+ - **HTML Processing**: Intermediate HTML format for better structure preservation
10
+ - **Advanced Crop Control**: Remove headers/footers with per-page customization
11
+ - **Multiple Export Formats**: TXT, DOCX, and HTML downloads
12
+ - **Real-time Preview**: Visual crop preview with live updates
13
+ - **Enhanced Resolution**: High-quality processing for better accuracy
14
+ - **Automatic Page Numbering**: Clear page separation in extracted content
15
 
16
+ ## Installation
17
 
18
+ ### Prerequisites
19
 
20
+ - Python 3.8 or higher
21
+ - System dependencies for OCR engines
 
 
 
 
 
22
 
23
+ #### Ubuntu/Debian
24
  ```bash
 
 
 
 
25
  sudo apt-get update
26
  sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
27
  sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
28
+ sudo apt-get install -y libxml2-dev libxslt1-dev
29
+ ```
30
 
31
+ #### macOS
32
+ ```bash
33
  brew install tesseract
34
+ brew install opencv
35
+ brew install libxml2
 
 
36
  ```
37
 
38
+ #### Windows
39
+ - Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
40
+ - Add Tesseract to PATH environment variable
41
+
42
+ ### Python Dependencies
43
 
44
  ```bash
45
+ pip install -r requirements.txt
46
+ ```
47
+
48
+ ### Environment Configuration
49
+
50
+ Create a `.env` file in the project root:
51
 
52
+ ```env
53
+ # Azure Document Intelligence (Optional)
54
+ AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=your_azure_endpoint
55
+ AZURE_DOCUMENT_INTELLIGENCE_KEY=your_azure_key
56
+
57
+ # File Processing Limits
58
+ MAX_FILE_SIZE_MB=50
59
+ MAX_HISTORY_SIZE=100
60
  ```
61
 
62
+ ## Usage
 
 
63
 
64
+ ### Starting the Service
65
 
66
  ```bash
 
67
  python app.py
 
 
 
 
68
  ```
69
 
70
+ The service will start on `http://localhost:7860`
71
 
72
+ ### Web Interface
73
 
74
+ 1. **Upload PDF**: Select your PDF file using the file upload button
75
+ 2. **Choose OCR Method**:
76
+ - `auto`: Automatically selects the best available method
77
+ - `azure`: Azure Document Intelligence (requires API key)
78
+ - `tesseract`: Open-source Tesseract OCR
79
+ - `pymupdf`: Fast PyMuPDF text extraction
80
+ 3. **Configure Preprocessing** (Optional):
81
+ - Enable header/footer removal
82
+ - Adjust crop percentages for each edge
83
+ - Use real-time preview to see crop effects
84
+ - Apply settings to all pages or customize per page
85
+ 4. **Process**: Click "Process PDF with HTML Enhancement"
86
+ 5. **Download**: Choose from TXT, DOCX, or HTML formats
87
 
88
+ ### Crop Control
 
 
 
 
 
 
 
89
 
90
+ The crop feature allows you to remove headers, footers, and margins:
91
 
92
+ - **Top/Bottom Crop**: Remove headers and footers (0-40% of page height)
93
+ - **Left/Right Crop**: Remove side margins (0-30% of page width)
94
+ - **Per-page Settings**: Customize crop for individual pages
95
+ - **Real-time Preview**: See crop effects with red (removed) and green (content) areas
 
96
 
97
+ #### Preset Options
98
+ - **Light Crop (5%)**: Minimal header/footer removal
99
+ - **Medium Crop (10%)**: Standard header/footer removal
100
+ - **Heavy Crop (15%)**: Aggressive header/footer removal
101
+ - **Reset**: Remove all cropping
102
 
103
+ ### API Usage
 
 
 
 
104
 
105
+ The service can be integrated programmatically:
106
 
 
 
 
 
 
 
 
 
107
  ```python
108
  from backend import BackendManager
109
 
110
  # Initialize backend
111
+ backend = BackendManager()
112
+
113
+ # Process PDF with options
114
+ preprocessing_options = {
115
+ 'enable_header_footer_removal': True,
116
+ 'crop_settings': {
117
+ 'per_page_crops': {
118
+ 0: {'top': 10, 'bottom': 10, 'left': 5, 'right': 5}
119
+ },
120
+ 'enhanced_resolution': True
121
+ }
122
+ }
123
+
124
+ result = backend.process_pdf_with_enhanced_resolution(
125
+ pdf_path='document.pdf',
126
+ method='auto',
127
+ preprocessing_options=preprocessing_options
128
+ )
129
 
130
  if result['success']:
131
+ print("Extracted text:", result['text'])
132
+ print("HTML content:", result['html'])
 
 
133
  else:
134
+ print("Error:", result['error'])
135
  ```
136
 
137
+ ## Output Formats
138
+
139
+ ### Text (TXT)
140
+ - Plain text with preserved formatting
141
+ - Page numbers and separators
142
+ - Table formatting with borders
143
+ - Proper indentation and spacing
144
+
145
+ ### Microsoft Word (DOCX)
146
+ - Structured document with headings
147
+ - Tables converted to Word tables
148
+ - Preserved formatting and layout
149
+ - Metadata and processing information
150
+
151
+ ### HTML
152
+ - Web-viewable format
153
+ - CSS styling for better readability
154
+ - Interactive tables
155
+ - Responsive design
156
 
157
+ ## OCR Method Selection
 
 
 
158
 
159
+ ### Auto (Recommended)
160
+ Automatically chooses the best available method based on:
161
+ 1. Azure Document Intelligence (if configured)
162
+ 2. Tesseract OCR (if available)
163
+ 3. PyMuPDF (fallback)
164
 
165
+ ### Azure Document Intelligence
166
+ - **Best for**: Complex documents with tables and forms
167
+ - **Requires**: Azure API credentials
168
+ - **Features**: Advanced layout detection, handwriting recognition
169
+ - **Speed**: Medium (cloud processing)
170
+
171
+ ### Tesseract OCR
172
+ - **Best for**: Scanned documents and images
173
+ - **Requires**: Local Tesseract installation
174
+ - **Features**: Open-source, multilingual support
175
+ - **Speed**: Slow (local processing)
176
+
177
+ ### PyMuPDF
178
+ - **Best for**: Text-based PDFs
179
+ - **Requires**: No additional setup
180
+ - **Features**: Fast extraction, basic formatting
181
+ - **Speed**: Fast (direct text extraction)
182
+
183
+ ## Configuration
184
+
185
+ ### Environment Variables
186
+
187
+ | Variable | Description | Default |
188
+ |----------|-------------|---------|
189
+ | `AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT` | Azure service endpoint | None |
190
+ | `AZURE_DOCUMENT_INTELLIGENCE_KEY` | Azure API key | None |
191
+ | `MAX_FILE_SIZE_MB` | Maximum file size limit | 50 |
192
+ | `MAX_HISTORY_SIZE` | Processing history limit | 100 |
193
+
194
+ ### Service Status
195
+
196
+ Check available OCR methods and service health at the bottom of the web interface. The status panel shows:
197
+ - Available OCR methods
198
+ - Feature availability
199
+ - Configuration status
200
+ - Export format support
201
 
202
  ## Troubleshooting
203
 
204
  ### Common Issues
205
 
206
+ **PDF Upload Fails**
207
+ - Check file size (default limit: 50MB)
208
+ - Ensure PDF is not password protected
209
+ - Verify PDF is not corrupted
210
 
211
+ **OCR Processing Errors**
212
+ - Check Azure credentials if using Azure method
213
+ - Verify Tesseract installation for Tesseract method
214
+ - Try different OCR method using auto-selection
215
 
216
+ **Crop Preview Not Showing**
217
+ - Ensure PDF is loaded successfully
218
+ - Enable header/footer removal option
219
+ - Check browser console for JavaScript errors
220
 
221
+ **Export Downloads Not Available**
222
+ - Verify processing completed successfully
223
+ - Check python-docx installation for DOCX export
224
+ - Ensure sufficient disk space for temporary files
225
 
226
  ### Performance Optimization
227
 
228
+ - Use PyMuPDF for simple text-based PDFs
229
+ - Enable crop processing only when needed
230
+ - Reduce crop resolution scale for better performance
231
+ - Regular cleanup of temporary files
232
 
233
+ ## Dependencies
234
 
235
+ ### Core Dependencies
236
+ - `gradio>=4.0.0` - Web interface
237
+ - `python-dotenv>=1.0.0` - Environment configuration
238
+ - `PyMuPDF>=1.23.0` - PDF processing
239
+ - `opencv-python>=4.8.0` - Image processing
240
+ - `numpy>=1.24.0` - Numerical operations
 
 
 
 
 
 
241
 
242
+ ### OCR Dependencies
243
+ - `azure-ai-documentintelligence>=1.0.0b1` - Azure OCR
244
+ - `pytesseract>=0.3.10` - Tesseract integration
245
+ - `Pillow>=10.0.0` - Image processing
246
 
247
+ ### Export Dependencies
248
+ - `python-docx>=0.8.11` - DOCX generation
249
+ - `beautifulsoup4>=4.12.0` - HTML processing
250
+ - `lxml>=4.9.0` - XML processing
251
+
252
+ ## License
253
+
254
+ This project is licensed under the MIT License. See LICENSE file for details.
255
 
256
  ## Contributing
257
 
258
  1. Fork the repository
259
  2. Create a feature branch
260
  3. Make your changes
261
+ 4. Add tests for new functionality
262
  5. Submit a pull request
263
 
 
 
 
 
264
  ## Support
265
 
266
+ For issues and questions:
267
+ - Check the troubleshooting section
268
+ - Review the service status panel
269
+ - Check system dependencies
270
+ - Verify environment configuration
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- # PDF OCR Service Requirements - Enhanced Version
2
 
3
  # Core web framework and UI
4
  gradio>=4.0.0
@@ -19,13 +19,21 @@ numpy>=1.24.0
19
  # PDF processing and manipulation
20
  PyMuPDF>=1.23.0
21
 
22
- # Document export formats
23
  python-docx>=0.8.11
24
 
 
 
 
 
25
  # Additional dependencies for enhanced preprocessing
26
  matplotlib>=3.7.0 # For image visualization in development
27
  scikit-image>=0.21.0 # Advanced image processing (optional)
28
 
 
 
 
 
29
  # System dependencies information (install separately):
30
  #
31
  # For Ubuntu/Debian:
@@ -33,30 +41,38 @@ scikit-image>=0.21.0 # Advanced image processing (optional)
33
  # sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
34
  # sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
35
  # sudo apt-get install -y python3-opencv # Alternative OpenCV installation
 
36
  #
37
  # For CentOS/RHEL:
38
  # sudo yum install -y tesseract tesseract-langpack-eng
39
  # sudo yum install -y opencv-python
 
40
  #
41
  # For macOS:
42
  # brew install tesseract
43
  # brew install opencv
 
44
  #
45
  # For Windows:
46
  # Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
47
  # Add Tesseract to PATH environment variable
48
- # OpenCV should install automatically with pip
49
 
50
  # Development and testing (optional)
51
  pytest>=7.0.0
52
  pytest-cov>=4.0.0
 
 
53
 
54
  # Performance monitoring (optional)
55
  memory-profiler>=0.60.0
 
56
 
57
  # Note: The enhanced version includes:
58
- # - Header/footer removal preprocessing
59
- # - OpenCV-based image manipulation
60
- # - Enhanced table processing with separator row removal
61
- # - Crop preview functionality
62
- # - Advanced PDF manipulation capabilities
 
 
 
1
+ # PDF OCR Service Requirements - Enhanced Version with HTML Processing
2
 
3
  # Core web framework and UI
4
  gradio>=4.0.0
 
19
  # PDF processing and manipulation
20
  PyMuPDF>=1.23.0
21
 
22
+ # Document export formats (ENHANCED)
23
  python-docx>=0.8.11
24
 
25
+ # HTML processing and parsing (NEW)
26
+ beautifulsoup4>=4.12.0
27
+ lxml>=4.9.0
28
+
29
  # Additional dependencies for enhanced preprocessing
30
  matplotlib>=3.7.0 # For image visualization in development
31
  scikit-image>=0.21.0 # Advanced image processing (optional)
32
 
33
+ # Performance and utility libraries
34
+ tqdm>=4.65.0 # Progress bars for long operations
35
+ requests>=2.31.0 # HTTP requests for external services
36
+
37
  # System dependencies information (install separately):
38
  #
39
  # For Ubuntu/Debian:
 
41
  # sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
42
  # sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
43
  # sudo apt-get install -y python3-opencv # Alternative OpenCV installation
44
+ # sudo apt-get install -y libxml2-dev libxslt1-dev # For lxml
45
  #
46
  # For CentOS/RHEL:
47
  # sudo yum install -y tesseract tesseract-langpack-eng
48
  # sudo yum install -y opencv-python
49
+ # sudo yum install -y libxml2-devel libxslt-devel
50
  #
51
  # For macOS:
52
  # brew install tesseract
53
  # brew install opencv
54
+ # brew install libxml2
55
  #
56
  # For Windows:
57
  # Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
58
  # Add Tesseract to PATH environment variable
59
+ # OpenCV and other packages should install automatically with pip
60
 
61
  # Development and testing (optional)
62
  pytest>=7.0.0
63
  pytest-cov>=4.0.0
64
+ black>=23.0.0 # Code formatting
65
+ flake8>=6.0.0 # Code linting
66
 
67
  # Performance monitoring (optional)
68
  memory-profiler>=0.60.0
69
+ psutil>=5.9.0 # System monitoring
70
 
71
  # Note: The enhanced version includes:
72
+ # - Fixed table processing that prevents text loss
73
+ # - HTML intermediate processing for better formatting
74
+ # - Enhanced export capabilities (TXT, DOCX, HTML)
75
+ # - Smart overlap detection with 70% threshold
76
+ # - Improved coordinate calculations for table boundaries
77
+ # - Better document structure preservation
78
+ # - Multi-format download options