Chirapath commited on
Commit
4b6cb81
·
verified ·
1 Parent(s): 6731c83

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +325 -129
  2. backend.py +315 -125
  3. enhanced_indentation.py +498 -15
  4. ocr_service.py +426 -139
  5. requirements.txt +112 -91
app.py CHANGED
@@ -17,7 +17,7 @@ from dotenv import load_dotenv
17
  load_dotenv()
18
 
19
  from backend import BackendManager
20
- from enhanced_indentation import EnhancedIndentationDetector
21
 
22
  # Configure logging
23
  logging.basicConfig(level=logging.INFO)
@@ -26,8 +26,9 @@ logger = logging.getLogger(__name__)
26
  # Initialize backend manager
27
  backend_manager = BackendManager()
28
 
29
- # Initialize enhanced indentation detector
30
  indent_detector = EnhancedIndentationDetector()
 
31
 
32
  # Check if python-docx is available
33
  try:
@@ -40,27 +41,38 @@ except ImportError:
40
  HAS_DOCX_SUPPORT = False
41
  logger.info("DOCX export not available - install python-docx to enable")
42
 
43
- # Global variables for enhanced crop management
 
 
 
 
 
 
 
 
 
44
  current_pdf_data = {
45
  'path': None,
46
  'page_count': 0,
47
  'page_images': {},
48
  'crop_settings': {},
49
- 'default_crop_all': True
 
50
  }
51
 
52
  class PDFPageManager:
53
- """Manages PDF page previews and crop settings with enhanced resolution - FIXED VERSION"""
54
 
55
  def __init__(self):
56
  self.pdf_doc = None
57
  self.page_images = {}
58
  self.crop_settings = {}
59
  self.current_page = 0
60
- self.high_res_scale = 2.0 # Reduced from 3.0 for better performance
 
61
 
62
  def load_pdf(self, pdf_path: str) -> Dict:
63
- """Load PDF and generate high-resolution page previews - FIXED"""
64
  try:
65
  if self.pdf_doc:
66
  self.pdf_doc.close()
@@ -68,10 +80,27 @@ class PDFPageManager:
68
  self.pdf_doc = fitz.open(pdf_path)
69
  page_count = len(self.pdf_doc)
70
 
71
- # Generate high-resolution previews for all pages
72
  self.page_images = {}
 
 
73
  for page_num in range(page_count):
74
- self.page_images[page_num] = self._generate_high_res_preview(page_num)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  # Initialize default crop settings for all pages
77
  self.crop_settings = {
@@ -79,12 +108,14 @@ class PDFPageManager:
79
  for i in range(page_count)
80
  }
81
 
82
- logger.info(f"PDF loaded successfully: {page_count} pages")
83
 
84
  return {
85
  'success': True,
86
  'page_count': page_count,
87
- 'pages': list(range(page_count))
 
 
88
  }
89
 
90
  except Exception as e:
@@ -92,14 +123,14 @@ class PDFPageManager:
92
  return {'success': False, 'error': str(e)}
93
 
94
  def _generate_high_res_preview(self, page_num: int) -> np.ndarray:
95
- """Generate high-resolution preview for better crop visualization - FIXED"""
96
  try:
97
  if not self.pdf_doc:
98
  return None
99
 
100
  page = self.pdf_doc.load_page(page_num)
101
 
102
- # Use high resolution matrix for better quality
103
  mat = fitz.Matrix(self.high_res_scale, self.high_res_scale)
104
  pix = page.get_pixmap(matrix=mat)
105
  img_data = pix.tobytes("png")
@@ -109,10 +140,14 @@ class PDFPageManager:
109
  pil_image = Image.open(io.BytesIO(img_data))
110
  img_array = np.array(pil_image)
111
 
112
- # Convert RGBA to RGB if needed
113
  if len(img_array.shape) == 3 and img_array.shape[2] == 4:
114
  img_array = img_array[:, :, :3]
115
 
 
 
 
 
116
  return img_array
117
 
118
  except Exception as e:
@@ -120,7 +155,7 @@ class PDFPageManager:
120
  return None
121
 
122
  def update_crop_visualization(self, page_num: int, crop_coords: Dict) -> np.ndarray:
123
- """Update crop visualization with enhanced preview - FIXED"""
124
  if page_num not in self.page_images or self.page_images[page_num] is None:
125
  logger.warning(f"No image available for page {page_num}")
126
  return None
@@ -147,23 +182,50 @@ class PDFPageManager:
147
  # Draw crop areas in semi-transparent red (areas to be removed)
148
  alpha = 0.3
149
  if crop_coords.get('top', 0) > 0 and y1 > 0:
150
- cv2.rectangle(overlay, (0, 0), (width, y1), (255, 0, 0), -1)
151
  if crop_coords.get('bottom', 0) > 0 and y2 < height:
152
- cv2.rectangle(overlay, (0, y2), (width, height), (255, 0, 0), -1)
153
  if crop_coords.get('left', 0) > 0 and x1 > 0:
154
- cv2.rectangle(overlay, (0, 0), (x1, height), (255, 0, 0), -1)
155
  if crop_coords.get('right', 0) > 0 and x2 < width:
156
- cv2.rectangle(overlay, (x2, 0), (width, height), (255, 0, 0), -1)
157
 
158
  # Draw content area outline in green
159
  if x2 > x1 and y2 > y1:
160
  thickness = max(2, int(self.high_res_scale * 2))
161
  cv2.rectangle(overlay, (x1, y1), (x2, y2), (0, 255, 0), thickness)
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  # Blend overlay with original
164
  result = cv2.addWeighted(img_array, 1-alpha, overlay, alpha, 0)
165
 
166
- # Add informative text with better scaling
167
  font_scale = max(0.8, self.high_res_scale / 3)
168
  thickness = max(1, int(self.high_res_scale))
169
  text_color = (255, 255, 255)
@@ -173,10 +235,22 @@ class PDFPageManager:
173
  texts = [
174
  f"Page {page_num + 1}",
175
  "RED: Remove areas",
176
- "GREEN: Content area",
177
- f"Crop: T{crop_coords.get('top', 0):.1f}% B{crop_coords.get('bottom', 0):.1f}% L{crop_coords.get('left', 0):.1f}% R{crop_coords.get('right', 0):.1f}%"
178
  ]
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  y_offset = 30
181
  for i, text in enumerate(texts):
182
  y_pos = y_offset + (i * 30)
@@ -185,35 +259,49 @@ class PDFPageManager:
185
  cv2.rectangle(result, (10, y_pos - text_height - 5), (text_width + 20, y_pos + 5), background_color, -1)
186
  cv2.putText(result, text, (15, y_pos), cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, thickness)
187
 
188
- return result
 
 
 
189
 
190
  except Exception as e:
191
  logger.error(f"Error updating crop visualization: {e}")
192
- return self.page_images[page_num] if page_num in self.page_images else None
 
 
 
193
 
194
  def set_crop_for_page(self, page_num: int, crop_coords: Dict):
195
- """Set crop coordinates for specific page - FIXED"""
196
  if page_num in self.crop_settings:
197
  self.crop_settings[page_num].update(crop_coords)
198
  self.crop_settings[page_num]['custom'] = True
199
  logger.info(f"Set crop for page {page_num}: {crop_coords}")
200
 
201
  def set_crop_for_all_pages(self, crop_coords: Dict):
202
- """Apply same crop settings to all pages - FIXED"""
203
  for page_num in self.crop_settings:
204
  if not self.crop_settings[page_num].get('custom', False):
205
  self.crop_settings[page_num].update(crop_coords)
206
  logger.info(f"Applied crop to all non-custom pages: {crop_coords}")
207
 
208
  def get_crop_settings_for_processing(self) -> Dict:
209
- """Get crop settings in format expected by backend - FIXED"""
210
  return {
211
  'per_page_crops': self.crop_settings,
212
  'has_custom_crops': any(page.get('custom', False) for page in self.crop_settings.values()),
213
  'enhanced_resolution': True,
214
- 'resolution_scale': self.high_res_scale
 
 
215
  }
216
 
 
 
 
 
 
 
217
  def close(self):
218
  """Clean up resources"""
219
  if self.pdf_doc:
@@ -221,12 +309,13 @@ class PDFPageManager:
221
  self.pdf_doc = None
222
  self.page_images.clear()
223
  self.crop_settings.clear()
 
224
 
225
  # Global page manager instance
226
  pdf_manager = PDFPageManager()
227
 
228
  def load_pdf_for_preview(pdf_file):
229
- """Load PDF and return page thumbnails for selection - FIXED"""
230
  if pdf_file is None:
231
  return None, gr.update(choices=[], value=None), gr.update(visible=False), "No PDF loaded"
232
 
@@ -237,12 +326,26 @@ def load_pdf_for_preview(pdf_file):
237
  # Create page choices for dropdown
238
  page_choices = [f"Page {i+1}" for i in range(result['page_count'])]
239
 
240
- # Get first page preview with default crop
241
  first_page_preview = pdf_manager.update_crop_visualization(0, {
242
  'top': 0, 'bottom': 0, 'left': 0, 'right': 0
243
  }) if 0 in pdf_manager.page_images else None
244
 
245
- status = f"PDF loaded successfully: {result['page_count']} pages"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
  return (first_page_preview,
248
  gr.update(choices=page_choices, value=page_choices[0] if page_choices else None, visible=True),
@@ -256,7 +359,7 @@ def load_pdf_for_preview(pdf_file):
256
  return None, gr.update(choices=[], value=None, visible=False), gr.update(visible=False), f"Error loading PDF: {str(e)}"
257
 
258
  def change_preview_page(page_selection, crop_top, crop_bottom, crop_left, crop_right):
259
- """Change preview to selected page with current crop settings - FIXED"""
260
  if not page_selection:
261
  return None
262
 
@@ -271,7 +374,7 @@ def change_preview_page(page_selection, crop_top, crop_bottom, crop_left, crop_r
271
  'right': crop_right
272
  }
273
 
274
- # Update visualization
275
  preview_image = pdf_manager.update_crop_visualization(page_num, crop_coords)
276
  return preview_image
277
 
@@ -280,7 +383,7 @@ def change_preview_page(page_selection, crop_top, crop_bottom, crop_left, crop_r
280
  return None
281
 
282
  def update_crop_preview_interactive(page_selection, crop_top, crop_bottom, crop_left, crop_right, apply_to_all):
283
- """Update crop preview with interactive feedback - FIXED"""
284
  if not page_selection or not pdf_manager.pdf_doc:
285
  return None
286
 
@@ -300,44 +403,51 @@ def update_crop_preview_interactive(page_selection, crop_top, crop_bottom, crop_
300
  else:
301
  pdf_manager.set_crop_for_page(page_num, crop_coords)
302
 
303
- # Return updated preview
304
  return pdf_manager.update_crop_visualization(page_num, crop_coords)
305
 
306
  except Exception as e:
307
  logger.error(f"Error updating crop preview: {e}")
308
  return None
309
 
310
- def process_pdf_with_enhanced_indentation(pdf_file, ocr_method, enable_header_footer_removal,
311
- crop_top, crop_bottom, crop_left, crop_right,
312
- apply_to_all_pages, current_page_selection,
313
- progress=gr.Progress()):
314
- """Process PDF with enhanced indentation detection, text classification, and comprehensive formatting"""
315
  if pdf_file is None:
316
  return "No file uploaded.", "", "", "Error: No file selected"
317
 
318
  try:
319
- progress(0.1, desc="Initializing enhanced processing with comprehensive indentation detection and intelligent text classification...")
320
 
321
- # Prepare enhanced preprocessing options
322
  preprocessing_options = {
323
  'enable_header_footer_removal': enable_header_footer_removal,
324
  'enhanced_crop_processing': True,
 
325
  'crop_settings': pdf_manager.get_crop_settings_for_processing() if enable_header_footer_removal else None
326
  }
327
 
328
- progress(0.3, desc="Processing with enhanced indentation detection and text classification...")
329
 
330
- # Process the PDF with enhanced preprocessing, indentation detection, and text classification
331
  result = backend_manager.process_pdf_with_enhanced_resolution(
332
  pdf_file.name, ocr_method, preprocessing_options
333
  )
334
 
335
- progress(0.9, desc="Finalizing enhanced processing...")
336
  progress(1.0, desc="Complete!")
337
 
338
  if result['success']:
339
- metadata_info = format_enhanced_metadata(result['metadata'], result['method_used'])
340
- status = f"Success: Processed using {result['method_used']} with comprehensive indentation detection and intelligent text classification"
 
 
 
 
 
 
341
 
342
  # Return text, HTML, metadata, and status
343
  return (result['text'],
@@ -349,11 +459,11 @@ def process_pdf_with_enhanced_indentation(pdf_file, ocr_method, enable_header_fo
349
  return f"Error: {error_msg}", "", "", f"Processing failed: {error_msg}"
350
 
351
  except Exception as e:
352
- logger.error(f"Enhanced processing error: {e}")
353
  return f"Error: {str(e)}", "", "", f"Unexpected error: {str(e)}"
354
 
355
- def format_enhanced_metadata(metadata, method_used):
356
- """Enhanced metadata formatting with comprehensive indentation processing and text classification info"""
357
  if not metadata:
358
  return f"Method used: {method_used}"
359
 
@@ -362,6 +472,21 @@ def format_enhanced_metadata(metadata, method_used):
362
  if 'pages' in metadata:
363
  info_lines.append(f"Pages processed: {metadata['pages']}")
364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  if metadata.get('enhanced_processing', False):
366
  info_lines.append("Enhanced processing: Enabled")
367
 
@@ -386,6 +511,14 @@ def format_enhanced_metadata(metadata, method_used):
386
  if 'tables' in metadata:
387
  info_lines.append(f"Tables detected: {metadata['tables']}")
388
 
 
 
 
 
 
 
 
 
389
  # Document structure analysis information
390
  if 'document_structure_analysis' in metadata:
391
  analysis = metadata['document_structure_analysis']
@@ -410,11 +543,11 @@ def format_enhanced_metadata(metadata, method_used):
410
 
411
  return "\n".join(info_lines)
412
 
413
- def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
414
- crop_top, crop_bottom, crop_left, crop_right,
415
- apply_to_all_pages, current_page_selection):
416
- """Prepare enhanced downloads with comprehensive indentation processing and text classification"""
417
- text, html, metadata, status = process_pdf_with_enhanced_indentation(
418
  pdf_file, method, enable_header_footer_removal,
419
  crop_top, crop_bottom, crop_left, crop_right,
420
  apply_to_all_pages, current_page_selection
@@ -423,7 +556,7 @@ def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
423
  # Prepare downloads if processing was successful
424
  if text and not text.startswith("Error:") and not text.startswith("No file"):
425
  try:
426
- # Create enhanced download files
427
  download_files = backend_manager.create_enhanced_downloads(text, html, metadata)
428
 
429
  # Prepare gradio updates for download buttons
@@ -437,7 +570,7 @@ def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
437
  return tuple(updates)
438
 
439
  except Exception as file_error:
440
- logger.error(f"Enhanced file creation error: {file_error}")
441
  return (text, metadata, status,
442
  gr.update(visible=False),
443
  gr.update(visible=False),
@@ -448,38 +581,59 @@ def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
448
  gr.update(visible=False),
449
  gr.update(visible=False))
450
 
451
- def get_enhanced_method_info(method):
452
- """Get information about selected OCR method with comprehensive indentation processing and text classification"""
 
 
453
  method_descriptions = {
454
- "auto": "**Auto Selection**: Automatically chooses the best available method with comprehensive indentation detection, intelligent text classification, HTML processing, enhanced pattern recognition for hierarchical numbering (including parenthetical patterns like (1), (๑), (a)), bullets, and multi-language support.",
455
- "azure": "**Azure Document Intelligence**: Advanced cloud-based OCR with comprehensive indentation detection, intelligent text classification, HTML generation, layout preservation, smart table detection, and support for complex document structures including hierarchical numbering and parenthetical patterns.",
456
- "tesseract": "**Tesseract OCR**: Open-source OCR enhanced with comprehensive indentation detection, intelligent text classification, HTML output, advanced image preprocessing, resolution scaling, and pattern recognition for various numbering styles including parenthetical patterns and bullet points.",
457
- "pymupdf": "**PyMuPDF**: Fast extraction enhanced with comprehensive indentation detection, intelligent text classification, HTML processing, improved formatting preservation, and pattern recognition for maintaining document structure and hierarchy including parenthetical numbering."
458
  }
459
 
460
  return method_descriptions.get(method, "Select a method to see details.")
461
 
462
- def check_enhanced_service_status():
463
- """Check and display enhanced service status with indentation detection and text classification capabilities"""
464
  available_methods = backend_manager.get_available_methods()
465
 
466
- status_lines = ["**Available OCR Methods (Enhanced with Comprehensive Indentation Detection & Text Classification):**"]
 
 
467
 
468
  if "azure" in available_methods:
469
- status_lines.append("✅ Azure Document Intelligence - Ready (HTML + Tables + Comprehensive Indentation + Text Classification)")
470
  else:
471
  status_lines.append("❌ Azure Document Intelligence - Not configured")
472
 
473
  if "tesseract" in available_methods:
474
- status_lines.append("✅ Tesseract OCR - Ready (HTML Enhanced + Comprehensive Indentation + Text Classification)")
475
  else:
476
  status_lines.append("❌ Tesseract OCR - Not available")
477
 
478
  if "pymupdf" in available_methods:
479
- status_lines.append("✅ PyMuPDF - Ready (HTML Enhanced + Comprehensive Indentation + Text Classification)")
480
  else:
481
  status_lines.append("❌ PyMuPDF - Not available")
482
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  # Add enhanced features status
484
  status_lines.append("")
485
  status_lines.append("**Comprehensive Indentation Detection Features:**")
@@ -509,6 +663,9 @@ def check_enhanced_service_status():
509
  status_lines.append("✅ Context-aware Analysis (position, font size)")
510
  status_lines.append("✅ Confidence Scoring")
511
  status_lines.append("✅ Document Structure Analysis")
 
 
 
512
 
513
  status_lines.append("")
514
  status_lines.append("**Enhanced Processing Features:**")
@@ -518,9 +675,12 @@ def check_enhanced_service_status():
518
  status_lines.append("✅ Multi-Page Crop Preview - Available")
519
  status_lines.append("✅ Per-Page Crop Customization - Available")
520
  status_lines.append("✅ Document Structure Analysis - Available")
 
 
 
521
 
522
  if HAS_DOCX_SUPPORT:
523
- status_lines.append("✅ Enhanced DOCX Export - Available (with indentation formatting)")
524
  else:
525
  status_lines.append("❌ Enhanced DOCX Export - Install python-docx to enable")
526
 
@@ -533,11 +693,11 @@ def check_enhanced_service_status():
533
 
534
  return "\n".join(status_lines)
535
 
536
- def create_enhanced_interface():
537
- """Create enhanced Gradio interface with comprehensive indentation detection and text classification"""
538
 
539
  with gr.Blocks(
540
- title="PDF OCR Service - Enhanced with Comprehensive Indentation Detection & Text Classification",
541
  theme=gr.themes.Soft(),
542
  css="""
543
  .main-header { text-align: center; margin-bottom: 2rem; }
@@ -547,32 +707,46 @@ def create_enhanced_interface():
547
  .page-preview { border: 2px solid #17a2b8; padding: 1rem; border-radius: 0.5rem; background-color: #f0f8ff; }
548
  .results-panel { border: 2px solid #6f42c1; padding: 1rem; border-radius: 0.5rem; background-color: #f8f5ff; }
549
  .status-box { border-left: 4px solid #007bff; padding: 1rem; background-color: #f8f9fa; }
 
550
  """
551
  ) as interface:
552
 
553
- gr.HTML("""
554
  <div class="main-header">
555
- <h1>PDF OCR Service - Enhanced with Comprehensive Indentation Detection & Intelligent Text Classification</h1>
556
- <p>Convert PDF documents to text using enhanced OCR with HTML intermediate processing, smart table handling, comprehensive indentation pattern recognition including parenthetical patterns like (1), (๑), (a), and intelligent text classification for headers, paragraphs, and list items</p>
 
557
  </div>
558
  """)
559
 
560
  # Instructions at the top
561
  with gr.Group(elem_classes=["instructions-panel"]):
562
- gr.HTML("<h3>Instructions & Enhanced Features</h3>")
563
- gr.HTML("""
564
  <div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem;">
565
  <h4>How to Use:</h4>
566
  <ol>
567
  <li><strong>Upload PDF:</strong> Select your PDF file in the configuration panel below</li>
568
  <li><strong>Choose Method:</strong> Select OCR method (Auto recommended for best results)</li>
569
- <li><strong>Configure Crop (Optional):</strong> Enable header/footer removal and adjust crop settings</li>
570
- <li><strong>Process:</strong> Click the process button to extract text with comprehensive indentation detection and text classification</li>
571
- <li><strong>Download:</strong> Get results in TXT, DOCX, or HTML format with preserved formatting</li>
572
  </ol>
573
 
574
- <h4>Comprehensive Indentation Detection & Text Classification Features:</h4>
575
  <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin-top: 0.5rem;">
 
 
 
 
 
 
 
 
 
 
 
 
576
  <div>
577
  <strong>Hierarchical Numbering:</strong>
578
  <ul>
@@ -583,6 +757,8 @@ def create_enhanced_interface():
583
  <li>Section: §1.2.3, Article 1.1.1</li>
584
  </ul>
585
  </div>
 
 
586
  <div>
587
  <strong>Parenthetical Patterns:</strong>
588
  <ul>
@@ -593,8 +769,6 @@ def create_enhanced_interface():
593
  <li>Thai Letters: (ก), (ข), (ค)</li>
594
  </ul>
595
  </div>
596
- </div>
597
- <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin-top: 0.5rem;">
598
  <div>
599
  <strong>Multi-Language & Symbols:</strong>
600
  <ul>
@@ -605,6 +779,8 @@ def create_enhanced_interface():
605
  <li>Checkboxes: [x], [ ], [✓]</li>
606
  </ul>
607
  </div>
 
 
608
  <div>
609
  <strong>Intelligent Text Classification:</strong>
610
  <ul>
@@ -613,21 +789,21 @@ def create_enhanced_interface():
613
  <li>List Item Identification: Patterned content</li>
614
  <li>Context Analysis: Position, font size, formatting</li>
615
  <li>Confidence Scoring: Reliability assessment</li>
 
 
 
 
 
 
 
 
 
 
 
 
616
  </ul>
617
  </div>
618
  </div>
619
-
620
- <h4>Technical Enhancements:</h4>
621
- <ul>
622
- <li><strong>Smart Table Detection:</strong> 70% overlap threshold prevents text loss</li>
623
- <li><strong>HTML Processing:</strong> Better structure and formatting preservation</li>
624
- <li><strong>Multi-format Export:</strong> TXT, DOCX, and HTML downloads with preserved indentation</li>
625
- <li><strong>Advanced Crop Control:</strong> Per-page customization with real-time preview</li>
626
- <li><strong>Enhanced Resolution:</strong> High-quality processing for better accuracy</li>
627
- <li><strong>Document Analysis:</strong> Automatic structure detection and statistics</li>
628
- <li><strong>Priority Pattern Matching:</strong> Intelligent pattern detection with priority ranking</li>
629
- <li><strong>Text Classification:</strong> Automated header, paragraph, and list item detection</li>
630
- </ul>
631
  </div>
632
  """)
633
 
@@ -658,28 +834,28 @@ def create_enhanced_interface():
658
  choices=["auto", "azure", "tesseract", "pymupdf"],
659
  value="auto",
660
  label="OCR Method",
661
- info="Choose OCR method (all enhanced with comprehensive indentation detection and text classification)"
662
  )
663
 
664
  # Method information display
665
  method_info = gr.Markdown(
666
- value=get_enhanced_method_info("auto"),
667
  elem_classes=["method-info"]
668
  )
669
 
670
  # Enhanced Header/Footer Removal Section
671
  with gr.Group(elem_classes=["crop-controls"]):
672
- gr.HTML("<h4>Header/Footer Removal & Crop Settings</h4>")
673
 
674
  enable_header_footer_removal = gr.Checkbox(
675
- label="Enable Enhanced Header/Footer Removal",
676
  value=False,
677
- info="Remove headers and footers with high-resolution processing"
678
  )
679
 
680
  # Multi-page controls
681
  with gr.Group(visible=False) as crop_controls:
682
- gr.HTML("<h5>Multi-Page Crop Control</h5>")
683
 
684
  with gr.Row():
685
  # Page selection
@@ -687,7 +863,7 @@ def create_enhanced_interface():
687
  label="Select Page for Preview",
688
  choices=[],
689
  value=None,
690
- info="Choose page to preview and customize crop settings",
691
  visible=False
692
  )
693
 
@@ -698,6 +874,17 @@ def create_enhanced_interface():
698
  info="When enabled, changes apply to all pages"
699
  )
700
 
 
 
 
 
 
 
 
 
 
 
 
701
  gr.HTML("<h5>Crop Areas (% of page)</h5>")
702
 
703
  with gr.Row():
@@ -743,7 +930,7 @@ def create_enhanced_interface():
743
 
744
  # Process button
745
  process_btn = gr.Button(
746
- "Process PDF with Comprehensive Indentation Detection & Text Classification",
747
  variant="primary",
748
  size="lg"
749
  )
@@ -751,22 +938,31 @@ def create_enhanced_interface():
751
  # Results and Preview Section
752
  with gr.Row():
753
  with gr.Column(scale=1):
754
- # Enhanced crop preview with multi-page support
755
  with gr.Group(visible=False, elem_classes=["page-preview"]) as preview_group:
756
- gr.HTML("<h4>Page Preview with Crop Visualization</h4>")
757
  crop_preview = gr.Image(
758
- label="High-Resolution Page Preview",
759
  interactive=False,
760
  height=500,
761
  show_label=False
762
  )
763
 
764
- gr.HTML("""
765
- <p style="font-size: 0.9em; color: #666; text-align: center;">
766
- <strong>Red areas:</strong> Will be removed | <strong>Green outline:</strong> Content area |
767
- <strong>Enhanced:</strong> 2x resolution processing
768
- </p>
769
- """)
 
 
 
 
 
 
 
 
 
770
 
771
  with gr.Column(scale=2):
772
  with gr.Group(elem_classes=["results-panel"]):
@@ -781,8 +977,8 @@ def create_enhanced_interface():
781
 
782
  # Extracted text output
783
  text_output = gr.Textbox(
784
- label="Extracted Text (Enhanced with Comprehensive Indentation Detection & Text Classification)",
785
- placeholder="Processed text with comprehensive indentation detection, intelligent text classification, HTML enhancement, and preserved formatting will appear here...",
786
  lines=20,
787
  max_lines=30,
788
  interactive=False,
@@ -793,7 +989,7 @@ def create_enhanced_interface():
793
  metadata_output = gr.Textbox(
794
  label="Processing Information & Document Analysis",
795
  interactive=False,
796
- lines=8
797
  )
798
 
799
  # Enhanced download buttons
@@ -804,7 +1000,7 @@ def create_enhanced_interface():
804
  variant="secondary"
805
  )
806
  download_docx_btn = gr.DownloadButton(
807
- "Download Enhanced DOCX (with Indentation & Classification)",
808
  visible=False,
809
  variant="secondary"
810
  )
@@ -816,15 +1012,15 @@ def create_enhanced_interface():
816
 
817
  # Service Status at the bottom
818
  with gr.Group(elem_classes=["status-box"]):
819
- gr.HTML("<h4>Service Status & Capabilities</h4>")
820
  service_status = gr.Markdown(
821
- value=check_enhanced_service_status()
822
  )
823
 
824
  # Refresh status button
825
  refresh_btn = gr.Button("Refresh Status", size="sm")
826
 
827
- # Event handlers with enhanced functionality
828
 
829
  # PDF upload handler
830
  pdf_input.change(
@@ -835,7 +1031,7 @@ def create_enhanced_interface():
835
 
836
  # Method info handler
837
  method_choice.change(
838
- fn=get_enhanced_method_info,
839
  inputs=[method_choice],
840
  outputs=[method_info]
841
  )
@@ -857,7 +1053,7 @@ def create_enhanced_interface():
857
  outputs=[crop_preview]
858
  )
859
 
860
- # Crop parameter handlers - update preview in real-time
861
  for crop_input in [crop_top, crop_bottom, crop_left, crop_right, apply_to_all_pages]:
862
  crop_input.change(
863
  fn=update_crop_preview_interactive,
@@ -891,13 +1087,13 @@ def create_enhanced_interface():
891
 
892
  # Status refresh handler
893
  refresh_btn.click(
894
- fn=check_enhanced_service_status,
895
  outputs=[service_status]
896
  )
897
 
898
- # Main processing handler with enhanced downloads
899
  process_btn.click(
900
- fn=prepare_enhanced_downloads,
901
  inputs=[pdf_input, method_choice, enable_header_footer_removal,
902
  crop_top, crop_bottom, crop_left, crop_right,
903
  apply_to_all_pages, page_selector],
@@ -907,10 +1103,10 @@ def create_enhanced_interface():
907
 
908
  return interface
909
 
910
- def launch_enhanced_ui():
911
- """Launch the enhanced Gradio interface with comprehensive indentation detection and text classification"""
912
  try:
913
- interface = create_enhanced_interface()
914
  interface.launch(
915
  server_name="0.0.0.0",
916
  server_port=7860,
@@ -922,4 +1118,4 @@ def launch_enhanced_ui():
922
  pdf_manager.close()
923
 
924
  if __name__ == "__main__":
925
- launch_enhanced_ui()
 
17
  load_dotenv()
18
 
19
  from backend import BackendManager
20
+ from enhanced_indentation import EnhancedIndentationDetector, OpenCVTextAnalyzer
21
 
22
  # Configure logging
23
  logging.basicConfig(level=logging.INFO)
 
26
  # Initialize backend manager
27
  backend_manager = BackendManager()
28
 
29
+ # Initialize enhanced indentation detector with OpenCV
30
  indent_detector = EnhancedIndentationDetector()
31
+ opencv_analyzer = OpenCVTextAnalyzer()
32
 
33
  # Check if python-docx is available
34
  try:
 
41
  HAS_DOCX_SUPPORT = False
42
  logger.info("DOCX export not available - install python-docx to enable")
43
 
44
+ # Check OpenCV availability
45
+ try:
46
+ import cv2
47
+ HAS_OPENCV_SUPPORT = True
48
+ logger.info("OpenCV text block analysis and bold detection available")
49
+ except ImportError:
50
+ HAS_OPENCV_SUPPORT = False
51
+ logger.info("OpenCV not available - text block analysis and bold detection disabled")
52
+
53
+ # Global variables for enhanced crop management with OpenCV
54
  current_pdf_data = {
55
  'path': None,
56
  'page_count': 0,
57
  'page_images': {},
58
  'crop_settings': {},
59
+ 'default_crop_all': True,
60
+ 'opencv_analysis': {}
61
  }
62
 
63
  class PDFPageManager:
64
+ """Manages PDF page previews and crop settings with OpenCV-enhanced analysis"""
65
 
66
  def __init__(self):
67
  self.pdf_doc = None
68
  self.page_images = {}
69
  self.crop_settings = {}
70
  self.current_page = 0
71
+ self.high_res_scale = 2.0 # Optimized for OpenCV analysis
72
+ self.opencv_analysis = {}
73
 
74
  def load_pdf(self, pdf_path: str) -> Dict:
75
+ """Load PDF and generate high-resolution page previews with OpenCV analysis"""
76
  try:
77
  if self.pdf_doc:
78
  self.pdf_doc.close()
 
80
  self.pdf_doc = fitz.open(pdf_path)
81
  page_count = len(self.pdf_doc)
82
 
83
+ # Generate high-resolution previews and OpenCV analysis for all pages
84
  self.page_images = {}
85
+ self.opencv_analysis = {}
86
+
87
  for page_num in range(page_count):
88
+ # Generate high-resolution preview
89
+ img_array = self._generate_high_res_preview(page_num)
90
+ self.page_images[page_num] = img_array
91
+
92
+ # Perform OpenCV analysis if available
93
+ if HAS_OPENCV_SUPPORT and img_array is not None:
94
+ # Extract text lines for OpenCV correlation
95
+ page = self.pdf_doc.load_page(page_num)
96
+ text_content = page.get_text()
97
+ text_lines = text_content.split('\n')
98
+
99
+ # Perform OpenCV text block analysis
100
+ opencv_result = opencv_analyzer.analyze_text_blocks(img_array, text_lines)
101
+ self.opencv_analysis[page_num] = opencv_result
102
+
103
+ logger.info(f"OpenCV analysis for page {page_num + 1}: {opencv_result.get('block_count', 0)} text blocks, bold detected: {opencv_result.get('bold_text_detected', False)}")
104
 
105
  # Initialize default crop settings for all pages
106
  self.crop_settings = {
 
108
  for i in range(page_count)
109
  }
110
 
111
+ logger.info(f"PDF loaded successfully with OpenCV enhancement: {page_count} pages")
112
 
113
  return {
114
  'success': True,
115
  'page_count': page_count,
116
+ 'pages': list(range(page_count)),
117
+ 'opencv_enhanced': HAS_OPENCV_SUPPORT,
118
+ 'opencv_analysis_available': bool(self.opencv_analysis)
119
  }
120
 
121
  except Exception as e:
 
123
  return {'success': False, 'error': str(e)}
124
 
125
  def _generate_high_res_preview(self, page_num: int) -> np.ndarray:
126
+ """Generate high-resolution preview optimized for OpenCV analysis"""
127
  try:
128
  if not self.pdf_doc:
129
  return None
130
 
131
  page = self.pdf_doc.load_page(page_num)
132
 
133
+ # Use high resolution matrix for better OpenCV analysis
134
  mat = fitz.Matrix(self.high_res_scale, self.high_res_scale)
135
  pix = page.get_pixmap(matrix=mat)
136
  img_data = pix.tobytes("png")
 
140
  pil_image = Image.open(io.BytesIO(img_data))
141
  img_array = np.array(pil_image)
142
 
143
+ # Convert RGBA to RGB if needed, then to BGR for OpenCV
144
  if len(img_array.shape) == 3 and img_array.shape[2] == 4:
145
  img_array = img_array[:, :, :3]
146
 
147
+ # Convert RGB to BGR for OpenCV compatibility
148
+ if len(img_array.shape) == 3:
149
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
150
+
151
  return img_array
152
 
153
  except Exception as e:
 
155
  return None
156
 
157
  def update_crop_visualization(self, page_num: int, crop_coords: Dict) -> np.ndarray:
158
+ """Update crop visualization with OpenCV-enhanced preview and text block overlay"""
159
  if page_num not in self.page_images or self.page_images[page_num] is None:
160
  logger.warning(f"No image available for page {page_num}")
161
  return None
 
182
  # Draw crop areas in semi-transparent red (areas to be removed)
183
  alpha = 0.3
184
  if crop_coords.get('top', 0) > 0 and y1 > 0:
185
+ cv2.rectangle(overlay, (0, 0), (width, y1), (0, 0, 255), -1)
186
  if crop_coords.get('bottom', 0) > 0 and y2 < height:
187
+ cv2.rectangle(overlay, (0, y2), (width, height), (0, 0, 255), -1)
188
  if crop_coords.get('left', 0) > 0 and x1 > 0:
189
+ cv2.rectangle(overlay, (0, 0), (x1, height), (0, 0, 255), -1)
190
  if crop_coords.get('right', 0) > 0 and x2 < width:
191
+ cv2.rectangle(overlay, (x2, 0), (width, height), (0, 0, 255), -1)
192
 
193
  # Draw content area outline in green
194
  if x2 > x1 and y2 > y1:
195
  thickness = max(2, int(self.high_res_scale * 2))
196
  cv2.rectangle(overlay, (x1, y1), (x2, y2), (0, 255, 0), thickness)
197
 
198
+ # Add OpenCV text block visualization if available
199
+ if HAS_OPENCV_SUPPORT and page_num in self.opencv_analysis:
200
+ opencv_result = self.opencv_analysis[page_num]
201
+ if opencv_result.get('success', False):
202
+ # Draw text blocks in blue
203
+ for block in opencv_result.get('text_blocks', []):
204
+ block_x = block.get('x', 0)
205
+ block_y = block.get('y', 0)
206
+ block_w = block.get('width', 0)
207
+ block_h = block.get('height', 0)
208
+
209
+ cv2.rectangle(overlay, (block_x, block_y),
210
+ (block_x + block_w, block_y + block_h),
211
+ (255, 100, 0), 2) # Blue for text blocks
212
+
213
+ # Draw bold regions in orange
214
+ for bold_region in opencv_result.get('bold_regions', []):
215
+ if bold_region.get('is_likely_header', False):
216
+ bold_x = bold_region.get('x', 0)
217
+ bold_y = bold_region.get('y', 0)
218
+ bold_w = bold_region.get('width', 0)
219
+ bold_h = bold_region.get('height', 0)
220
+
221
+ cv2.rectangle(overlay, (bold_x, bold_y),
222
+ (bold_x + bold_w, bold_y + bold_h),
223
+ (0, 165, 255), 3) # Orange for bold headers
224
+
225
  # Blend overlay with original
226
  result = cv2.addWeighted(img_array, 1-alpha, overlay, alpha, 0)
227
 
228
+ # Add informative text with OpenCV enhancement info
229
  font_scale = max(0.8, self.high_res_scale / 3)
230
  thickness = max(1, int(self.high_res_scale))
231
  text_color = (255, 255, 255)
 
235
  texts = [
236
  f"Page {page_num + 1}",
237
  "RED: Remove areas",
238
+ "GREEN: Content area"
 
239
  ]
240
 
241
+ # Add OpenCV-specific information
242
+ if HAS_OPENCV_SUPPORT and page_num in self.opencv_analysis:
243
+ opencv_result = self.opencv_analysis[page_num]
244
+ if opencv_result.get('success', False):
245
+ texts.extend([
246
+ "BLUE: Text blocks",
247
+ "ORANGE: Bold headers",
248
+ f"Blocks: {opencv_result.get('block_count', 0)}",
249
+ f"Bold detected: {'Yes' if opencv_result.get('bold_text_detected', False) else 'No'}"
250
+ ])
251
+
252
+ texts.append(f"Crop: T{crop_coords.get('top', 0):.1f}% B{crop_coords.get('bottom', 0):.1f}% L{crop_coords.get('left', 0):.1f}% R{crop_coords.get('right', 0):.1f}%")
253
+
254
  y_offset = 30
255
  for i, text in enumerate(texts):
256
  y_pos = y_offset + (i * 30)
 
259
  cv2.rectangle(result, (10, y_pos - text_height - 5), (text_width + 20, y_pos + 5), background_color, -1)
260
  cv2.putText(result, text, (15, y_pos), cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, thickness)
261
 
262
+ # Convert back to RGB for display
263
+ result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
264
+
265
+ return result_rgb
266
 
267
  except Exception as e:
268
  logger.error(f"Error updating crop visualization: {e}")
269
+ if page_num in self.page_images and self.page_images[page_num] is not None:
270
+ # Convert BGR to RGB for display
271
+ return cv2.cvtColor(self.page_images[page_num], cv2.COLOR_BGR2RGB)
272
+ return None
273
 
274
  def set_crop_for_page(self, page_num: int, crop_coords: Dict):
275
+ """Set crop coordinates for specific page"""
276
  if page_num in self.crop_settings:
277
  self.crop_settings[page_num].update(crop_coords)
278
  self.crop_settings[page_num]['custom'] = True
279
  logger.info(f"Set crop for page {page_num}: {crop_coords}")
280
 
281
  def set_crop_for_all_pages(self, crop_coords: Dict):
282
+ """Apply same crop settings to all pages"""
283
  for page_num in self.crop_settings:
284
  if not self.crop_settings[page_num].get('custom', False):
285
  self.crop_settings[page_num].update(crop_coords)
286
  logger.info(f"Applied crop to all non-custom pages: {crop_coords}")
287
 
288
  def get_crop_settings_for_processing(self) -> Dict:
289
+ """Get crop settings in format expected by backend"""
290
  return {
291
  'per_page_crops': self.crop_settings,
292
  'has_custom_crops': any(page.get('custom', False) for page in self.crop_settings.values()),
293
  'enhanced_resolution': True,
294
+ 'resolution_scale': self.high_res_scale,
295
+ 'opencv_enhanced': HAS_OPENCV_SUPPORT,
296
+ 'opencv_analysis': self.opencv_analysis
297
  }
298
 
299
+ def get_opencv_analysis(self, page_num: int = None) -> Dict:
300
+ """Get OpenCV analysis for specific page or all pages"""
301
+ if page_num is not None:
302
+ return self.opencv_analysis.get(page_num, {})
303
+ return self.opencv_analysis
304
+
305
  def close(self):
306
  """Clean up resources"""
307
  if self.pdf_doc:
 
309
  self.pdf_doc = None
310
  self.page_images.clear()
311
  self.crop_settings.clear()
312
+ self.opencv_analysis.clear()
313
 
314
  # Global page manager instance
315
  pdf_manager = PDFPageManager()
316
 
317
  def load_pdf_for_preview(pdf_file):
318
+ """Load PDF and return page thumbnails with OpenCV analysis"""
319
  if pdf_file is None:
320
  return None, gr.update(choices=[], value=None), gr.update(visible=False), "No PDF loaded"
321
 
 
326
  # Create page choices for dropdown
327
  page_choices = [f"Page {i+1}" for i in range(result['page_count'])]
328
 
329
+ # Get first page preview with default crop and OpenCV overlay
330
  first_page_preview = pdf_manager.update_crop_visualization(0, {
331
  'top': 0, 'bottom': 0, 'left': 0, 'right': 0
332
  }) if 0 in pdf_manager.page_images else None
333
 
334
+ # Create status message with OpenCV information
335
+ status_parts = [f"PDF loaded successfully: {result['page_count']} pages"]
336
+
337
+ if result.get('opencv_enhanced'):
338
+ status_parts.append("OpenCV text block analysis: Enabled")
339
+ opencv_analysis = pdf_manager.get_opencv_analysis()
340
+ if opencv_analysis:
341
+ total_blocks = sum(analysis.get('block_count', 0) for analysis in opencv_analysis.values())
342
+ bold_pages = sum(1 for analysis in opencv_analysis.values() if analysis.get('bold_text_detected', False))
343
+ status_parts.append(f"Total text blocks detected: {total_blocks}")
344
+ status_parts.append(f"Pages with bold text: {bold_pages}")
345
+ else:
346
+ status_parts.append("OpenCV analysis: Not available")
347
+
348
+ status = " | ".join(status_parts)
349
 
350
  return (first_page_preview,
351
  gr.update(choices=page_choices, value=page_choices[0] if page_choices else None, visible=True),
 
359
  return None, gr.update(choices=[], value=None, visible=False), gr.update(visible=False), f"Error loading PDF: {str(e)}"
360
 
361
  def change_preview_page(page_selection, crop_top, crop_bottom, crop_left, crop_right):
362
+ """Change preview to selected page with OpenCV-enhanced visualization"""
363
  if not page_selection:
364
  return None
365
 
 
374
  'right': crop_right
375
  }
376
 
377
+ # Update visualization with OpenCV enhancement
378
  preview_image = pdf_manager.update_crop_visualization(page_num, crop_coords)
379
  return preview_image
380
 
 
383
  return None
384
 
385
  def update_crop_preview_interactive(page_selection, crop_top, crop_bottom, crop_left, crop_right, apply_to_all):
386
+ """Update crop preview with OpenCV-enhanced interactive feedback"""
387
  if not page_selection or not pdf_manager.pdf_doc:
388
  return None
389
 
 
403
  else:
404
  pdf_manager.set_crop_for_page(page_num, crop_coords)
405
 
406
+ # Return updated preview with OpenCV enhancement
407
  return pdf_manager.update_crop_visualization(page_num, crop_coords)
408
 
409
  except Exception as e:
410
  logger.error(f"Error updating crop preview: {e}")
411
  return None
412
 
413
+ def process_pdf_with_opencv_enhancement(pdf_file, ocr_method, enable_header_footer_removal,
414
+ crop_top, crop_bottom, crop_left, crop_right,
415
+ apply_to_all_pages, current_page_selection,
416
+ progress=gr.Progress()):
417
+ """Process PDF with OpenCV-enhanced text block analysis, bold detection, and comprehensive formatting"""
418
  if pdf_file is None:
419
  return "No file uploaded.", "", "", "Error: No file selected"
420
 
421
  try:
422
+ progress(0.1, desc="Initializing OpenCV-enhanced processing with text block analysis and bold detection...")
423
 
424
+ # Prepare enhanced preprocessing options with OpenCV data
425
  preprocessing_options = {
426
  'enable_header_footer_removal': enable_header_footer_removal,
427
  'enhanced_crop_processing': True,
428
+ 'opencv_enhanced': HAS_OPENCV_SUPPORT,
429
  'crop_settings': pdf_manager.get_crop_settings_for_processing() if enable_header_footer_removal else None
430
  }
431
 
432
+ progress(0.3, desc="Processing with OpenCV text block analysis, bold detection, and comprehensive indentation...")
433
 
434
+ # Process the PDF with OpenCV-enhanced analysis
435
  result = backend_manager.process_pdf_with_enhanced_resolution(
436
  pdf_file.name, ocr_method, preprocessing_options
437
  )
438
 
439
+ progress(0.9, desc="Finalizing OpenCV-enhanced processing...")
440
  progress(1.0, desc="Complete!")
441
 
442
  if result['success']:
443
+ metadata_info = format_opencv_enhanced_metadata(result['metadata'], result['method_used'])
444
+ status_parts = [f"Success: Processed using {result['method_used']}"]
445
+ status_parts.append("OpenCV text block analysis: Enabled")
446
+ status_parts.append("Bold text detection: Enabled")
447
+ status_parts.append("Comprehensive indentation detection: Enabled")
448
+ status_parts.append("Intelligent text classification: Enabled")
449
+
450
+ status = " | ".join(status_parts)
451
 
452
  # Return text, HTML, metadata, and status
453
  return (result['text'],
 
459
  return f"Error: {error_msg}", "", "", f"Processing failed: {error_msg}"
460
 
461
  except Exception as e:
462
+ logger.error(f"OpenCV-enhanced processing error: {e}")
463
  return f"Error: {str(e)}", "", "", f"Unexpected error: {str(e)}"
464
 
465
+ def format_opencv_enhanced_metadata(metadata, method_used):
466
+ """Enhanced metadata formatting with OpenCV text block analysis and bold detection info"""
467
  if not metadata:
468
  return f"Method used: {method_used}"
469
 
 
472
  if 'pages' in metadata:
473
  info_lines.append(f"Pages processed: {metadata['pages']}")
474
 
475
+ if metadata.get('opencv_enhanced', False):
476
+ info_lines.append("OpenCV enhancement: Enabled")
477
+
478
+ if metadata.get('opencv_text_block_analysis', False):
479
+ info_lines.append("OpenCV text block analysis: Enabled")
480
+
481
+ if metadata.get('opencv_bold_detection', False):
482
+ info_lines.append("OpenCV bold text detection: Enabled")
483
+
484
+ if metadata.get('opencv_spacing_analysis', False):
485
+ info_lines.append("OpenCV spacing analysis: Enabled")
486
+
487
+ if metadata.get('header_indentation_suppression', False):
488
+ info_lines.append("Header indentation suppression: Enabled")
489
+
490
  if metadata.get('enhanced_processing', False):
491
  info_lines.append("Enhanced processing: Enabled")
492
 
 
511
  if 'tables' in metadata:
512
  info_lines.append(f"Tables detected: {metadata['tables']}")
513
 
514
+ # OpenCV-specific analysis information
515
+ if 'opencv_global_analysis' in metadata:
516
+ opencv_analysis = metadata['opencv_global_analysis']
517
+ if opencv_analysis.get('success', False):
518
+ info_lines.append(f"OpenCV text blocks detected: {opencv_analysis.get('block_count', 0)}")
519
+ info_lines.append(f"OpenCV paragraphs detected: {opencv_analysis.get('paragraph_count', 0)}")
520
+ info_lines.append(f"OpenCV bold text detected: {'Yes' if opencv_analysis.get('bold_text_detected', False) else 'No'}")
521
+
522
  # Document structure analysis information
523
  if 'document_structure_analysis' in metadata:
524
  analysis = metadata['document_structure_analysis']
 
543
 
544
  return "\n".join(info_lines)
545
 
546
+ def prepare_opencv_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
547
+ crop_top, crop_bottom, crop_left, crop_right,
548
+ apply_to_all_pages, current_page_selection):
549
+ """Prepare OpenCV-enhanced downloads with text block analysis and bold detection"""
550
+ text, html, metadata, status = process_pdf_with_opencv_enhancement(
551
  pdf_file, method, enable_header_footer_removal,
552
  crop_top, crop_bottom, crop_left, crop_right,
553
  apply_to_all_pages, current_page_selection
 
556
  # Prepare downloads if processing was successful
557
  if text and not text.startswith("Error:") and not text.startswith("No file"):
558
  try:
559
+ # Create OpenCV-enhanced download files
560
  download_files = backend_manager.create_enhanced_downloads(text, html, metadata)
561
 
562
  # Prepare gradio updates for download buttons
 
570
  return tuple(updates)
571
 
572
  except Exception as file_error:
573
+ logger.error(f"OpenCV-enhanced file creation error: {file_error}")
574
  return (text, metadata, status,
575
  gr.update(visible=False),
576
  gr.update(visible=False),
 
581
  gr.update(visible=False),
582
  gr.update(visible=False))
583
 
584
+ def get_opencv_enhanced_method_info(method):
585
+ """Get information about selected OCR method with OpenCV enhancements"""
586
+ opencv_status = "with OpenCV Text Block Analysis & Bold Detection" if HAS_OPENCV_SUPPORT else "(OpenCV not available)"
587
+
588
  method_descriptions = {
589
+ "auto": f"**Auto Selection**: Automatically chooses the best available method {opencv_status}, comprehensive indentation detection, intelligent text classification, HTML processing, enhanced pattern recognition for hierarchical numbering (including parenthetical patterns like (1), (๑), (a)), bullets, and multi-language support with header indentation suppression.",
590
+ "azure": f"**Azure Document Intelligence**: Advanced cloud-based OCR {opencv_status}, comprehensive indentation detection, intelligent text classification, HTML generation, layout preservation, smart table detection, bold text recognition, and support for complex document structures including hierarchical numbering and parenthetical patterns with header detection.",
591
+ "tesseract": f"**Tesseract OCR**: Open-source OCR enhanced {opencv_status}, comprehensive indentation detection, intelligent text classification, HTML output, advanced image preprocessing, resolution scaling, bold text detection, and pattern recognition for various numbering styles including parenthetical patterns and bullet points with header analysis.",
592
+ "pymupdf": f"**PyMuPDF**: Fast extraction enhanced {opencv_status}, comprehensive indentation detection, intelligent text classification, HTML processing, improved formatting preservation, bold text recognition, and pattern recognition for maintaining document structure and hierarchy including parenthetical numbering with header detection."
593
  }
594
 
595
  return method_descriptions.get(method, "Select a method to see details.")
596
 
597
+ def check_opencv_enhanced_service_status():
598
+ """Check and display OpenCV-enhanced service status with text block analysis and bold detection capabilities"""
599
  available_methods = backend_manager.get_available_methods()
600
 
601
+ status_lines = ["**Available OCR Methods (Enhanced with OpenCV Text Block Analysis & Bold Detection):**"]
602
+
603
+ opencv_status = " + OpenCV Enhanced" if HAS_OPENCV_SUPPORT else " (OpenCV not available)"
604
 
605
  if "azure" in available_methods:
606
+ status_lines.append(f"✅ Azure Document Intelligence - Ready (HTML + Tables + Comprehensive Indentation + Text Classification{opencv_status})")
607
  else:
608
  status_lines.append("❌ Azure Document Intelligence - Not configured")
609
 
610
  if "tesseract" in available_methods:
611
+ status_lines.append(f"✅ Tesseract OCR - Ready (HTML Enhanced + Comprehensive Indentation + Text Classification{opencv_status})")
612
  else:
613
  status_lines.append("❌ Tesseract OCR - Not available")
614
 
615
  if "pymupdf" in available_methods:
616
+ status_lines.append(f"✅ PyMuPDF - Ready (HTML Enhanced + Comprehensive Indentation + Text Classification{opencv_status})")
617
  else:
618
  status_lines.append("❌ PyMuPDF - Not available")
619
 
620
+ # Add OpenCV features status
621
+ status_lines.append("")
622
+ if HAS_OPENCV_SUPPORT:
623
+ status_lines.append("**OpenCV Text Block Analysis & Bold Detection Features:**")
624
+ status_lines.append("✅ Text Block Detection & Analysis")
625
+ status_lines.append("✅ Bold Text Recognition for Headers")
626
+ status_lines.append("✅ Automatic Spacing & Paragraph Detection")
627
+ status_lines.append("✅ Visual Text Element Analysis")
628
+ status_lines.append("✅ Header Indentation Suppression")
629
+ status_lines.append("✅ Real-time Crop Preview with Text Overlay")
630
+ status_lines.append("✅ Enhanced High-Resolution Processing")
631
+ else:
632
+ status_lines.append("**OpenCV Features:**")
633
+ status_lines.append("❌ OpenCV not available - install opencv-python to enable")
634
+ status_lines.append("❌ Text block analysis disabled")
635
+ status_lines.append("❌ Bold detection disabled")
636
+
637
  # Add enhanced features status
638
  status_lines.append("")
639
  status_lines.append("**Comprehensive Indentation Detection Features:**")
 
663
  status_lines.append("✅ Context-aware Analysis (position, font size)")
664
  status_lines.append("✅ Confidence Scoring")
665
  status_lines.append("✅ Document Structure Analysis")
666
+ if HAS_OPENCV_SUPPORT:
667
+ status_lines.append("✅ OpenCV-Enhanced Bold Text Detection")
668
+ status_lines.append("✅ Header Indentation Suppression")
669
 
670
  status_lines.append("")
671
  status_lines.append("**Enhanced Processing Features:**")
 
675
  status_lines.append("✅ Multi-Page Crop Preview - Available")
676
  status_lines.append("✅ Per-Page Crop Customization - Available")
677
  status_lines.append("✅ Document Structure Analysis - Available")
678
+ if HAS_OPENCV_SUPPORT:
679
+ status_lines.append("✅ OpenCV Text Block Overlay - Available")
680
+ status_lines.append("✅ Bold Text Visualization - Available")
681
 
682
  if HAS_DOCX_SUPPORT:
683
+ status_lines.append("✅ Enhanced DOCX Export - Available (with OpenCV-enhanced indentation formatting)")
684
  else:
685
  status_lines.append("❌ Enhanced DOCX Export - Install python-docx to enable")
686
 
 
693
 
694
  return "\n".join(status_lines)
695
 
696
+ def create_opencv_enhanced_interface():
697
+ """Create OpenCV-enhanced Gradio interface with text block analysis and bold detection"""
698
 
699
  with gr.Blocks(
700
+ title="PDF OCR Service - Enhanced with OpenCV Text Block Analysis & Bold Detection",
701
  theme=gr.themes.Soft(),
702
  css="""
703
  .main-header { text-align: center; margin-bottom: 2rem; }
 
707
  .page-preview { border: 2px solid #17a2b8; padding: 1rem; border-radius: 0.5rem; background-color: #f0f8ff; }
708
  .results-panel { border: 2px solid #6f42c1; padding: 1rem; border-radius: 0.5rem; background-color: #f8f5ff; }
709
  .status-box { border-left: 4px solid #007bff; padding: 1rem; background-color: #f8f9fa; }
710
+ .opencv-panel { border: 2px solid #e74c3c; padding: 1rem; border-radius: 0.5rem; background-color: #fdf2f2; }
711
  """
712
  ) as interface:
713
 
714
+ gr.HTML(f"""
715
  <div class="main-header">
716
+ <h1>PDF OCR Service - Enhanced with OpenCV Text Block Analysis & Bold Detection</h1>
717
+ <p>Convert PDF documents to text using OpenCV-enhanced OCR with text block detection, bold text recognition, HTML intermediate processing, smart table handling, comprehensive indentation pattern recognition including parenthetical patterns like (1), (๑), (a), and intelligent text classification for headers, paragraphs, and list items</p>
718
+ <p><strong>OpenCV Status:</strong> {'✅ Available - Text block analysis and bold detection enabled' if HAS_OPENCV_SUPPORT else '❌ Not available - Install opencv-python for enhanced features'}</p>
719
  </div>
720
  """)
721
 
722
  # Instructions at the top
723
  with gr.Group(elem_classes=["instructions-panel"]):
724
+ gr.HTML("<h3>Instructions & OpenCV-Enhanced Features</h3>")
725
+ gr.HTML(f"""
726
  <div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem;">
727
  <h4>How to Use:</h4>
728
  <ol>
729
  <li><strong>Upload PDF:</strong> Select your PDF file in the configuration panel below</li>
730
  <li><strong>Choose Method:</strong> Select OCR method (Auto recommended for best results)</li>
731
+ <li><strong>Configure Crop (Optional):</strong> Enable header/footer removal and adjust crop settings with OpenCV visualization</li>
732
+ <li><strong>Process:</strong> Click the process button to extract text with OpenCV text block analysis and bold detection</li>
733
+ <li><strong>Download:</strong> Get results in TXT, DOCX, or HTML format with preserved formatting and header detection</li>
734
  </ol>
735
 
736
+ <h4>OpenCV Text Block Analysis & Bold Detection Features:</h4>
737
  <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin-top: 0.5rem;">
738
+ <div>
739
+ <strong>OpenCV Enhancements {'✅' if HAS_OPENCV_SUPPORT else '❌'}:</strong>
740
+ <ul>
741
+ <li>Text Block Detection & Analysis</li>
742
+ <li>Entire Line Bold Text Recognition for Headers</li>
743
+ <li>Automatic Spacing & Paragraph Detection</li>
744
+ <li>Visual Text Element Analysis</li>
745
+ <li>Header Indentation Suppression</li>
746
+ <li>Real-time Text Block Overlay</li>
747
+ <li>4-Space Indentation System</li>
748
+ </ul>
749
+ </div>
750
  <div>
751
  <strong>Hierarchical Numbering:</strong>
752
  <ul>
 
757
  <li>Section: §1.2.3, Article 1.1.1</li>
758
  </ul>
759
  </div>
760
+ </div>
761
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin-top: 0.5rem;">
762
  <div>
763
  <strong>Parenthetical Patterns:</strong>
764
  <ul>
 
769
  <li>Thai Letters: (ก), (ข), (ค)</li>
770
  </ul>
771
  </div>
 
 
772
  <div>
773
  <strong>Multi-Language & Symbols:</strong>
774
  <ul>
 
779
  <li>Checkboxes: [x], [ ], [✓]</li>
780
  </ul>
781
  </div>
782
+ </div>
783
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin-top: 0.5rem;">
784
  <div>
785
  <strong>Intelligent Text Classification:</strong>
786
  <ul>
 
789
  <li>List Item Identification: Patterned content</li>
790
  <li>Context Analysis: Position, font size, formatting</li>
791
  <li>Confidence Scoring: Reliability assessment</li>
792
+ <li>OpenCV Bold Detection: {'Enabled' if HAS_OPENCV_SUPPORT else 'Disabled'}</li>
793
+ </ul>
794
+ </div>
795
+ <div>
796
+ <strong>Technical Enhancements:</strong>
797
+ <ul>
798
+ <li><strong>OpenCV Text Block Analysis:</strong> {'Enabled' if HAS_OPENCV_SUPPORT else 'Disabled'}</li>
799
+ <li><strong>Bold Text Recognition:</strong> {'Enabled' if HAS_OPENCV_SUPPORT else 'Disabled'}</li>
800
+ <li><strong>Header Indentation Suppression:</strong> {'Enabled' if HAS_OPENCV_SUPPORT else 'Disabled'}</li>
801
+ <li><strong>Smart Table Detection:</strong> 70% overlap threshold prevents text loss</li>
802
+ <li><strong>HTML Processing:</strong> Better structure and formatting preservation</li>
803
+ <li><strong>Multi-format Export:</strong> TXT, DOCX, and HTML downloads with preserved indentation</li>
804
  </ul>
805
  </div>
806
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
807
  </div>
808
  """)
809
 
 
834
  choices=["auto", "azure", "tesseract", "pymupdf"],
835
  value="auto",
836
  label="OCR Method",
837
+ info=f"Choose OCR method (all enhanced with OpenCV {'✅' if HAS_OPENCV_SUPPORT else '❌'} + comprehensive indentation detection and text classification)"
838
  )
839
 
840
  # Method information display
841
  method_info = gr.Markdown(
842
+ value=get_opencv_enhanced_method_info("auto"),
843
  elem_classes=["method-info"]
844
  )
845
 
846
  # Enhanced Header/Footer Removal Section
847
  with gr.Group(elem_classes=["crop-controls"]):
848
+ gr.HTML("<h4>Header/Footer Removal & Crop Settings with OpenCV Enhancement</h4>")
849
 
850
  enable_header_footer_removal = gr.Checkbox(
851
+ label="Enable Enhanced Header/Footer Removal with OpenCV Analysis",
852
  value=False,
853
+ info=f"Remove headers and footers with high-resolution processing {'+ OpenCV text block analysis' if HAS_OPENCV_SUPPORT else ''}"
854
  )
855
 
856
  # Multi-page controls
857
  with gr.Group(visible=False) as crop_controls:
858
+ gr.HTML(f"<h5>Multi-Page Crop Control {'with OpenCV Text Block Overlay' if HAS_OPENCV_SUPPORT else ''}</h5>")
859
 
860
  with gr.Row():
861
  # Page selection
 
863
  label="Select Page for Preview",
864
  choices=[],
865
  value=None,
866
+ info=f"Choose page to preview and customize crop settings {'(with OpenCV overlay)' if HAS_OPENCV_SUPPORT else ''}",
867
  visible=False
868
  )
869
 
 
874
  info="When enabled, changes apply to all pages"
875
  )
876
 
877
+ if HAS_OPENCV_SUPPORT:
878
+ gr.HTML("<h5>OpenCV Visual Indicators</h5>")
879
+ gr.HTML("""
880
+ <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 0.5rem; font-size: 0.9em;">
881
+ <div style="color: #ff0000;"><strong>🔴 RED:</strong> Crop areas</div>
882
+ <div style="color: #00ff00;"><strong>🟢 GREEN:</strong> Content area</div>
883
+ <div style="color: #0066ff;"><strong>🔵 BLUE:</strong> Text blocks</div>
884
+ <div style="color: #ff6600;"><strong>🟠 ORANGE:</strong> Bold headers</div>
885
+ </div>
886
+ """)
887
+
888
  gr.HTML("<h5>Crop Areas (% of page)</h5>")
889
 
890
  with gr.Row():
 
930
 
931
  # Process button
932
  process_btn = gr.Button(
933
+ f"Process PDF with OpenCV {'✅' if HAS_OPENCV_SUPPORT else '❌'} + Comprehensive Indentation Detection & Text Classification",
934
  variant="primary",
935
  size="lg"
936
  )
 
938
  # Results and Preview Section
939
  with gr.Row():
940
  with gr.Column(scale=1):
941
+ # Enhanced crop preview with OpenCV overlay
942
  with gr.Group(visible=False, elem_classes=["page-preview"]) as preview_group:
943
+ gr.HTML(f"<h4>Page Preview with OpenCV-Enhanced Crop Visualization {'& Text Block Overlay' if HAS_OPENCV_SUPPORT else ''}</h4>")
944
  crop_preview = gr.Image(
945
+ label="High-Resolution Page Preview with OpenCV Enhancement",
946
  interactive=False,
947
  height=500,
948
  show_label=False
949
  )
950
 
951
+ if HAS_OPENCV_SUPPORT:
952
+ gr.HTML("""
953
+ <p style="font-size: 0.9em; color: #666; text-align: center;">
954
+ <strong>Red areas:</strong> Will be removed | <strong>Green outline:</strong> Content area |
955
+ <strong>Blue rectangles:</strong> OpenCV text blocks | <strong>Orange rectangles:</strong> Bold headers |
956
+ <strong>Enhanced:</strong> 2x resolution processing with OpenCV analysis
957
+ </p>
958
+ """)
959
+ else:
960
+ gr.HTML("""
961
+ <p style="font-size: 0.9em; color: #666; text-align: center;">
962
+ <strong>Red areas:</strong> Will be removed | <strong>Green outline:</strong> Content area |
963
+ <strong>Enhanced:</strong> 2x resolution processing
964
+ </p>
965
+ """)
966
 
967
  with gr.Column(scale=2):
968
  with gr.Group(elem_classes=["results-panel"]):
 
977
 
978
  # Extracted text output
979
  text_output = gr.Textbox(
980
+ label=f"Extracted Text (OpenCV {'✅' if HAS_OPENCV_SUPPORT else '❌'} Enhanced with Comprehensive Indentation Detection & Text Classification)",
981
+ placeholder=f"Processed text with {'OpenCV text block analysis, bold detection, ' if HAS_OPENCV_SUPPORT else ''}comprehensive indentation detection, intelligent text classification, HTML enhancement, and preserved formatting will appear here...",
982
  lines=20,
983
  max_lines=30,
984
  interactive=False,
 
989
  metadata_output = gr.Textbox(
990
  label="Processing Information & Document Analysis",
991
  interactive=False,
992
+ lines=10
993
  )
994
 
995
  # Enhanced download buttons
 
1000
  variant="secondary"
1001
  )
1002
  download_docx_btn = gr.DownloadButton(
1003
+ f"Download Enhanced DOCX (with OpenCV {'✅' if HAS_OPENCV_SUPPORT else '❌'} + Indentation & Classification)",
1004
  visible=False,
1005
  variant="secondary"
1006
  )
 
1012
 
1013
  # Service Status at the bottom
1014
  with gr.Group(elem_classes=["status-box"]):
1015
+ gr.HTML("<h4>Service Status & OpenCV-Enhanced Capabilities</h4>")
1016
  service_status = gr.Markdown(
1017
+ value=check_opencv_enhanced_service_status()
1018
  )
1019
 
1020
  # Refresh status button
1021
  refresh_btn = gr.Button("Refresh Status", size="sm")
1022
 
1023
+ # Event handlers with OpenCV enhancement
1024
 
1025
  # PDF upload handler
1026
  pdf_input.change(
 
1031
 
1032
  # Method info handler
1033
  method_choice.change(
1034
+ fn=get_opencv_enhanced_method_info,
1035
  inputs=[method_choice],
1036
  outputs=[method_info]
1037
  )
 
1053
  outputs=[crop_preview]
1054
  )
1055
 
1056
+ # Crop parameter handlers - update preview in real-time with OpenCV enhancement
1057
  for crop_input in [crop_top, crop_bottom, crop_left, crop_right, apply_to_all_pages]:
1058
  crop_input.change(
1059
  fn=update_crop_preview_interactive,
 
1087
 
1088
  # Status refresh handler
1089
  refresh_btn.click(
1090
+ fn=check_opencv_enhanced_service_status,
1091
  outputs=[service_status]
1092
  )
1093
 
1094
+ # Main processing handler with OpenCV enhancement
1095
  process_btn.click(
1096
+ fn=prepare_opencv_enhanced_downloads,
1097
  inputs=[pdf_input, method_choice, enable_header_footer_removal,
1098
  crop_top, crop_bottom, crop_left, crop_right,
1099
  apply_to_all_pages, page_selector],
 
1103
 
1104
  return interface
1105
 
1106
+ def launch_opencv_enhanced_ui():
1107
+ """Launch the OpenCV-enhanced Gradio interface with text block analysis and bold detection"""
1108
  try:
1109
+ interface = create_opencv_enhanced_interface()
1110
  interface.launch(
1111
  server_name="0.0.0.0",
1112
  server_port=7860,
 
1118
  pdf_manager.close()
1119
 
1120
  if __name__ == "__main__":
1121
+ launch_opencv_enhanced_ui()
backend.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
- Backend Management Module - ENHANCED VERSION with Comprehensive Indentation Detection and Intelligent Text Classification
3
- Coordinates between UI and OCR services, handles file management and preprocessing
4
  """
5
  import re
6
  import os
@@ -26,7 +26,7 @@ from dotenv import load_dotenv
26
  load_dotenv()
27
 
28
  from ocr_service import OCRService
29
- from enhanced_indentation import EnhancedIndentationDetector
30
 
31
  # Configure logging
32
  logging.basicConfig(level=logging.INFO)
@@ -34,17 +34,18 @@ logger = logging.getLogger(__name__)
34
 
35
 
36
  class EnhancedDocumentExporter:
37
- """Advanced document export with comprehensive indentation support, parenthetical patterns, and text classification for HTML and DOCX"""
38
 
39
  def __init__(self):
40
  self.indent_detector = EnhancedIndentationDetector()
 
41
 
42
  @staticmethod
43
  def create_enhanced_txt_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
44
- """Create enhanced TXT file with improved formatting and indentation preservation"""
45
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
46
  temp_file = tempfile.NamedTemporaryFile(
47
- suffix=f'_extracted_text_{timestamp}.txt',
48
  delete=False,
49
  mode='w',
50
  encoding='utf-8'
@@ -52,8 +53,8 @@ class EnhancedDocumentExporter:
52
 
53
  try:
54
  # Add header
55
- temp_file.write("PDF OCR Extraction Results - Enhanced with Comprehensive Indentation Detection & Text Classification\n")
56
- temp_file.write("=" * 90 + "\n\n")
57
 
58
  # Add metadata
59
  if metadata_info:
@@ -63,22 +64,26 @@ class EnhancedDocumentExporter:
63
 
64
  # Add timestamp
65
  temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
66
- temp_file.write("=" * 90 + "\n\n")
67
 
68
- # Add feature list
69
- temp_file.write("Enhanced Features Applied:\n")
70
- temp_file.write("-" * 25 + "\n")
 
 
 
71
  temp_file.write("• Comprehensive Indentation Detection (20+ patterns)\n")
72
  temp_file.write("• Parenthetical Patterns ((1), (๑), (a), (i), (ก))\n")
73
  temp_file.write("• Intelligent Text Classification (headers, paragraphs, lists)\n")
74
  temp_file.write("• Multi-language Support (English, Thai)\n")
75
  temp_file.write("• HTML Intermediate Processing\n")
76
  temp_file.write("• Priority-based Pattern Matching\n")
77
- temp_file.write("• Document Structure Analysis\n\n")
 
78
 
79
  # Add main content
80
- temp_file.write("Extracted Text (Enhanced with Comprehensive Pattern Detection):\n")
81
- temp_file.write("-" * 60 + "\n\n")
82
  temp_file.write(text_content)
83
 
84
  temp_file.close()
@@ -90,9 +95,9 @@ class EnhancedDocumentExporter:
90
  raise
91
 
92
  def create_enhanced_docx_file(self, text_content: str, html_content: str, metadata_info: str = "") -> str:
93
- """Create enhanced DOCX file with comprehensive indentation support, parenthetical patterns, and text classification"""
94
  try:
95
- class EnhancedDOCXHTMLParser(HTMLParser):
96
  def __init__(self, doc, processor):
97
  super().__init__()
98
  self.doc = doc
@@ -107,6 +112,7 @@ class EnhancedDocumentExporter:
107
  self.in_section_heading = False
108
  self.in_page_header = False
109
  self.in_content_header = False
 
110
  self.current_classes = []
111
 
112
  def handle_starttag(self, tag, attrs):
@@ -114,7 +120,13 @@ class EnhancedDocumentExporter:
114
  class_attr = attr_dict.get('class', '')
115
  self.current_classes = class_attr.split()
116
 
117
- if 'page' in class_attr and tag == 'div':
 
 
 
 
 
 
118
  if hasattr(self, 'has_content'):
119
  self.doc.add_paragraph()
120
  self.doc.add_paragraph()
@@ -139,7 +151,7 @@ class EnhancedDocumentExporter:
139
 
140
  elif tag == 'div' and 'paragraph' in class_attr:
141
  self.current_paragraph = self.doc.add_paragraph()
142
- self._apply_enhanced_formatting()
143
 
144
  elif tag == 'table':
145
  self.in_table = True
@@ -152,12 +164,23 @@ class EnhancedDocumentExporter:
152
  if self.current_paragraph:
153
  self.current_paragraph.add_run().add_break()
154
 
155
- def _apply_enhanced_formatting(self):
156
- """Apply enhanced formatting based on CSS classes and indentation detection"""
157
  if not self.current_paragraph:
158
  return
159
 
160
- # Extract indent level from classes
 
 
 
 
 
 
 
 
 
 
 
161
  for cls in self.current_classes:
162
  if cls.startswith('indent-level-'):
163
  try:
@@ -184,34 +207,36 @@ class EnhancedDocumentExporter:
184
  else:
185
  self.current_formatting_hint = 'normal_text'
186
 
187
- # Apply indentation
188
  if self.current_indent_level > 0:
189
  indent_inches = self.current_indent_level * 0.5
190
  self.current_paragraph.paragraph_format.left_indent = Inches(indent_inches)
191
 
192
- # Apply hanging indent for bullets and parenthetical items
193
  if 'bullet' in self.current_formatting_hint or 'parenthetical' in self.current_formatting_hint:
194
- self.current_paragraph.paragraph_format.first_line_indent = Inches(-0.25)
195
 
196
- # Set line spacing and paragraph spacing
197
  self.current_paragraph.paragraph_format.line_spacing = 1.15
198
 
199
- # Apply spacing based on formatting hint
200
  if 'primary' in self.current_formatting_hint:
 
 
 
201
  self.current_paragraph.paragraph_format.space_before = Pt(10)
202
  self.current_paragraph.paragraph_format.space_after = Pt(8)
203
- elif 'secondary' in self.current_formatting_hint:
204
  self.current_paragraph.paragraph_format.space_before = Pt(8)
205
  self.current_paragraph.paragraph_format.space_after = Pt(6)
206
- elif 'tertiary' in self.current_formatting_hint:
207
- self.current_paragraph.paragraph_format.space_before = Pt(6)
208
- self.current_paragraph.paragraph_format.space_after = Pt(4)
209
  else:
210
- self.current_paragraph.paragraph_format.space_after = Pt(3)
211
 
212
  def handle_endtag(self, tag):
213
  if tag == 'div':
214
- if self.in_page_header:
 
 
215
  self.in_page_header = False
216
  elif self.in_content_header:
217
  self.in_content_header = False
@@ -245,8 +270,14 @@ class EnhancedDocumentExporter:
245
 
246
  run = self.current_paragraph.add_run(data.strip())
247
 
248
- # Apply formatting based on pattern, level, and text classification
249
- if self.in_title:
 
 
 
 
 
 
250
  run.bold = True
251
  run.font.size = Pt(16)
252
  run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue
@@ -263,11 +294,11 @@ class EnhancedDocumentExporter:
263
  run.font.size = Pt(14)
264
  run.font.color.rgb = RGBColor(44, 62, 80)
265
  else:
266
- # Apply pattern-specific formatting
267
- self._apply_pattern_formatting(run, indent_info, text_classification)
268
 
269
- def _apply_pattern_formatting(self, run, indent_info, text_classification):
270
- """Apply formatting based on detected pattern, classification, and current formatting hint"""
271
  pattern_type = indent_info.get('pattern_type', 'normal')
272
  level = indent_info.get('level', 0)
273
  is_numbered = indent_info.get('is_numbered', False)
@@ -277,7 +308,7 @@ class EnhancedDocumentExporter:
277
  is_thai = indent_info.get('is_thai', False)
278
  is_parenthetical = indent_info.get('is_parenthetical', False)
279
 
280
- # Base font size
281
  run.font.size = Pt(11)
282
 
283
  # Apply formatting based on current formatting hint and detected pattern
@@ -337,7 +368,7 @@ class EnhancedDocumentExporter:
337
  run.font.color.rgb = RGBColor(85, 85, 85) # Dark gray
338
 
339
  else:
340
- # Default text formatting based on classification
341
  if text_classification.get('is_header'):
342
  run.bold = True
343
  run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue
@@ -392,7 +423,7 @@ class EnhancedDocumentExporter:
392
  # Create DOCX document
393
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
394
  temp_file = tempfile.NamedTemporaryFile(
395
- suffix=f'_enhanced_document_{timestamp}.docx',
396
  delete=False
397
  )
398
  temp_file.close()
@@ -415,7 +446,7 @@ class EnhancedDocumentExporter:
415
 
416
  # Add subtitle
417
  subtitle_para = doc.add_paragraph()
418
- subtitle_run = subtitle_para.add_run('Enhanced with Comprehensive Indentation Detection & Intelligent Text Classification')
419
  subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
420
  subtitle_run.italic = True
421
  subtitle_run.font.size = Pt(12)
@@ -423,7 +454,7 @@ class EnhancedDocumentExporter:
423
 
424
  # Add feature list
425
  features_para = doc.add_paragraph()
426
- features_run = features_para.add_run('Features: Hierarchical Numbering • Parenthetical Patterns ((1), (๑), (a)) • Bullet Points • Letter & Roman Numerals • Thai Script • Multi-level Indentation • Text Classification')
427
  features_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
428
  features_run.font.size = Pt(9)
429
  features_run.font.color.rgb = RGBColor(149, 165, 166)
@@ -447,39 +478,39 @@ class EnhancedDocumentExporter:
447
  doc.add_heading('Extracted Content', level=1)
448
 
449
  if html_content and '<div' in html_content:
450
- # Parse HTML with enhanced indentation processing and text classification
451
- parser = EnhancedDOCXHTMLParser(doc, self)
452
  parser.feed(html_content)
453
  else:
454
- # Fallback to text processing with enhanced indentation and classification
455
- self._process_text_content_enhanced(doc, text_content)
456
 
457
  # Add enhanced footer
458
  footer_section = doc.sections[0]
459
  footer = footer_section.footer
460
  footer_para = footer.paragraphs[0]
461
- footer_para.text = f"Generated by Enhanced PDF OCR Service with Comprehensive Indentation Detection & Text Classification on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
462
  footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
463
  footer_run = footer_para.runs[0]
464
  footer_run.font.size = Pt(8)
465
  footer_run.font.color.rgb = RGBColor(128, 128, 128)
466
 
467
  doc.save(temp_file.name)
468
- logger.info(f"Enhanced DOCX file with comprehensive indentation support and text classification created: {temp_file.name}")
469
  return temp_file.name
470
 
471
  except ImportError:
472
  raise ImportError("python-docx not installed. Cannot create DOCX files.")
473
  except Exception as e:
474
- logger.error(f"Error creating enhanced DOCX file: {e}")
475
  try:
476
  os.unlink(temp_file.name)
477
  except:
478
  pass
479
  raise
480
 
481
- def _process_text_content_enhanced(self, doc, text_content):
482
- """Process text content with enhanced indentation detection and text classification"""
483
  paragraphs = text_content.split('\n\n')
484
 
485
  for para_text in paragraphs:
@@ -491,43 +522,63 @@ class EnhancedDocumentExporter:
491
  if not line.strip():
492
  continue
493
 
494
- # Detect indentation and classify text
495
  indent_info = self.indent_detector.detect_indentation(line)
496
  text_classification = self.indent_detector.classify_text_type(line)
497
 
 
 
 
 
 
 
 
 
498
  if line.strip().startswith('==='):
499
  # Page headers
500
  page_header = doc.add_heading(line.strip(), level=1)
501
  page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER
502
  header_run = page_header.runs[0]
503
  header_run.font.color.rgb = RGBColor(44, 62, 80)
 
 
 
 
 
 
 
 
 
504
  elif line.strip().startswith('##'):
505
  # Section headings
506
  heading_text = line.strip().lstrip('#').strip()
507
  heading = doc.add_heading(heading_text, level=2)
508
  heading_run = heading.runs[0]
509
  heading_run.font.color.rgb = RGBColor(52, 73, 94)
 
510
  elif text_classification.get('is_header') and text_classification.get('confidence', 0) > 0.7:
511
- # Detected headers
512
  heading = doc.add_heading(indent_info.get('content', line.strip()), level=2)
513
  heading_run = heading.runs[0]
514
  heading_run.font.color.rgb = RGBColor(52, 73, 94)
 
515
  else:
516
- # Regular content with enhanced indentation and classification
517
  para = doc.add_paragraph()
518
 
519
- # Apply indentation based on detected level
520
  level = indent_info.get('level', 0)
521
- if level > 0:
522
- para.paragraph_format.left_indent = Inches(level * 0.5)
 
523
 
524
- # Apply pattern-specific formatting
525
  if indent_info.get('is_bullet', False) or indent_info.get('is_parenthetical', False):
526
- para.paragraph_format.first_line_indent = Inches(-0.25)
527
 
528
- # Set proper spacing
529
  para.paragraph_format.line_spacing = 1.15
530
- para.paragraph_format.space_after = Pt(3)
531
 
532
  # Add content with enhanced formatting
533
  content = indent_info.get('content', line.strip())
@@ -570,17 +621,17 @@ class EnhancedDocumentExporter:
570
 
571
  @staticmethod
572
  def create_html_file(html_content: str, metadata_info: str = "") -> str:
573
- """Create standalone HTML file with enhanced styling for comprehensive indentation and text classification"""
574
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
575
  temp_file = tempfile.NamedTemporaryFile(
576
- suffix=f'_extracted_document_{timestamp}.html',
577
  delete=False,
578
  mode='w',
579
  encoding='utf-8'
580
  )
581
 
582
  try:
583
- # Enhance HTML with better styling
584
  enhanced_html = html_content
585
 
586
  # Add comprehensive styling if not already present
@@ -616,7 +667,7 @@ class EnhancedDocumentExporter:
616
  margin-bottom: 25px;
617
  border-left: 4px solid #3498db;
618
  }
619
- .enhanced-features {
620
  background-color: #e8f5e8;
621
  padding: 10px;
622
  border-radius: 5px;
@@ -624,8 +675,17 @@ class EnhancedDocumentExporter:
624
  border-left: 4px solid #27ae60;
625
  font-size: 0.9em;
626
  }
627
- .classification-features {
628
- background-color: #fef9e7;
 
 
 
 
 
 
 
 
 
629
  padding: 10px;
630
  border-radius: 5px;
631
  margin-bottom: 20px;
@@ -643,17 +703,18 @@ class EnhancedDocumentExporter:
643
  <div class="container">
644
  <div class="header">
645
  <h1>PDF OCR Extraction Results</h1>
646
- <p>Enhanced with Comprehensive Indentation Detection & Intelligent Text Classification</p>
647
  </div>
648
- <div class="enhanced-features">
649
- <strong>Indentation Features:</strong> Hierarchical NumberingParenthetical Patterns ((1), (๑), (a), (i), (ก))
650
- Multi-level Bullets • Letter & Roman NumeralsThai Script Support
651
- Space-based Indentation Pattern Priority Detection
652
  </div>
653
- <div class="classification-features">
654
- <strong>Text Classification:</strong> Header Detection • Paragraph Recognition •
655
- List Item Identification Context AnalysisConfidence Scoring
656
- Document Structure Analysis
 
657
  </div>''' +
658
  (f'<div class="metadata"><h3>Processing Information</h3><pre>{metadata_info}</pre></div>' if metadata_info else '')
659
  )
@@ -670,24 +731,25 @@ class EnhancedDocumentExporter:
670
 
671
 
672
  class BackendManager:
673
- """Enhanced backend manager with comprehensive indentation detection, parenthetical patterns, text classification, and advanced export capabilities"""
674
 
675
  def __init__(self):
676
  self.ocr_service = OCRService()
677
  self.document_exporter = EnhancedDocumentExporter()
 
678
  self.processing_history = []
679
  self.max_history_size = int(os.getenv('MAX_HISTORY_SIZE', 100))
680
 
681
  # Create directories for temporary files and logs
682
- self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service_enhanced_v2'
683
  self.temp_dir.mkdir(exist_ok=True)
684
 
685
- logger.info("Enhanced backend manager with comprehensive indentation detection and text classification initialized successfully")
686
 
687
  def process_pdf_with_enhanced_resolution(self, pdf_path: str, method: str = "auto",
688
  preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
689
  """
690
- Process PDF with enhanced resolution, comprehensive indentation detection, and intelligent text classification
691
 
692
  Args:
693
  pdf_path: Path to the PDF file
@@ -695,7 +757,7 @@ class BackendManager:
695
  preprocessing_options: Dictionary containing preprocessing settings
696
 
697
  Returns:
698
- Dict containing processing results with enhanced HTML content, indentation, and text classification
699
  """
700
  start_time = datetime.now()
701
 
@@ -727,7 +789,7 @@ class BackendManager:
727
  # Generate file hash for tracking
728
  file_hash = self._calculate_file_hash(pdf_path)
729
 
730
- logger.info(f"Processing PDF with enhanced indentation detection and text classification: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
731
  logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
732
 
733
  # Handle preprocessing if enabled
@@ -735,29 +797,40 @@ class BackendManager:
735
  preprocessing_applied = False
736
 
737
  if preprocessing_options and preprocessing_options.get('enable_header_footer_removal', False):
738
- logger.info("Applying enhanced preprocessing...")
739
  try:
740
  processed_pdf_path = self._apply_enhanced_preprocessing(pdf_path, preprocessing_options)
741
  preprocessing_applied = True
742
- logger.info("Enhanced preprocessing completed successfully")
743
  except Exception as e:
744
  logger.error(f"Preprocessing failed: {e}")
745
  processed_pdf_path = pdf_path
746
 
747
  try:
748
- # Process with enhanced OCR, indentation detection, and text classification
749
  result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method)
750
 
751
  # Add processing metadata
752
  processing_time = (datetime.now() - start_time).total_seconds()
753
 
754
- # Analyze document structure with text classification if successful
755
  document_analysis = {}
 
 
756
  if result['success'] and result['text']:
757
  try:
758
  text_lines = result['text'].split('\n')
759
  detector = EnhancedIndentationDetector()
760
- document_analysis = detector.analyze_document_structure(text_lines)
 
 
 
 
 
 
 
 
 
761
  except Exception as analysis_error:
762
  logger.warning(f"Document structure analysis failed: {analysis_error}")
763
  document_analysis = {'analysis_failed': True}
@@ -767,14 +840,20 @@ class BackendManager:
767
  'file_size_mb': round(file_size / (1024*1024), 2),
768
  'processing_time_seconds': round(processing_time, 2),
769
  'timestamp': start_time.isoformat(),
 
 
 
 
770
  'enhanced_processing': True,
771
  'html_processing': True,
772
  'comprehensive_indentation': True,
773
  'parenthetical_patterns_supported': True,
774
  'intelligent_text_classification': True,
 
775
  'header_footer_removed': preprocessing_applied,
776
  'preprocessing_options': preprocessing_options if preprocessing_applied else None,
777
- 'document_structure_analysis': document_analysis
 
778
  })
779
 
780
  # Cleanup temporary preprocessed file
@@ -784,19 +863,20 @@ class BackendManager:
784
  except:
785
  pass
786
 
787
- # Log results with enhanced information
788
  if result['success']:
789
  text_length = len(result['text'])
790
  has_html = bool(result.get('html'))
791
  table_count = result['text'].count('Table ') if 'Table ' in result['text'] else 0
792
 
793
- logger.info(f"Enhanced processing completed successfully in {processing_time:.2f}s")
794
  logger.info(f"Method used: {result['method_used']}")
795
  logger.info(f"Text extracted: {text_length} characters")
796
  logger.info(f"HTML generated: {has_html}")
797
- logger.info(f"Comprehensive indentation detection: Enabled")
798
- logger.info(f"Parenthetical patterns supported: Enabled")
799
- logger.info(f"Intelligent text classification: Enabled")
 
800
 
801
  if table_count > 0:
802
  logger.info(f"Tables detected: {table_count}")
@@ -805,6 +885,9 @@ class BackendManager:
805
  if document_analysis and not document_analysis.get('analysis_failed'):
806
  logger.info(f"Document analysis: {document_analysis.get('patterned_lines', 0)} patterned lines, max level {document_analysis.get('max_level', 0)}")
807
  logger.info(f"Text classification: {document_analysis.get('header_count', 0)} headers, {document_analysis.get('paragraph_count', 0)} paragraphs, {document_analysis.get('list_item_count', 0)} list items")
 
 
 
808
 
809
  # Add to processing history
810
  self._add_to_history({
@@ -817,14 +900,20 @@ class BackendManager:
817
  'processing_time': processing_time,
818
  'preprocessing_applied': preprocessing_applied,
819
  'html_generated': has_html,
 
 
 
 
820
  'enhanced_processing': True,
821
  'comprehensive_indentation': True,
822
  'parenthetical_patterns_supported': True,
823
  'intelligent_text_classification': True,
824
- 'document_analysis': document_analysis
 
 
825
  })
826
  else:
827
- logger.error(f"Enhanced processing failed: {result.get('error', 'Unknown error')}")
828
 
829
  # Add to processing history
830
  self._add_to_history({
@@ -835,16 +924,21 @@ class BackendManager:
835
  'error': result.get('error', 'Unknown error'),
836
  'processing_time': processing_time,
837
  'preprocessing_applied': preprocessing_applied,
 
 
 
 
838
  'enhanced_processing': True,
839
  'comprehensive_indentation': True,
840
  'parenthetical_patterns_supported': True,
841
- 'intelligent_text_classification': True
 
842
  })
843
 
844
  return result
845
 
846
  except Exception as e:
847
- logger.error(f"Unexpected error during enhanced processing: {e}")
848
 
849
  # Cleanup
850
  if preprocessing_applied and processed_pdf_path != pdf_path:
@@ -862,15 +956,20 @@ class BackendManager:
862
  'success': False,
863
  'error': str(e),
864
  'processing_time': processing_time,
 
 
 
 
865
  'enhanced_processing': True,
866
  'comprehensive_indentation': True,
867
  'parenthetical_patterns_supported': True,
868
- 'intelligent_text_classification': True
 
869
  })
870
 
871
  return {
872
  'success': False,
873
- 'error': f"Enhanced processing error: {str(e)}",
874
  'text': '',
875
  'html': '',
876
  'method_used': '',
@@ -878,15 +977,50 @@ class BackendManager:
878
  'file_hash': file_hash,
879
  'processing_time_seconds': round(processing_time, 2),
880
  'timestamp': start_time.isoformat(),
 
 
 
 
881
  'enhanced_processing': True,
882
  'comprehensive_indentation': True,
883
  'parenthetical_patterns_supported': True,
884
- 'intelligent_text_classification': True
 
885
  }
886
  }
887
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
888
  def _apply_enhanced_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str:
889
- """Apply enhanced preprocessing with high-resolution crop handling"""
890
  crop_settings = options.get('crop_settings', {})
891
  per_page_crops = crop_settings.get('per_page_crops', {})
892
  enhanced_resolution = crop_settings.get('enhanced_resolution', True)
@@ -894,7 +1028,7 @@ class BackendManager:
894
 
895
  # Create temporary file for processed PDF
896
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
897
- temp_pdf_path = self.temp_dir / f"enhanced_preprocessed_{timestamp}.pdf"
898
 
899
  doc = fitz.open(pdf_path)
900
  new_doc = fitz.open()
@@ -961,13 +1095,13 @@ class BackendManager:
961
  clip=new_rect
962
  )
963
 
964
- logger.debug(f"Page {page_num}: Applied crop T{top_percent}% B{bottom_percent}% L{left_percent}% R{right_percent}%")
965
 
966
  new_doc.save(str(temp_pdf_path))
967
- logger.info(f"Enhanced preprocessing applied with {resolution_scale}x resolution to {len(doc)} pages")
968
 
969
  except Exception as e:
970
- logger.error(f"Error in enhanced preprocessing: {e}")
971
  raise
972
  finally:
973
  doc.close()
@@ -977,41 +1111,41 @@ class BackendManager:
977
 
978
  def create_enhanced_downloads(self, text_content: str, html_content: str,
979
  metadata_info: str = "") -> Dict[str, str]:
980
- """Create enhanced download files with comprehensive indentation support, parenthetical patterns, and text classification"""
981
  download_files = {}
982
 
983
  try:
984
- # Create enhanced TXT file
985
  txt_path = EnhancedDocumentExporter.create_enhanced_txt_file(
986
  text_content, html_content, metadata_info
987
  )
988
  download_files['txt'] = txt_path
989
- logger.info(f"Enhanced TXT file created: {txt_path}")
990
 
991
- # Create enhanced DOCX file with comprehensive indentation support and text classification
992
  try:
993
  docx_path = self.document_exporter.create_enhanced_docx_file(
994
  text_content, html_content, metadata_info
995
  )
996
  download_files['docx'] = docx_path
997
- logger.info(f"Enhanced DOCX file with comprehensive indentation and text classification created: {docx_path}")
998
  except ImportError:
999
  logger.warning("python-docx not available. DOCX creation skipped.")
1000
  except Exception as e:
1001
- logger.error(f"Enhanced DOCX creation failed: {e}")
1002
 
1003
- # Create standalone HTML file
1004
  try:
1005
  html_path = EnhancedDocumentExporter.create_html_file(
1006
  html_content, metadata_info
1007
  )
1008
  download_files['html'] = html_path
1009
- logger.info(f"Enhanced HTML file created: {html_path}")
1010
  except Exception as e:
1011
  logger.error(f"HTML file creation failed: {e}")
1012
 
1013
  except Exception as e:
1014
- logger.error(f"Error creating enhanced downloads: {e}")
1015
  raise
1016
 
1017
  return download_files
@@ -1019,11 +1153,11 @@ class BackendManager:
1019
  def get_available_methods(self) -> List[str]:
1020
  """Get list of available OCR methods"""
1021
  methods = self.ocr_service.get_available_methods()
1022
- logger.info(f"Available enhanced OCR methods: {methods}")
1023
  return methods
1024
 
1025
  def get_service_status(self) -> Dict[str, Any]:
1026
- """Get comprehensive service status with enhanced features"""
1027
  available_methods = self.get_available_methods()
1028
 
1029
  # Check DOCX support
@@ -1033,6 +1167,13 @@ class BackendManager:
1033
  except ImportError:
1034
  docx_available = False
1035
 
 
 
 
 
 
 
 
1036
  status = {
1037
  'service_healthy': True,
1038
  'available_methods': available_methods,
@@ -1043,11 +1184,16 @@ class BackendManager:
1043
  'successful_processes': sum(1 for h in self.processing_history if h.get('success', False)),
1044
  'temp_dir': str(self.temp_dir),
1045
  'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)),
 
 
 
 
1046
  'enhanced_processing': True,
1047
  'html_processing': True,
1048
  'comprehensive_indentation': True,
1049
  'parenthetical_patterns_supported': True,
1050
  'intelligent_text_classification': True,
 
1051
  'pattern_detection_count': len(EnhancedIndentationDetector().patterns),
1052
  'docx_export_available': docx_available,
1053
  'enhanced_crop_processing': True,
@@ -1104,7 +1250,7 @@ class BackendManager:
1104
  logger.error(f"Error during cleanup: {e}")
1105
 
1106
  def get_enhanced_statistics(self) -> Dict[str, Any]:
1107
- """Get enhanced processing statistics with indentation analysis and text classification"""
1108
  if not self.processing_history:
1109
  return {
1110
  'total_processed': 0,
@@ -1115,10 +1261,15 @@ class BackendManager:
1115
  'total_tables_processed': 0,
1116
  'preprocessing_usage': 0,
1117
  'html_generation_rate': 0,
 
 
 
 
1118
  'enhanced_processing_usage': 0,
1119
  'comprehensive_indentation_usage': 0,
1120
  'parenthetical_patterns_usage': 0,
1121
  'text_classification_usage': 0,
 
1122
  'document_analysis_success_rate': 0
1123
  }
1124
 
@@ -1138,10 +1289,15 @@ class BackendManager:
1138
 
1139
  preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False))
1140
  html_generated = sum(1 for h in self.processing_history if h.get('html_generated', False))
 
 
 
 
1141
  enhanced_processing = sum(1 for h in self.processing_history if h.get('enhanced_processing', False))
1142
  comprehensive_indentation = sum(1 for h in self.processing_history if h.get('comprehensive_indentation', False))
1143
  parenthetical_patterns = sum(1 for h in self.processing_history if h.get('parenthetical_patterns_supported', False))
1144
  text_classification = sum(1 for h in self.processing_history if h.get('intelligent_text_classification', False))
 
1145
 
1146
  # Document analysis statistics
1147
  doc_analysis_success = sum(1 for h in self.processing_history
@@ -1149,10 +1305,15 @@ class BackendManager:
1149
  doc_analysis_rate = (doc_analysis_success / total_processed) * 100 if total_processed > 0 else 0
1150
 
1151
  html_generation_rate = (html_generated / total_processed) * 100 if total_processed > 0 else 0
 
 
 
 
1152
  enhanced_processing_rate = (enhanced_processing / total_processed) * 100 if total_processed > 0 else 0
1153
  comprehensive_indentation_rate = (comprehensive_indentation / total_processed) * 100 if total_processed > 0 else 0
1154
  parenthetical_patterns_rate = (parenthetical_patterns / total_processed) * 100 if total_processed > 0 else 0
1155
  text_classification_rate = (text_classification / total_processed) * 100 if total_processed > 0 else 0
 
1156
 
1157
  return {
1158
  'total_processed': total_processed,
@@ -1165,6 +1326,14 @@ class BackendManager:
1165
  'failed_processes': total_processed - len(successful),
1166
  'preprocessing_usage': preprocessing_usage,
1167
  'html_generation_rate': round(html_generation_rate, 2),
 
 
 
 
 
 
 
 
1168
  'enhanced_processing_usage': enhanced_processing,
1169
  'enhanced_processing_rate': round(enhanced_processing_rate, 2),
1170
  'comprehensive_indentation_usage': comprehensive_indentation,
@@ -1173,6 +1342,8 @@ class BackendManager:
1173
  'parenthetical_patterns_rate': round(parenthetical_patterns_rate, 2),
1174
  'text_classification_usage': text_classification,
1175
  'text_classification_rate': round(text_classification_rate, 2),
 
 
1176
  'document_analysis_success_rate': round(doc_analysis_rate, 2)
1177
  }
1178
 
@@ -1181,7 +1352,7 @@ class BackendManager:
1181
  _backend_manager = None
1182
 
1183
  def get_backend_manager() -> BackendManager:
1184
- """Get global enhanced backend manager instance"""
1185
  global _backend_manager
1186
  if _backend_manager is None:
1187
  _backend_manager = BackendManager()
@@ -1189,18 +1360,36 @@ def get_backend_manager() -> BackendManager:
1189
 
1190
 
1191
  if __name__ == "__main__":
1192
- # Test the enhanced backend manager
1193
  manager = BackendManager()
1194
 
1195
- print("Enhanced Backend Manager with Comprehensive Indentation Detection & Text Classification Test")
1196
- print("=" * 100)
1197
  print(f"Available methods: {manager.get_available_methods()}")
1198
  print(f"Service status: {manager.get_service_status()}")
1199
  print(f"Enhanced statistics: {manager.get_enhanced_statistics()}")
1200
 
1201
- # Test indentation detector with parenthetical patterns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1202
  detector = EnhancedIndentationDetector()
1203
  test_cases = [
 
1204
  "1.2.3. Hierarchical item",
1205
  "(1) Parenthetical Arabic",
1206
  "(๑) Parenthetical Thai numeral",
@@ -1209,12 +1398,13 @@ if __name__ == "__main__":
1209
  "(ก) Parenthetical Thai letter"
1210
  ]
1211
 
1212
- print(f"\nIndentation Detection Test with Parenthetical Patterns:")
1213
  print("-" * 60)
1214
  for test_text in test_cases:
1215
  result = detector.detect_indentation(test_text)
1216
  classification = detector.classify_text_type(test_text)
1217
  print(f"Text: {test_text}")
1218
  print(f" Pattern: {result['pattern_type']}, Level: {result['level']}")
 
1219
  print(f" Classification: {classification['type']} (confidence: {classification['confidence']:.2f})")
1220
  print()
 
1
  """
2
+ Backend Management Module - ENHANCED VERSION with OpenCV Text Block Analysis and Bold Detection
3
+ Coordinates between UI and OCR services, handles file management and preprocessing with OpenCV integration
4
  """
5
  import re
6
  import os
 
26
  load_dotenv()
27
 
28
  from ocr_service import OCRService
29
+ from enhanced_indentation import EnhancedIndentationDetector, OpenCVTextAnalyzer
30
 
31
  # Configure logging
32
  logging.basicConfig(level=logging.INFO)
 
34
 
35
 
36
  class EnhancedDocumentExporter:
37
+ """Advanced document export with OpenCV-enhanced text analysis, bold detection, and comprehensive formatting"""
38
 
39
  def __init__(self):
40
  self.indent_detector = EnhancedIndentationDetector()
41
+ self.opencv_analyzer = OpenCVTextAnalyzer()
42
 
43
  @staticmethod
44
  def create_enhanced_txt_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
45
+ """Create enhanced TXT file with OpenCV-improved formatting and spacing analysis"""
46
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
47
  temp_file = tempfile.NamedTemporaryFile(
48
+ suffix=f'_extracted_text_opencv_{timestamp}.txt',
49
  delete=False,
50
  mode='w',
51
  encoding='utf-8'
 
53
 
54
  try:
55
  # Add header
56
+ temp_file.write("PDF OCR Extraction Results - Enhanced with OpenCV Text Block Analysis & Bold Detection\n")
57
+ temp_file.write("=" * 100 + "\n\n")
58
 
59
  # Add metadata
60
  if metadata_info:
 
64
 
65
  # Add timestamp
66
  temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
67
+ temp_file.write("=" * 100 + "\n\n")
68
 
69
+ # Add enhanced feature list
70
+ temp_file.write("OpenCV-Enhanced Features Applied:\n")
71
+ temp_file.write("-" * 35 + "\n")
72
+ temp_file.write("• OpenCV Text Block Detection & Analysis\n")
73
+ temp_file.write("• Bold Text Recognition for Headers\n")
74
+ temp_file.write("• Automatic Spacing & Paragraph Detection\n")
75
  temp_file.write("• Comprehensive Indentation Detection (20+ patterns)\n")
76
  temp_file.write("• Parenthetical Patterns ((1), (๑), (a), (i), (ก))\n")
77
  temp_file.write("• Intelligent Text Classification (headers, paragraphs, lists)\n")
78
  temp_file.write("• Multi-language Support (English, Thai)\n")
79
  temp_file.write("• HTML Intermediate Processing\n")
80
  temp_file.write("• Priority-based Pattern Matching\n")
81
+ temp_file.write("• Document Structure Analysis\n")
82
+ temp_file.write("• Header Indentation Suppression\n\n")
83
 
84
  # Add main content
85
+ temp_file.write("Extracted Text (OpenCV-Enhanced with Text Block Analysis):\n")
86
+ temp_file.write("-" * 70 + "\n\n")
87
  temp_file.write(text_content)
88
 
89
  temp_file.close()
 
95
  raise
96
 
97
  def create_enhanced_docx_file(self, text_content: str, html_content: str, metadata_info: str = "") -> str:
98
+ """Create enhanced DOCX file with OpenCV-enhanced formatting, bold detection, and spacing analysis"""
99
  try:
100
+ class OpenCVEnhancedDOCXHTMLParser(HTMLParser):
101
  def __init__(self, doc, processor):
102
  super().__init__()
103
  self.doc = doc
 
112
  self.in_section_heading = False
113
  self.in_page_header = False
114
  self.in_content_header = False
115
+ self.in_opencv_bold_header = False
116
  self.current_classes = []
117
 
118
  def handle_starttag(self, tag, attrs):
 
120
  class_attr = attr_dict.get('class', '')
121
  self.current_classes = class_attr.split()
122
 
123
+ if 'opencv-bold-header' in class_attr:
124
+ # OpenCV detected bold header - special styling, no indentation
125
+ self.current_paragraph = self.doc.add_heading(level=1)
126
+ self.current_paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
127
+ self.in_opencv_bold_header = True
128
+
129
+ elif 'page' in class_attr and tag == 'div':
130
  if hasattr(self, 'has_content'):
131
  self.doc.add_paragraph()
132
  self.doc.add_paragraph()
 
151
 
152
  elif tag == 'div' and 'paragraph' in class_attr:
153
  self.current_paragraph = self.doc.add_paragraph()
154
+ self._apply_opencv_enhanced_formatting()
155
 
156
  elif tag == 'table':
157
  self.in_table = True
 
164
  if self.current_paragraph:
165
  self.current_paragraph.add_run().add_break()
166
 
167
+ def _apply_opencv_enhanced_formatting(self):
168
+ """Apply OpenCV-enhanced formatting with bold detection and spacing analysis"""
169
  if not self.current_paragraph:
170
  return
171
 
172
+ # Check if this is an OpenCV-detected bold header
173
+ is_opencv_bold_header = 'opencv-bold-header' in self.current_classes
174
+
175
+ if is_opencv_bold_header:
176
+ # Bold headers get no indentation and special formatting
177
+ self.current_indent_level = 0
178
+ self.current_paragraph.paragraph_format.left_indent = Inches(0)
179
+ self.current_paragraph.paragraph_format.space_before = Pt(15)
180
+ self.current_paragraph.paragraph_format.space_after = Pt(12)
181
+ return
182
+
183
+ # Extract indent level from classes (only for non-bold headers)
184
  for cls in self.current_classes:
185
  if cls.startswith('indent-level-'):
186
  try:
 
207
  else:
208
  self.current_formatting_hint = 'normal_text'
209
 
210
+ # Apply indentation (only for non-bold headers)
211
  if self.current_indent_level > 0:
212
  indent_inches = self.current_indent_level * 0.5
213
  self.current_paragraph.paragraph_format.left_indent = Inches(indent_inches)
214
 
215
+ # Apply hanging indent for bullets and parenthetical items (4 spaces equivalent)
216
  if 'bullet' in self.current_formatting_hint or 'parenthetical' in self.current_formatting_hint:
217
+ self.current_paragraph.paragraph_format.first_line_indent = Inches(-0.125) # Reduced for 4-space system
218
 
219
+ # Set line spacing and paragraph spacing with OpenCV-enhanced spacing
220
  self.current_paragraph.paragraph_format.line_spacing = 1.15
221
 
222
+ # Apply spacing based on formatting hint and OpenCV analysis
223
  if 'primary' in self.current_formatting_hint:
224
+ self.current_paragraph.paragraph_format.space_before = Pt(12)
225
+ self.current_paragraph.paragraph_format.space_after = Pt(10)
226
+ elif 'secondary' in self.current_formatting_hint:
227
  self.current_paragraph.paragraph_format.space_before = Pt(10)
228
  self.current_paragraph.paragraph_format.space_after = Pt(8)
229
+ elif 'tertiary' in self.current_formatting_hint:
230
  self.current_paragraph.paragraph_format.space_before = Pt(8)
231
  self.current_paragraph.paragraph_format.space_after = Pt(6)
 
 
 
232
  else:
233
+ self.current_paragraph.paragraph_format.space_after = Pt(4)
234
 
235
  def handle_endtag(self, tag):
236
  if tag == 'div':
237
+ if self.in_opencv_bold_header:
238
+ self.in_opencv_bold_header = False
239
+ elif self.in_page_header:
240
  self.in_page_header = False
241
  elif self.in_content_header:
242
  self.in_content_header = False
 
270
 
271
  run = self.current_paragraph.add_run(data.strip())
272
 
273
+ # Apply formatting based on context and OpenCV detection
274
+ if self.in_opencv_bold_header:
275
+ # Special formatting for OpenCV-detected bold headers
276
+ run.bold = True
277
+ run.font.size = Pt(16)
278
+ run.font.color.rgb = RGBColor(231, 76, 60) # Red color for emphasis
279
+ self.current_paragraph.paragraph_format.left_indent = Inches(0) # Force no indent
280
+ elif self.in_title:
281
  run.bold = True
282
  run.font.size = Pt(16)
283
  run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue
 
294
  run.font.size = Pt(14)
295
  run.font.color.rgb = RGBColor(44, 62, 80)
296
  else:
297
+ # Apply pattern-specific formatting with OpenCV enhancement
298
+ self._apply_opencv_pattern_formatting(run, indent_info, text_classification)
299
 
300
+ def _apply_opencv_pattern_formatting(self, run, indent_info, text_classification):
301
+ """Apply formatting based on detected pattern, classification, and OpenCV analysis"""
302
  pattern_type = indent_info.get('pattern_type', 'normal')
303
  level = indent_info.get('level', 0)
304
  is_numbered = indent_info.get('is_numbered', False)
 
308
  is_thai = indent_info.get('is_thai', False)
309
  is_parenthetical = indent_info.get('is_parenthetical', False)
310
 
311
+ # Base font size with OpenCV-enhanced scaling
312
  run.font.size = Pt(11)
313
 
314
  # Apply formatting based on current formatting hint and detected pattern
 
368
  run.font.color.rgb = RGBColor(85, 85, 85) # Dark gray
369
 
370
  else:
371
+ # Default text formatting based on classification and OpenCV
372
  if text_classification.get('is_header'):
373
  run.bold = True
374
  run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue
 
423
  # Create DOCX document
424
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
425
  temp_file = tempfile.NamedTemporaryFile(
426
+ suffix=f'_opencv_enhanced_document_{timestamp}.docx',
427
  delete=False
428
  )
429
  temp_file.close()
 
446
 
447
  # Add subtitle
448
  subtitle_para = doc.add_paragraph()
449
+ subtitle_run = subtitle_para.add_run('Enhanced with OpenCV Text Block Analysis & Bold Detection')
450
  subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
451
  subtitle_run.italic = True
452
  subtitle_run.font.size = Pt(12)
 
454
 
455
  # Add feature list
456
  features_para = doc.add_paragraph()
457
+ features_run = features_para.add_run('Features: OpenCV Text Block Detection • Bold Text Recognition • Spacing Analysis • Hierarchical Numbering • Parenthetical Patterns ((1), (๑), (a)) • Bullet Points • Letter & Roman Numerals • Thai Script • Multi-level Indentation • Text Classification • Header Indentation Suppression')
458
  features_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
459
  features_run.font.size = Pt(9)
460
  features_run.font.color.rgb = RGBColor(149, 165, 166)
 
478
  doc.add_heading('Extracted Content', level=1)
479
 
480
  if html_content and '<div' in html_content:
481
+ # Parse HTML with OpenCV-enhanced processing
482
+ parser = OpenCVEnhancedDOCXHTMLParser(doc, self)
483
  parser.feed(html_content)
484
  else:
485
+ # Fallback to text processing with OpenCV enhancement
486
+ self._process_text_content_opencv_enhanced(doc, text_content)
487
 
488
  # Add enhanced footer
489
  footer_section = doc.sections[0]
490
  footer = footer_section.footer
491
  footer_para = footer.paragraphs[0]
492
+ footer_para.text = f"Generated by OpenCV-Enhanced PDF OCR Service with Text Block Analysis & Bold Detection on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
493
  footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
494
  footer_run = footer_para.runs[0]
495
  footer_run.font.size = Pt(8)
496
  footer_run.font.color.rgb = RGBColor(128, 128, 128)
497
 
498
  doc.save(temp_file.name)
499
+ logger.info(f"OpenCV-enhanced DOCX file with text block analysis and bold detection created: {temp_file.name}")
500
  return temp_file.name
501
 
502
  except ImportError:
503
  raise ImportError("python-docx not installed. Cannot create DOCX files.")
504
  except Exception as e:
505
+ logger.error(f"Error creating OpenCV-enhanced DOCX file: {e}")
506
  try:
507
  os.unlink(temp_file.name)
508
  except:
509
  pass
510
  raise
511
 
512
+ def _process_text_content_opencv_enhanced(self, doc, text_content):
513
+ """Process text content with OpenCV-enhanced analysis, bold detection, and spacing"""
514
  paragraphs = text_content.split('\n\n')
515
 
516
  for para_text in paragraphs:
 
522
  if not line.strip():
523
  continue
524
 
525
+ # Detect indentation and classify text with OpenCV enhancement
526
  indent_info = self.indent_detector.detect_indentation(line)
527
  text_classification = self.indent_detector.classify_text_type(line)
528
 
529
+ # Check for OpenCV-style bold headers (simulated analysis)
530
+ is_opencv_bold_header = (
531
+ text_classification.get('is_header') and
532
+ text_classification.get('confidence', 0) > 0.8 and
533
+ len(line.strip()) < 80 and
534
+ line.strip().isupper() # Simple heuristic for bold headers
535
+ )
536
+
537
  if line.strip().startswith('==='):
538
  # Page headers
539
  page_header = doc.add_heading(line.strip(), level=1)
540
  page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER
541
  header_run = page_header.runs[0]
542
  header_run.font.color.rgb = RGBColor(44, 62, 80)
543
+
544
+ elif is_opencv_bold_header:
545
+ # OpenCV-detected bold headers - no indentation
546
+ heading = doc.add_heading(line.strip(), level=1)
547
+ heading.alignment = WD_ALIGN_PARAGRAPH.LEFT
548
+ heading_run = heading.runs[0]
549
+ heading_run.font.color.rgb = RGBColor(231, 76, 60) # Red for emphasis
550
+ heading_run.font.size = Pt(16)
551
+
552
  elif line.strip().startswith('##'):
553
  # Section headings
554
  heading_text = line.strip().lstrip('#').strip()
555
  heading = doc.add_heading(heading_text, level=2)
556
  heading_run = heading.runs[0]
557
  heading_run.font.color.rgb = RGBColor(52, 73, 94)
558
+
559
  elif text_classification.get('is_header') and text_classification.get('confidence', 0) > 0.7:
560
+ # Regular detected headers
561
  heading = doc.add_heading(indent_info.get('content', line.strip()), level=2)
562
  heading_run = heading.runs[0]
563
  heading_run.font.color.rgb = RGBColor(52, 73, 94)
564
+
565
  else:
566
+ # Regular content with OpenCV-enhanced formatting
567
  para = doc.add_paragraph()
568
 
569
+ # Apply indentation based on detected level using 4 spaces per level (but not for bold headers)
570
  level = indent_info.get('level', 0)
571
+ if level > 0 and not is_opencv_bold_header:
572
+ # Use 4 spaces equivalent per level (0.25 inches per level)
573
+ para.paragraph_format.left_indent = Inches(level * 0.25)
574
 
575
+ # Apply pattern-specific formatting using 4 spaces equivalent
576
  if indent_info.get('is_bullet', False) or indent_info.get('is_parenthetical', False):
577
+ para.paragraph_format.first_line_indent = Inches(-0.125) # 4-space equivalent hanging indent
578
 
579
+ # Set proper spacing with OpenCV enhancement
580
  para.paragraph_format.line_spacing = 1.15
581
+ para.paragraph_format.space_after = Pt(4)
582
 
583
  # Add content with enhanced formatting
584
  content = indent_info.get('content', line.strip())
 
621
 
622
  @staticmethod
623
  def create_html_file(html_content: str, metadata_info: str = "") -> str:
624
+ """Create standalone HTML file with OpenCV-enhanced styling"""
625
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
626
  temp_file = tempfile.NamedTemporaryFile(
627
+ suffix=f'_opencv_enhanced_document_{timestamp}.html',
628
  delete=False,
629
  mode='w',
630
  encoding='utf-8'
631
  )
632
 
633
  try:
634
+ # Enhance HTML with better styling including OpenCV features
635
  enhanced_html = html_content
636
 
637
  # Add comprehensive styling if not already present
 
667
  margin-bottom: 25px;
668
  border-left: 4px solid #3498db;
669
  }
670
+ .opencv-features {
671
  background-color: #e8f5e8;
672
  padding: 10px;
673
  border-radius: 5px;
 
675
  border-left: 4px solid #27ae60;
676
  font-size: 0.9em;
677
  }
678
+ .opencv-bold-header {
679
+ font-weight: bold;
680
+ color: #e74c3c;
681
+ font-size: 1.3em;
682
+ margin: 20px 0 15px 0;
683
+ border-left: 4px solid #e74c3c;
684
+ padding-left: 12px;
685
+ background-color: #fdf2f2;
686
+ }
687
+ .text-analysis-features {
688
+ background-color: #fff9e7;
689
  padding: 10px;
690
  border-radius: 5px;
691
  margin-bottom: 20px;
 
703
  <div class="container">
704
  <div class="header">
705
  <h1>PDF OCR Extraction Results</h1>
706
+ <p>Enhanced with OpenCV Text Block Analysis & Bold Detection</p>
707
  </div>
708
+ <div class="opencv-features">
709
+ <strong>OpenCV Features:</strong> Text Block Detection Bold Text Recognition
710
+ Automatic Spacing & Paragraph AnalysisHeader Indentation Suppression
711
+ Visual Text Element Analysis
712
  </div>
713
+ <div class="text-analysis-features">
714
+ <strong>Text Analysis:</strong> Comprehensive Indentation Detection •
715
+ Parenthetical Patterns ((1), (๑), (a), (i), (ก)) Multi-level Bullets
716
+ Letter & Roman Numerals • Thai Script Support • Pattern Priority Detection •
717
+ Intelligent Text Classification
718
  </div>''' +
719
  (f'<div class="metadata"><h3>Processing Information</h3><pre>{metadata_info}</pre></div>' if metadata_info else '')
720
  )
 
731
 
732
 
733
  class BackendManager:
734
+ """Enhanced backend manager with OpenCV text block analysis, bold detection, and comprehensive formatting"""
735
 
736
  def __init__(self):
737
  self.ocr_service = OCRService()
738
  self.document_exporter = EnhancedDocumentExporter()
739
+ self.opencv_analyzer = OpenCVTextAnalyzer()
740
  self.processing_history = []
741
  self.max_history_size = int(os.getenv('MAX_HISTORY_SIZE', 100))
742
 
743
  # Create directories for temporary files and logs
744
+ self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service_opencv_enhanced'
745
  self.temp_dir.mkdir(exist_ok=True)
746
 
747
+ logger.info("OpenCV-enhanced backend manager with text block analysis and bold detection initialized successfully")
748
 
749
  def process_pdf_with_enhanced_resolution(self, pdf_path: str, method: str = "auto",
750
  preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
751
  """
752
+ Process PDF with OpenCV-enhanced resolution, text block analysis, and bold detection
753
 
754
  Args:
755
  pdf_path: Path to the PDF file
 
757
  preprocessing_options: Dictionary containing preprocessing settings
758
 
759
  Returns:
760
+ Dict containing processing results with OpenCV-enhanced analysis
761
  """
762
  start_time = datetime.now()
763
 
 
789
  # Generate file hash for tracking
790
  file_hash = self._calculate_file_hash(pdf_path)
791
 
792
+ logger.info(f"Processing PDF with OpenCV text block analysis and bold detection: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
793
  logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
794
 
795
  # Handle preprocessing if enabled
 
797
  preprocessing_applied = False
798
 
799
  if preprocessing_options and preprocessing_options.get('enable_header_footer_removal', False):
800
+ logger.info("Applying enhanced preprocessing with OpenCV analysis...")
801
  try:
802
  processed_pdf_path = self._apply_enhanced_preprocessing(pdf_path, preprocessing_options)
803
  preprocessing_applied = True
804
+ logger.info("OpenCV-enhanced preprocessing completed successfully")
805
  except Exception as e:
806
  logger.error(f"Preprocessing failed: {e}")
807
  processed_pdf_path = pdf_path
808
 
809
  try:
810
+ # Process with OpenCV-enhanced OCR
811
  result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method)
812
 
813
  # Add processing metadata
814
  processing_time = (datetime.now() - start_time).total_seconds()
815
 
816
+ # Analyze document structure with OpenCV enhancement if successful
817
  document_analysis = {}
818
+ opencv_global_analysis = {}
819
+
820
  if result['success'] and result['text']:
821
  try:
822
  text_lines = result['text'].split('\n')
823
  detector = EnhancedIndentationDetector()
824
+
825
+ # Perform global OpenCV analysis on the PDF
826
+ opencv_global_analysis = self._perform_global_opencv_analysis(pdf_path, text_lines)
827
+
828
+ # Enhanced document structure analysis
829
+ document_analysis = detector.analyze_document_structure_with_opencv(text_lines)
830
+
831
+ if opencv_global_analysis:
832
+ document_analysis['opencv_global_analysis'] = opencv_global_analysis
833
+
834
  except Exception as analysis_error:
835
  logger.warning(f"Document structure analysis failed: {analysis_error}")
836
  document_analysis = {'analysis_failed': True}
 
840
  'file_size_mb': round(file_size / (1024*1024), 2),
841
  'processing_time_seconds': round(processing_time, 2),
842
  'timestamp': start_time.isoformat(),
843
+ 'opencv_enhanced': True,
844
+ 'opencv_text_block_analysis': True,
845
+ 'opencv_bold_detection': True,
846
+ 'opencv_spacing_analysis': True,
847
  'enhanced_processing': True,
848
  'html_processing': True,
849
  'comprehensive_indentation': True,
850
  'parenthetical_patterns_supported': True,
851
  'intelligent_text_classification': True,
852
+ 'header_indentation_suppression': True,
853
  'header_footer_removed': preprocessing_applied,
854
  'preprocessing_options': preprocessing_options if preprocessing_applied else None,
855
+ 'document_structure_analysis': document_analysis,
856
+ 'opencv_global_analysis': opencv_global_analysis
857
  })
858
 
859
  # Cleanup temporary preprocessed file
 
863
  except:
864
  pass
865
 
866
+ # Log results with OpenCV enhancement information
867
  if result['success']:
868
  text_length = len(result['text'])
869
  has_html = bool(result.get('html'))
870
  table_count = result['text'].count('Table ') if 'Table ' in result['text'] else 0
871
 
872
+ logger.info(f"OpenCV-enhanced processing completed successfully in {processing_time:.2f}s")
873
  logger.info(f"Method used: {result['method_used']}")
874
  logger.info(f"Text extracted: {text_length} characters")
875
  logger.info(f"HTML generated: {has_html}")
876
+ logger.info(f"OpenCV text block analysis: Enabled")
877
+ logger.info(f"OpenCV bold detection: Enabled")
878
+ logger.info(f"OpenCV spacing analysis: Enabled")
879
+ logger.info(f"Header indentation suppression: Enabled")
880
 
881
  if table_count > 0:
882
  logger.info(f"Tables detected: {table_count}")
 
885
  if document_analysis and not document_analysis.get('analysis_failed'):
886
  logger.info(f"Document analysis: {document_analysis.get('patterned_lines', 0)} patterned lines, max level {document_analysis.get('max_level', 0)}")
887
  logger.info(f"Text classification: {document_analysis.get('header_count', 0)} headers, {document_analysis.get('paragraph_count', 0)} paragraphs, {document_analysis.get('list_item_count', 0)} list items")
888
+ if opencv_global_analysis:
889
+ logger.info(f"OpenCV global analysis: {opencv_global_analysis.get('block_count', 0)} text blocks, {opencv_global_analysis.get('paragraph_count', 0)} paragraphs")
890
+ logger.info(f"Bold text detected: {opencv_global_analysis.get('bold_text_detected', False)}")
891
 
892
  # Add to processing history
893
  self._add_to_history({
 
900
  'processing_time': processing_time,
901
  'preprocessing_applied': preprocessing_applied,
902
  'html_generated': has_html,
903
+ 'opencv_enhanced': True,
904
+ 'opencv_text_block_analysis': True,
905
+ 'opencv_bold_detection': True,
906
+ 'opencv_spacing_analysis': True,
907
  'enhanced_processing': True,
908
  'comprehensive_indentation': True,
909
  'parenthetical_patterns_supported': True,
910
  'intelligent_text_classification': True,
911
+ 'header_indentation_suppression': True,
912
+ 'document_analysis': document_analysis,
913
+ 'opencv_global_analysis': opencv_global_analysis
914
  })
915
  else:
916
+ logger.error(f"OpenCV-enhanced processing failed: {result.get('error', 'Unknown error')}")
917
 
918
  # Add to processing history
919
  self._add_to_history({
 
924
  'error': result.get('error', 'Unknown error'),
925
  'processing_time': processing_time,
926
  'preprocessing_applied': preprocessing_applied,
927
+ 'opencv_enhanced': True,
928
+ 'opencv_text_block_analysis': True,
929
+ 'opencv_bold_detection': True,
930
+ 'opencv_spacing_analysis': True,
931
  'enhanced_processing': True,
932
  'comprehensive_indentation': True,
933
  'parenthetical_patterns_supported': True,
934
+ 'intelligent_text_classification': True,
935
+ 'header_indentation_suppression': True
936
  })
937
 
938
  return result
939
 
940
  except Exception as e:
941
+ logger.error(f"Unexpected error during OpenCV-enhanced processing: {e}")
942
 
943
  # Cleanup
944
  if preprocessing_applied and processed_pdf_path != pdf_path:
 
956
  'success': False,
957
  'error': str(e),
958
  'processing_time': processing_time,
959
+ 'opencv_enhanced': True,
960
+ 'opencv_text_block_analysis': True,
961
+ 'opencv_bold_detection': True,
962
+ 'opencv_spacing_analysis': True,
963
  'enhanced_processing': True,
964
  'comprehensive_indentation': True,
965
  'parenthetical_patterns_supported': True,
966
+ 'intelligent_text_classification': True,
967
+ 'header_indentation_suppression': True
968
  })
969
 
970
  return {
971
  'success': False,
972
+ 'error': f"OpenCV-enhanced processing error: {str(e)}",
973
  'text': '',
974
  'html': '',
975
  'method_used': '',
 
977
  'file_hash': file_hash,
978
  'processing_time_seconds': round(processing_time, 2),
979
  'timestamp': start_time.isoformat(),
980
+ 'opencv_enhanced': True,
981
+ 'opencv_text_block_analysis': True,
982
+ 'opencv_bold_detection': True,
983
+ 'opencv_spacing_analysis': True,
984
  'enhanced_processing': True,
985
  'comprehensive_indentation': True,
986
  'parenthetical_patterns_supported': True,
987
+ 'intelligent_text_classification': True,
988
+ 'header_indentation_suppression': True
989
  }
990
  }
991
 
992
+ def _perform_global_opencv_analysis(self, pdf_path: str, text_lines: List[str]) -> Dict[str, Any]:
993
+ """Perform global OpenCV analysis on the entire PDF"""
994
+ try:
995
+ # Extract first page for global analysis
996
+ pdf_document = fitz.open(pdf_path)
997
+ page = pdf_document.load_page(0) # First page
998
+
999
+ # Render page to image
1000
+ mat = fitz.Matrix(2.0, 2.0)
1001
+ pix = page.get_pixmap(matrix=mat)
1002
+ img_data = pix.tobytes("png")
1003
+
1004
+ # Convert to OpenCV format
1005
+ import io
1006
+ from PIL import Image
1007
+ pil_image = Image.open(io.BytesIO(img_data))
1008
+ img_array = np.array(pil_image)
1009
+ img_cv = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
1010
+
1011
+ # Perform OpenCV analysis
1012
+ opencv_analysis = self.opencv_analyzer.analyze_text_blocks(img_cv, text_lines)
1013
+
1014
+ pdf_document.close()
1015
+
1016
+ return opencv_analysis
1017
+
1018
+ except Exception as e:
1019
+ logger.error(f"Global OpenCV analysis failed: {e}")
1020
+ return {}
1021
+
1022
  def _apply_enhanced_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str:
1023
+ """Apply enhanced preprocessing with high-resolution crop handling and OpenCV analysis"""
1024
  crop_settings = options.get('crop_settings', {})
1025
  per_page_crops = crop_settings.get('per_page_crops', {})
1026
  enhanced_resolution = crop_settings.get('enhanced_resolution', True)
 
1028
 
1029
  # Create temporary file for processed PDF
1030
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1031
+ temp_pdf_path = self.temp_dir / f"opencv_enhanced_preprocessed_{timestamp}.pdf"
1032
 
1033
  doc = fitz.open(pdf_path)
1034
  new_doc = fitz.open()
 
1095
  clip=new_rect
1096
  )
1097
 
1098
+ logger.debug(f"Page {page_num}: Applied OpenCV-enhanced crop T{top_percent}% B{bottom_percent}% L{left_percent}% R{right_percent}%")
1099
 
1100
  new_doc.save(str(temp_pdf_path))
1101
+ logger.info(f"OpenCV-enhanced preprocessing applied with {resolution_scale}x resolution to {len(doc)} pages")
1102
 
1103
  except Exception as e:
1104
+ logger.error(f"Error in OpenCV-enhanced preprocessing: {e}")
1105
  raise
1106
  finally:
1107
  doc.close()
 
1111
 
1112
  def create_enhanced_downloads(self, text_content: str, html_content: str,
1113
  metadata_info: str = "") -> Dict[str, str]:
1114
+ """Create OpenCV-enhanced download files with text block analysis and bold detection"""
1115
  download_files = {}
1116
 
1117
  try:
1118
+ # Create OpenCV-enhanced TXT file
1119
  txt_path = EnhancedDocumentExporter.create_enhanced_txt_file(
1120
  text_content, html_content, metadata_info
1121
  )
1122
  download_files['txt'] = txt_path
1123
+ logger.info(f"OpenCV-enhanced TXT file created: {txt_path}")
1124
 
1125
+ # Create OpenCV-enhanced DOCX file with text block analysis and bold detection
1126
  try:
1127
  docx_path = self.document_exporter.create_enhanced_docx_file(
1128
  text_content, html_content, metadata_info
1129
  )
1130
  download_files['docx'] = docx_path
1131
+ logger.info(f"OpenCV-enhanced DOCX file with text block analysis and bold detection created: {docx_path}")
1132
  except ImportError:
1133
  logger.warning("python-docx not available. DOCX creation skipped.")
1134
  except Exception as e:
1135
+ logger.error(f"OpenCV-enhanced DOCX creation failed: {e}")
1136
 
1137
+ # Create standalone HTML file with OpenCV enhancements
1138
  try:
1139
  html_path = EnhancedDocumentExporter.create_html_file(
1140
  html_content, metadata_info
1141
  )
1142
  download_files['html'] = html_path
1143
+ logger.info(f"OpenCV-enhanced HTML file created: {html_path}")
1144
  except Exception as e:
1145
  logger.error(f"HTML file creation failed: {e}")
1146
 
1147
  except Exception as e:
1148
+ logger.error(f"Error creating OpenCV-enhanced downloads: {e}")
1149
  raise
1150
 
1151
  return download_files
 
1153
  def get_available_methods(self) -> List[str]:
1154
  """Get list of available OCR methods"""
1155
  methods = self.ocr_service.get_available_methods()
1156
+ logger.info(f"Available OpenCV-enhanced OCR methods: {methods}")
1157
  return methods
1158
 
1159
  def get_service_status(self) -> Dict[str, Any]:
1160
+ """Get comprehensive service status with OpenCV enhancements"""
1161
  available_methods = self.get_available_methods()
1162
 
1163
  # Check DOCX support
 
1167
  except ImportError:
1168
  docx_available = False
1169
 
1170
+ # Check OpenCV availability
1171
+ opencv_available = True
1172
+ try:
1173
+ import cv2
1174
+ except ImportError:
1175
+ opencv_available = False
1176
+
1177
  status = {
1178
  'service_healthy': True,
1179
  'available_methods': available_methods,
 
1184
  'successful_processes': sum(1 for h in self.processing_history if h.get('success', False)),
1185
  'temp_dir': str(self.temp_dir),
1186
  'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)),
1187
+ 'opencv_available': opencv_available,
1188
+ 'opencv_text_block_analysis': opencv_available,
1189
+ 'opencv_bold_detection': opencv_available,
1190
+ 'opencv_spacing_analysis': opencv_available,
1191
  'enhanced_processing': True,
1192
  'html_processing': True,
1193
  'comprehensive_indentation': True,
1194
  'parenthetical_patterns_supported': True,
1195
  'intelligent_text_classification': True,
1196
+ 'header_indentation_suppression': True,
1197
  'pattern_detection_count': len(EnhancedIndentationDetector().patterns),
1198
  'docx_export_available': docx_available,
1199
  'enhanced_crop_processing': True,
 
1250
  logger.error(f"Error during cleanup: {e}")
1251
 
1252
  def get_enhanced_statistics(self) -> Dict[str, Any]:
1253
+ """Get enhanced processing statistics with OpenCV analysis"""
1254
  if not self.processing_history:
1255
  return {
1256
  'total_processed': 0,
 
1261
  'total_tables_processed': 0,
1262
  'preprocessing_usage': 0,
1263
  'html_generation_rate': 0,
1264
+ 'opencv_enhanced_usage': 0,
1265
+ 'opencv_text_block_analysis_usage': 0,
1266
+ 'opencv_bold_detection_usage': 0,
1267
+ 'opencv_spacing_analysis_usage': 0,
1268
  'enhanced_processing_usage': 0,
1269
  'comprehensive_indentation_usage': 0,
1270
  'parenthetical_patterns_usage': 0,
1271
  'text_classification_usage': 0,
1272
+ 'header_indentation_suppression_usage': 0,
1273
  'document_analysis_success_rate': 0
1274
  }
1275
 
 
1289
 
1290
  preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False))
1291
  html_generated = sum(1 for h in self.processing_history if h.get('html_generated', False))
1292
+ opencv_enhanced = sum(1 for h in self.processing_history if h.get('opencv_enhanced', False))
1293
+ opencv_text_block_analysis = sum(1 for h in self.processing_history if h.get('opencv_text_block_analysis', False))
1294
+ opencv_bold_detection = sum(1 for h in self.processing_history if h.get('opencv_bold_detection', False))
1295
+ opencv_spacing_analysis = sum(1 for h in self.processing_history if h.get('opencv_spacing_analysis', False))
1296
  enhanced_processing = sum(1 for h in self.processing_history if h.get('enhanced_processing', False))
1297
  comprehensive_indentation = sum(1 for h in self.processing_history if h.get('comprehensive_indentation', False))
1298
  parenthetical_patterns = sum(1 for h in self.processing_history if h.get('parenthetical_patterns_supported', False))
1299
  text_classification = sum(1 for h in self.processing_history if h.get('intelligent_text_classification', False))
1300
+ header_indentation_suppression = sum(1 for h in self.processing_history if h.get('header_indentation_suppression', False))
1301
 
1302
  # Document analysis statistics
1303
  doc_analysis_success = sum(1 for h in self.processing_history
 
1305
  doc_analysis_rate = (doc_analysis_success / total_processed) * 100 if total_processed > 0 else 0
1306
 
1307
  html_generation_rate = (html_generated / total_processed) * 100 if total_processed > 0 else 0
1308
+ opencv_enhanced_rate = (opencv_enhanced / total_processed) * 100 if total_processed > 0 else 0
1309
+ opencv_text_block_analysis_rate = (opencv_text_block_analysis / total_processed) * 100 if total_processed > 0 else 0
1310
+ opencv_bold_detection_rate = (opencv_bold_detection / total_processed) * 100 if total_processed > 0 else 0
1311
+ opencv_spacing_analysis_rate = (opencv_spacing_analysis / total_processed) * 100 if total_processed > 0 else 0
1312
  enhanced_processing_rate = (enhanced_processing / total_processed) * 100 if total_processed > 0 else 0
1313
  comprehensive_indentation_rate = (comprehensive_indentation / total_processed) * 100 if total_processed > 0 else 0
1314
  parenthetical_patterns_rate = (parenthetical_patterns / total_processed) * 100 if total_processed > 0 else 0
1315
  text_classification_rate = (text_classification / total_processed) * 100 if total_processed > 0 else 0
1316
+ header_indentation_suppression_rate = (header_indentation_suppression / total_processed) * 100 if total_processed > 0 else 0
1317
 
1318
  return {
1319
  'total_processed': total_processed,
 
1326
  'failed_processes': total_processed - len(successful),
1327
  'preprocessing_usage': preprocessing_usage,
1328
  'html_generation_rate': round(html_generation_rate, 2),
1329
+ 'opencv_enhanced_usage': opencv_enhanced,
1330
+ 'opencv_enhanced_rate': round(opencv_enhanced_rate, 2),
1331
+ 'opencv_text_block_analysis_usage': opencv_text_block_analysis,
1332
+ 'opencv_text_block_analysis_rate': round(opencv_text_block_analysis_rate, 2),
1333
+ 'opencv_bold_detection_usage': opencv_bold_detection,
1334
+ 'opencv_bold_detection_rate': round(opencv_bold_detection_rate, 2),
1335
+ 'opencv_spacing_analysis_usage': opencv_spacing_analysis,
1336
+ 'opencv_spacing_analysis_rate': round(opencv_spacing_analysis_rate, 2),
1337
  'enhanced_processing_usage': enhanced_processing,
1338
  'enhanced_processing_rate': round(enhanced_processing_rate, 2),
1339
  'comprehensive_indentation_usage': comprehensive_indentation,
 
1342
  'parenthetical_patterns_rate': round(parenthetical_patterns_rate, 2),
1343
  'text_classification_usage': text_classification,
1344
  'text_classification_rate': round(text_classification_rate, 2),
1345
+ 'header_indentation_suppression_usage': header_indentation_suppression,
1346
+ 'header_indentation_suppression_rate': round(header_indentation_suppression_rate, 2),
1347
  'document_analysis_success_rate': round(doc_analysis_rate, 2)
1348
  }
1349
 
 
1352
  _backend_manager = None
1353
 
1354
  def get_backend_manager() -> BackendManager:
1355
+ """Get global OpenCV-enhanced backend manager instance"""
1356
  global _backend_manager
1357
  if _backend_manager is None:
1358
  _backend_manager = BackendManager()
 
1360
 
1361
 
1362
  if __name__ == "__main__":
1363
+ # Test the OpenCV-enhanced backend manager
1364
  manager = BackendManager()
1365
 
1366
+ print("OpenCV-Enhanced Backend Manager with Text Block Analysis & Bold Detection Test")
1367
+ print("=" * 110)
1368
  print(f"Available methods: {manager.get_available_methods()}")
1369
  print(f"Service status: {manager.get_service_status()}")
1370
  print(f"Enhanced statistics: {manager.get_enhanced_statistics()}")
1371
 
1372
+ # Test OpenCV analyzer
1373
+ opencv_analyzer = OpenCVTextAnalyzer()
1374
+ test_image_path = "test_page.png" # This would be a real image path in practice
1375
+ test_text_lines = [
1376
+ "CHAPTER 1: INTRODUCTION",
1377
+ "1.1. Overview of the System",
1378
+ "This document provides comprehensive information...",
1379
+ "1.2. Key Features",
1380
+ "• Feature one with detailed explanation",
1381
+ "• Feature two with additional notes"
1382
+ ]
1383
+
1384
+ print(f"\nOpenCV Text Analysis Test:")
1385
+ print("-" * 40)
1386
+ # opencv_analysis = opencv_analyzer.analyze_text_blocks(test_image_path, test_text_lines)
1387
+ # print(f"Analysis result: {opencv_analysis}")
1388
+
1389
+ # Test indentation detector with OpenCV integration
1390
  detector = EnhancedIndentationDetector()
1391
  test_cases = [
1392
+ "INTRODUCTION TO THE SYSTEM", # Should be detected as bold header
1393
  "1.2.3. Hierarchical item",
1394
  "(1) Parenthetical Arabic",
1395
  "(๑) Parenthetical Thai numeral",
 
1398
  "(ก) Parenthetical Thai letter"
1399
  ]
1400
 
1401
+ print(f"\nOpenCV-Enhanced Indentation Detection Test:")
1402
  print("-" * 60)
1403
  for test_text in test_cases:
1404
  result = detector.detect_indentation(test_text)
1405
  classification = detector.classify_text_type(test_text)
1406
  print(f"Text: {test_text}")
1407
  print(f" Pattern: {result['pattern_type']}, Level: {result['level']}")
1408
+ print(f" Is Header: {result['is_header']}, Suppress Indent: {result['suppress_indentation']}")
1409
  print(f" Classification: {classification['type']} (confidence: {classification['confidence']:.2f})")
1410
  print()
enhanced_indentation.py CHANGED
@@ -1,19 +1,375 @@
1
  """
2
- Enhanced Indentation Detection System
3
- Comprehensive regex-based system for detecting hierarchical numbering and indentation levels
4
- For PDF OCR Service with HTML and DOCX output support including parenthetical patterns
5
  """
6
  import re
7
  import logging
 
 
8
  from typing import Dict, Tuple, Optional, List, Any
9
  from collections import Counter
 
 
 
10
 
11
  logger = logging.getLogger(__name__)
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  class EnhancedIndentationDetector:
14
- """Advanced indentation detection with comprehensive pattern matching including parenthetical patterns"""
15
 
16
  def __init__(self):
 
 
17
  # Define comprehensive patterns for different numbering styles
18
  self.patterns = {
19
  # Hierarchical decimal numbering (1.1.1.1.1...)
@@ -192,7 +548,7 @@ class EnhancedIndentationDetector:
192
 
193
  # Bullet points with various symbols
194
  'bullet_symbols': {
195
- 'pattern': r'^\s*([•·▪▫◦‣⁃⁌⁍◘◙○●▶▷►★☆♦♠♣♥◆◇■□▲△▼▽❖❀❁❂❃❄❅❆❇❈❉❊❋❍❏❐❑❒❖])\s+',
196
  'example': '•',
197
  'level_func': self._calculate_bullet_level,
198
  'priority': 2
@@ -208,7 +564,7 @@ class EnhancedIndentationDetector:
208
 
209
  # Arrow bullets
210
  'arrow_bullets': {
211
- 'pattern': r'^\s*([\→\←\\\\\\\\\\⇐\\\\\\\\\\\\\\\\\\\\\\\\\\\➾])\s+',
212
  'example': '→',
213
  'level_func': self._calculate_bullet_level,
214
  'priority': 2
@@ -242,6 +598,69 @@ class EnhancedIndentationDetector:
242
  'capitalized_words': lambda text: sum(1 for word in text.split() if word and word[0].isupper()) / max(len(text.split()), 1) > 0.5
243
  }
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  def detect_indentation(self, text: str, base_margin: float = 0) -> Dict[str, Any]:
246
  """
247
  Detect indentation pattern and level for given text
@@ -258,7 +677,7 @@ class EnhancedIndentationDetector:
258
 
259
  text_stripped = text.strip()
260
 
261
- # Count leading whitespace for additional indentation
262
  leading_spaces = len(text) - len(text.lstrip())
263
  space_indent_level = leading_spaces // 4 # 4 spaces = 1 level
264
 
@@ -302,7 +721,9 @@ class EnhancedIndentationDetector:
302
  'is_thai': self._is_thai_pattern(pattern_name),
303
  'is_parenthetical': self._is_parenthetical_pattern(pattern_name),
304
  'formatting_hint': self._get_formatting_hint(pattern_name, total_level),
305
- 'priority': pattern_info['priority']
 
 
306
  }
307
 
308
  # No pattern found - check for basic indentation
@@ -324,19 +745,22 @@ class EnhancedIndentationDetector:
324
  'is_thai': False,
325
  'is_parenthetical': False,
326
  'formatting_hint': 'indented_text',
327
- 'priority': 0
 
 
328
  }
329
 
330
  # No indentation at all
331
  return self._create_empty_result(text)
332
 
333
- def classify_text_type(self, text: str, context: Dict = None) -> Dict[str, Any]:
334
  """
335
- Classify text as header, paragraph, or list item based on patterns and context
336
 
337
  Args:
338
  text: Text to classify
339
  context: Additional context like position, formatting, etc.
 
340
 
341
  Returns:
342
  Dict with classification results
@@ -347,7 +771,16 @@ class EnhancedIndentationDetector:
347
  text_stripped = text.strip()
348
  context = context or {}
349
 
350
- # Check for indentation patterns first
 
 
 
 
 
 
 
 
 
351
  indent_result = self.detect_indentation(text)
352
 
353
  # Initialize classification scores
@@ -357,6 +790,10 @@ class EnhancedIndentationDetector:
357
  'list_item': 0.0
358
  }
359
 
 
 
 
 
360
  # List item indicators
361
  if indent_result['has_pattern']:
362
  scores['list_item'] += 0.8
@@ -417,9 +854,51 @@ class EnhancedIndentationDetector:
417
  'indentation': indent_result,
418
  'is_header': classification[0] == 'header',
419
  'is_paragraph': classification[0] == 'paragraph',
420
- 'is_list_item': classification[0] == 'list_item'
 
421
  }
422
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  def _create_empty_result(self, text: str) -> Dict[str, Any]:
424
  """Create result for text with no indentation pattern"""
425
  return {
@@ -439,7 +918,9 @@ class EnhancedIndentationDetector:
439
  'is_thai': False,
440
  'is_parenthetical': False,
441
  'formatting_hint': 'normal_text',
442
- 'priority': 0
 
 
443
  }
444
 
445
  def _calculate_decimal_level(self, marker: str) -> int:
@@ -623,7 +1104,7 @@ if __name__ == "__main__":
623
  "This is a regular paragraph with some text that should be classified as paragraph content.",
624
  ]
625
 
626
- print("Enhanced Indentation Detection Results with Parenthetical Patterns:")
627
  print("=" * 80)
628
 
629
  for i, test_text in enumerate(test_cases, 1):
@@ -637,6 +1118,8 @@ if __name__ == "__main__":
637
  print(f" Content: {result['content']!r}")
638
  print(f" Hint: {result['formatting_hint']}")
639
  print(f" Priority: {result['priority']}")
 
 
640
  print(f" Classification: {classification['type']} (confidence: {classification['confidence']:.2f})")
641
  print()
642
 
 
1
  """
2
+ Enhanced Indentation Detection System with OpenCV Text Block Analysis
3
+ Comprehensive system for detecting hierarchical numbering, text blocks, and spacing
4
+ For PDF OCR Service with HTML and DOCX output support including OpenCV-based analysis
5
  """
6
  import re
7
  import logging
8
+ import cv2
9
+ import numpy as np
10
  from typing import Dict, Tuple, Optional, List, Any
11
  from collections import Counter
12
+ from PIL import Image, ImageFont, ImageDraw
13
+ import tempfile
14
+ import os
15
 
16
  logger = logging.getLogger(__name__)
17
 
18
+ class OpenCVTextAnalyzer:
19
+ """OpenCV-based text block and spacing analyzer"""
20
+
21
+ def __init__(self):
22
+ self.debug_mode = os.getenv('OPENCV_DEBUG', 'false').lower() == 'true'
23
+
24
+ def analyze_text_blocks(self, image_path_or_array, page_text_lines=None):
25
+ """
26
+ Analyze text blocks using OpenCV to detect paragraphs and spacing
27
+
28
+ Args:
29
+ image_path_or_array: Path to image or numpy array
30
+ page_text_lines: Optional list of text lines for correlation
31
+
32
+ Returns:
33
+ Dict with block analysis results
34
+ """
35
+ try:
36
+ # Load image
37
+ if isinstance(image_path_or_array, str):
38
+ image = cv2.imread(image_path_or_array)
39
+ else:
40
+ image = image_path_or_array.copy()
41
+
42
+ if image is None:
43
+ return self._empty_analysis()
44
+
45
+ # Convert to grayscale
46
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
47
+
48
+ # Text block detection
49
+ text_blocks = self._detect_text_blocks(gray)
50
+
51
+ # Bold text detection
52
+ bold_regions = self._detect_bold_text(gray)
53
+
54
+ # Spacing analysis
55
+ spacing_analysis = self._analyze_spacing(gray, text_blocks)
56
+
57
+ # Paragraph detection
58
+ paragraphs = self._detect_paragraphs(text_blocks, spacing_analysis)
59
+
60
+ # Correlate with actual text if provided
61
+ if page_text_lines:
62
+ correlated_analysis = self._correlate_with_text(
63
+ text_blocks, bold_regions, paragraphs, page_text_lines
64
+ )
65
+ else:
66
+ correlated_analysis = {
67
+ 'text_blocks': text_blocks,
68
+ 'bold_regions': bold_regions,
69
+ 'paragraphs': paragraphs
70
+ }
71
+
72
+ return {
73
+ 'success': True,
74
+ 'image_shape': image.shape,
75
+ 'text_blocks': correlated_analysis['text_blocks'],
76
+ 'bold_regions': correlated_analysis['bold_regions'],
77
+ 'paragraphs': correlated_analysis['paragraphs'],
78
+ 'spacing_analysis': spacing_analysis,
79
+ 'block_count': len(text_blocks),
80
+ 'paragraph_count': len(paragraphs),
81
+ 'bold_text_detected': len(bold_regions) > 0
82
+ }
83
+
84
+ except Exception as e:
85
+ logger.error(f"OpenCV text analysis error: {e}")
86
+ return self._empty_analysis()
87
+
88
+ def _detect_text_blocks(self, gray_image):
89
+ """Detect text blocks using morphological operations"""
90
+ # Apply threshold to get binary image
91
+ _, binary = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
92
+
93
+ # Noise removal
94
+ kernel = np.ones((2, 2), np.uint8)
95
+ binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
96
+
97
+ # Detect horizontal lines (text lines)
98
+ horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))
99
+ horizontal_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel)
100
+
101
+ # Detect text regions
102
+ text_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
103
+ text_regions = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, text_kernel)
104
+
105
+ # Find contours for text blocks
106
+ contours, _ = cv2.findContours(text_regions, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
107
+
108
+ text_blocks = []
109
+ for contour in contours:
110
+ x, y, w, h = cv2.boundingRect(contour)
111
+ # Filter small noise
112
+ if w > 10 and h > 5:
113
+ text_blocks.append({
114
+ 'x': x, 'y': y, 'width': w, 'height': h,
115
+ 'area': w * h,
116
+ 'aspect_ratio': w / h,
117
+ 'center_x': x + w // 2,
118
+ 'center_y': y + h // 2
119
+ })
120
+
121
+ # Sort by vertical position (top to bottom)
122
+ text_blocks.sort(key=lambda block: block['y'])
123
+
124
+ return text_blocks
125
+
126
+ def _detect_bold_text(self, gray_image):
127
+ """Detect bold text regions using morphological analysis - only entire lines that are bold"""
128
+ # Apply threshold
129
+ _, binary = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
130
+
131
+ # Detect thick/bold text using dilation
132
+ bold_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
133
+ dilated = cv2.dilate(binary, bold_kernel, iterations=1)
134
+
135
+ # Find areas that remain after erosion (indicating thicker text)
136
+ eroded = cv2.erode(dilated, bold_kernel, iterations=1)
137
+ bold_regions = cv2.bitwise_and(binary, eroded)
138
+
139
+ # Detect all text regions for comparison
140
+ text_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
141
+ all_text_regions = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, text_kernel)
142
+
143
+ # Find contours for bold regions
144
+ bold_contours, _ = cv2.findContours(bold_regions, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
145
+ all_text_contours, _ = cv2.findContours(all_text_regions, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
146
+
147
+ # Group contours by horizontal lines (y-coordinate similarity)
148
+ line_groups = {}
149
+ tolerance = 10 # pixels tolerance for same line
150
+
151
+ for contour in all_text_contours:
152
+ x, y, w, h = cv2.boundingRect(contour)
153
+ if w > 10 and h > 5: # Filter noise
154
+ # Find which line group this belongs to
155
+ line_key = None
156
+ for existing_y in line_groups.keys():
157
+ if abs(y - existing_y) <= tolerance:
158
+ line_key = existing_y
159
+ break
160
+
161
+ if line_key is None:
162
+ line_key = y
163
+ line_groups[line_key] = {'all_text': [], 'bold_text': []}
164
+
165
+ line_groups[line_key]['all_text'].append((x, y, w, h))
166
+
167
+ # Add bold regions to their respective lines
168
+ for contour in bold_contours:
169
+ x, y, w, h = cv2.boundingRect(contour)
170
+ if w > 20 and h > 8: # Filter for reasonable text size
171
+ # Calculate density to confirm bold text
172
+ roi = bold_regions[y:y+h, x:x+w]
173
+ density = np.sum(roi) / (255 * w * h)
174
+
175
+ if density > 0.1: # Threshold for bold text density
176
+ # Find which line this bold region belongs to
177
+ for line_y, line_data in line_groups.items():
178
+ if abs(y - line_y) <= tolerance:
179
+ line_data['bold_text'].append({
180
+ 'x': x, 'y': y, 'width': w, 'height': h,
181
+ 'density': density
182
+ })
183
+ break
184
+
185
+ # Now determine which lines are entirely bold (header candidates)
186
+ bold_header_blocks = []
187
+ for line_y, line_data in line_groups.items():
188
+ all_text_blocks = line_data['all_text']
189
+ bold_text_blocks = line_data['bold_text']
190
+
191
+ if not all_text_blocks or not bold_text_blocks:
192
+ continue
193
+
194
+ # Calculate total width of all text in this line
195
+ line_left = min(block[0] for block in all_text_blocks)
196
+ line_right = max(block[0] + block[2] for block in all_text_blocks)
197
+ total_text_width = line_right - line_left
198
+
199
+ # Calculate total width of bold text in this line
200
+ if bold_text_blocks:
201
+ bold_left = min(block['x'] for block in bold_text_blocks)
202
+ bold_right = max(block['x'] + block['width'] for block in bold_text_blocks)
203
+ total_bold_width = bold_right - bold_left
204
+
205
+ # Check if bold text covers most of the line (90% threshold)
206
+ bold_coverage = total_bold_width / max(total_text_width, 1)
207
+
208
+ if bold_coverage > 0.9: # 90% of the line must be bold
209
+ # This line is entirely bold - potential header
210
+ line_height = max(block[3] for block in all_text_blocks)
211
+ avg_density = sum(block['density'] for block in bold_text_blocks) / len(bold_text_blocks)
212
+
213
+ bold_header_blocks.append({
214
+ 'x': line_left,
215
+ 'y': line_y,
216
+ 'width': total_text_width,
217
+ 'height': line_height,
218
+ 'density': avg_density,
219
+ 'center_x': line_left + total_text_width // 2,
220
+ 'center_y': line_y + line_height // 2,
221
+ 'is_likely_header': line_height < 40 and total_text_width > 30 and avg_density > 0.12,
222
+ 'bold_coverage': bold_coverage
223
+ })
224
+
225
+ return bold_header_blocks
226
+
227
+ def _analyze_spacing(self, gray_image, text_blocks):
228
+ """Analyze spacing between text blocks"""
229
+ if len(text_blocks) < 2:
230
+ return {'average_line_spacing': 20, 'paragraph_spacing': 40}
231
+
232
+ # Calculate vertical distances between consecutive blocks
233
+ vertical_gaps = []
234
+ for i in range(len(text_blocks) - 1):
235
+ current_block = text_blocks[i]
236
+ next_block = text_blocks[i + 1]
237
+
238
+ gap = next_block['y'] - (current_block['y'] + current_block['height'])
239
+ if gap > 0:
240
+ vertical_gaps.append(gap)
241
+
242
+ if not vertical_gaps:
243
+ return {'average_line_spacing': 20, 'paragraph_spacing': 40}
244
+
245
+ # Analyze spacing distribution
246
+ vertical_gaps.sort()
247
+
248
+ # Calculate quartiles for spacing classification
249
+ q1 = np.percentile(vertical_gaps, 25)
250
+ q2 = np.percentile(vertical_gaps, 50) # median
251
+ q3 = np.percentile(vertical_gaps, 75)
252
+
253
+ # Classify spacing types
254
+ line_spacing = q1 # Small gaps = line spacing
255
+ paragraph_spacing = q3 # Large gaps = paragraph breaks
256
+
257
+ return {
258
+ 'vertical_gaps': vertical_gaps,
259
+ 'average_line_spacing': float(line_spacing),
260
+ 'paragraph_spacing': float(paragraph_spacing),
261
+ 'median_spacing': float(q2),
262
+ 'spacing_variance': float(np.var(vertical_gaps))
263
+ }
264
+
265
+ def _detect_paragraphs(self, text_blocks, spacing_analysis):
266
+ """Group text blocks into paragraphs based on spacing"""
267
+ if not text_blocks:
268
+ return []
269
+
270
+ paragraph_threshold = spacing_analysis.get('paragraph_spacing', 40)
271
+ paragraphs = []
272
+ current_paragraph = [text_blocks[0]]
273
+
274
+ for i in range(1, len(text_blocks)):
275
+ current_block = text_blocks[i-1]
276
+ next_block = text_blocks[i]
277
+
278
+ gap = next_block['y'] - (current_block['y'] + current_block['height'])
279
+
280
+ if gap > paragraph_threshold:
281
+ # End current paragraph and start new one
282
+ paragraphs.append({
283
+ 'blocks': current_paragraph,
284
+ 'start_y': current_paragraph[0]['y'],
285
+ 'end_y': current_paragraph[-1]['y'] + current_paragraph[-1]['height'],
286
+ 'block_count': len(current_paragraph)
287
+ })
288
+ current_paragraph = [next_block]
289
+ else:
290
+ # Continue current paragraph
291
+ current_paragraph.append(next_block)
292
+
293
+ # Add the last paragraph
294
+ if current_paragraph:
295
+ paragraphs.append({
296
+ 'blocks': current_paragraph,
297
+ 'start_y': current_paragraph[0]['y'],
298
+ 'end_y': current_paragraph[-1]['y'] + current_paragraph[-1]['height'],
299
+ 'block_count': len(current_paragraph)
300
+ })
301
+
302
+ return paragraphs
303
+
304
+ def _correlate_with_text(self, text_blocks, bold_regions, paragraphs, text_lines):
305
+ """Correlate OpenCV analysis with actual text content"""
306
+ # Map text lines to blocks based on position
307
+ line_to_block_mapping = []
308
+
309
+ for i, line in enumerate(text_lines):
310
+ line_strip = line.strip()
311
+ if not line_strip:
312
+ continue
313
+
314
+ # Find most likely corresponding block
315
+ best_block = None
316
+ min_distance = float('inf')
317
+
318
+ # Estimate line position (this is approximate)
319
+ estimated_y = i * 20 # Rough estimate
320
+
321
+ for block in text_blocks:
322
+ distance = abs(block['y'] - estimated_y)
323
+ if distance < min_distance:
324
+ min_distance = distance
325
+ best_block = block
326
+
327
+ # Check if line is in a bold region
328
+ is_bold = False
329
+ for bold_region in bold_regions:
330
+ if (best_block and
331
+ abs(bold_region['y'] - best_block['y']) < 20 and
332
+ bold_region['is_likely_header']):
333
+ is_bold = True
334
+ break
335
+
336
+ line_to_block_mapping.append({
337
+ 'line_index': i,
338
+ 'text': line_strip,
339
+ 'block': best_block,
340
+ 'is_bold': is_bold,
341
+ 'is_likely_header': is_bold and (len(line_strip) < 100),
342
+ 'estimated_y': estimated_y
343
+ })
344
+
345
+ return {
346
+ 'text_blocks': text_blocks,
347
+ 'bold_regions': bold_regions,
348
+ 'paragraphs': paragraphs,
349
+ 'line_mappings': line_to_block_mapping
350
+ }
351
+
352
+ def _empty_analysis(self):
353
+ """Return empty analysis structure"""
354
+ return {
355
+ 'success': False,
356
+ 'image_shape': None,
357
+ 'text_blocks': [],
358
+ 'bold_regions': [],
359
+ 'paragraphs': [],
360
+ 'spacing_analysis': {'average_line_spacing': 20, 'paragraph_spacing': 40},
361
+ 'block_count': 0,
362
+ 'paragraph_count': 0,
363
+ 'bold_text_detected': False
364
+ }
365
+
366
+
367
  class EnhancedIndentationDetector:
368
+ """Advanced indentation detection with comprehensive pattern matching and OpenCV integration"""
369
 
370
  def __init__(self):
371
+ self.opencv_analyzer = OpenCVTextAnalyzer()
372
+
373
  # Define comprehensive patterns for different numbering styles
374
  self.patterns = {
375
  # Hierarchical decimal numbering (1.1.1.1.1...)
 
548
 
549
  # Bullet points with various symbols
550
  'bullet_symbols': {
551
+ 'pattern': r'^\s*([•·▪▫◦‣⁃‧⌐◘◙○●▶▷►»★☆♦♠♣♥◆◇■□▲△▼▽▬─‒―‖•‚„…†‡•‰‹›€™])\s+',
552
  'example': '•',
553
  'level_func': self._calculate_bullet_level,
554
  'priority': 2
 
564
 
565
  # Arrow bullets
566
  'arrow_bullets': {
567
+ 'pattern': r'^\s*([\→↑↓↔↕↖↗↘↙⇒⇑⇓⇔⇕➔➜➤➪➫➬➭➮➯➱➲➳➴➵➶➷➸➹➺➻➼➽➾])\s+',
568
  'example': '→',
569
  'level_func': self._calculate_bullet_level,
570
  'priority': 2
 
598
  'capitalized_words': lambda text: sum(1 for word in text.split() if word and word[0].isupper()) / max(len(text.split()), 1) > 0.5
599
  }
600
 
601
+ def detect_indentation_with_opencv(self, text: str, opencv_analysis: Dict = None,
602
+ line_mapping: Dict = None) -> Dict[str, Any]:
603
+ """
604
+ Enhanced indentation detection using OpenCV analysis
605
+
606
+ Args:
607
+ text: Text line to analyze
608
+ opencv_analysis: OpenCV analysis results
609
+ line_mapping: Mapping between text and visual elements
610
+
611
+ Returns:
612
+ Dict with enhanced pattern info, level, and formatting details
613
+ """
614
+ # Start with regular pattern detection
615
+ base_result = self.detect_indentation(text)
616
+
617
+ # Enhance with OpenCV data if available
618
+ if opencv_analysis and line_mapping:
619
+ # Check if this line is detected as bold/header
620
+ is_bold_header = line_mapping.get('is_bold', False) and line_mapping.get('is_likely_header', False)
621
+
622
+ if is_bold_header:
623
+ # Override indentation for headers - they should not be indented
624
+ base_result.update({
625
+ 'is_header': True,
626
+ 'level': 0, # Headers don't get indentation
627
+ 'formatting_hint': 'header',
628
+ 'opencv_detected_bold': True,
629
+ 'suppress_indentation': True
630
+ })
631
+ else:
632
+ # Use spacing analysis to refine indentation
633
+ if 'spacing_analysis' in opencv_analysis:
634
+ spacing = opencv_analysis['spacing_analysis']
635
+
636
+ # Adjust level based on visual spacing
637
+ visual_indent = self._calculate_visual_indentation(line_mapping, spacing)
638
+ if visual_indent > 0:
639
+ base_result['level'] = max(base_result['level'], visual_indent)
640
+ base_result['opencv_visual_indent'] = visual_indent
641
+
642
+ return base_result
643
+
644
+ def _calculate_visual_indentation(self, line_mapping: Dict, spacing_analysis: Dict) -> int:
645
+ """Calculate indentation level based on visual position using 4-space equivalent"""
646
+ if not line_mapping or 'block' not in line_mapping:
647
+ return 0
648
+
649
+ block = line_mapping['block']
650
+ if not block:
651
+ return 0
652
+
653
+ # Use horizontal position to determine indentation
654
+ # Assume standard margin is around x=50-100
655
+ base_margin = 80
656
+ indent_width = 20 # pixels per indent level (adjusted for 4-space system)
657
+
658
+ if block['x'] > base_margin:
659
+ visual_level = (block['x'] - base_margin) // indent_width
660
+ return max(0, min(visual_level, 10)) # Cap at 10 levels
661
+
662
+ return 0
663
+
664
  def detect_indentation(self, text: str, base_margin: float = 0) -> Dict[str, Any]:
665
  """
666
  Detect indentation pattern and level for given text
 
677
 
678
  text_stripped = text.strip()
679
 
680
+ # Count leading whitespace for additional indentation (4 spaces = 1 level)
681
  leading_spaces = len(text) - len(text.lstrip())
682
  space_indent_level = leading_spaces // 4 # 4 spaces = 1 level
683
 
 
721
  'is_thai': self._is_thai_pattern(pattern_name),
722
  'is_parenthetical': self._is_parenthetical_pattern(pattern_name),
723
  'formatting_hint': self._get_formatting_hint(pattern_name, total_level),
724
+ 'priority': pattern_info['priority'],
725
+ 'is_header': False,
726
+ 'suppress_indentation': False
727
  }
728
 
729
  # No pattern found - check for basic indentation
 
745
  'is_thai': False,
746
  'is_parenthetical': False,
747
  'formatting_hint': 'indented_text',
748
+ 'priority': 0,
749
+ 'is_header': False,
750
+ 'suppress_indentation': False
751
  }
752
 
753
  # No indentation at all
754
  return self._create_empty_result(text)
755
 
756
+ def classify_text_type(self, text: str, context: Dict = None, opencv_analysis: Dict = None) -> Dict[str, Any]:
757
  """
758
+ Classify text as header, paragraph, or list item with OpenCV enhancement
759
 
760
  Args:
761
  text: Text to classify
762
  context: Additional context like position, formatting, etc.
763
+ opencv_analysis: OpenCV analysis results
764
 
765
  Returns:
766
  Dict with classification results
 
771
  text_stripped = text.strip()
772
  context = context or {}
773
 
774
+ # Check OpenCV bold detection first
775
+ opencv_is_bold = False
776
+ if opencv_analysis and 'line_mappings' in opencv_analysis:
777
+ # Find this text in the mappings
778
+ for mapping in opencv_analysis['line_mappings']:
779
+ if mapping.get('text', '').strip() == text_stripped:
780
+ opencv_is_bold = mapping.get('is_bold', False)
781
+ break
782
+
783
+ # Check for indentation patterns
784
  indent_result = self.detect_indentation(text)
785
 
786
  # Initialize classification scores
 
790
  'list_item': 0.0
791
  }
792
 
793
+ # OpenCV bold detection boost
794
+ if opencv_is_bold:
795
+ scores['header'] += 0.6
796
+
797
  # List item indicators
798
  if indent_result['has_pattern']:
799
  scores['list_item'] += 0.8
 
854
  'indentation': indent_result,
855
  'is_header': classification[0] == 'header',
856
  'is_paragraph': classification[0] == 'paragraph',
857
+ 'is_list_item': classification[0] == 'list_item',
858
+ 'opencv_detected_bold': opencv_is_bold
859
  }
860
 
861
+ def analyze_document_structure_with_opencv(self, text_lines: List[str],
862
+ image_path: str = None) -> Dict[str, Any]:
863
+ """
864
+ Analyze entire document structure with OpenCV enhancement
865
+
866
+ Args:
867
+ text_lines: List of text lines to analyze
868
+ image_path: Optional path to page image for OpenCV analysis
869
+
870
+ Returns:
871
+ Dict with enhanced document structure analysis
872
+ """
873
+ # Perform OpenCV analysis if image is provided
874
+ opencv_analysis = None
875
+ if image_path:
876
+ opencv_analysis = self.opencv_analyzer.analyze_text_blocks(image_path, text_lines)
877
+
878
+ # Standard document analysis
879
+ analysis = self.analyze_document_structure(text_lines)
880
+
881
+ # Enhance with OpenCV data
882
+ if opencv_analysis and opencv_analysis['success']:
883
+ analysis.update({
884
+ 'opencv_analysis': opencv_analysis,
885
+ 'opencv_text_blocks': opencv_analysis['block_count'],
886
+ 'opencv_paragraphs': opencv_analysis['paragraph_count'],
887
+ 'opencv_bold_detected': opencv_analysis['bold_text_detected'],
888
+ 'opencv_enhanced': True
889
+ })
890
+
891
+ # Re-analyze text classification with OpenCV data
892
+ opencv_enhanced_classification = Counter()
893
+ for line in text_lines:
894
+ if line.strip():
895
+ classification = self.classify_text_type(line, opencv_analysis=opencv_analysis)
896
+ opencv_enhanced_classification[classification['type']] += 1
897
+
898
+ analysis['opencv_enhanced_classification'] = dict(opencv_enhanced_classification)
899
+
900
+ return analysis
901
+
902
  def _create_empty_result(self, text: str) -> Dict[str, Any]:
903
  """Create result for text with no indentation pattern"""
904
  return {
 
918
  'is_thai': False,
919
  'is_parenthetical': False,
920
  'formatting_hint': 'normal_text',
921
+ 'priority': 0,
922
+ 'is_header': False,
923
+ 'suppress_indentation': False
924
  }
925
 
926
  def _calculate_decimal_level(self, marker: str) -> int:
 
1104
  "This is a regular paragraph with some text that should be classified as paragraph content.",
1105
  ]
1106
 
1107
+ print("Enhanced Indentation Detection with OpenCV Integration:")
1108
  print("=" * 80)
1109
 
1110
  for i, test_text in enumerate(test_cases, 1):
 
1118
  print(f" Content: {result['content']!r}")
1119
  print(f" Hint: {result['formatting_hint']}")
1120
  print(f" Priority: {result['priority']}")
1121
+ print(f" Is Header: {result['is_header']}")
1122
+ print(f" Suppress Indent: {result['suppress_indentation']}")
1123
  print(f" Classification: {classification['type']} (confidence: {classification['confidence']:.2f})")
1124
  print()
1125
 
ocr_service.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
- OCR Service Module - ENHANCED VERSION with Comprehensive Indentation Detection and Intelligent Text Classification
3
- Handles PDF to text conversion with proper indentation, spacing, page numbering, and intelligent text analysis
4
  """
5
  import re
6
  import os
@@ -8,6 +8,8 @@ import logging
8
  from typing import Optional, Dict, Any, Tuple, List
9
  import tempfile
10
  from pathlib import Path
 
 
11
 
12
  # Load environment variables
13
  from dotenv import load_dotenv
@@ -22,16 +24,14 @@ from azure.core.exceptions import AzureError
22
  try:
23
  import pytesseract
24
  from PIL import Image
25
- import cv2
26
- import numpy as np
27
  TESSERACT_AVAILABLE = True
28
  except ImportError:
29
  TESSERACT_AVAILABLE = False
30
 
31
  import fitz # PyMuPDF
32
 
33
- # Enhanced indentation detection
34
- from enhanced_indentation import EnhancedIndentationDetector
35
 
36
  # Configure logging
37
  logging.basicConfig(level=logging.INFO)
@@ -39,14 +39,15 @@ logger = logging.getLogger(__name__)
39
 
40
 
41
  class EnhancedHTMLProcessor:
42
- """Process OCR results through HTML with comprehensive indentation detection and intelligent text classification"""
43
 
44
  def __init__(self):
45
  self.indent_detector = EnhancedIndentationDetector()
 
46
 
47
  @staticmethod
48
- def create_html_from_azure_result(analysis_result) -> str:
49
- """Create structured HTML from Azure Document Intelligence result with enhanced indentation and text classification"""
50
  processor = EnhancedHTMLProcessor()
51
 
52
  html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8">']
@@ -80,18 +81,44 @@ class EnhancedHTMLProcessor:
80
  letter-spacing: 1px;
81
  }
82
 
83
- /* Enhanced indentation levels */
 
 
 
 
 
 
 
 
 
 
 
 
84
  .indent-level-0 { margin-left: 0em; }
85
- .indent-level-1 { margin-left: 1.5em; }
86
- .indent-level-2 { margin-left: 3.0em; }
87
- .indent-level-3 { margin-left: 4.5em; }
88
- .indent-level-4 { margin-left: 6.0em; }
89
- .indent-level-5 { margin-left: 7.5em; }
90
- .indent-level-6 { margin-left: 9.0em; }
91
- .indent-level-7 { margin-left: 10.5em; }
92
- .indent-level-8 { margin-left: 12.0em; }
93
- .indent-level-9 { margin-left: 13.5em; }
94
- .indent-level-10 { margin-left: 15.0em; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  /* Text classification styles */
97
  .content-header {
@@ -315,6 +342,20 @@ class EnhancedHTMLProcessor:
315
  font-size: 9pt;
316
  margin-top: -5px;
317
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  ''')
319
  html_parts.append('</style></head><body>')
320
 
@@ -326,24 +367,54 @@ class EnhancedHTMLProcessor:
326
  html_parts.append(f'<div class="page">')
327
  html_parts.append(f'<div class="page-header">Page {page_num} <span class="page-number">{page_num}</span></div>')
328
 
329
- # Process content with enhanced indentation detection and text classification
330
- content_items = processor._extract_page_content_enhanced(page, analysis_result, page_num)
 
 
 
 
 
 
 
 
 
 
331
  content_items.sort(key=lambda x: (x['y_pos'], x['x_pos']))
332
 
333
- # Generate HTML for each content item with enhanced formatting and classification
334
  for item in content_items:
335
  if item['type'] == 'table':
336
  html_parts.append(processor._table_to_html(item['content'], item['table_idx']))
337
  else:
338
- html_parts.append(processor._text_to_html_enhanced(item))
339
 
340
  html_parts.append('</div>')
341
 
342
  html_parts.append('</body></html>')
343
  return '\n'.join(html_parts)
344
 
345
- def _extract_page_content_enhanced(self, page, analysis_result, page_num):
346
- """Extract page content with enhanced indentation detection and intelligent text classification"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  content_items = []
348
 
349
  # Handle tables (existing logic)
@@ -375,7 +446,7 @@ class EnhancedHTMLProcessor:
375
  'x_pos': x_pos
376
  })
377
 
378
- # Process text content with enhanced indentation detection and text classification
379
  if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
380
  page_paragraphs = [p for p in analysis_result.paragraphs if
381
  p.bounding_regions and
@@ -391,10 +462,23 @@ class EnhancedHTMLProcessor:
391
  y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
392
  x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
393
 
394
- # Enhanced indentation detection
395
- indent_info = self.indent_detector.detect_indentation(para.content)
 
 
 
 
 
 
 
 
 
 
 
 
 
396
 
397
- # Intelligent text classification with context
398
  context = {
399
  'y_position': y_pos,
400
  'x_position': x_pos,
@@ -403,7 +487,9 @@ class EnhancedHTMLProcessor:
403
  'page_number': page_num
404
  }
405
 
406
- text_classification = self.indent_detector.classify_text_type(para.content, context)
 
 
407
 
408
  content_items.append({
409
  'type': 'paragraph',
@@ -413,18 +499,19 @@ class EnhancedHTMLProcessor:
413
  'x_pos': x_pos,
414
  'indent_info': indent_info,
415
  'text_classification': text_classification,
 
416
  'preserve_spacing': True
417
  })
418
 
419
  elif page.lines:
420
- # Process lines with enhanced indentation detection and classification
421
- processed_lines = self._process_lines_enhanced(page.lines, table_regions)
422
  content_items.extend(processed_lines)
423
 
424
  return content_items
425
 
426
- def _process_lines_enhanced(self, lines, table_regions):
427
- """Process lines with enhanced indentation detection and text classification"""
428
  content_items = []
429
  processed_content = set()
430
 
@@ -445,16 +532,31 @@ class EnhancedHTMLProcessor:
445
  y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
446
  x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
447
 
448
- # Enhanced indentation detection
449
- indent_info = self.indent_detector.detect_indentation(line.content)
 
 
 
 
 
450
 
451
- # Text classification with context
 
 
 
 
 
 
 
 
452
  context = {
453
  'y_position': y_pos,
454
  'x_position': x_pos
455
  }
456
 
457
- text_classification = self.indent_detector.classify_text_type(line.content, context)
 
 
458
 
459
  content_items.append({
460
  'type': 'line',
@@ -464,55 +566,76 @@ class EnhancedHTMLProcessor:
464
  'x_pos': x_pos,
465
  'indent_info': indent_info,
466
  'text_classification': text_classification,
 
467
  'preserve_spacing': True
468
  })
469
 
470
  return content_items
471
 
472
- def _text_to_html_enhanced(self, item):
473
- """Convert text item to HTML with enhanced indentation formatting and intelligent classification"""
474
  content = item['content']
475
  role = item.get('role', 'paragraph')
476
  indent_info = item.get('indent_info', {})
477
  text_classification = item.get('text_classification', {})
 
478
  preserve_spacing = item.get('preserve_spacing', False)
479
 
480
- # Build CSS classes based on indentation info and text classification
481
  css_classes = ['paragraph']
482
 
 
 
 
 
 
 
483
  # Add text classification class
484
  if text_classification.get('type'):
485
  css_classes.append(f"content-{text_classification['type']}")
486
 
487
- # Add indentation level class
488
- level = indent_info.get('level', 0)
489
- css_classes.append(f'indent-level-{min(level, 10)}')
 
490
 
491
- # Add pattern-specific formatting
492
- formatting_hint = indent_info.get('formatting_hint', 'normal_text')
493
- if formatting_hint != 'normal_text':
494
- css_classes.append(formatting_hint)
 
495
 
496
- # Add space indent class if needed
497
- if indent_info.get('pattern_type') == 'space_indent':
498
  css_classes.append('space-indent')
499
 
 
 
 
 
 
500
  # Preserve internal spacing
501
  if preserve_spacing:
502
  content = re.sub(r' +', lambda m: '&nbsp;' * len(m.group()), content)
503
  content = content.replace('\n', '<br>')
504
 
505
- # Add pattern marker if needed (but not for bullets as CSS handles them)
506
  pattern_marker = indent_info.get('pattern_marker', '')
507
- if pattern_marker and not indent_info.get('is_bullet', False):
 
 
508
  # For numbered/lettered items, include the marker
509
  content = f"{pattern_marker} {content}"
510
 
511
- # Build final HTML with enhanced classification
512
  class_str = f' class="{" ".join(css_classes)}"'
513
 
514
- # Use text classification to determine HTML structure
515
- if text_classification.get('is_header') and text_classification.get('confidence', 0) > 0.6:
 
 
 
 
516
  return f'<div class="content-header"{class_str}>{content}</div>'
517
  elif role == 'title':
518
  return f'<div class="title"{class_str}>{content}</div>'
@@ -662,10 +785,10 @@ class EnhancedHTMLProcessor:
662
 
663
  @staticmethod
664
  def html_to_formatted_text_enhanced(html_content):
665
- """Convert HTML back to formatted text with enhanced indentation preservation and text classification"""
666
  from html.parser import HTMLParser
667
 
668
- class EnhancedTextExtractor(HTMLParser):
669
  def __init__(self):
670
  super().__init__()
671
  self.text_parts = []
@@ -680,13 +803,17 @@ class EnhancedHTMLProcessor:
680
  self.in_page_header = False
681
  self.current_classes = []
682
  self.in_content_header = False
 
683
 
684
  def handle_starttag(self, tag, attrs):
685
  attr_dict = dict(attrs)
686
  class_attr = attr_dict.get('class', '')
687
  self.current_classes = class_attr.split()
688
 
689
- if 'page-header' in class_attr:
 
 
 
690
  self.in_page_header = True
691
  if len(self.text_parts) > 0:
692
  self.text_parts.append('\n\n' + '=' * 80 + '\n')
@@ -704,16 +831,19 @@ class EnhancedHTMLProcessor:
704
  elif tag == 'br':
705
  self.text_parts.append('\n')
706
 
707
- # Extract indent level from class
708
- for cls in self.current_classes:
709
- if cls.startswith('indent-level-'):
710
- try:
711
- self.current_indent_level = int(cls.split('-')[-1])
712
- except ValueError:
713
- self.current_indent_level = 0
714
- break
 
 
 
715
  else:
716
- self.current_indent_level = 0
717
 
718
  # Extract formatting hint
719
  formatting_hints = [
@@ -734,7 +864,10 @@ class EnhancedHTMLProcessor:
734
  self.current_formatting_hint = 'normal_text'
735
 
736
  def handle_endtag(self, tag):
737
- if tag == 'div' and self.in_page_header:
 
 
 
738
  self.text_parts.append('\n' + '=' * 80 + '\n\n')
739
  self.in_page_header = False
740
  elif tag == 'div' and self.in_content_header:
@@ -752,7 +885,7 @@ class EnhancedHTMLProcessor:
752
  elif tag == 'tr' and self.current_table_row:
753
  self.table_data.append(self.current_table_row[:])
754
  elif tag == 'div' and not self.in_table:
755
- if not self.in_title and not self.in_section_heading and not self.in_page_header and not self.in_content_header:
756
  self.text_parts.append('\n')
757
 
758
  # Reset state
@@ -771,22 +904,25 @@ class EnhancedHTMLProcessor:
771
  page_num = int(page_match.group(1))
772
  page_header = f"PAGE {page_num}"
773
  self.text_parts.append(page_header.center(80))
 
 
 
774
  elif self.in_content_header:
775
- indent_str = " " * self.current_indent_level
776
  self.text_parts.append(f'\n{indent_str}# {data.strip()}')
777
  elif self.in_title:
778
- indent_str = " " * self.current_indent_level
779
  self.text_parts.append(f'\n{indent_str}## {data.strip()}')
780
  elif self.in_section_heading:
781
- indent_str = " " * self.current_indent_level
782
  self.text_parts.append(f'\n{indent_str}### {data.strip()}')
783
  elif self.in_table:
784
  self.current_table_row.append(data.strip())
785
  else:
786
- # Apply enhanced indentation formatting
787
- indent_str = " " * self.current_indent_level
788
 
789
- # Handle different formatting hints including parenthetical
790
  if 'bullet' in self.current_formatting_hint:
791
  # Use appropriate bullet symbol based on level
792
  if 'primary' in self.current_formatting_hint:
@@ -807,11 +943,11 @@ class EnhancedHTMLProcessor:
807
  self.text_parts.append(f'{indent_str}{data.strip()}')
808
 
809
  elif 'space-indent' in self.current_formatting_hint:
810
- # Simple indented text
811
  self.text_parts.append(f'{indent_str}{data.strip()}')
812
 
813
  else:
814
- # Regular text with indentation
815
  self.text_parts.append(f'{indent_str}{data.strip()}')
816
 
817
  def _format_table(self):
@@ -855,7 +991,7 @@ class EnhancedHTMLProcessor:
855
 
856
  self.text_parts.append('\n')
857
 
858
- extractor = EnhancedTextExtractor()
859
  extractor.feed(html_content)
860
 
861
  result = ''.join(extractor.text_parts)
@@ -870,7 +1006,7 @@ class EnhancedHTMLProcessor:
870
 
871
 
872
  class OCRService:
873
- """Main OCR service with enhanced HTML processing, comprehensive indentation detection, and intelligent text classification"""
874
 
875
  def __init__(self):
876
  self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
@@ -892,14 +1028,14 @@ class OCRService:
892
 
893
  def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
894
  """
895
- Convert PDF to text using specified method with enhanced HTML processing and intelligent text classification
896
 
897
  Args:
898
  pdf_path: Path to the PDF file
899
  method: OCR method ('azure', 'tesseract', 'pymupdf', 'auto')
900
 
901
  Returns:
902
- Dict containing text content, HTML, metadata, and processing info
903
  """
904
  result = {
905
  'success': False,
@@ -926,11 +1062,11 @@ class OCRService:
926
  # Try primary method
927
  try:
928
  if method == "azure" and self.azure_client:
929
- result = self._azure_ocr_with_enhanced_html(pdf_path)
930
  elif method == "tesseract":
931
- result = self._tesseract_ocr(pdf_path)
932
  elif method == "pymupdf":
933
- result = self._pymupdf_extract(pdf_path)
934
  else:
935
  result['error'] = f"Method '{method}' not available or not configured"
936
 
@@ -945,18 +1081,57 @@ class OCRService:
945
 
946
  return result
947
 
948
- def _azure_ocr_with_enhanced_html(self, pdf_path: str) -> Dict[str, Any]:
949
- """Azure Document Intelligence OCR with enhanced HTML processing, indentation detection, and intelligent text classification"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
950
  result = {
951
  'success': False,
952
  'text': '',
953
  'html': '',
954
- 'method_used': 'azure_document_intelligence_enhanced_v2',
955
  'metadata': {},
956
  'error': None
957
  }
958
 
959
  try:
 
 
 
960
  with open(pdf_path, 'rb') as pdf_file:
961
  file_content = pdf_file.read()
962
 
@@ -982,16 +1157,32 @@ class OCRService:
982
 
983
  analysis_result = poller.result()
984
 
985
- # Generate HTML with enhanced indentation processing and text classification
986
- html_content = EnhancedHTMLProcessor.create_html_from_azure_result(analysis_result)
 
 
987
 
988
- # Convert HTML to formatted text with enhanced indentation preservation and classification
989
  formatted_text = EnhancedHTMLProcessor.html_to_formatted_text_enhanced(html_content)
990
 
991
- # Analyze document structure with text classification
992
  detector = EnhancedIndentationDetector()
993
  text_lines = formatted_text.split('\n')
994
- document_analysis = detector.analyze_document_structure(text_lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
995
 
996
  result.update({
997
  'success': True,
@@ -1003,31 +1194,36 @@ class OCRService:
1003
  'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
1004
  'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
1005
  'html_generated': True,
 
 
 
1006
  'enhanced_indentation': True,
1007
  'intelligent_text_classification': True,
1008
  'parenthetical_patterns_supported': True,
1009
  'page_numbers_added': True,
1010
  'comprehensive_formatting': True,
1011
  'azure_analysis': analysis_result,
1012
- 'document_structure_analysis': document_analysis
 
1013
  }
1014
  })
1015
 
1016
- logger.info("Azure OCR with enhanced indentation processing and intelligent text classification completed successfully")
 
1017
 
1018
  except Exception as e:
1019
- logger.error(f"Azure OCR error: {e}")
1020
- result['error'] = f"Azure OCR error: {e}"
1021
 
1022
  return result
1023
 
1024
- def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
1025
- """Tesseract OCR with enhanced HTML generation, indentation detection, and text classification"""
1026
  result = {
1027
  'success': False,
1028
  'text': '',
1029
  'html': '',
1030
- 'method_used': 'tesseract_enhanced_v2',
1031
  'metadata': {},
1032
  'error': None
1033
  }
@@ -1047,6 +1243,7 @@ class OCRService:
1047
  .page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }
1048
  .page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }
1049
  .paragraph { margin-bottom: 0.8em; white-space: pre-wrap; }
 
1050
  .content-header { font-weight: bold; color: #2c3e50; margin: 10px 0; }
1051
  .content-paragraph { margin-bottom: 1em; }
1052
  .content-list-item { margin-bottom: 0.5em; }
@@ -1054,6 +1251,7 @@ class OCRService:
1054
  html_parts.append('</style></head><body>')
1055
 
1056
  indent_detector = EnhancedIndentationDetector()
 
1057
 
1058
  for page_num in range(page_count):
1059
  # Add page header to text
@@ -1068,11 +1266,15 @@ class OCRService:
1068
  img_data = pix.tobytes("png")
1069
 
1070
  temp_img_path = None
 
1071
  try:
1072
  with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
1073
  temp_img.write(img_data)
1074
  temp_img_path = temp_img.name
1075
 
 
 
 
1076
  processed_img = self._preprocess_image(temp_img_path)
1077
 
1078
  custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
@@ -1080,23 +1282,55 @@ class OCRService:
1080
 
1081
  all_text.append(text)
1082
 
1083
- # Add to HTML with enhanced indentation processing and text classification
 
 
 
 
1084
  html_parts.append(f'<div class="page">')
1085
  html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
1086
 
1087
- # Process each line for indentation and classification
1088
  lines = text.split('\n')
1089
  for line in lines:
1090
  if line.strip():
1091
- indent_info = indent_detector.detect_indentation(line)
1092
- text_classification = indent_detector.classify_text_type(line)
 
 
 
 
 
1093
 
1094
- level = indent_info.get('level', 0)
1095
- formatting_hint = indent_info.get('formatting_hint', 'normal_text')
 
 
 
 
 
1096
 
1097
- css_classes = [f'indent-level-{min(level, 10)}']
1098
- if formatting_hint != 'normal_text':
1099
- css_classes.append(formatting_hint)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1100
 
1101
  # Add text classification class
1102
  if text_classification.get('type'):
@@ -1105,10 +1339,11 @@ class OCRService:
1105
  class_str = f' class="paragraph {" ".join(css_classes)}"'
1106
  content = indent_info.get('content', line.strip())
1107
 
1108
- # Add marker for non-bullet items
1109
- marker = indent_info.get('pattern_marker', '')
1110
- if marker and not indent_info.get('is_bullet', False):
1111
- content = f"{marker} {content}"
 
1112
 
1113
  html_parts.append(f'<div{class_str}>{content}</div>')
1114
  else:
@@ -1136,6 +1371,9 @@ class OCRService:
1136
  'metadata': {
1137
  'pages': page_count,
1138
  'html_generated': True,
 
 
 
1139
  'enhanced_indentation': True,
1140
  'intelligent_text_classification': True,
1141
  'parenthetical_patterns_supported': True,
@@ -1144,11 +1382,11 @@ class OCRService:
1144
  }
1145
  })
1146
 
1147
- logger.info("Tesseract OCR with enhanced indentation processing and text classification completed successfully")
1148
 
1149
  except Exception as e:
1150
- logger.error(f"Tesseract OCR error: {e}")
1151
- result['error'] = f"Tesseract OCR error: {e}"
1152
  finally:
1153
  if pdf_document is not None:
1154
  try:
@@ -1158,13 +1396,13 @@ class OCRService:
1158
 
1159
  return result
1160
 
1161
- def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
1162
- """PyMuPDF text extraction with enhanced HTML generation, indentation detection, and text classification"""
1163
  result = {
1164
  'success': False,
1165
  'text': '',
1166
  'html': '',
1167
- 'method_used': 'pymupdf_enhanced_v2',
1168
  'metadata': {},
1169
  'error': None
1170
  }
@@ -1180,6 +1418,7 @@ class OCRService:
1180
  .page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }
1181
  .page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }
1182
  .paragraph { margin-bottom: 0.8em; white-space: pre-wrap; }
 
1183
  .content-header { font-weight: bold; color: #2c3e50; margin: 10px 0; }
1184
  .content-paragraph { margin-bottom: 1em; }
1185
  .content-list-item { margin-bottom: 0.5em; }
@@ -1187,6 +1426,7 @@ class OCRService:
1187
  html_parts.append('</style></head><body>')
1188
 
1189
  indent_detector = EnhancedIndentationDetector()
 
1190
 
1191
  for page_num in range(page_count):
1192
  # Add page header to text
@@ -1195,26 +1435,69 @@ class OCRService:
1195
 
1196
  page = pdf_document.load_page(page_num)
1197
  text = page.get_text()
1198
-
1199
  all_text.append(text)
1200
 
1201
- # Add to HTML with enhanced indentation processing and text classification
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1202
  html_parts.append(f'<div class="page">')
1203
  html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
1204
 
1205
- # Process each line for indentation and classification
1206
  lines = text.split('\n')
1207
  for line in lines:
1208
  if line.strip():
1209
- indent_info = indent_detector.detect_indentation(line)
1210
- text_classification = indent_detector.classify_text_type(line)
 
 
 
 
 
1211
 
1212
- level = indent_info.get('level', 0)
1213
- formatting_hint = indent_info.get('formatting_hint', 'normal_text')
 
 
 
 
 
1214
 
1215
- css_classes = [f'indent-level-{min(level, 10)}']
1216
- if formatting_hint != 'normal_text':
1217
- css_classes.append(formatting_hint)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1218
 
1219
  # Add text classification class
1220
  if text_classification.get('type'):
@@ -1223,10 +1506,11 @@ class OCRService:
1223
  class_str = f' class="paragraph {" ".join(css_classes)}"'
1224
  content = indent_info.get('content', line.strip())
1225
 
1226
- # Add marker for non-bullet items
1227
- marker = indent_info.get('pattern_marker', '')
1228
- if marker and not indent_info.get('is_bullet', False):
1229
- content = f"{marker} {content}"
 
1230
 
1231
  html_parts.append(f'<div{class_str}>{content}</div>')
1232
  else:
@@ -1247,6 +1531,9 @@ class OCRService:
1247
  'metadata': {
1248
  'pages': page_count,
1249
  'html_generated': True,
 
 
 
1250
  'enhanced_indentation': True,
1251
  'intelligent_text_classification': True,
1252
  'parenthetical_patterns_supported': True,
@@ -1255,11 +1542,11 @@ class OCRService:
1255
  }
1256
  })
1257
 
1258
- logger.info("PyMuPDF extraction with enhanced indentation processing and text classification completed successfully")
1259
 
1260
  except Exception as e:
1261
- logger.error(f"PyMuPDF error: {e}")
1262
- result['error'] = f"PyMuPDF error: {e}"
1263
  finally:
1264
  if pdf_document is not None:
1265
  try:
@@ -1292,11 +1579,11 @@ class OCRService:
1292
  logger.info(f"Trying fallback method: {method}")
1293
  try:
1294
  if method == "azure":
1295
- result = self._azure_ocr_with_enhanced_html(pdf_path)
1296
  elif method == "tesseract":
1297
- result = self._tesseract_ocr(pdf_path)
1298
  elif method == "pymupdf":
1299
- result = self._pymupdf_extract(pdf_path)
1300
 
1301
  if result['success']:
1302
  result['method_used'] += '_fallback'
 
1
  """
2
+ OCR Service Module - ENHANCED VERSION with OpenCV Text Block Analysis and Bold Detection
3
+ Handles PDF to text conversion with OpenCV-based spacing analysis, bold text detection, and improved formatting
4
  """
5
  import re
6
  import os
 
8
  from typing import Optional, Dict, Any, Tuple, List
9
  import tempfile
10
  from pathlib import Path
11
+ import cv2
12
+ import numpy as np
13
 
14
  # Load environment variables
15
  from dotenv import load_dotenv
 
24
  try:
25
  import pytesseract
26
  from PIL import Image
 
 
27
  TESSERACT_AVAILABLE = True
28
  except ImportError:
29
  TESSERACT_AVAILABLE = False
30
 
31
  import fitz # PyMuPDF
32
 
33
+ # Enhanced indentation detection with OpenCV
34
+ from enhanced_indentation import EnhancedIndentationDetector, OpenCVTextAnalyzer
35
 
36
  # Configure logging
37
  logging.basicConfig(level=logging.INFO)
 
39
 
40
 
41
  class EnhancedHTMLProcessor:
42
+ """Process OCR results through HTML with OpenCV-enhanced text block analysis and bold detection"""
43
 
44
  def __init__(self):
45
  self.indent_detector = EnhancedIndentationDetector()
46
+ self.opencv_analyzer = OpenCVTextAnalyzer()
47
 
48
  @staticmethod
49
+ def create_html_from_azure_result(analysis_result, page_images=None) -> str:
50
+ """Create structured HTML from Azure Document Intelligence result with OpenCV enhancement"""
51
  processor = EnhancedHTMLProcessor()
52
 
53
  html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8">']
 
81
  letter-spacing: 1px;
82
  }
83
 
84
+ /* OpenCV-enhanced bold headers */
85
+ .opencv-bold-header {
86
+ font-weight: bold;
87
+ color: #2c3e50;
88
+ font-size: 1.3em;
89
+ margin: 20px 0 15px 0;
90
+ border-left: 4px solid #e74c3c;
91
+ padding-left: 12px;
92
+ background-color: #fdf2f2;
93
+ line-height: 1.4;
94
+ }
95
+
96
+ /* Enhanced indentation levels - 4 spaces per level system */
97
  .indent-level-0 { margin-left: 0em; }
98
+ .indent-level-1 { margin-left: 1.0em; } /* 4 spaces */
99
+ .indent-level-2 { margin-left: 2.0em; } /* 8 spaces */
100
+ .indent-level-3 { margin-left: 3.0em; } /* 12 spaces */
101
+ .indent-level-4 { margin-left: 4.0em; } /* 16 spaces */
102
+ .indent-level-5 { margin-left: 5.0em; } /* 20 spaces */
103
+ .indent-level-6 { margin-left: 6.0em; } /* 24 spaces */
104
+ .indent-level-7 { margin-left: 7.0em; } /* 28 spaces */
105
+ .indent-level-8 { margin-left: 8.0em; } /* 32 spaces */
106
+ .indent-level-9 { margin-left: 9.0em; } /* 36 spaces */
107
+ .indent-level-10 { margin-left: 10.0em; } /* 40 spaces */
108
+
109
+ /* OpenCV-detected headers have no indentation */
110
+ .opencv-bold-header.indent-level-1,
111
+ .opencv-bold-header.indent-level-2,
112
+ .opencv-bold-header.indent-level-3,
113
+ .opencv-bold-header.indent-level-4,
114
+ .opencv-bold-header.indent-level-5,
115
+ .opencv-bold-header.indent-level-6,
116
+ .opencv-bold-header.indent-level-7,
117
+ .opencv-bold-header.indent-level-8,
118
+ .opencv-bold-header.indent-level-9,
119
+ .opencv-bold-header.indent-level-10 {
120
+ margin-left: 0em !important;
121
+ }
122
 
123
  /* Text classification styles */
124
  .content-header {
 
342
  font-size: 9pt;
343
  margin-top: -5px;
344
  }
345
+
346
+ /* OpenCV block analysis indicators */
347
+ .opencv-paragraph-block {
348
+ border-left: 2px solid #27ae60;
349
+ padding-left: 8px;
350
+ margin: 10px 0;
351
+ }
352
+
353
+ .opencv-text-block {
354
+ background-color: #f8f9fa;
355
+ border-radius: 3px;
356
+ padding: 5px;
357
+ margin: 5px 0;
358
+ }
359
  ''')
360
  html_parts.append('</style></head><body>')
361
 
 
367
  html_parts.append(f'<div class="page">')
368
  html_parts.append(f'<div class="page-header">Page {page_num} <span class="page-number">{page_num}</span></div>')
369
 
370
+ # Get OpenCV analysis for this page if available
371
+ opencv_analysis = None
372
+ if page_images and page_num in page_images:
373
+ page_text_lines = processor._extract_page_text_lines(page, analysis_result, page_num)
374
+ opencv_analysis = processor.opencv_analyzer.analyze_text_blocks(
375
+ page_images[page_num], page_text_lines
376
+ )
377
+
378
+ # Process content with OpenCV-enhanced indentation detection and text classification
379
+ content_items = processor._extract_page_content_enhanced(
380
+ page, analysis_result, page_num, opencv_analysis
381
+ )
382
  content_items.sort(key=lambda x: (x['y_pos'], x['x_pos']))
383
 
384
+ # Generate HTML for each content item with OpenCV enhancement
385
  for item in content_items:
386
  if item['type'] == 'table':
387
  html_parts.append(processor._table_to_html(item['content'], item['table_idx']))
388
  else:
389
+ html_parts.append(processor._text_to_html_opencv_enhanced(item))
390
 
391
  html_parts.append('</div>')
392
 
393
  html_parts.append('</body></html>')
394
  return '\n'.join(html_parts)
395
 
396
+ def _extract_page_text_lines(self, page, analysis_result, page_num):
397
+ """Extract text lines for OpenCV correlation"""
398
+ text_lines = []
399
+
400
+ if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
401
+ page_paragraphs = [p for p in analysis_result.paragraphs if
402
+ p.bounding_regions and
403
+ p.bounding_regions[0].page_number == page_num]
404
+
405
+ for para in page_paragraphs:
406
+ if para.content.strip():
407
+ text_lines.append(para.content.strip())
408
+
409
+ elif page.lines:
410
+ for line in page.lines:
411
+ if line.content.strip():
412
+ text_lines.append(line.content.strip())
413
+
414
+ return text_lines
415
+
416
+ def _extract_page_content_enhanced(self, page, analysis_result, page_num, opencv_analysis=None):
417
+ """Extract page content with OpenCV-enhanced text block analysis and bold detection"""
418
  content_items = []
419
 
420
  # Handle tables (existing logic)
 
446
  'x_pos': x_pos
447
  })
448
 
449
+ # Process text content with OpenCV-enhanced analysis
450
  if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
451
  page_paragraphs = [p for p in analysis_result.paragraphs if
452
  p.bounding_regions and
 
462
  y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
463
  x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
464
 
465
+ # Find corresponding OpenCV analysis
466
+ opencv_line_mapping = None
467
+ if opencv_analysis and opencv_analysis.get('success') and 'line_mappings' in opencv_analysis:
468
+ for mapping in opencv_analysis['line_mappings']:
469
+ if mapping.get('text', '').strip() == para.content.strip():
470
+ opencv_line_mapping = mapping
471
+ break
472
+
473
+ # Enhanced indentation detection with OpenCV
474
+ if opencv_line_mapping:
475
+ indent_info = self.indent_detector.detect_indentation_with_opencv(
476
+ para.content, opencv_analysis, opencv_line_mapping
477
+ )
478
+ else:
479
+ indent_info = self.indent_detector.detect_indentation(para.content)
480
 
481
+ # Intelligent text classification with OpenCV context
482
  context = {
483
  'y_position': y_pos,
484
  'x_position': x_pos,
 
487
  'page_number': page_num
488
  }
489
 
490
+ text_classification = self.indent_detector.classify_text_type(
491
+ para.content, context, opencv_analysis
492
+ )
493
 
494
  content_items.append({
495
  'type': 'paragraph',
 
499
  'x_pos': x_pos,
500
  'indent_info': indent_info,
501
  'text_classification': text_classification,
502
+ 'opencv_analysis': opencv_line_mapping,
503
  'preserve_spacing': True
504
  })
505
 
506
  elif page.lines:
507
+ # Process lines with OpenCV-enhanced analysis
508
+ processed_lines = self._process_lines_opencv_enhanced(page.lines, table_regions, opencv_analysis)
509
  content_items.extend(processed_lines)
510
 
511
  return content_items
512
 
513
+ def _process_lines_opencv_enhanced(self, lines, table_regions, opencv_analysis=None):
514
+ """Process lines with OpenCV-enhanced text block analysis and bold detection"""
515
  content_items = []
516
  processed_content = set()
517
 
 
532
  y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
533
  x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
534
 
535
+ # Find corresponding OpenCV analysis
536
+ opencv_line_mapping = None
537
+ if opencv_analysis and opencv_analysis.get('success') and 'line_mappings' in opencv_analysis:
538
+ for mapping in opencv_analysis['line_mappings']:
539
+ if mapping.get('text', '').strip() == line.content.strip():
540
+ opencv_line_mapping = mapping
541
+ break
542
 
543
+ # Enhanced indentation detection with OpenCV
544
+ if opencv_line_mapping:
545
+ indent_info = self.indent_detector.detect_indentation_with_opencv(
546
+ line.content, opencv_analysis, opencv_line_mapping
547
+ )
548
+ else:
549
+ indent_info = self.indent_detector.detect_indentation(line.content)
550
+
551
+ # Text classification with OpenCV context
552
  context = {
553
  'y_position': y_pos,
554
  'x_position': x_pos
555
  }
556
 
557
+ text_classification = self.indent_detector.classify_text_type(
558
+ line.content, context, opencv_analysis
559
+ )
560
 
561
  content_items.append({
562
  'type': 'line',
 
566
  'x_pos': x_pos,
567
  'indent_info': indent_info,
568
  'text_classification': text_classification,
569
+ 'opencv_analysis': opencv_line_mapping,
570
  'preserve_spacing': True
571
  })
572
 
573
  return content_items
574
 
575
+ def _text_to_html_opencv_enhanced(self, item):
576
+ """Convert text item to HTML with OpenCV-enhanced formatting and bold detection"""
577
  content = item['content']
578
  role = item.get('role', 'paragraph')
579
  indent_info = item.get('indent_info', {})
580
  text_classification = item.get('text_classification', {})
581
+ opencv_analysis = item.get('opencv_analysis', {})
582
  preserve_spacing = item.get('preserve_spacing', False)
583
 
584
+ # Build CSS classes based on indentation info, text classification, and OpenCV
585
  css_classes = ['paragraph']
586
 
587
+ # Check if OpenCV detected this as a bold header
588
+ is_opencv_bold_header = False
589
+ if opencv_analysis and opencv_analysis.get('is_bold') and opencv_analysis.get('is_likely_header'):
590
+ is_opencv_bold_header = True
591
+ css_classes.append('opencv-bold-header')
592
+
593
  # Add text classification class
594
  if text_classification.get('type'):
595
  css_classes.append(f"content-{text_classification['type']}")
596
 
597
+ # Add indentation level class ONLY if not a bold header
598
+ if not is_opencv_bold_header and not indent_info.get('suppress_indentation', False):
599
+ level = indent_info.get('level', 0)
600
+ css_classes.append(f'indent-level-{min(level, 10)}')
601
 
602
+ # Add pattern-specific formatting ONLY if not a bold header
603
+ if not is_opencv_bold_header:
604
+ formatting_hint = indent_info.get('formatting_hint', 'normal_text')
605
+ if formatting_hint != 'normal_text':
606
+ css_classes.append(formatting_hint)
607
 
608
+ # Add space indent class if needed and not a bold header
609
+ if not is_opencv_bold_header and indent_info.get('pattern_type') == 'space_indent':
610
  css_classes.append('space-indent')
611
 
612
+ # Add OpenCV analysis indicators
613
+ if opencv_analysis:
614
+ if opencv_analysis.get('is_bold'):
615
+ css_classes.append('opencv-text-block')
616
+
617
  # Preserve internal spacing
618
  if preserve_spacing:
619
  content = re.sub(r' +', lambda m: '&nbsp;' * len(m.group()), content)
620
  content = content.replace('\n', '<br>')
621
 
622
+ # Add pattern marker if needed (but not for bullets or bold headers)
623
  pattern_marker = indent_info.get('pattern_marker', '')
624
+ if (pattern_marker and
625
+ not indent_info.get('is_bullet', False) and
626
+ not is_opencv_bold_header):
627
  # For numbered/lettered items, include the marker
628
  content = f"{pattern_marker} {content}"
629
 
630
+ # Build final HTML with OpenCV enhancement
631
  class_str = f' class="{" ".join(css_classes)}"'
632
 
633
+ # Use OpenCV and text classification to determine HTML structure
634
+ if is_opencv_bold_header:
635
+ return f'<div class="opencv-bold-header"{class_str}>{content}</div>'
636
+ elif (text_classification.get('is_header') and
637
+ text_classification.get('confidence', 0) > 0.6 and
638
+ not is_opencv_bold_header):
639
  return f'<div class="content-header"{class_str}>{content}</div>'
640
  elif role == 'title':
641
  return f'<div class="title"{class_str}>{content}</div>'
 
785
 
786
  @staticmethod
787
  def html_to_formatted_text_enhanced(html_content):
788
+ """Convert HTML back to formatted text with OpenCV-enhanced preservation"""
789
  from html.parser import HTMLParser
790
 
791
+ class OpenCVEnhancedTextExtractor(HTMLParser):
792
  def __init__(self):
793
  super().__init__()
794
  self.text_parts = []
 
803
  self.in_page_header = False
804
  self.current_classes = []
805
  self.in_content_header = False
806
+ self.in_opencv_bold_header = False
807
 
808
  def handle_starttag(self, tag, attrs):
809
  attr_dict = dict(attrs)
810
  class_attr = attr_dict.get('class', '')
811
  self.current_classes = class_attr.split()
812
 
813
+ if 'opencv-bold-header' in class_attr:
814
+ self.in_opencv_bold_header = True
815
+ # Bold headers get special treatment - no indentation
816
+ elif 'page-header' in class_attr:
817
  self.in_page_header = True
818
  if len(self.text_parts) > 0:
819
  self.text_parts.append('\n\n' + '=' * 80 + '\n')
 
831
  elif tag == 'br':
832
  self.text_parts.append('\n')
833
 
834
+ # Extract indent level from class ONLY if not OpenCV bold header
835
+ if not self.in_opencv_bold_header:
836
+ for cls in self.current_classes:
837
+ if cls.startswith('indent-level-'):
838
+ try:
839
+ self.current_indent_level = int(cls.split('-')[-1])
840
+ except ValueError:
841
+ self.current_indent_level = 0
842
+ break
843
+ else:
844
+ self.current_indent_level = 0
845
  else:
846
+ self.current_indent_level = 0 # Force no indentation for bold headers
847
 
848
  # Extract formatting hint
849
  formatting_hints = [
 
864
  self.current_formatting_hint = 'normal_text'
865
 
866
  def handle_endtag(self, tag):
867
+ if tag == 'div' and self.in_opencv_bold_header:
868
+ self.text_parts.append('\n\n')
869
+ self.in_opencv_bold_header = False
870
+ elif tag == 'div' and self.in_page_header:
871
  self.text_parts.append('\n' + '=' * 80 + '\n\n')
872
  self.in_page_header = False
873
  elif tag == 'div' and self.in_content_header:
 
885
  elif tag == 'tr' and self.current_table_row:
886
  self.table_data.append(self.current_table_row[:])
887
  elif tag == 'div' and not self.in_table:
888
+ if not self.in_title and not self.in_section_heading and not self.in_page_header and not self.in_content_header and not self.in_opencv_bold_header:
889
  self.text_parts.append('\n')
890
 
891
  # Reset state
 
904
  page_num = int(page_match.group(1))
905
  page_header = f"PAGE {page_num}"
906
  self.text_parts.append(page_header.center(80))
907
+ elif self.in_opencv_bold_header:
908
+ # OpenCV detected bold headers - no indentation, special formatting
909
+ self.text_parts.append(f'\n## {data.strip().upper()}')
910
  elif self.in_content_header:
911
+ indent_str = " " * self.current_indent_level # 4 spaces per level
912
  self.text_parts.append(f'\n{indent_str}# {data.strip()}')
913
  elif self.in_title:
914
+ indent_str = " " * self.current_indent_level # 4 spaces per level
915
  self.text_parts.append(f'\n{indent_str}## {data.strip()}')
916
  elif self.in_section_heading:
917
+ indent_str = " " * self.current_indent_level # 4 spaces per level
918
  self.text_parts.append(f'\n{indent_str}### {data.strip()}')
919
  elif self.in_table:
920
  self.current_table_row.append(data.strip())
921
  else:
922
+ # Apply OpenCV-enhanced indentation formatting using 4 spaces per level
923
+ indent_str = " " * self.current_indent_level # 4 spaces per level
924
 
925
+ # Handle different formatting hints including parenthetical using 4 spaces
926
  if 'bullet' in self.current_formatting_hint:
927
  # Use appropriate bullet symbol based on level
928
  if 'primary' in self.current_formatting_hint:
 
943
  self.text_parts.append(f'{indent_str}{data.strip()}')
944
 
945
  elif 'space-indent' in self.current_formatting_hint:
946
+ # Simple indented text using 4 spaces
947
  self.text_parts.append(f'{indent_str}{data.strip()}')
948
 
949
  else:
950
+ # Regular text with indentation using 4 spaces
951
  self.text_parts.append(f'{indent_str}{data.strip()}')
952
 
953
  def _format_table(self):
 
991
 
992
  self.text_parts.append('\n')
993
 
994
+ extractor = OpenCVEnhancedTextExtractor()
995
  extractor.feed(html_content)
996
 
997
  result = ''.join(extractor.text_parts)
 
1006
 
1007
 
1008
  class OCRService:
1009
+ """Main OCR service with OpenCV-enhanced text analysis, spacing detection, and bold text recognition"""
1010
 
1011
  def __init__(self):
1012
  self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
 
1028
 
1029
  def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
1030
  """
1031
+ Convert PDF to text using specified method with OpenCV-enhanced processing
1032
 
1033
  Args:
1034
  pdf_path: Path to the PDF file
1035
  method: OCR method ('azure', 'tesseract', 'pymupdf', 'auto')
1036
 
1037
  Returns:
1038
+ Dict containing text content, HTML, metadata, and OpenCV analysis
1039
  """
1040
  result = {
1041
  'success': False,
 
1062
  # Try primary method
1063
  try:
1064
  if method == "azure" and self.azure_client:
1065
+ result = self._azure_ocr_with_opencv_enhancement(pdf_path)
1066
  elif method == "tesseract":
1067
+ result = self._tesseract_ocr_with_opencv(pdf_path)
1068
  elif method == "pymupdf":
1069
+ result = self._pymupdf_extract_with_opencv(pdf_path)
1070
  else:
1071
  result['error'] = f"Method '{method}' not available or not configured"
1072
 
 
1081
 
1082
  return result
1083
 
1084
+ def _extract_page_images_from_pdf(self, pdf_path: str) -> Dict[int, np.ndarray]:
1085
+ """Extract page images for OpenCV analysis"""
1086
+ page_images = {}
1087
+ pdf_document = None
1088
+
1089
+ try:
1090
+ pdf_document = fitz.open(pdf_path)
1091
+
1092
+ for page_num in range(len(pdf_document)):
1093
+ page = pdf_document.load_page(page_num)
1094
+
1095
+ # Render page to image for OpenCV analysis
1096
+ mat = fitz.Matrix(2.0, 2.0) # High resolution for better analysis
1097
+ pix = page.get_pixmap(matrix=mat)
1098
+
1099
+ # Convert to numpy array
1100
+ img_data = pix.tobytes("png")
1101
+ import io
1102
+ from PIL import Image
1103
+ pil_image = Image.open(io.BytesIO(img_data))
1104
+ img_array = np.array(pil_image)
1105
+
1106
+ # Convert RGB to BGR for OpenCV
1107
+ if len(img_array.shape) == 3:
1108
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
1109
+
1110
+ page_images[page_num + 1] = img_array
1111
+
1112
+ except Exception as e:
1113
+ logger.error(f"Error extracting page images: {e}")
1114
+ finally:
1115
+ if pdf_document:
1116
+ pdf_document.close()
1117
+
1118
+ return page_images
1119
+
1120
+ def _azure_ocr_with_opencv_enhancement(self, pdf_path: str) -> Dict[str, Any]:
1121
+ """Azure Document Intelligence OCR with OpenCV-enhanced text analysis and bold detection"""
1122
  result = {
1123
  'success': False,
1124
  'text': '',
1125
  'html': '',
1126
+ 'method_used': 'azure_document_intelligence_opencv_enhanced',
1127
  'metadata': {},
1128
  'error': None
1129
  }
1130
 
1131
  try:
1132
+ # Extract page images for OpenCV analysis
1133
+ page_images = self._extract_page_images_from_pdf(pdf_path)
1134
+
1135
  with open(pdf_path, 'rb') as pdf_file:
1136
  file_content = pdf_file.read()
1137
 
 
1157
 
1158
  analysis_result = poller.result()
1159
 
1160
+ # Generate HTML with OpenCV-enhanced processing
1161
+ html_content = EnhancedHTMLProcessor.create_html_from_azure_result(
1162
+ analysis_result, page_images
1163
+ )
1164
 
1165
+ # Convert HTML to formatted text with OpenCV enhancement
1166
  formatted_text = EnhancedHTMLProcessor.html_to_formatted_text_enhanced(html_content)
1167
 
1168
+ # Analyze document structure with OpenCV enhancement
1169
  detector = EnhancedIndentationDetector()
1170
  text_lines = formatted_text.split('\n')
1171
+
1172
+ # Perform OpenCV analysis on first page for overall document analysis
1173
+ opencv_document_analysis = None
1174
+ if page_images:
1175
+ first_page_image = list(page_images.values())[0]
1176
+ opencv_document_analysis = detector.opencv_analyzer.analyze_text_blocks(
1177
+ first_page_image, text_lines
1178
+ )
1179
+
1180
+ document_analysis = detector.analyze_document_structure_with_opencv(
1181
+ text_lines, None # We already have the OpenCV analysis
1182
+ )
1183
+
1184
+ if opencv_document_analysis:
1185
+ document_analysis['opencv_global_analysis'] = opencv_document_analysis
1186
 
1187
  result.update({
1188
  'success': True,
 
1194
  'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
1195
  'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
1196
  'html_generated': True,
1197
+ 'opencv_enhanced': True,
1198
+ 'opencv_bold_detection': True,
1199
+ 'opencv_spacing_analysis': True,
1200
  'enhanced_indentation': True,
1201
  'intelligent_text_classification': True,
1202
  'parenthetical_patterns_supported': True,
1203
  'page_numbers_added': True,
1204
  'comprehensive_formatting': True,
1205
  'azure_analysis': analysis_result,
1206
+ 'document_structure_analysis': document_analysis,
1207
+ 'page_images_processed': len(page_images)
1208
  }
1209
  })
1210
 
1211
+ logger.info("Azure OCR with OpenCV enhancement completed successfully")
1212
+ logger.info(f"OpenCV analysis: {len(page_images)} pages processed with text block and bold detection")
1213
 
1214
  except Exception as e:
1215
+ logger.error(f"Azure OCR with OpenCV error: {e}")
1216
+ result['error'] = f"Azure OCR with OpenCV error: {e}"
1217
 
1218
  return result
1219
 
1220
+ def _tesseract_ocr_with_opencv(self, pdf_path: str) -> Dict[str, Any]:
1221
+ """Tesseract OCR with OpenCV-enhanced text analysis and bold detection"""
1222
  result = {
1223
  'success': False,
1224
  'text': '',
1225
  'html': '',
1226
+ 'method_used': 'tesseract_opencv_enhanced',
1227
  'metadata': {},
1228
  'error': None
1229
  }
 
1243
  .page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }
1244
  .page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }
1245
  .paragraph { margin-bottom: 0.8em; white-space: pre-wrap; }
1246
+ .opencv-bold-header { font-weight: bold; color: #2c3e50; font-size: 1.3em; margin: 20px 0 15px 0; border-left: 4px solid #e74c3c; padding-left: 12px; background-color: #fdf2f2; }
1247
  .content-header { font-weight: bold; color: #2c3e50; margin: 10px 0; }
1248
  .content-paragraph { margin-bottom: 1em; }
1249
  .content-list-item { margin-bottom: 0.5em; }
 
1251
  html_parts.append('</style></head><body>')
1252
 
1253
  indent_detector = EnhancedIndentationDetector()
1254
+ opencv_analyzer = OpenCVTextAnalyzer()
1255
 
1256
  for page_num in range(page_count):
1257
  # Add page header to text
 
1266
  img_data = pix.tobytes("png")
1267
 
1268
  temp_img_path = None
1269
+ opencv_analysis = None
1270
  try:
1271
  with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
1272
  temp_img.write(img_data)
1273
  temp_img_path = temp_img.name
1274
 
1275
+ # Convert to OpenCV format for analysis
1276
+ img_cv = cv2.imread(temp_img_path)
1277
+
1278
  processed_img = self._preprocess_image(temp_img_path)
1279
 
1280
  custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
 
1282
 
1283
  all_text.append(text)
1284
 
1285
+ # Perform OpenCV analysis
1286
+ text_lines = text.split('\n')
1287
+ opencv_analysis = opencv_analyzer.analyze_text_blocks(img_cv, text_lines)
1288
+
1289
+ # Add to HTML with OpenCV-enhanced processing
1290
  html_parts.append(f'<div class="page">')
1291
  html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
1292
 
1293
+ # Process each line with OpenCV enhancement
1294
  lines = text.split('\n')
1295
  for line in lines:
1296
  if line.strip():
1297
+ # Find OpenCV mapping for this line
1298
+ opencv_line_mapping = None
1299
+ if opencv_analysis and opencv_analysis.get('success') and 'line_mappings' in opencv_analysis:
1300
+ for mapping in opencv_analysis['line_mappings']:
1301
+ if mapping.get('text', '').strip() == line.strip():
1302
+ opencv_line_mapping = mapping
1303
+ break
1304
 
1305
+ # Enhanced indentation detection with OpenCV
1306
+ if opencv_line_mapping:
1307
+ indent_info = indent_detector.detect_indentation_with_opencv(
1308
+ line, opencv_analysis, opencv_line_mapping
1309
+ )
1310
+ else:
1311
+ indent_info = indent_detector.detect_indentation(line)
1312
 
1313
+ text_classification = indent_detector.classify_text_type(
1314
+ line, opencv_analysis=opencv_analysis
1315
+ )
1316
+
1317
+ # Build CSS classes
1318
+ css_classes = []
1319
+
1320
+ # Check if OpenCV detected bold header
1321
+ is_opencv_bold_header = (opencv_line_mapping and
1322
+ opencv_line_mapping.get('is_bold') and
1323
+ opencv_line_mapping.get('is_likely_header'))
1324
+
1325
+ if is_opencv_bold_header:
1326
+ css_classes.append('opencv-bold-header')
1327
+ else:
1328
+ level = indent_info.get('level', 0)
1329
+ css_classes.append(f'indent-level-{min(level, 10)}')
1330
+
1331
+ formatting_hint = indent_info.get('formatting_hint', 'normal_text')
1332
+ if formatting_hint != 'normal_text':
1333
+ css_classes.append(formatting_hint)
1334
 
1335
  # Add text classification class
1336
  if text_classification.get('type'):
 
1339
  class_str = f' class="paragraph {" ".join(css_classes)}"'
1340
  content = indent_info.get('content', line.strip())
1341
 
1342
+ # Add marker for non-bullet items (unless bold header)
1343
+ if not is_opencv_bold_header:
1344
+ marker = indent_info.get('pattern_marker', '')
1345
+ if marker and not indent_info.get('is_bullet', False):
1346
+ content = f"{marker} {content}"
1347
 
1348
  html_parts.append(f'<div{class_str}>{content}</div>')
1349
  else:
 
1371
  'metadata': {
1372
  'pages': page_count,
1373
  'html_generated': True,
1374
+ 'opencv_enhanced': True,
1375
+ 'opencv_bold_detection': True,
1376
+ 'opencv_spacing_analysis': True,
1377
  'enhanced_indentation': True,
1378
  'intelligent_text_classification': True,
1379
  'parenthetical_patterns_supported': True,
 
1382
  }
1383
  })
1384
 
1385
+ logger.info("Tesseract OCR with OpenCV enhancement completed successfully")
1386
 
1387
  except Exception as e:
1388
+ logger.error(f"Tesseract OCR with OpenCV error: {e}")
1389
+ result['error'] = f"Tesseract OCR with OpenCV error: {e}"
1390
  finally:
1391
  if pdf_document is not None:
1392
  try:
 
1396
 
1397
  return result
1398
 
1399
+ def _pymupdf_extract_with_opencv(self, pdf_path: str) -> Dict[str, Any]:
1400
+ """PyMuPDF text extraction with OpenCV-enhanced analysis and bold detection"""
1401
  result = {
1402
  'success': False,
1403
  'text': '',
1404
  'html': '',
1405
+ 'method_used': 'pymupdf_opencv_enhanced',
1406
  'metadata': {},
1407
  'error': None
1408
  }
 
1418
  .page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }
1419
  .page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }
1420
  .paragraph { margin-bottom: 0.8em; white-space: pre-wrap; }
1421
+ .opencv-bold-header { font-weight: bold; color: #2c3e50; font-size: 1.3em; margin: 20px 0 15px 0; border-left: 4px solid #e74c3c; padding-left: 12px; background-color: #fdf2f2; }
1422
  .content-header { font-weight: bold; color: #2c3e50; margin: 10px 0; }
1423
  .content-paragraph { margin-bottom: 1em; }
1424
  .content-list-item { margin-bottom: 0.5em; }
 
1426
  html_parts.append('</style></head><body>')
1427
 
1428
  indent_detector = EnhancedIndentationDetector()
1429
+ opencv_analyzer = OpenCVTextAnalyzer()
1430
 
1431
  for page_num in range(page_count):
1432
  # Add page header to text
 
1435
 
1436
  page = pdf_document.load_page(page_num)
1437
  text = page.get_text()
 
1438
  all_text.append(text)
1439
 
1440
+ # Get page image for OpenCV analysis
1441
+ mat = fitz.Matrix(2.0, 2.0)
1442
+ pix = page.get_pixmap(matrix=mat)
1443
+ img_data = pix.tobytes("png")
1444
+
1445
+ # Convert to OpenCV format
1446
+ import io
1447
+ from PIL import Image
1448
+ pil_image = Image.open(io.BytesIO(img_data))
1449
+ img_array = np.array(pil_image)
1450
+ img_cv = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
1451
+
1452
+ # Perform OpenCV analysis
1453
+ text_lines = text.split('\n')
1454
+ opencv_analysis = opencv_analyzer.analyze_text_blocks(img_cv, text_lines)
1455
+
1456
+ # Add to HTML with OpenCV-enhanced processing
1457
  html_parts.append(f'<div class="page">')
1458
  html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
1459
 
1460
+ # Process each line with OpenCV enhancement
1461
  lines = text.split('\n')
1462
  for line in lines:
1463
  if line.strip():
1464
+ # Find OpenCV mapping for this line
1465
+ opencv_line_mapping = None
1466
+ if opencv_analysis and opencv_analysis.get('success') and 'line_mappings' in opencv_analysis:
1467
+ for mapping in opencv_analysis['line_mappings']:
1468
+ if mapping.get('text', '').strip() == line.strip():
1469
+ opencv_line_mapping = mapping
1470
+ break
1471
 
1472
+ # Enhanced indentation detection with OpenCV
1473
+ if opencv_line_mapping:
1474
+ indent_info = indent_detector.detect_indentation_with_opencv(
1475
+ line, opencv_analysis, opencv_line_mapping
1476
+ )
1477
+ else:
1478
+ indent_info = indent_detector.detect_indentation(line)
1479
 
1480
+ text_classification = indent_detector.classify_text_type(
1481
+ line, opencv_analysis=opencv_analysis
1482
+ )
1483
+
1484
+ # Build CSS classes
1485
+ css_classes = []
1486
+
1487
+ # Check if OpenCV detected bold header
1488
+ is_opencv_bold_header = (opencv_line_mapping and
1489
+ opencv_line_mapping.get('is_bold') and
1490
+ opencv_line_mapping.get('is_likely_header'))
1491
+
1492
+ if is_opencv_bold_header:
1493
+ css_classes.append('opencv-bold-header')
1494
+ else:
1495
+ level = indent_info.get('level', 0)
1496
+ css_classes.append(f'indent-level-{min(level, 10)}')
1497
+
1498
+ formatting_hint = indent_info.get('formatting_hint', 'normal_text')
1499
+ if formatting_hint != 'normal_text':
1500
+ css_classes.append(formatting_hint)
1501
 
1502
  # Add text classification class
1503
  if text_classification.get('type'):
 
1506
  class_str = f' class="paragraph {" ".join(css_classes)}"'
1507
  content = indent_info.get('content', line.strip())
1508
 
1509
+ # Add marker for non-bullet items (unless bold header)
1510
+ if not is_opencv_bold_header:
1511
+ marker = indent_info.get('pattern_marker', '')
1512
+ if marker and not indent_info.get('is_bullet', False):
1513
+ content = f"{marker} {content}"
1514
 
1515
  html_parts.append(f'<div{class_str}>{content}</div>')
1516
  else:
 
1531
  'metadata': {
1532
  'pages': page_count,
1533
  'html_generated': True,
1534
+ 'opencv_enhanced': True,
1535
+ 'opencv_bold_detection': True,
1536
+ 'opencv_spacing_analysis': True,
1537
  'enhanced_indentation': True,
1538
  'intelligent_text_classification': True,
1539
  'parenthetical_patterns_supported': True,
 
1542
  }
1543
  })
1544
 
1545
+ logger.info("PyMuPDF extraction with OpenCV enhancement completed successfully")
1546
 
1547
  except Exception as e:
1548
+ logger.error(f"PyMuPDF with OpenCV error: {e}")
1549
+ result['error'] = f"PyMuPDF with OpenCV error: {e}"
1550
  finally:
1551
  if pdf_document is not None:
1552
  try:
 
1579
  logger.info(f"Trying fallback method: {method}")
1580
  try:
1581
  if method == "azure":
1582
+ result = self._azure_ocr_with_opencv_enhancement(pdf_path)
1583
  elif method == "tesseract":
1584
+ result = self._tesseract_ocr_with_opencv(pdf_path)
1585
  elif method == "pymupdf":
1586
+ result = self._pymupdf_extract_with_opencv(pdf_path)
1587
 
1588
  if result['success']:
1589
  result['method_used'] += '_fallback'
requirements.txt CHANGED
@@ -1,91 +1,112 @@
1
- # PDF OCR Service Requirements - Enhanced Version with Comprehensive Indentation Detection & Text Classification
2
-
3
- # Core web framework and UI
4
- gradio>=4.0.0
5
-
6
- # Environment configuration
7
- python-dotenv>=1.0.0
8
-
9
- # Azure Document Intelligence
10
- azure-ai-documentintelligence>=1.0.0b1
11
- azure-core>=1.28.0
12
-
13
- # OCR and image processing
14
- pytesseract>=0.3.10
15
- Pillow>=10.0.0
16
- opencv-python>=4.8.0
17
- numpy>=1.24.0
18
-
19
- # PDF processing and manipulation
20
- PyMuPDF>=1.23.0
21
-
22
- # Document export formats (ENHANCED)
23
- python-docx>=0.8.11
24
-
25
- # HTML processing and parsing
26
- beautifulsoup4>=4.12.0
27
- lxml>=4.9.0
28
-
29
- # Enhanced text processing and pattern detection
30
- regex>=2023.10.3 # For advanced regex patterns including parenthetical detection
31
-
32
- # Data handling and analysis
33
- pandas>=2.0.0 # For document structure analysis
34
- collections-extended>=2.0.2 # For enhanced counter operations
35
-
36
- # Text classification and analysis
37
- scikit-learn>=1.3.0 # For advanced text classification algorithms (optional)
38
- nltk>=3.8 # Natural language processing toolkit (optional)
39
-
40
- # Additional dependencies for enhanced preprocessing
41
- matplotlib>=3.7.0 # For image visualization in development
42
- scikit-image>=0.21.0 # Advanced image processing (optional)
43
-
44
- # Performance and utility libraries
45
- tqdm>=4.65.0 # Progress bars for long operations
46
- requests>=2.31.0 # HTTP requests for external services
47
-
48
- # Logging and monitoring
49
- colorlog>=6.7.0 # Enhanced logging with colors
50
- structlog>=23.1.0 # Structured logging for better debugging
51
-
52
- # File handling and temporary file management
53
- pathlib2>=2.3.7 # Enhanced path operations
54
-
55
- # Date and time handling
56
- python-dateutil>=2.8.2 # Enhanced date parsing
57
-
58
- # Enhanced Unicode and text processing
59
- unicodedata2>=15.0.0 # Enhanced Unicode support for Thai and other scripts
60
- ftfy>=6.1.1 # Text fixing and encoding repair
61
-
62
- # Configuration and validation
63
- pydantic>=2.0.0 # Data validation and settings management
64
- confuse>=2.0.0 # Configuration file handling
65
-
66
- # Development and testing (optional)
67
- pytest>=7.0.0
68
- pytest-cov>=4.0.0
69
- pytest-asyncio>=0.21.0 # For async testing
70
- pytest-mock>=3.11.0 # For mocking in tests
71
- black>=23.0.0 # Code formatting
72
- flake8>=6.0.0 # Code linting
73
- mypy>=1.5.0 # Type checking
74
- isort>=5.12.0 # Import sorting
75
-
76
- # Performance monitoring (optional)
77
- memory-profiler>=0.60.0
78
- psutil>=5.9.0 # System monitoring
79
- py-spy>=0.3.14 # Performance profiling
80
-
81
- # Enhanced error handling and debugging
82
- rich>=13.0.0 # Rich console output for debugging
83
- icecream>=2.1.3 # Enhanced debugging print statements
84
-
85
- # Enhanced file type detection
86
- python-magic>=0.4.27 # File type detection
87
- filetype>=1.2.0 # Alternative file type detection
88
-
89
- # Additional text processing utilities
90
- Unidecode>=1.3.6 # ASCII transliteration for Unicode text
91
- langdetect>=1.0.9 # Language detection for multi-language documents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PDF OCR Service Requirements - Enhanced with OpenCV Text Block Analysis & Bold Detection
2
+
3
+ # Core web framework and UI
4
+ gradio>=4.0.0
5
+
6
+ # Environment configuration
7
+ python-dotenv>=1.0.0
8
+
9
+ # Azure Document Intelligence
10
+ azure-ai-documentintelligence>=1.0.0b1
11
+ azure-core>=1.28.0
12
+
13
+ # OCR and image processing
14
+ pytesseract>=0.3.10
15
+ Pillow>=10.0.0
16
+ numpy>=1.24.0
17
+
18
+ # OpenCV for text block analysis and bold detection
19
+ opencv-python>=4.8.0
20
+ opencv-contrib-python>=4.8.0 # Additional OpenCV modules for advanced image processing
21
+
22
+ # PDF processing and manipulation
23
+ PyMuPDF>=1.23.0
24
+
25
+ # Document export formats (ENHANCED)
26
+ python-docx>=0.8.11
27
+
28
+ # HTML processing and parsing
29
+ beautifulsoup4>=4.12.0
30
+ lxml>=4.9.0
31
+
32
+ # Enhanced text processing and pattern detection
33
+ regex>=2023.10.3 # For advanced regex patterns including parenthetical detection
34
+
35
+ # Data handling and analysis for OpenCV processing
36
+ pandas>=2.0.0 # For document structure analysis
37
+ collections-extended>=2.0.2 # For enhanced counter operations
38
+
39
+ # Text classification and analysis
40
+ scikit-learn>=1.3.0 # For advanced text classification algorithms (optional)
41
+ nltk>=3.8 # Natural language processing toolkit (optional)
42
+
43
+ # OpenCV-related image processing dependencies
44
+ scipy>=1.11.0 # Scientific computing library used by OpenCV
45
+ matplotlib>=3.7.0 # For image visualization in development
46
+ scikit-image>=0.21.0 # Advanced image processing (optional, complements OpenCV)
47
+
48
+ # Performance and utility libraries
49
+ tqdm>=4.65.0 # Progress bars for long operations
50
+ requests>=2.31.0 # HTTP requests for external services
51
+
52
+ # Logging and monitoring
53
+ colorlog>=6.7.0 # Enhanced logging with colors
54
+ structlog>=23.1.0 # Structured logging for better debugging
55
+
56
+ # File handling and temporary file management
57
+ pathlib2>=2.3.7 # Enhanced path operations
58
+
59
+ # Date and time handling
60
+ python-dateutil>=2.8.2 # Enhanced date parsing
61
+
62
+ # Enhanced Unicode and text processing
63
+ unicodedata2>=15.0.0 # Enhanced Unicode support for Thai and other scripts
64
+ ftfy>=6.1.1 # Text fixing and encoding repair
65
+
66
+ # Configuration and validation
67
+ pydantic>=2.0.0 # Data validation and settings management
68
+ confuse>=2.0.0 # Configuration file handling
69
+
70
+ # Image format support for OpenCV
71
+ imageio>=2.31.0 # Additional image format support
72
+ imageio-ffmpeg>=0.4.8 # Video processing support (optional)
73
+
74
+ # Mathematical operations for OpenCV analysis
75
+ sympy>=1.12 # Symbolic mathematics (optional)
76
+ numba>=0.57.0 # Just-in-time compilation for performance (optional)
77
+
78
+ # Development and testing (optional)
79
+ pytest>=7.0.0
80
+ pytest-cov>=4.0.0
81
+ pytest-asyncio>=0.21.0 # For async testing
82
+ pytest-mock>=3.11.0 # For mocking in tests
83
+ black>=23.0.0 # Code formatting
84
+ flake8>=6.0.0 # Code linting
85
+ mypy>=1.5.0 # Type checking
86
+ isort>=5.12.0 # Import sorting
87
+
88
+ # Performance monitoring (optional)
89
+ memory-profiler>=0.60.0
90
+ psutil>=5.9.0 # System monitoring
91
+ py-spy>=0.3.14 # Performance profiling
92
+
93
+ # Enhanced error handling and debugging
94
+ rich>=13.0.0 # Rich console output for debugging
95
+ icecream>=2.1.3 # Enhanced debugging print statements
96
+
97
+ # Enhanced file type detection
98
+ python-magic>=0.4.27 # File type detection
99
+ filetype>=1.2.0 # Alternative file type detection
100
+
101
+ # Additional text processing utilities
102
+ Unidecode>=1.3.6 # ASCII transliteration for Unicode text
103
+ langdetect>=1.0.9 # Language detection for multi-language documents
104
+
105
+ # OpenCV-specific utilities and extensions
106
+ imutils>=0.5.4 # Convenience functions for OpenCV operations
107
+
108
+ # Threading and parallel processing for OpenCV operations
109
+ joblib>=1.3.0 # Efficient parallel processing
110
+ # Color space processing for OpenCV
111
+ colorama>=0.4.6 # Cross-platform colored terminal text
112
+ wcwidth