Chirapath commited on
Commit
cf0c0b1
·
verified ·
1 Parent(s): 5ba08f1

Upload 11 files

Browse files
Files changed (5) hide show
  1. app.py +171 -56
  2. backend.py +533 -181
  3. enhanced_indentation.py +648 -0
  4. ocr_service.py +580 -346
  5. requirements.txt +275 -12
app.py CHANGED
@@ -17,6 +17,7 @@ from dotenv import load_dotenv
17
  load_dotenv()
18
 
19
  from backend import BackendManager
 
20
 
21
  # Configure logging
22
  logging.basicConfig(level=logging.INFO)
@@ -25,6 +26,9 @@ logger = logging.getLogger(__name__)
25
  # Initialize backend manager
26
  backend_manager = BackendManager()
27
 
 
 
 
28
  # Check if python-docx is available
29
  try:
30
  from docx import Document
@@ -303,16 +307,16 @@ def update_crop_preview_interactive(page_selection, crop_top, crop_bottom, crop_
303
  logger.error(f"Error updating crop preview: {e}")
304
  return None
305
 
306
- def process_pdf_with_html_enhancement(pdf_file, ocr_method, enable_header_footer_removal,
307
- crop_top, crop_bottom, crop_left, crop_right,
308
- apply_to_all_pages, current_page_selection,
309
- progress=gr.Progress()):
310
- """Process PDF with HTML enhancement and improved table handling - FIXED"""
311
  if pdf_file is None:
312
  return "No file uploaded.", "", "", "Error: No file selected"
313
 
314
  try:
315
- progress(0.1, desc="Initializing HTML-enhanced processing...")
316
 
317
  # Prepare enhanced preprocessing options
318
  preprocessing_options = {
@@ -321,19 +325,19 @@ def process_pdf_with_html_enhancement(pdf_file, ocr_method, enable_header_footer
321
  'crop_settings': pdf_manager.get_crop_settings_for_processing() if enable_header_footer_removal else None
322
  }
323
 
324
- progress(0.3, desc="Processing with HTML enhancement...")
325
 
326
- # Process the PDF with enhanced preprocessing
327
  result = backend_manager.process_pdf_with_enhanced_resolution(
328
  pdf_file.name, ocr_method, preprocessing_options
329
  )
330
 
331
- progress(0.9, desc="Finalizing HTML processing...")
332
  progress(1.0, desc="Complete!")
333
 
334
  if result['success']:
335
  metadata_info = format_enhanced_metadata(result['metadata'], result['method_used'])
336
- status = f"Success: Processed using {result['method_used']} with HTML enhancement"
337
 
338
  # Return text, HTML, metadata, and status
339
  return (result['text'],
@@ -345,11 +349,11 @@ def process_pdf_with_html_enhancement(pdf_file, ocr_method, enable_header_footer
345
  return f"Error: {error_msg}", "", "", f"Processing failed: {error_msg}"
346
 
347
  except Exception as e:
348
- logger.error(f"HTML-enhanced processing error: {e}")
349
  return f"Error: {str(e)}", "", "", f"Unexpected error: {str(e)}"
350
 
351
  def format_enhanced_metadata(metadata, method_used):
352
- """Enhanced metadata formatting with HTML processing info"""
353
  if not metadata:
354
  return f"Method used: {method_used}"
355
 
@@ -364,6 +368,15 @@ def format_enhanced_metadata(metadata, method_used):
364
  if metadata.get('html_processing', False):
365
  info_lines.append("HTML generation: Enabled")
366
 
 
 
 
 
 
 
 
 
 
367
  if metadata.get('enhanced_resolution', False) and 'resolution_scale' in metadata:
368
  info_lines.append(f"Enhanced resolution: {metadata.get('resolution_scale', 'N/A')}x")
369
 
@@ -373,6 +386,25 @@ def format_enhanced_metadata(metadata, method_used):
373
  if 'tables' in metadata:
374
  info_lines.append(f"Tables detected: {metadata['tables']}")
375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  if 'processing_time_seconds' in metadata:
377
  info_lines.append(f"Processing time: {metadata['processing_time_seconds']:.2f} seconds")
378
 
@@ -381,8 +413,8 @@ def format_enhanced_metadata(metadata, method_used):
381
  def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
382
  crop_top, crop_bottom, crop_left, crop_right,
383
  apply_to_all_pages, current_page_selection):
384
- """Prepare enhanced downloads with HTML processing"""
385
- text, html, metadata, status = process_pdf_with_html_enhancement(
386
  pdf_file, method, enable_header_footer_removal,
387
  crop_top, crop_bottom, crop_left, crop_right,
388
  apply_to_all_pages, current_page_selection
@@ -417,59 +449,95 @@ def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
417
  gr.update(visible=False))
418
 
419
  def get_enhanced_method_info(method):
420
- """Get information about selected OCR method with HTML processing"""
421
  method_descriptions = {
422
- "auto": "**Auto Selection**: Automatically chooses the best available method with HTML processing and enhanced table handling.",
423
- "azure": "**Azure Document Intelligence**: Advanced cloud-based OCR with HTML generation, layout preservation, and smart table detection.",
424
- "tesseract": "**Tesseract OCR**: Open-source OCR with HTML output, enhanced image preprocessing, and resolution scaling.",
425
- "pymupdf": "**PyMuPDF**: Fast extraction enhanced with HTML processing and improved formatting preservation."
426
  }
427
 
428
  return method_descriptions.get(method, "Select a method to see details.")
429
 
430
  def check_enhanced_service_status():
431
- """Check and display enhanced service status"""
432
  available_methods = backend_manager.get_available_methods()
433
 
434
- status_lines = ["**Available OCR Methods (Enhanced with HTML Processing):**"]
435
 
436
  if "azure" in available_methods:
437
- status_lines.append(" Azure Document Intelligence - Ready (HTML + Tables)")
438
  else:
439
- status_lines.append(" Azure Document Intelligence - Not configured")
440
 
441
  if "tesseract" in available_methods:
442
- status_lines.append(" Tesseract OCR - Ready (HTML Enhanced)")
443
  else:
444
- status_lines.append(" Tesseract OCR - Not available")
445
 
446
  if "pymupdf" in available_methods:
447
- status_lines.append(" PyMuPDF - Ready (HTML Enhanced)")
448
  else:
449
- status_lines.append(" PyMuPDF - Not available")
450
 
451
  # Add enhanced features status
452
- status_lines.append("✓ HTML Processing - Available")
453
- status_lines.append(" Enhanced Table Handling - Available")
454
- status_lines.append(" Smart Text Preservation - Available")
455
- status_lines.append(" Multi-Page Crop Preview - Available")
456
- status_lines.append(" Per-Page Crop Customization - Available")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
  if HAS_DOCX_SUPPORT:
459
- status_lines.append(" Enhanced DOCX Export - Available")
460
  else:
461
- status_lines.append(" Enhanced DOCX Export - Install python-docx to enable")
 
 
 
462
 
463
- status_lines.append("✓ HTML File Export - Available")
464
- status_lines.append("✓ Enhanced Text Export - Available")
 
465
 
466
  return "\n".join(status_lines)
467
 
468
  def create_enhanced_interface():
469
- """Create enhanced Gradio interface with improved layout and HTML processing"""
470
 
471
  with gr.Blocks(
472
- title="PDF OCR Service - Enhanced with HTML Processing",
473
  theme=gr.themes.Soft(),
474
  css="""
475
  .main-header { text-align: center; margin-bottom: 2rem; }
@@ -484,14 +552,14 @@ def create_enhanced_interface():
484
 
485
  gr.HTML("""
486
  <div class="main-header">
487
- <h1>PDF OCR Service - Enhanced with HTML Processing</h1>
488
- <p>Convert PDF documents to text using enhanced OCR with HTML intermediate processing, smart table handling, and format preservation</p>
489
  </div>
490
  """)
491
 
492
  # Instructions at the top
493
  with gr.Group(elem_classes=["instructions-panel"]):
494
- gr.HTML("<h3>Instructions & Features</h3>")
495
  gr.HTML("""
496
  <div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem;">
497
  <h4>How to Use:</h4>
@@ -499,19 +567,66 @@ def create_enhanced_interface():
499
  <li><strong>Upload PDF:</strong> Select your PDF file in the configuration panel below</li>
500
  <li><strong>Choose Method:</strong> Select OCR method (Auto recommended for best results)</li>
501
  <li><strong>Configure Crop (Optional):</strong> Enable header/footer removal and adjust crop settings</li>
502
- <li><strong>Process:</strong> Click the process button to extract text with HTML enhancement</li>
503
- <li><strong>Download:</strong> Get results in TXT, DOCX, or HTML format</li>
504
  </ol>
505
 
506
- <h4>Enhanced Features:</h4>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  <ul>
508
  <li><strong>Smart Table Detection:</strong> 70% overlap threshold prevents text loss</li>
509
  <li><strong>HTML Processing:</strong> Better structure and formatting preservation</li>
510
- <li><strong>Multi-format Export:</strong> TXT, DOCX, and HTML downloads</li>
511
  <li><strong>Advanced Crop Control:</strong> Per-page customization with real-time preview</li>
512
  <li><strong>Enhanced Resolution:</strong> High-quality processing for better accuracy</li>
513
- <li><strong>Page Numbers:</strong> Automatic page numbering in extracted content</li>
514
- <li><strong>Proper Indentation:</strong> Preserved spacing and formatting</li>
 
515
  </ul>
516
  </div>
517
  """)
@@ -543,7 +658,7 @@ def create_enhanced_interface():
543
  choices=["auto", "azure", "tesseract", "pymupdf"],
544
  value="auto",
545
  label="OCR Method",
546
- info="Choose OCR method (all enhanced with HTML processing)"
547
  )
548
 
549
  # Method information display
@@ -628,7 +743,7 @@ def create_enhanced_interface():
628
 
629
  # Process button
630
  process_btn = gr.Button(
631
- "Process PDF with HTML Enhancement",
632
  variant="primary",
633
  size="lg"
634
  )
@@ -666,8 +781,8 @@ def create_enhanced_interface():
666
 
667
  # Extracted text output
668
  text_output = gr.Textbox(
669
- label="Extracted Text (Enhanced with Proper Formatting and Page Numbers)",
670
- placeholder="Processed text with HTML enhancement and preserved formatting will appear here...",
671
  lines=20,
672
  max_lines=30,
673
  interactive=False,
@@ -676,9 +791,9 @@ def create_enhanced_interface():
676
 
677
  # Metadata information
678
  metadata_output = gr.Textbox(
679
- label="Processing Information",
680
  interactive=False,
681
- lines=4
682
  )
683
 
684
  # Enhanced download buttons
@@ -689,7 +804,7 @@ def create_enhanced_interface():
689
  variant="secondary"
690
  )
691
  download_docx_btn = gr.DownloadButton(
692
- "Download Enhanced DOCX",
693
  visible=False,
694
  variant="secondary"
695
  )
@@ -701,7 +816,7 @@ def create_enhanced_interface():
701
 
702
  # Service Status at the bottom
703
  with gr.Group(elem_classes=["status-box"]):
704
- gr.HTML("<h4>Service Status</h4>")
705
  service_status = gr.Markdown(
706
  value=check_enhanced_service_status()
707
  )
@@ -793,7 +908,7 @@ def create_enhanced_interface():
793
  return interface
794
 
795
  def launch_enhanced_ui():
796
- """Launch the enhanced Gradio interface with HTML processing"""
797
  try:
798
  interface = create_enhanced_interface()
799
  interface.launch(
 
17
  load_dotenv()
18
 
19
  from backend import BackendManager
20
+ from enhanced_indentation import EnhancedIndentationDetector
21
 
22
  # Configure logging
23
  logging.basicConfig(level=logging.INFO)
 
26
  # Initialize backend manager
27
  backend_manager = BackendManager()
28
 
29
+ # Initialize enhanced indentation detector
30
+ indent_detector = EnhancedIndentationDetector()
31
+
32
  # Check if python-docx is available
33
  try:
34
  from docx import Document
 
307
  logger.error(f"Error updating crop preview: {e}")
308
  return None
309
 
310
+ def process_pdf_with_enhanced_indentation(pdf_file, ocr_method, enable_header_footer_removal,
311
+ crop_top, crop_bottom, crop_left, crop_right,
312
+ apply_to_all_pages, current_page_selection,
313
+ progress=gr.Progress()):
314
+ """Process PDF with enhanced indentation detection, text classification, and comprehensive formatting"""
315
  if pdf_file is None:
316
  return "No file uploaded.", "", "", "Error: No file selected"
317
 
318
  try:
319
+ progress(0.1, desc="Initializing enhanced processing with comprehensive indentation detection and intelligent text classification...")
320
 
321
  # Prepare enhanced preprocessing options
322
  preprocessing_options = {
 
325
  'crop_settings': pdf_manager.get_crop_settings_for_processing() if enable_header_footer_removal else None
326
  }
327
 
328
+ progress(0.3, desc="Processing with enhanced indentation detection and text classification...")
329
 
330
+ # Process the PDF with enhanced preprocessing, indentation detection, and text classification
331
  result = backend_manager.process_pdf_with_enhanced_resolution(
332
  pdf_file.name, ocr_method, preprocessing_options
333
  )
334
 
335
+ progress(0.9, desc="Finalizing enhanced processing...")
336
  progress(1.0, desc="Complete!")
337
 
338
  if result['success']:
339
  metadata_info = format_enhanced_metadata(result['metadata'], result['method_used'])
340
+ status = f"Success: Processed using {result['method_used']} with comprehensive indentation detection and intelligent text classification"
341
 
342
  # Return text, HTML, metadata, and status
343
  return (result['text'],
 
349
  return f"Error: {error_msg}", "", "", f"Processing failed: {error_msg}"
350
 
351
  except Exception as e:
352
+ logger.error(f"Enhanced processing error: {e}")
353
  return f"Error: {str(e)}", "", "", f"Unexpected error: {str(e)}"
354
 
355
  def format_enhanced_metadata(metadata, method_used):
356
+ """Enhanced metadata formatting with comprehensive indentation processing and text classification info"""
357
  if not metadata:
358
  return f"Method used: {method_used}"
359
 
 
368
  if metadata.get('html_processing', False):
369
  info_lines.append("HTML generation: Enabled")
370
 
371
+ if metadata.get('comprehensive_indentation', False):
372
+ info_lines.append("Comprehensive indentation detection: Enabled")
373
+
374
+ if metadata.get('intelligent_text_classification', False):
375
+ info_lines.append("Intelligent text classification: Enabled")
376
+
377
+ if metadata.get('parenthetical_patterns_supported', False):
378
+ info_lines.append("Parenthetical patterns: Supported (Arabic, Thai, Letters, Roman)")
379
+
380
  if metadata.get('enhanced_resolution', False) and 'resolution_scale' in metadata:
381
  info_lines.append(f"Enhanced resolution: {metadata.get('resolution_scale', 'N/A')}x")
382
 
 
386
  if 'tables' in metadata:
387
  info_lines.append(f"Tables detected: {metadata['tables']}")
388
 
389
+ # Document structure analysis information
390
+ if 'document_structure_analysis' in metadata:
391
+ analysis = metadata['document_structure_analysis']
392
+ if not analysis.get('analysis_failed', False):
393
+ info_lines.append(f"Patterned lines detected: {analysis.get('patterned_lines', 0)}")
394
+ info_lines.append(f"Maximum indentation level: {analysis.get('max_level', 0)}")
395
+ info_lines.append(f"Pattern coverage: {analysis.get('coverage_percentage', 0):.1f}%")
396
+
397
+ # Text classification results
398
+ if 'text_classification' in analysis:
399
+ classification = analysis['text_classification']
400
+ info_lines.append(f"Headers detected: {analysis.get('header_count', 0)}")
401
+ info_lines.append(f"Paragraphs detected: {analysis.get('paragraph_count', 0)}")
402
+ info_lines.append(f"List items detected: {analysis.get('list_item_count', 0)}")
403
+
404
+ if analysis.get('dominant_patterns'):
405
+ dominant = analysis['dominant_patterns'][0][0] if analysis['dominant_patterns'] else 'None'
406
+ info_lines.append(f"Dominant pattern: {dominant}")
407
+
408
  if 'processing_time_seconds' in metadata:
409
  info_lines.append(f"Processing time: {metadata['processing_time_seconds']:.2f} seconds")
410
 
 
413
  def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
414
  crop_top, crop_bottom, crop_left, crop_right,
415
  apply_to_all_pages, current_page_selection):
416
+ """Prepare enhanced downloads with comprehensive indentation processing and text classification"""
417
+ text, html, metadata, status = process_pdf_with_enhanced_indentation(
418
  pdf_file, method, enable_header_footer_removal,
419
  crop_top, crop_bottom, crop_left, crop_right,
420
  apply_to_all_pages, current_page_selection
 
449
  gr.update(visible=False))
450
 
451
  def get_enhanced_method_info(method):
452
+ """Get information about selected OCR method with comprehensive indentation processing and text classification"""
453
  method_descriptions = {
454
+ "auto": "**Auto Selection**: Automatically chooses the best available method with comprehensive indentation detection, intelligent text classification, HTML processing, enhanced pattern recognition for hierarchical numbering (including parenthetical patterns like (1), (๑), (a)), bullets, and multi-language support.",
455
+ "azure": "**Azure Document Intelligence**: Advanced cloud-based OCR with comprehensive indentation detection, intelligent text classification, HTML generation, layout preservation, smart table detection, and support for complex document structures including hierarchical numbering and parenthetical patterns.",
456
+ "tesseract": "**Tesseract OCR**: Open-source OCR enhanced with comprehensive indentation detection, intelligent text classification, HTML output, advanced image preprocessing, resolution scaling, and pattern recognition for various numbering styles including parenthetical patterns and bullet points.",
457
+ "pymupdf": "**PyMuPDF**: Fast extraction enhanced with comprehensive indentation detection, intelligent text classification, HTML processing, improved formatting preservation, and pattern recognition for maintaining document structure and hierarchy including parenthetical numbering."
458
  }
459
 
460
  return method_descriptions.get(method, "Select a method to see details.")
461
 
462
  def check_enhanced_service_status():
463
+ """Check and display enhanced service status with indentation detection and text classification capabilities"""
464
  available_methods = backend_manager.get_available_methods()
465
 
466
+ status_lines = ["**Available OCR Methods (Enhanced with Comprehensive Indentation Detection & Text Classification):**"]
467
 
468
  if "azure" in available_methods:
469
+ status_lines.append(" Azure Document Intelligence - Ready (HTML + Tables + Comprehensive Indentation + Text Classification)")
470
  else:
471
+ status_lines.append(" Azure Document Intelligence - Not configured")
472
 
473
  if "tesseract" in available_methods:
474
+ status_lines.append(" Tesseract OCR - Ready (HTML Enhanced + Comprehensive Indentation + Text Classification)")
475
  else:
476
+ status_lines.append(" Tesseract OCR - Not available")
477
 
478
  if "pymupdf" in available_methods:
479
+ status_lines.append(" PyMuPDF - Ready (HTML Enhanced + Comprehensive Indentation + Text Classification)")
480
  else:
481
+ status_lines.append(" PyMuPDF - Not available")
482
 
483
  # Add enhanced features status
484
+ status_lines.append("")
485
+ status_lines.append("**Comprehensive Indentation Detection Features:**")
486
+ status_lines.append(" Hierarchical Decimal Numbering (1.1.1.1.1...)")
487
+ status_lines.append(" Mixed Hierarchical Numbering (1.2.a.i.A...)")
488
+ status_lines.append(" Legal Numbering (1.1.1(a)(i))")
489
+ status_lines.append("✅ Outline Numbering (I.A.1.a.i.)")
490
+ status_lines.append("✅ Section Numbering (§1.2.3, Article 1.1.1)")
491
+ status_lines.append("✅ Parenthetical Arabic Numerals ((1), (2), (3))")
492
+ status_lines.append("✅ Parenthetical Thai Numerals ((๑), (๒), (๓))")
493
+ status_lines.append("✅ Parenthetical Letters ((a), (b), (A), (B))")
494
+ status_lines.append("✅ Parenthetical Roman Numerals ((i), (ii), (I), (II))")
495
+ status_lines.append("✅ Parenthetical Thai Letters ((ก), (ข), (ค))")
496
+ status_lines.append("✅ Thai Script Support (มาตรา, ข้อ, ก.ข.ค.)")
497
+ status_lines.append("✅ Multiple Bullet Styles (•◦▪→ and more)")
498
+ status_lines.append("✅ Checkbox Items ([x], [ ], [✓])")
499
+ status_lines.append("✅ Roman Numerals (I.II.III, i.ii.iii)")
500
+ status_lines.append("✅ Letter Lists (A.B.C, a.b.c)")
501
+ status_lines.append("✅ Space-based Indentation Detection")
502
+ status_lines.append("✅ Priority-based Pattern Matching")
503
+
504
+ status_lines.append("")
505
+ status_lines.append("**Intelligent Text Classification Features:**")
506
+ status_lines.append("✅ Header Detection (title case, all caps, short lines)")
507
+ status_lines.append("✅ Paragraph Classification (long text, proper punctuation)")
508
+ status_lines.append("✅ List Item Recognition (patterned content)")
509
+ status_lines.append("✅ Context-aware Analysis (position, font size)")
510
+ status_lines.append("✅ Confidence Scoring")
511
+ status_lines.append("✅ Document Structure Analysis")
512
+
513
+ status_lines.append("")
514
+ status_lines.append("**Enhanced Processing Features:**")
515
+ status_lines.append("✅ HTML Processing - Available")
516
+ status_lines.append("✅ Enhanced Table Handling - Available")
517
+ status_lines.append("✅ Smart Text Preservation - Available")
518
+ status_lines.append("✅ Multi-Page Crop Preview - Available")
519
+ status_lines.append("✅ Per-Page Crop Customization - Available")
520
+ status_lines.append("✅ Document Structure Analysis - Available")
521
 
522
  if HAS_DOCX_SUPPORT:
523
+ status_lines.append(" Enhanced DOCX Export - Available (with indentation formatting)")
524
  else:
525
+ status_lines.append(" Enhanced DOCX Export - Install python-docx to enable")
526
+
527
+ status_lines.append("✅ HTML File Export - Available")
528
+ status_lines.append("✅ Enhanced Text Export - Available")
529
 
530
+ # Add pattern detection statistics
531
+ pattern_count = len(indent_detector.patterns)
532
+ status_lines.append(f"✅ Pattern Detection Engine - {pattern_count} patterns supported")
533
 
534
  return "\n".join(status_lines)
535
 
536
  def create_enhanced_interface():
537
+ """Create enhanced Gradio interface with comprehensive indentation detection and text classification"""
538
 
539
  with gr.Blocks(
540
+ title="PDF OCR Service - Enhanced with Comprehensive Indentation Detection & Text Classification",
541
  theme=gr.themes.Soft(),
542
  css="""
543
  .main-header { text-align: center; margin-bottom: 2rem; }
 
552
 
553
  gr.HTML("""
554
  <div class="main-header">
555
+ <h1>PDF OCR Service - Enhanced with Comprehensive Indentation Detection & Intelligent Text Classification</h1>
556
+ <p>Convert PDF documents to text using enhanced OCR with HTML intermediate processing, smart table handling, comprehensive indentation pattern recognition including parenthetical patterns like (1), (๑), (a), and intelligent text classification for headers, paragraphs, and list items</p>
557
  </div>
558
  """)
559
 
560
  # Instructions at the top
561
  with gr.Group(elem_classes=["instructions-panel"]):
562
+ gr.HTML("<h3>Instructions & Enhanced Features</h3>")
563
  gr.HTML("""
564
  <div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem;">
565
  <h4>How to Use:</h4>
 
567
  <li><strong>Upload PDF:</strong> Select your PDF file in the configuration panel below</li>
568
  <li><strong>Choose Method:</strong> Select OCR method (Auto recommended for best results)</li>
569
  <li><strong>Configure Crop (Optional):</strong> Enable header/footer removal and adjust crop settings</li>
570
+ <li><strong>Process:</strong> Click the process button to extract text with comprehensive indentation detection and text classification</li>
571
+ <li><strong>Download:</strong> Get results in TXT, DOCX, or HTML format with preserved formatting</li>
572
  </ol>
573
 
574
+ <h4>Comprehensive Indentation Detection & Text Classification Features:</h4>
575
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin-top: 0.5rem;">
576
+ <div>
577
+ <strong>Hierarchical Numbering:</strong>
578
+ <ul>
579
+ <li>Decimal: 1.1.1.1.1...</li>
580
+ <li>Mixed: 1.2.a.i.A...</li>
581
+ <li>Legal: 1.1.1(a)(i)</li>
582
+ <li>Outline: I.A.1.a.i.</li>
583
+ <li>Section: §1.2.3, Article 1.1.1</li>
584
+ </ul>
585
+ </div>
586
+ <div>
587
+ <strong>Parenthetical Patterns:</strong>
588
+ <ul>
589
+ <li>Arabic: (1), (2), (3)</li>
590
+ <li>Thai Numerals: (๑), (๒), (๓)</li>
591
+ <li>Letters: (a), (b), (A), (B)</li>
592
+ <li>Roman: (i), (ii), (I), (II)</li>
593
+ <li>Thai Letters: (ก), (ข), (ค)</li>
594
+ </ul>
595
+ </div>
596
+ </div>
597
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin-top: 0.5rem;">
598
+ <div>
599
+ <strong>Multi-Language & Symbols:</strong>
600
+ <ul>
601
+ <li>Thai Script: มาตรา, ข้อ, ก.ข.ค.</li>
602
+ <li>Bullets: •◦▪→ and 20+ more</li>
603
+ <li>Roman: I.II.III, i.ii.iii</li>
604
+ <li>Letters: A.B.C, a.b.c</li>
605
+ <li>Checkboxes: [x], [ ], [✓]</li>
606
+ </ul>
607
+ </div>
608
+ <div>
609
+ <strong>Intelligent Text Classification:</strong>
610
+ <ul>
611
+ <li>Header Detection: Title case, all caps, short lines</li>
612
+ <li>Paragraph Recognition: Long text, proper punctuation</li>
613
+ <li>List Item Identification: Patterned content</li>
614
+ <li>Context Analysis: Position, font size, formatting</li>
615
+ <li>Confidence Scoring: Reliability assessment</li>
616
+ </ul>
617
+ </div>
618
+ </div>
619
+
620
+ <h4>Technical Enhancements:</h4>
621
  <ul>
622
  <li><strong>Smart Table Detection:</strong> 70% overlap threshold prevents text loss</li>
623
  <li><strong>HTML Processing:</strong> Better structure and formatting preservation</li>
624
+ <li><strong>Multi-format Export:</strong> TXT, DOCX, and HTML downloads with preserved indentation</li>
625
  <li><strong>Advanced Crop Control:</strong> Per-page customization with real-time preview</li>
626
  <li><strong>Enhanced Resolution:</strong> High-quality processing for better accuracy</li>
627
+ <li><strong>Document Analysis:</strong> Automatic structure detection and statistics</li>
628
+ <li><strong>Priority Pattern Matching:</strong> Intelligent pattern detection with priority ranking</li>
629
+ <li><strong>Text Classification:</strong> Automated header, paragraph, and list item detection</li>
630
  </ul>
631
  </div>
632
  """)
 
658
  choices=["auto", "azure", "tesseract", "pymupdf"],
659
  value="auto",
660
  label="OCR Method",
661
+ info="Choose OCR method (all enhanced with comprehensive indentation detection and text classification)"
662
  )
663
 
664
  # Method information display
 
743
 
744
  # Process button
745
  process_btn = gr.Button(
746
+ "Process PDF with Comprehensive Indentation Detection & Text Classification",
747
  variant="primary",
748
  size="lg"
749
  )
 
781
 
782
  # Extracted text output
783
  text_output = gr.Textbox(
784
+ label="Extracted Text (Enhanced with Comprehensive Indentation Detection & Text Classification)",
785
+ placeholder="Processed text with comprehensive indentation detection, intelligent text classification, HTML enhancement, and preserved formatting will appear here...",
786
  lines=20,
787
  max_lines=30,
788
  interactive=False,
 
791
 
792
  # Metadata information
793
  metadata_output = gr.Textbox(
794
+ label="Processing Information & Document Analysis",
795
  interactive=False,
796
+ lines=8
797
  )
798
 
799
  # Enhanced download buttons
 
804
  variant="secondary"
805
  )
806
  download_docx_btn = gr.DownloadButton(
807
+ "Download Enhanced DOCX (with Indentation & Classification)",
808
  visible=False,
809
  variant="secondary"
810
  )
 
816
 
817
  # Service Status at the bottom
818
  with gr.Group(elem_classes=["status-box"]):
819
+ gr.HTML("<h4>Service Status & Capabilities</h4>")
820
  service_status = gr.Markdown(
821
  value=check_enhanced_service_status()
822
  )
 
908
  return interface
909
 
910
  def launch_enhanced_ui():
911
+ """Launch the enhanced Gradio interface with comprehensive indentation detection and text classification"""
912
  try:
913
  interface = create_enhanced_interface()
914
  interface.launch(
backend.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Backend Management Module - FIXED VERSION with Corrected Crop Processing
3
  Coordinates between UI and OCR services, handles file management and preprocessing
4
  """
5
  import re
@@ -14,24 +14,34 @@ from datetime import datetime
14
  import cv2
15
  import numpy as np
16
  import fitz # PyMuPDF
 
 
 
 
 
 
17
 
18
  # Load environment variables
19
  from dotenv import load_dotenv
20
  load_dotenv()
21
 
22
  from ocr_service import OCRService
 
23
 
24
  # Configure logging
25
  logging.basicConfig(level=logging.INFO)
26
  logger = logging.getLogger(__name__)
27
 
28
 
29
- class DocumentExporter:
30
- """Advanced document export with HTML-based formatting"""
 
 
 
31
 
32
  @staticmethod
33
  def create_enhanced_txt_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
34
- """Create enhanced TXT file with improved formatting"""
35
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
36
  temp_file = tempfile.NamedTemporaryFile(
37
  suffix=f'_extracted_text_{timestamp}.txt',
@@ -42,8 +52,8 @@ class DocumentExporter:
42
 
43
  try:
44
  # Add header
45
- temp_file.write("PDF OCR Extraction Results - Enhanced with HTML Processing\n")
46
- temp_file.write("=" * 70 + "\n\n")
47
 
48
  # Add metadata
49
  if metadata_info:
@@ -53,11 +63,22 @@ class DocumentExporter:
53
 
54
  # Add timestamp
55
  temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
56
- temp_file.write("=" * 70 + "\n\n")
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  # Add main content
59
- temp_file.write("Extracted Text (Formatted):\n")
60
- temp_file.write("-" * 30 + "\n\n")
61
  temp_file.write(text_content)
62
 
63
  temp_file.close()
@@ -68,67 +89,57 @@ class DocumentExporter:
68
  temp_file.close()
69
  raise
70
 
71
- @staticmethod
72
- def create_enhanced_docx_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
73
- """Create enhanced DOCX file from HTML content with proper spacing and indentation"""
74
  try:
75
- from docx import Document
76
- from docx.shared import Inches, Pt, RGBColor
77
- from docx.enum.text import WD_ALIGN_PARAGRAPH
78
- from docx.enum.table import WD_TABLE_ALIGNMENT
79
- from docx.oxml.shared import OxmlElement, qn
80
- from html.parser import HTMLParser
81
-
82
- # Enhanced HTML to DOCX parser with spacing preservation
83
  class EnhancedDOCXHTMLParser(HTMLParser):
84
- def __init__(self, doc):
85
  super().__init__()
86
  self.doc = doc
 
87
  self.current_paragraph = None
88
- self.current_run = None
89
  self.in_table = False
90
- self.current_table = None
91
- self.current_row = None
92
- self.current_cell = None
93
  self.table_data = []
94
  self.current_table_row = []
95
- self.current_indent_em = 0
96
- self.is_bold = False
97
- self.is_title = False
98
- self.is_heading = False
99
- self.is_bullet_point = False
 
 
100
 
101
  def handle_starttag(self, tag, attrs):
102
  attr_dict = dict(attrs)
103
  class_attr = attr_dict.get('class', '')
104
- style_attr = attr_dict.get('style', '')
105
 
106
- if tag == 'div' and 'page' in class_attr:
107
- # Add minimal page separation (just paragraph spacing, no page break)
108
  if hasattr(self, 'has_content'):
109
- # Add just 2 line breaks worth of spacing
110
  self.doc.add_paragraph()
111
  self.doc.add_paragraph()
112
  self.has_content = True
113
 
114
- elif tag == 'div' and 'page-header' in class_attr:
115
  self.current_paragraph = self.doc.add_heading(level=1)
116
  self.current_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
 
117
 
118
- elif tag == 'div' and 'title' in class_attr:
 
 
 
 
119
  self.current_paragraph = self.doc.add_heading(level=1)
120
- self.is_title = True
121
- self._apply_spacing_from_style(style_attr)
122
 
123
- elif tag == 'div' and 'section-heading' in class_attr:
124
  self.current_paragraph = self.doc.add_heading(level=2)
125
- self.is_heading = True
126
- self._apply_spacing_from_style(style_attr)
127
 
128
  elif tag == 'div' and 'paragraph' in class_attr:
129
  self.current_paragraph = self.doc.add_paragraph()
130
- self.is_bullet_point = 'bullet-point' in class_attr
131
- self._apply_spacing_from_style(style_attr)
132
 
133
  elif tag == 'table':
134
  self.in_table = True
@@ -137,47 +148,81 @@ class DocumentExporter:
137
  elif tag == 'tr':
138
  self.current_table_row = []
139
 
140
- elif tag == 'th' or tag == 'td':
141
- pass # Will be handled in handle_data
142
-
143
  elif tag == 'br':
144
  if self.current_paragraph:
145
  self.current_paragraph.add_run().add_break()
146
 
147
- def _apply_spacing_from_style(self, style_attr):
148
- """Apply spacing and indentation from HTML style to DOCX paragraph"""
149
  if not self.current_paragraph:
150
  return
151
 
152
- # Extract margin-left for indentation
153
- import re
154
- margin_match = re.search(r'margin-left:\s*(\d+(?:\.\d+)?)em', style_attr)
155
- if margin_match:
156
- em_value = float(margin_match.group(1))
157
- # Convert em to inches (1em ≈ 12pt, 72pt = 1 inch)
158
- indent_inches = (em_value * 12) / 72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  self.current_paragraph.paragraph_format.left_indent = Inches(indent_inches)
160
-
161
- # For bullet points, add hanging indent
162
- if self.is_bullet_point:
163
- self.current_paragraph.paragraph_format.first_line_indent = Inches(-0.25)
164
 
165
- # Set line spacing for better readability
166
- from docx.shared import Length
 
 
 
167
  self.current_paragraph.paragraph_format.line_spacing = 1.15
168
 
169
- # Add appropriate spacing after paragraphs
170
- self.current_paragraph.paragraph_format.space_after = Pt(6)
 
 
 
 
 
 
 
 
 
 
171
 
172
  def handle_endtag(self, tag):
173
- if tag == 'div' and (self.is_title or self.is_heading):
174
- self.is_title = False
175
- self.is_heading = False
176
- self.current_paragraph = None
177
-
178
- elif tag == 'div' and self.current_paragraph and not self.in_table:
179
- self.is_bullet_point = False
 
 
180
  self.current_paragraph = None
 
 
 
181
 
182
  elif tag == 'table':
183
  self.in_table = False
@@ -189,28 +234,123 @@ class DocumentExporter:
189
 
190
  def handle_data(self, data):
191
  if data.strip():
192
- # Convert &nbsp; back to regular spaces
193
  data = data.replace('&nbsp;', ' ')
194
 
195
  if self.in_table:
196
  self.current_table_row.append(data.strip())
197
  elif self.current_paragraph is not None:
198
- run = self.current_paragraph.add_run(data)
199
- if self.is_title:
 
 
 
 
 
 
200
  run.bold = True
201
  run.font.size = Pt(16)
202
- elif self.is_heading:
 
 
 
 
 
 
 
 
 
203
  run.bold = True
204
  run.font.size = Pt(14)
 
205
  else:
206
- # Regular text formatting
207
- run.font.size = Pt(11)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
  def _create_enhanced_docx_table(self):
 
210
  if not self.table_data:
211
  return
212
 
213
- # Create table with proper formatting
214
  rows = len(self.table_data)
215
  cols = max(len(row) for row in self.table_data) if self.table_data else 1
216
 
@@ -218,10 +358,7 @@ class DocumentExporter:
218
  table.style = 'Table Grid'
219
  table.alignment = WD_TABLE_ALIGNMENT.LEFT
220
 
221
- # Set table margins
222
- table.autofit = False
223
-
224
- # Fill table data with proper formatting
225
  for row_idx, row_data in enumerate(self.table_data):
226
  table_row = table.rows[row_idx]
227
  for col_idx, cell_data in enumerate(row_data):
@@ -235,15 +372,19 @@ class DocumentExporter:
235
  for run in paragraph.runs:
236
  run.bold = True
237
  run.font.size = Pt(10)
 
238
  paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
 
 
 
 
 
239
  else:
240
  # Regular data cells
241
  for paragraph in cell.paragraphs:
242
  for run in paragraph.runs:
243
  run.font.size = Pt(10)
244
-
245
- # Set cell margins for better spacing
246
- cell.vertical_alignment = WD_ALIGN_PARAGRAPH.LEFT
247
 
248
  # Add spacing after table
249
  self.doc.add_paragraph()
@@ -251,14 +392,14 @@ class DocumentExporter:
251
  # Create DOCX document
252
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
253
  temp_file = tempfile.NamedTemporaryFile(
254
- suffix=f'_extracted_document_{timestamp}.docx',
255
  delete=False
256
  )
257
  temp_file.close()
258
 
259
  doc = Document()
260
 
261
- # Set document margins for better spacing
262
  sections = doc.sections
263
  for section in sections:
264
  section.top_margin = Inches(1)
@@ -266,84 +407,65 @@ class DocumentExporter:
266
  section.left_margin = Inches(1)
267
  section.right_margin = Inches(1)
268
 
269
- # Title with better formatting
270
  title = doc.add_heading('PDF OCR Extraction Results', 0)
271
  title.alignment = WD_ALIGN_PARAGRAPH.CENTER
 
 
272
 
273
- # Add subtitle with enhanced styling
274
  subtitle_para = doc.add_paragraph()
275
- subtitle_run = subtitle_para.add_run('Enhanced with HTML Processing and Preserved Formatting')
276
  subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
277
  subtitle_run.italic = True
278
  subtitle_run.font.size = Pt(12)
279
  subtitle_run.font.color.rgb = RGBColor(102, 102, 102)
280
 
281
- # Metadata section with better formatting
 
 
 
 
 
 
 
282
  if metadata_info:
283
  doc.add_heading('Processing Information', level=1)
284
  meta_para = doc.add_paragraph()
285
  meta_run = meta_para.add_run(metadata_info)
286
  meta_run.font.size = Pt(10)
287
  meta_para.style = 'Intense Quote'
288
- doc.add_paragraph() # Add spacing
 
 
 
 
 
 
289
 
290
- # Process HTML content with enhanced spacing
291
  doc.add_heading('Extracted Content', level=1)
292
 
293
- if html_content and '<table' in html_content:
294
- # Parse HTML and convert to DOCX with spacing preservation
295
- parser = EnhancedDOCXHTMLParser(doc)
296
  parser.feed(html_content)
297
  else:
298
- # Fallback to text content with enhanced formatting
299
- paragraphs = text_content.split('\n\n')
300
- for para in paragraphs:
301
- if para.strip():
302
- if para.strip().startswith('==='):
303
- # Page headers with minimal separation
304
- page_header = doc.add_heading(para.strip(), level=1)
305
- page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER
306
- elif para.strip().startswith('#'):
307
- # Titles
308
- title_text = para.strip().lstrip('#').strip()
309
- title_para = doc.add_heading(title_text, level=1)
310
- elif para.strip().startswith('##'):
311
- # Section headings
312
- heading_text = para.strip().lstrip('#').strip()
313
- heading_para = doc.add_heading(heading_text, level=2)
314
- else:
315
- # Regular paragraphs with spacing preservation
316
- lines = para.split('\n')
317
- for line in lines:
318
- if line.strip():
319
- para_element = doc.add_paragraph()
320
-
321
- # Calculate indentation from leading spaces
322
- leading_spaces = len(line) - len(line.lstrip())
323
- if leading_spaces > 0:
324
- indent_level = leading_spaces // 2 # 2 spaces = 1 indent level
325
- para_element.paragraph_format.left_indent = Inches(0.5 * indent_level)
326
-
327
- # Add the text content
328
- run = para_element.add_run(line.strip())
329
- run.font.size = Pt(11)
330
-
331
- # Set line spacing
332
- para_element.paragraph_format.line_spacing = 1.15
333
- para_element.paragraph_format.space_after = Pt(3)
334
-
335
- # Enhanced footer
336
  footer_section = doc.sections[0]
337
  footer = footer_section.footer
338
  footer_para = footer.paragraphs[0]
339
- footer_para.text = f"Generated by Enhanced PDF OCR Service on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
340
  footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
341
  footer_run = footer_para.runs[0]
342
- footer_run.font.size = Pt(9)
343
  footer_run.font.color.rgb = RGBColor(128, 128, 128)
344
 
345
  doc.save(temp_file.name)
346
- logger.info(f"Enhanced DOCX file with proper spacing created: {temp_file.name}")
347
  return temp_file.name
348
 
349
  except ImportError:
@@ -356,9 +478,99 @@ class DocumentExporter:
356
  pass
357
  raise
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  @staticmethod
360
  def create_html_file(html_content: str, metadata_info: str = "") -> str:
361
- """Create standalone HTML file"""
362
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
363
  temp_file = tempfile.NamedTemporaryFile(
364
  suffix=f'_extracted_document_{timestamp}.html',
@@ -368,26 +580,80 @@ class DocumentExporter:
368
  )
369
 
370
  try:
371
- # Enhanced HTML with better styling
372
- enhanced_html = html_content.replace(
373
- '<style>',
374
- '''<style>
375
- body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; line-height: 1.6; margin: 20px; background-color: #f9f9f9; }
376
- .container { max-width: 1200px; margin: 0 auto; background-color: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
377
- .header { text-align: center; margin-bottom: 30px; border-bottom: 3px solid #2c3e50; padding-bottom: 20px; }
378
- .metadata { background-color: #ecf0f1; padding: 15px; border-radius: 5px; margin-bottom: 25px; border-left: 4px solid #3498db; }
379
- '''
380
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
- # Wrap content in container
383
- if '<body>' in enhanced_html:
384
  enhanced_html = enhanced_html.replace(
385
  '<body>',
386
  '''<body>
387
  <div class="container">
388
  <div class="header">
389
  <h1>PDF OCR Extraction Results</h1>
390
- <p>Enhanced with HTML Processing and Format Preservation</p>
 
 
 
 
 
 
 
 
 
 
391
  </div>''' +
392
  (f'<div class="metadata"><h3>Processing Information</h3><pre>{metadata_info}</pre></div>' if metadata_info else '')
393
  )
@@ -404,23 +670,24 @@ class DocumentExporter:
404
 
405
 
406
  class BackendManager:
407
- """Enhanced backend manager with FIXED crop processing and advanced export capabilities"""
408
 
409
  def __init__(self):
410
  self.ocr_service = OCRService()
 
411
  self.processing_history = []
412
  self.max_history_size = int(os.getenv('MAX_HISTORY_SIZE', 100))
413
 
414
  # Create directories for temporary files and logs
415
- self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service'
416
  self.temp_dir.mkdir(exist_ok=True)
417
 
418
- logger.info("Enhanced backend manager with fixed crop processing initialized successfully")
419
 
420
  def process_pdf_with_enhanced_resolution(self, pdf_path: str, method: str = "auto",
421
  preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
422
  """
423
- Process PDF with enhanced resolution and HTML generation
424
 
425
  Args:
426
  pdf_path: Path to the PDF file
@@ -428,7 +695,7 @@ class BackendManager:
428
  preprocessing_options: Dictionary containing preprocessing settings
429
 
430
  Returns:
431
- Dict containing processing results with HTML content
432
  """
433
  start_time = datetime.now()
434
 
@@ -460,7 +727,7 @@ class BackendManager:
460
  # Generate file hash for tracking
461
  file_hash = self._calculate_file_hash(pdf_path)
462
 
463
- logger.info(f"Processing PDF with enhanced resolution: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
464
  logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
465
 
466
  # Handle preprocessing if enabled
@@ -478,12 +745,23 @@ class BackendManager:
478
  processed_pdf_path = pdf_path
479
 
480
  try:
481
- # Process with enhanced OCR
482
  result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method)
483
 
484
  # Add processing metadata
485
  processing_time = (datetime.now() - start_time).total_seconds()
486
 
 
 
 
 
 
 
 
 
 
 
 
487
  result['metadata'].update({
488
  'file_hash': file_hash,
489
  'file_size_mb': round(file_size / (1024*1024), 2),
@@ -491,8 +769,12 @@ class BackendManager:
491
  'timestamp': start_time.isoformat(),
492
  'enhanced_processing': True,
493
  'html_processing': True,
 
 
 
494
  'header_footer_removed': preprocessing_applied,
495
- 'preprocessing_options': preprocessing_options if preprocessing_applied else None
 
496
  })
497
 
498
  # Cleanup temporary preprocessed file
@@ -502,7 +784,7 @@ class BackendManager:
502
  except:
503
  pass
504
 
505
- # Log results
506
  if result['success']:
507
  text_length = len(result['text'])
508
  has_html = bool(result.get('html'))
@@ -512,10 +794,17 @@ class BackendManager:
512
  logger.info(f"Method used: {result['method_used']}")
513
  logger.info(f"Text extracted: {text_length} characters")
514
  logger.info(f"HTML generated: {has_html}")
 
 
 
 
515
  if table_count > 0:
516
  logger.info(f"Tables detected: {table_count}")
517
  if preprocessing_applied:
518
  logger.info("Enhanced preprocessing applied")
 
 
 
519
 
520
  # Add to processing history
521
  self._add_to_history({
@@ -528,7 +817,11 @@ class BackendManager:
528
  'processing_time': processing_time,
529
  'preprocessing_applied': preprocessing_applied,
530
  'html_generated': has_html,
531
- 'enhanced_processing': True
 
 
 
 
532
  })
533
  else:
534
  logger.error(f"Enhanced processing failed: {result.get('error', 'Unknown error')}")
@@ -542,7 +835,10 @@ class BackendManager:
542
  'error': result.get('error', 'Unknown error'),
543
  'processing_time': processing_time,
544
  'preprocessing_applied': preprocessing_applied,
545
- 'enhanced_processing': True
 
 
 
546
  })
547
 
548
  return result
@@ -566,7 +862,10 @@ class BackendManager:
566
  'success': False,
567
  'error': str(e),
568
  'processing_time': processing_time,
569
- 'enhanced_processing': True
 
 
 
570
  })
571
 
572
  return {
@@ -579,12 +878,15 @@ class BackendManager:
579
  'file_hash': file_hash,
580
  'processing_time_seconds': round(processing_time, 2),
581
  'timestamp': start_time.isoformat(),
582
- 'enhanced_processing': True
 
 
 
583
  }
584
  }
585
 
586
  def _apply_enhanced_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str:
587
- """Apply enhanced preprocessing with high-resolution crop handling - FIXED"""
588
  crop_settings = options.get('crop_settings', {})
589
  per_page_crops = crop_settings.get('per_page_crops', {})
590
  enhanced_resolution = crop_settings.get('enhanced_resolution', True)
@@ -602,7 +904,7 @@ class BackendManager:
602
  page = doc.load_page(page_num)
603
  page_rect = page.rect
604
 
605
- # Get crop settings for this page - FIXED indexing
606
  page_crop = per_page_crops.get(page_num, per_page_crops.get(0, {
607
  'top': 0, 'bottom': 0, 'left': 0, 'right': 0
608
  }))
@@ -636,7 +938,6 @@ class BackendManager:
636
 
637
  # Create new page with enhanced resolution if enabled
638
  if enhanced_resolution:
639
- # Use high resolution for better quality
640
  new_page = new_doc.new_page(
641
  width=new_rect.width,
642
  height=new_rect.height
@@ -676,36 +977,36 @@ class BackendManager:
676
 
677
  def create_enhanced_downloads(self, text_content: str, html_content: str,
678
  metadata_info: str = "") -> Dict[str, str]:
679
- """Create enhanced download files with HTML processing"""
680
  download_files = {}
681
 
682
  try:
683
  # Create enhanced TXT file
684
- txt_path = DocumentExporter.create_enhanced_txt_file(
685
  text_content, html_content, metadata_info
686
  )
687
  download_files['txt'] = txt_path
688
  logger.info(f"Enhanced TXT file created: {txt_path}")
689
 
690
- # Create enhanced DOCX file if possible
691
  try:
692
- docx_path = DocumentExporter.create_enhanced_docx_file(
693
  text_content, html_content, metadata_info
694
  )
695
  download_files['docx'] = docx_path
696
- logger.info(f"Enhanced DOCX file created: {docx_path}")
697
  except ImportError:
698
  logger.warning("python-docx not available. DOCX creation skipped.")
699
  except Exception as e:
700
- logger.error(f"DOCX creation failed: {e}")
701
 
702
  # Create standalone HTML file
703
  try:
704
- html_path = DocumentExporter.create_html_file(
705
  html_content, metadata_info
706
  )
707
  download_files['html'] = html_path
708
- logger.info(f"HTML file created: {html_path}")
709
  except Exception as e:
710
  logger.error(f"HTML file creation failed: {e}")
711
 
@@ -744,10 +1045,18 @@ class BackendManager:
744
  'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)),
745
  'enhanced_processing': True,
746
  'html_processing': True,
 
 
 
 
747
  'docx_export_available': docx_available,
748
  'enhanced_crop_processing': True,
749
  'multi_resolution_support': True,
750
- 'crop_processing_fixed': True
 
 
 
 
751
  }
752
 
753
  return status
@@ -795,7 +1104,7 @@ class BackendManager:
795
  logger.error(f"Error during cleanup: {e}")
796
 
797
  def get_enhanced_statistics(self) -> Dict[str, Any]:
798
- """Get enhanced processing statistics"""
799
  if not self.processing_history:
800
  return {
801
  'total_processed': 0,
@@ -806,7 +1115,11 @@ class BackendManager:
806
  'total_tables_processed': 0,
807
  'preprocessing_usage': 0,
808
  'html_generation_rate': 0,
809
- 'enhanced_processing_usage': 0
 
 
 
 
810
  }
811
 
812
  total_processed = len(self.processing_history)
@@ -826,9 +1139,20 @@ class BackendManager:
826
  preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False))
827
  html_generated = sum(1 for h in self.processing_history if h.get('html_generated', False))
828
  enhanced_processing = sum(1 for h in self.processing_history if h.get('enhanced_processing', False))
 
 
 
 
 
 
 
 
829
 
830
  html_generation_rate = (html_generated / total_processed) * 100 if total_processed > 0 else 0
831
  enhanced_processing_rate = (enhanced_processing / total_processed) * 100 if total_processed > 0 else 0
 
 
 
832
 
833
  return {
834
  'total_processed': total_processed,
@@ -842,7 +1166,14 @@ class BackendManager:
842
  'preprocessing_usage': preprocessing_usage,
843
  'html_generation_rate': round(html_generation_rate, 2),
844
  'enhanced_processing_usage': enhanced_processing,
845
- 'enhanced_processing_rate': round(enhanced_processing_rate, 2)
 
 
 
 
 
 
 
846
  }
847
 
848
 
@@ -861,8 +1192,29 @@ if __name__ == "__main__":
861
  # Test the enhanced backend manager
862
  manager = BackendManager()
863
 
864
- print("Enhanced Backend Manager with Fixed Crop Processing Test")
865
- print("=" * 60)
866
  print(f"Available methods: {manager.get_available_methods()}")
867
  print(f"Service status: {manager.get_service_status()}")
868
- print(f"Enhanced statistics: {manager.get_enhanced_statistics()}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Backend Management Module - ENHANCED VERSION with Comprehensive Indentation Detection and Intelligent Text Classification
3
  Coordinates between UI and OCR services, handles file management and preprocessing
4
  """
5
  import re
 
14
  import cv2
15
  import numpy as np
16
  import fitz # PyMuPDF
17
+ from docx import Document
18
+ from docx.shared import Inches, Pt, RGBColor
19
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
20
+ from docx.enum.table import WD_TABLE_ALIGNMENT
21
+ from docx.oxml.shared import OxmlElement, qn
22
+ from html.parser import HTMLParser
23
 
24
  # Load environment variables
25
  from dotenv import load_dotenv
26
  load_dotenv()
27
 
28
  from ocr_service import OCRService
29
+ from enhanced_indentation import EnhancedIndentationDetector
30
 
31
  # Configure logging
32
  logging.basicConfig(level=logging.INFO)
33
  logger = logging.getLogger(__name__)
34
 
35
 
36
+ class EnhancedDocumentExporter:
37
+ """Advanced document export with comprehensive indentation support, parenthetical patterns, and text classification for HTML and DOCX"""
38
+
39
+ def __init__(self):
40
+ self.indent_detector = EnhancedIndentationDetector()
41
 
42
  @staticmethod
43
  def create_enhanced_txt_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
44
+ """Create enhanced TXT file with improved formatting and indentation preservation"""
45
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
46
  temp_file = tempfile.NamedTemporaryFile(
47
  suffix=f'_extracted_text_{timestamp}.txt',
 
52
 
53
  try:
54
  # Add header
55
+ temp_file.write("PDF OCR Extraction Results - Enhanced with Comprehensive Indentation Detection & Text Classification\n")
56
+ temp_file.write("=" * 90 + "\n\n")
57
 
58
  # Add metadata
59
  if metadata_info:
 
63
 
64
  # Add timestamp
65
  temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
66
+ temp_file.write("=" * 90 + "\n\n")
67
+
68
+ # Add feature list
69
+ temp_file.write("Enhanced Features Applied:\n")
70
+ temp_file.write("-" * 25 + "\n")
71
+ temp_file.write("• Comprehensive Indentation Detection (20+ patterns)\n")
72
+ temp_file.write("• Parenthetical Patterns ((1), (๑), (a), (i), (ก))\n")
73
+ temp_file.write("• Intelligent Text Classification (headers, paragraphs, lists)\n")
74
+ temp_file.write("• Multi-language Support (English, Thai)\n")
75
+ temp_file.write("• HTML Intermediate Processing\n")
76
+ temp_file.write("• Priority-based Pattern Matching\n")
77
+ temp_file.write("• Document Structure Analysis\n\n")
78
 
79
  # Add main content
80
+ temp_file.write("Extracted Text (Enhanced with Comprehensive Pattern Detection):\n")
81
+ temp_file.write("-" * 60 + "\n\n")
82
  temp_file.write(text_content)
83
 
84
  temp_file.close()
 
89
  temp_file.close()
90
  raise
91
 
92
+ def create_enhanced_docx_file(self, text_content: str, html_content: str, metadata_info: str = "") -> str:
93
+ """Create enhanced DOCX file with comprehensive indentation support, parenthetical patterns, and text classification"""
 
94
  try:
 
 
 
 
 
 
 
 
95
  class EnhancedDOCXHTMLParser(HTMLParser):
96
+ def __init__(self, doc, processor):
97
  super().__init__()
98
  self.doc = doc
99
+ self.processor = processor
100
  self.current_paragraph = None
 
101
  self.in_table = False
 
 
 
102
  self.table_data = []
103
  self.current_table_row = []
104
+ self.current_indent_level = 0
105
+ self.current_formatting_hint = 'normal_text'
106
+ self.in_title = False
107
+ self.in_section_heading = False
108
+ self.in_page_header = False
109
+ self.in_content_header = False
110
+ self.current_classes = []
111
 
112
  def handle_starttag(self, tag, attrs):
113
  attr_dict = dict(attrs)
114
  class_attr = attr_dict.get('class', '')
115
+ self.current_classes = class_attr.split()
116
 
117
+ if 'page' in class_attr and tag == 'div':
 
118
  if hasattr(self, 'has_content'):
 
119
  self.doc.add_paragraph()
120
  self.doc.add_paragraph()
121
  self.has_content = True
122
 
123
+ elif 'page-header' in class_attr:
124
  self.current_paragraph = self.doc.add_heading(level=1)
125
  self.current_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
126
+ self.in_page_header = True
127
 
128
+ elif 'content-header' in class_attr:
129
+ self.current_paragraph = self.doc.add_heading(level=2)
130
+ self.in_content_header = True
131
+
132
+ elif 'title' in class_attr:
133
  self.current_paragraph = self.doc.add_heading(level=1)
134
+ self.in_title = True
 
135
 
136
+ elif 'section-heading' in class_attr:
137
  self.current_paragraph = self.doc.add_heading(level=2)
138
+ self.in_section_heading = True
 
139
 
140
  elif tag == 'div' and 'paragraph' in class_attr:
141
  self.current_paragraph = self.doc.add_paragraph()
142
+ self._apply_enhanced_formatting()
 
143
 
144
  elif tag == 'table':
145
  self.in_table = True
 
148
  elif tag == 'tr':
149
  self.current_table_row = []
150
 
 
 
 
151
  elif tag == 'br':
152
  if self.current_paragraph:
153
  self.current_paragraph.add_run().add_break()
154
 
155
+ def _apply_enhanced_formatting(self):
156
+ """Apply enhanced formatting based on CSS classes and indentation detection"""
157
  if not self.current_paragraph:
158
  return
159
 
160
+ # Extract indent level from classes
161
+ for cls in self.current_classes:
162
+ if cls.startswith('indent-level-'):
163
+ try:
164
+ self.current_indent_level = int(cls.split('-')[-1])
165
+ except ValueError:
166
+ self.current_indent_level = 0
167
+ break
168
+
169
+ # Extract formatting hint from classes
170
+ formatting_hints = [
171
+ 'numbered-primary', 'numbered-secondary', 'numbered-tertiary', 'numbered-quaternary', 'numbered-quinary',
172
+ 'parenthetical-primary', 'parenthetical-secondary', 'parenthetical-tertiary', 'parenthetical-quaternary',
173
+ 'bullet-primary', 'bullet-secondary', 'bullet-tertiary', 'bullet-quaternary',
174
+ 'lettered-primary', 'lettered-secondary',
175
+ 'roman-primary', 'roman-secondary',
176
+ 'thai-primary', 'thai-secondary',
177
+ 'indented_text', 'space-indent'
178
+ ]
179
+
180
+ for hint in formatting_hints:
181
+ if hint in self.current_classes:
182
+ self.current_formatting_hint = hint
183
+ break
184
+ else:
185
+ self.current_formatting_hint = 'normal_text'
186
+
187
+ # Apply indentation
188
+ if self.current_indent_level > 0:
189
+ indent_inches = self.current_indent_level * 0.5
190
  self.current_paragraph.paragraph_format.left_indent = Inches(indent_inches)
 
 
 
 
191
 
192
+ # Apply hanging indent for bullets and parenthetical items
193
+ if 'bullet' in self.current_formatting_hint or 'parenthetical' in self.current_formatting_hint:
194
+ self.current_paragraph.paragraph_format.first_line_indent = Inches(-0.25)
195
+
196
+ # Set line spacing and paragraph spacing
197
  self.current_paragraph.paragraph_format.line_spacing = 1.15
198
 
199
+ # Apply spacing based on formatting hint
200
+ if 'primary' in self.current_formatting_hint:
201
+ self.current_paragraph.paragraph_format.space_before = Pt(10)
202
+ self.current_paragraph.paragraph_format.space_after = Pt(8)
203
+ elif 'secondary' in self.current_formatting_hint:
204
+ self.current_paragraph.paragraph_format.space_before = Pt(8)
205
+ self.current_paragraph.paragraph_format.space_after = Pt(6)
206
+ elif 'tertiary' in self.current_formatting_hint:
207
+ self.current_paragraph.paragraph_format.space_before = Pt(6)
208
+ self.current_paragraph.paragraph_format.space_after = Pt(4)
209
+ else:
210
+ self.current_paragraph.paragraph_format.space_after = Pt(3)
211
 
212
  def handle_endtag(self, tag):
213
+ if tag == 'div':
214
+ if self.in_page_header:
215
+ self.in_page_header = False
216
+ elif self.in_content_header:
217
+ self.in_content_header = False
218
+ elif self.in_title:
219
+ self.in_title = False
220
+ elif self.in_section_heading:
221
+ self.in_section_heading = False
222
  self.current_paragraph = None
223
+ self.current_indent_level = 0
224
+ self.current_formatting_hint = 'normal_text'
225
+ self.current_classes = []
226
 
227
  elif tag == 'table':
228
  self.in_table = False
 
234
 
235
  def handle_data(self, data):
236
  if data.strip():
 
237
  data = data.replace('&nbsp;', ' ')
238
 
239
  if self.in_table:
240
  self.current_table_row.append(data.strip())
241
  elif self.current_paragraph is not None:
242
+ # Detect patterns in the text for additional formatting
243
+ indent_info = self.processor.indent_detector.detect_indentation(data)
244
+ text_classification = self.processor.indent_detector.classify_text_type(data)
245
+
246
+ run = self.current_paragraph.add_run(data.strip())
247
+
248
+ # Apply formatting based on pattern, level, and text classification
249
+ if self.in_title:
250
  run.bold = True
251
  run.font.size = Pt(16)
252
+ run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue
253
+ elif self.in_content_header or text_classification.get('is_header'):
254
+ run.bold = True
255
+ run.font.size = Pt(14)
256
+ run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue
257
+ elif self.in_section_heading:
258
+ run.bold = True
259
+ run.font.size = Pt(14)
260
+ run.font.color.rgb = RGBColor(52, 73, 94) # Darker blue
261
+ elif self.in_page_header:
262
  run.bold = True
263
  run.font.size = Pt(14)
264
+ run.font.color.rgb = RGBColor(44, 62, 80)
265
  else:
266
+ # Apply pattern-specific formatting
267
+ self._apply_pattern_formatting(run, indent_info, text_classification)
268
+
269
+ def _apply_pattern_formatting(self, run, indent_info, text_classification):
270
+ """Apply formatting based on detected pattern, classification, and current formatting hint"""
271
+ pattern_type = indent_info.get('pattern_type', 'normal')
272
+ level = indent_info.get('level', 0)
273
+ is_numbered = indent_info.get('is_numbered', False)
274
+ is_bullet = indent_info.get('is_bullet', False)
275
+ is_lettered = indent_info.get('is_lettered', False)
276
+ is_roman = indent_info.get('is_roman', False)
277
+ is_thai = indent_info.get('is_thai', False)
278
+ is_parenthetical = indent_info.get('is_parenthetical', False)
279
+
280
+ # Base font size
281
+ run.font.size = Pt(11)
282
+
283
+ # Apply formatting based on current formatting hint and detected pattern
284
+ if 'numbered' in self.current_formatting_hint or is_numbered:
285
+ if 'primary' in self.current_formatting_hint or level == 1:
286
+ run.bold = True
287
+ run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue
288
+ elif 'secondary' in self.current_formatting_hint or level == 2:
289
+ run.font.color.rgb = RGBColor(52, 73, 94) # Medium blue
290
+ elif 'tertiary' in self.current_formatting_hint or level == 3:
291
+ run.font.color.rgb = RGBColor(85, 85, 85) # Dark gray
292
+ else:
293
+ run.font.color.rgb = RGBColor(102, 102, 102) # Gray
294
+
295
+ elif 'parenthetical' in self.current_formatting_hint or is_parenthetical:
296
+ # Special formatting for parenthetical patterns
297
+ if 'primary' in self.current_formatting_hint or level == 2:
298
+ run.bold = True
299
+ run.font.color.rgb = RGBColor(142, 68, 173) # Purple
300
+ elif 'secondary' in self.current_formatting_hint or level == 3:
301
+ run.font.color.rgb = RGBColor(155, 89, 182) # Light purple
302
+ elif 'tertiary' in self.current_formatting_hint or level == 4:
303
+ run.font.color.rgb = RGBColor(175, 122, 197) # Lighter purple
304
+ else:
305
+ run.font.color.rgb = RGBColor(195, 155, 211) # Very light purple
306
+
307
+ elif 'bullet' in self.current_formatting_hint or is_bullet:
308
+ if 'primary' in self.current_formatting_hint or level == 1:
309
+ run.font.color.rgb = RGBColor(52, 152, 219) # Blue
310
+ elif 'secondary' in self.current_formatting_hint or level == 2:
311
+ run.font.color.rgb = RGBColor(149, 165, 166) # Gray
312
+ elif 'tertiary' in self.current_formatting_hint or level == 3:
313
+ run.font.color.rgb = RGBColor(189, 195, 199) # Light gray
314
+ else:
315
+ run.font.color.rgb = RGBColor(189, 195, 199) # Light gray
316
+
317
+ elif 'lettered' in self.current_formatting_hint or is_lettered:
318
+ run.italic = True
319
+ if 'primary' in self.current_formatting_hint:
320
+ run.font.color.rgb = RGBColor(142, 68, 173) # Purple
321
+ else:
322
+ run.font.color.rgb = RGBColor(155, 89, 182) # Light purple
323
+
324
+ elif 'roman' in self.current_formatting_hint or is_roman:
325
+ run.font.color.rgb = RGBColor(211, 84, 0) # Orange
326
+ run.font.name = 'Times New Roman' # Roman style font
327
+
328
+ elif 'thai' in self.current_formatting_hint or is_thai:
329
+ if 'primary' in self.current_formatting_hint:
330
+ run.bold = True
331
+ run.font.color.rgb = RGBColor(22, 160, 133) # Teal
332
+ else:
333
+ run.font.color.rgb = RGBColor(26, 188, 156) # Light teal
334
+
335
+ elif 'space-indent' in self.current_formatting_hint:
336
+ run.italic = True
337
+ run.font.color.rgb = RGBColor(85, 85, 85) # Dark gray
338
+
339
+ else:
340
+ # Default text formatting based on classification
341
+ if text_classification.get('is_header'):
342
+ run.bold = True
343
+ run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue
344
+ elif text_classification.get('is_list_item'):
345
+ run.font.color.rgb = RGBColor(52, 152, 219) # Blue
346
+ else:
347
+ run.font.color.rgb = RGBColor(0, 0, 0) # Black
348
 
349
  def _create_enhanced_docx_table(self):
350
+ """Create table with enhanced formatting"""
351
  if not self.table_data:
352
  return
353
 
 
354
  rows = len(self.table_data)
355
  cols = max(len(row) for row in self.table_data) if self.table_data else 1
356
 
 
358
  table.style = 'Table Grid'
359
  table.alignment = WD_TABLE_ALIGNMENT.LEFT
360
 
361
+ # Fill table data with enhanced formatting
 
 
 
362
  for row_idx, row_data in enumerate(self.table_data):
363
  table_row = table.rows[row_idx]
364
  for col_idx, cell_data in enumerate(row_data):
 
372
  for run in paragraph.runs:
373
  run.bold = True
374
  run.font.size = Pt(10)
375
+ run.font.color.rgb = RGBColor(44, 62, 80)
376
  paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
377
+
378
+ # Add background color to header
379
+ shading_elm_1 = OxmlElement('w:shd')
380
+ shading_elm_1.set(qn('w:fill'), 'ECF0F1')
381
+ paragraph._element.get_or_add_pPr().append(shading_elm_1)
382
  else:
383
  # Regular data cells
384
  for paragraph in cell.paragraphs:
385
  for run in paragraph.runs:
386
  run.font.size = Pt(10)
387
+ paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
 
 
388
 
389
  # Add spacing after table
390
  self.doc.add_paragraph()
 
392
  # Create DOCX document
393
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
394
  temp_file = tempfile.NamedTemporaryFile(
395
+ suffix=f'_enhanced_document_{timestamp}.docx',
396
  delete=False
397
  )
398
  temp_file.close()
399
 
400
  doc = Document()
401
 
402
+ # Set document margins for better layout
403
  sections = doc.sections
404
  for section in sections:
405
  section.top_margin = Inches(1)
 
407
  section.left_margin = Inches(1)
408
  section.right_margin = Inches(1)
409
 
410
+ # Add title with enhanced styling
411
  title = doc.add_heading('PDF OCR Extraction Results', 0)
412
  title.alignment = WD_ALIGN_PARAGRAPH.CENTER
413
+ title_run = title.runs[0]
414
+ title_run.font.color.rgb = RGBColor(44, 62, 80)
415
 
416
+ # Add subtitle
417
  subtitle_para = doc.add_paragraph()
418
+ subtitle_run = subtitle_para.add_run('Enhanced with Comprehensive Indentation Detection & Intelligent Text Classification')
419
  subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
420
  subtitle_run.italic = True
421
  subtitle_run.font.size = Pt(12)
422
  subtitle_run.font.color.rgb = RGBColor(102, 102, 102)
423
 
424
+ # Add feature list
425
+ features_para = doc.add_paragraph()
426
+ features_run = features_para.add_run('Features: Hierarchical Numbering • Parenthetical Patterns ((1), (๑), (a)) • Bullet Points • Letter & Roman Numerals • Thai Script • Multi-level Indentation • Text Classification')
427
+ features_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
428
+ features_run.font.size = Pt(9)
429
+ features_run.font.color.rgb = RGBColor(149, 165, 166)
430
+
431
+ # Add metadata section
432
  if metadata_info:
433
  doc.add_heading('Processing Information', level=1)
434
  meta_para = doc.add_paragraph()
435
  meta_run = meta_para.add_run(metadata_info)
436
  meta_run.font.size = Pt(10)
437
  meta_para.style = 'Intense Quote'
438
+
439
+ # Add background to metadata
440
+ shading_elm = OxmlElement('w:shd')
441
+ shading_elm.set(qn('w:fill'), 'F8F9FA')
442
+ meta_para._element.get_or_add_pPr().append(shading_elm)
443
+
444
+ doc.add_paragraph()
445
 
446
+ # Process content
447
  doc.add_heading('Extracted Content', level=1)
448
 
449
+ if html_content and '<div' in html_content:
450
+ # Parse HTML with enhanced indentation processing and text classification
451
+ parser = EnhancedDOCXHTMLParser(doc, self)
452
  parser.feed(html_content)
453
  else:
454
+ # Fallback to text processing with enhanced indentation and classification
455
+ self._process_text_content_enhanced(doc, text_content)
456
+
457
+ # Add enhanced footer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
  footer_section = doc.sections[0]
459
  footer = footer_section.footer
460
  footer_para = footer.paragraphs[0]
461
+ footer_para.text = f"Generated by Enhanced PDF OCR Service with Comprehensive Indentation Detection & Text Classification on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
462
  footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
463
  footer_run = footer_para.runs[0]
464
+ footer_run.font.size = Pt(8)
465
  footer_run.font.color.rgb = RGBColor(128, 128, 128)
466
 
467
  doc.save(temp_file.name)
468
+ logger.info(f"Enhanced DOCX file with comprehensive indentation support and text classification created: {temp_file.name}")
469
  return temp_file.name
470
 
471
  except ImportError:
 
478
  pass
479
  raise
480
 
481
+ def _process_text_content_enhanced(self, doc, text_content):
482
+ """Process text content with enhanced indentation detection and text classification"""
483
+ paragraphs = text_content.split('\n\n')
484
+
485
+ for para_text in paragraphs:
486
+ if not para_text.strip():
487
+ continue
488
+
489
+ lines = para_text.split('\n')
490
+ for line in lines:
491
+ if not line.strip():
492
+ continue
493
+
494
+ # Detect indentation and classify text
495
+ indent_info = self.indent_detector.detect_indentation(line)
496
+ text_classification = self.indent_detector.classify_text_type(line)
497
+
498
+ if line.strip().startswith('==='):
499
+ # Page headers
500
+ page_header = doc.add_heading(line.strip(), level=1)
501
+ page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER
502
+ header_run = page_header.runs[0]
503
+ header_run.font.color.rgb = RGBColor(44, 62, 80)
504
+ elif line.strip().startswith('##'):
505
+ # Section headings
506
+ heading_text = line.strip().lstrip('#').strip()
507
+ heading = doc.add_heading(heading_text, level=2)
508
+ heading_run = heading.runs[0]
509
+ heading_run.font.color.rgb = RGBColor(52, 73, 94)
510
+ elif text_classification.get('is_header') and text_classification.get('confidence', 0) > 0.7:
511
+ # Detected headers
512
+ heading = doc.add_heading(indent_info.get('content', line.strip()), level=2)
513
+ heading_run = heading.runs[0]
514
+ heading_run.font.color.rgb = RGBColor(52, 73, 94)
515
+ else:
516
+ # Regular content with enhanced indentation and classification
517
+ para = doc.add_paragraph()
518
+
519
+ # Apply indentation based on detected level
520
+ level = indent_info.get('level', 0)
521
+ if level > 0:
522
+ para.paragraph_format.left_indent = Inches(level * 0.5)
523
+
524
+ # Apply pattern-specific formatting
525
+ if indent_info.get('is_bullet', False) or indent_info.get('is_parenthetical', False):
526
+ para.paragraph_format.first_line_indent = Inches(-0.25)
527
+
528
+ # Set proper spacing
529
+ para.paragraph_format.line_spacing = 1.15
530
+ para.paragraph_format.space_after = Pt(3)
531
+
532
+ # Add content with enhanced formatting
533
+ content = indent_info.get('content', line.strip())
534
+ marker = indent_info.get('pattern_marker', '')
535
+
536
+ # Include marker for non-bullet items
537
+ if marker and not indent_info.get('is_bullet', False):
538
+ content = f"{marker} {content}"
539
+
540
+ run = para.add_run(content)
541
+ run.font.size = Pt(11)
542
+
543
+ # Apply color coding based on pattern type and classification
544
+ pattern_type = indent_info.get('pattern_type', 'normal')
545
+ if 'numbered' in pattern_type or 'decimal' in pattern_type:
546
+ if level == 1:
547
+ run.bold = True
548
+ run.font.color.rgb = RGBColor(44, 62, 80)
549
+ elif level == 2:
550
+ run.font.color.rgb = RGBColor(52, 73, 94)
551
+ else:
552
+ run.font.color.rgb = RGBColor(85, 85, 85)
553
+ elif 'parenthetical' in pattern_type:
554
+ if level <= 2:
555
+ run.bold = True
556
+ run.font.color.rgb = RGBColor(142, 68, 173) # Purple
557
+ else:
558
+ run.font.color.rgb = RGBColor(155, 89, 182) # Light purple
559
+ elif 'bullet' in pattern_type:
560
+ run.font.color.rgb = RGBColor(52, 152, 219)
561
+ elif 'lettered' in pattern_type:
562
+ run.italic = True
563
+ run.font.color.rgb = RGBColor(142, 68, 173)
564
+ elif 'roman' in pattern_type:
565
+ run.font.color.rgb = RGBColor(211, 84, 0)
566
+ elif 'thai' in pattern_type:
567
+ run.font.color.rgb = RGBColor(22, 160, 133)
568
+ elif text_classification.get('is_list_item'):
569
+ run.font.color.rgb = RGBColor(52, 152, 219)
570
+
571
  @staticmethod
572
  def create_html_file(html_content: str, metadata_info: str = "") -> str:
573
+ """Create standalone HTML file with enhanced styling for comprehensive indentation and text classification"""
574
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
575
  temp_file = tempfile.NamedTemporaryFile(
576
  suffix=f'_extracted_document_{timestamp}.html',
 
580
  )
581
 
582
  try:
583
+ # Enhance HTML with better styling
584
+ enhanced_html = html_content
585
+
586
+ # Add comprehensive styling if not already present
587
+ if '<style>' not in enhanced_html:
588
+ enhanced_html = enhanced_html.replace(
589
+ '<head>',
590
+ '''<head>
591
+ <style>
592
+ body {
593
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
594
+ line-height: 1.6;
595
+ margin: 20px;
596
+ background-color: #f9f9f9;
597
+ }
598
+ .container {
599
+ max-width: 1200px;
600
+ margin: 0 auto;
601
+ background-color: white;
602
+ padding: 30px;
603
+ border-radius: 8px;
604
+ box-shadow: 0 2px 10px rgba(0,0,0,0.1);
605
+ }
606
+ .header {
607
+ text-align: center;
608
+ margin-bottom: 30px;
609
+ border-bottom: 3px solid #2c3e50;
610
+ padding-bottom: 20px;
611
+ }
612
+ .metadata {
613
+ background-color: #ecf0f1;
614
+ padding: 15px;
615
+ border-radius: 5px;
616
+ margin-bottom: 25px;
617
+ border-left: 4px solid #3498db;
618
+ }
619
+ .enhanced-features {
620
+ background-color: #e8f5e8;
621
+ padding: 10px;
622
+ border-radius: 5px;
623
+ margin-bottom: 20px;
624
+ border-left: 4px solid #27ae60;
625
+ font-size: 0.9em;
626
+ }
627
+ .classification-features {
628
+ background-color: #fef9e7;
629
+ padding: 10px;
630
+ border-radius: 5px;
631
+ margin-bottom: 20px;
632
+ border-left: 4px solid #f39c12;
633
+ font-size: 0.9em;
634
+ }
635
+ </style>'''
636
+ )
637
 
638
+ # Wrap content in container if not already wrapped
639
+ if '<body>' in enhanced_html and '.container' not in enhanced_html:
640
  enhanced_html = enhanced_html.replace(
641
  '<body>',
642
  '''<body>
643
  <div class="container">
644
  <div class="header">
645
  <h1>PDF OCR Extraction Results</h1>
646
+ <p>Enhanced with Comprehensive Indentation Detection & Intelligent Text Classification</p>
647
+ </div>
648
+ <div class="enhanced-features">
649
+ <strong>Indentation Features:</strong> Hierarchical Numbering • Parenthetical Patterns ((1), (๑), (a), (i), (ก)) •
650
+ Multi-level Bullets • Letter & Roman Numerals • Thai Script Support •
651
+ Space-based Indentation • Pattern Priority Detection
652
+ </div>
653
+ <div class="classification-features">
654
+ <strong>Text Classification:</strong> Header Detection • Paragraph Recognition •
655
+ List Item Identification • Context Analysis • Confidence Scoring •
656
+ Document Structure Analysis
657
  </div>''' +
658
  (f'<div class="metadata"><h3>Processing Information</h3><pre>{metadata_info}</pre></div>' if metadata_info else '')
659
  )
 
670
 
671
 
672
  class BackendManager:
673
+ """Enhanced backend manager with comprehensive indentation detection, parenthetical patterns, text classification, and advanced export capabilities"""
674
 
675
  def __init__(self):
676
  self.ocr_service = OCRService()
677
+ self.document_exporter = EnhancedDocumentExporter()
678
  self.processing_history = []
679
  self.max_history_size = int(os.getenv('MAX_HISTORY_SIZE', 100))
680
 
681
  # Create directories for temporary files and logs
682
+ self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service_enhanced_v2'
683
  self.temp_dir.mkdir(exist_ok=True)
684
 
685
+ logger.info("Enhanced backend manager with comprehensive indentation detection and text classification initialized successfully")
686
 
687
  def process_pdf_with_enhanced_resolution(self, pdf_path: str, method: str = "auto",
688
  preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
689
  """
690
+ Process PDF with enhanced resolution, comprehensive indentation detection, and intelligent text classification
691
 
692
  Args:
693
  pdf_path: Path to the PDF file
 
695
  preprocessing_options: Dictionary containing preprocessing settings
696
 
697
  Returns:
698
+ Dict containing processing results with enhanced HTML content, indentation, and text classification
699
  """
700
  start_time = datetime.now()
701
 
 
727
  # Generate file hash for tracking
728
  file_hash = self._calculate_file_hash(pdf_path)
729
 
730
+ logger.info(f"Processing PDF with enhanced indentation detection and text classification: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
731
  logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
732
 
733
  # Handle preprocessing if enabled
 
745
  processed_pdf_path = pdf_path
746
 
747
  try:
748
+ # Process with enhanced OCR, indentation detection, and text classification
749
  result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method)
750
 
751
  # Add processing metadata
752
  processing_time = (datetime.now() - start_time).total_seconds()
753
 
754
+ # Analyze document structure with text classification if successful
755
+ document_analysis = {}
756
+ if result['success'] and result['text']:
757
+ try:
758
+ text_lines = result['text'].split('\n')
759
+ detector = EnhancedIndentationDetector()
760
+ document_analysis = detector.analyze_document_structure(text_lines)
761
+ except Exception as analysis_error:
762
+ logger.warning(f"Document structure analysis failed: {analysis_error}")
763
+ document_analysis = {'analysis_failed': True}
764
+
765
  result['metadata'].update({
766
  'file_hash': file_hash,
767
  'file_size_mb': round(file_size / (1024*1024), 2),
 
769
  'timestamp': start_time.isoformat(),
770
  'enhanced_processing': True,
771
  'html_processing': True,
772
+ 'comprehensive_indentation': True,
773
+ 'parenthetical_patterns_supported': True,
774
+ 'intelligent_text_classification': True,
775
  'header_footer_removed': preprocessing_applied,
776
+ 'preprocessing_options': preprocessing_options if preprocessing_applied else None,
777
+ 'document_structure_analysis': document_analysis
778
  })
779
 
780
  # Cleanup temporary preprocessed file
 
784
  except:
785
  pass
786
 
787
+ # Log results with enhanced information
788
  if result['success']:
789
  text_length = len(result['text'])
790
  has_html = bool(result.get('html'))
 
794
  logger.info(f"Method used: {result['method_used']}")
795
  logger.info(f"Text extracted: {text_length} characters")
796
  logger.info(f"HTML generated: {has_html}")
797
+ logger.info(f"Comprehensive indentation detection: Enabled")
798
+ logger.info(f"Parenthetical patterns supported: Enabled")
799
+ logger.info(f"Intelligent text classification: Enabled")
800
+
801
  if table_count > 0:
802
  logger.info(f"Tables detected: {table_count}")
803
  if preprocessing_applied:
804
  logger.info("Enhanced preprocessing applied")
805
+ if document_analysis and not document_analysis.get('analysis_failed'):
806
+ logger.info(f"Document analysis: {document_analysis.get('patterned_lines', 0)} patterned lines, max level {document_analysis.get('max_level', 0)}")
807
+ logger.info(f"Text classification: {document_analysis.get('header_count', 0)} headers, {document_analysis.get('paragraph_count', 0)} paragraphs, {document_analysis.get('list_item_count', 0)} list items")
808
 
809
  # Add to processing history
810
  self._add_to_history({
 
817
  'processing_time': processing_time,
818
  'preprocessing_applied': preprocessing_applied,
819
  'html_generated': has_html,
820
+ 'enhanced_processing': True,
821
+ 'comprehensive_indentation': True,
822
+ 'parenthetical_patterns_supported': True,
823
+ 'intelligent_text_classification': True,
824
+ 'document_analysis': document_analysis
825
  })
826
  else:
827
  logger.error(f"Enhanced processing failed: {result.get('error', 'Unknown error')}")
 
835
  'error': result.get('error', 'Unknown error'),
836
  'processing_time': processing_time,
837
  'preprocessing_applied': preprocessing_applied,
838
+ 'enhanced_processing': True,
839
+ 'comprehensive_indentation': True,
840
+ 'parenthetical_patterns_supported': True,
841
+ 'intelligent_text_classification': True
842
  })
843
 
844
  return result
 
862
  'success': False,
863
  'error': str(e),
864
  'processing_time': processing_time,
865
+ 'enhanced_processing': True,
866
+ 'comprehensive_indentation': True,
867
+ 'parenthetical_patterns_supported': True,
868
+ 'intelligent_text_classification': True
869
  })
870
 
871
  return {
 
878
  'file_hash': file_hash,
879
  'processing_time_seconds': round(processing_time, 2),
880
  'timestamp': start_time.isoformat(),
881
+ 'enhanced_processing': True,
882
+ 'comprehensive_indentation': True,
883
+ 'parenthetical_patterns_supported': True,
884
+ 'intelligent_text_classification': True
885
  }
886
  }
887
 
888
  def _apply_enhanced_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str:
889
+ """Apply enhanced preprocessing with high-resolution crop handling"""
890
  crop_settings = options.get('crop_settings', {})
891
  per_page_crops = crop_settings.get('per_page_crops', {})
892
  enhanced_resolution = crop_settings.get('enhanced_resolution', True)
 
904
  page = doc.load_page(page_num)
905
  page_rect = page.rect
906
 
907
+ # Get crop settings for this page
908
  page_crop = per_page_crops.get(page_num, per_page_crops.get(0, {
909
  'top': 0, 'bottom': 0, 'left': 0, 'right': 0
910
  }))
 
938
 
939
  # Create new page with enhanced resolution if enabled
940
  if enhanced_resolution:
 
941
  new_page = new_doc.new_page(
942
  width=new_rect.width,
943
  height=new_rect.height
 
977
 
978
  def create_enhanced_downloads(self, text_content: str, html_content: str,
979
  metadata_info: str = "") -> Dict[str, str]:
980
+ """Create enhanced download files with comprehensive indentation support, parenthetical patterns, and text classification"""
981
  download_files = {}
982
 
983
  try:
984
  # Create enhanced TXT file
985
+ txt_path = EnhancedDocumentExporter.create_enhanced_txt_file(
986
  text_content, html_content, metadata_info
987
  )
988
  download_files['txt'] = txt_path
989
  logger.info(f"Enhanced TXT file created: {txt_path}")
990
 
991
+ # Create enhanced DOCX file with comprehensive indentation support and text classification
992
  try:
993
+ docx_path = self.document_exporter.create_enhanced_docx_file(
994
  text_content, html_content, metadata_info
995
  )
996
  download_files['docx'] = docx_path
997
+ logger.info(f"Enhanced DOCX file with comprehensive indentation and text classification created: {docx_path}")
998
  except ImportError:
999
  logger.warning("python-docx not available. DOCX creation skipped.")
1000
  except Exception as e:
1001
+ logger.error(f"Enhanced DOCX creation failed: {e}")
1002
 
1003
  # Create standalone HTML file
1004
  try:
1005
+ html_path = EnhancedDocumentExporter.create_html_file(
1006
  html_content, metadata_info
1007
  )
1008
  download_files['html'] = html_path
1009
+ logger.info(f"Enhanced HTML file created: {html_path}")
1010
  except Exception as e:
1011
  logger.error(f"HTML file creation failed: {e}")
1012
 
 
1045
  'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)),
1046
  'enhanced_processing': True,
1047
  'html_processing': True,
1048
+ 'comprehensive_indentation': True,
1049
+ 'parenthetical_patterns_supported': True,
1050
+ 'intelligent_text_classification': True,
1051
+ 'pattern_detection_count': len(EnhancedIndentationDetector().patterns),
1052
  'docx_export_available': docx_available,
1053
  'enhanced_crop_processing': True,
1054
  'multi_resolution_support': True,
1055
+ 'crop_processing_fixed': True,
1056
+ 'document_structure_analysis': True,
1057
+ 'thai_script_support': True,
1058
+ 'multi_level_support': True,
1059
+ 'text_classification_features': True
1060
  }
1061
 
1062
  return status
 
1104
  logger.error(f"Error during cleanup: {e}")
1105
 
1106
  def get_enhanced_statistics(self) -> Dict[str, Any]:
1107
+ """Get enhanced processing statistics with indentation analysis and text classification"""
1108
  if not self.processing_history:
1109
  return {
1110
  'total_processed': 0,
 
1115
  'total_tables_processed': 0,
1116
  'preprocessing_usage': 0,
1117
  'html_generation_rate': 0,
1118
+ 'enhanced_processing_usage': 0,
1119
+ 'comprehensive_indentation_usage': 0,
1120
+ 'parenthetical_patterns_usage': 0,
1121
+ 'text_classification_usage': 0,
1122
+ 'document_analysis_success_rate': 0
1123
  }
1124
 
1125
  total_processed = len(self.processing_history)
 
1139
  preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False))
1140
  html_generated = sum(1 for h in self.processing_history if h.get('html_generated', False))
1141
  enhanced_processing = sum(1 for h in self.processing_history if h.get('enhanced_processing', False))
1142
+ comprehensive_indentation = sum(1 for h in self.processing_history if h.get('comprehensive_indentation', False))
1143
+ parenthetical_patterns = sum(1 for h in self.processing_history if h.get('parenthetical_patterns_supported', False))
1144
+ text_classification = sum(1 for h in self.processing_history if h.get('intelligent_text_classification', False))
1145
+
1146
+ # Document analysis statistics
1147
+ doc_analysis_success = sum(1 for h in self.processing_history
1148
+ if h.get('document_analysis', {}) and not h.get('document_analysis', {}).get('analysis_failed', False))
1149
+ doc_analysis_rate = (doc_analysis_success / total_processed) * 100 if total_processed > 0 else 0
1150
 
1151
  html_generation_rate = (html_generated / total_processed) * 100 if total_processed > 0 else 0
1152
  enhanced_processing_rate = (enhanced_processing / total_processed) * 100 if total_processed > 0 else 0
1153
+ comprehensive_indentation_rate = (comprehensive_indentation / total_processed) * 100 if total_processed > 0 else 0
1154
+ parenthetical_patterns_rate = (parenthetical_patterns / total_processed) * 100 if total_processed > 0 else 0
1155
+ text_classification_rate = (text_classification / total_processed) * 100 if total_processed > 0 else 0
1156
 
1157
  return {
1158
  'total_processed': total_processed,
 
1166
  'preprocessing_usage': preprocessing_usage,
1167
  'html_generation_rate': round(html_generation_rate, 2),
1168
  'enhanced_processing_usage': enhanced_processing,
1169
+ 'enhanced_processing_rate': round(enhanced_processing_rate, 2),
1170
+ 'comprehensive_indentation_usage': comprehensive_indentation,
1171
+ 'comprehensive_indentation_rate': round(comprehensive_indentation_rate, 2),
1172
+ 'parenthetical_patterns_usage': parenthetical_patterns,
1173
+ 'parenthetical_patterns_rate': round(parenthetical_patterns_rate, 2),
1174
+ 'text_classification_usage': text_classification,
1175
+ 'text_classification_rate': round(text_classification_rate, 2),
1176
+ 'document_analysis_success_rate': round(doc_analysis_rate, 2)
1177
  }
1178
 
1179
 
 
1192
  # Test the enhanced backend manager
1193
  manager = BackendManager()
1194
 
1195
+ print("Enhanced Backend Manager with Comprehensive Indentation Detection & Text Classification Test")
1196
+ print("=" * 100)
1197
  print(f"Available methods: {manager.get_available_methods()}")
1198
  print(f"Service status: {manager.get_service_status()}")
1199
+ print(f"Enhanced statistics: {manager.get_enhanced_statistics()}")
1200
+
1201
+ # Test indentation detector with parenthetical patterns
1202
+ detector = EnhancedIndentationDetector()
1203
+ test_cases = [
1204
+ "1.2.3. Hierarchical item",
1205
+ "(1) Parenthetical Arabic",
1206
+ "(๑) Parenthetical Thai numeral",
1207
+ "(a) Parenthetical letter",
1208
+ "(i) Parenthetical Roman",
1209
+ "(ก) Parenthetical Thai letter"
1210
+ ]
1211
+
1212
+ print(f"\nIndentation Detection Test with Parenthetical Patterns:")
1213
+ print("-" * 60)
1214
+ for test_text in test_cases:
1215
+ result = detector.detect_indentation(test_text)
1216
+ classification = detector.classify_text_type(test_text)
1217
+ print(f"Text: {test_text}")
1218
+ print(f" Pattern: {result['pattern_type']}, Level: {result['level']}")
1219
+ print(f" Classification: {classification['type']} (confidence: {classification['confidence']:.2f})")
1220
+ print()
enhanced_indentation.py ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced Indentation Detection System
3
+ Comprehensive regex-based system for detecting hierarchical numbering and indentation levels
4
+ For PDF OCR Service with HTML and DOCX output support including parenthetical patterns
5
+ """
6
+ import re
7
+ import logging
8
+ from typing import Dict, Tuple, Optional, List, Any
9
+ from collections import Counter
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class EnhancedIndentationDetector:
14
+ """Advanced indentation detection with comprehensive pattern matching including parenthetical patterns"""
15
+
16
+ def __init__(self):
17
+ # Define comprehensive patterns for different numbering styles
18
+ self.patterns = {
19
+ # Hierarchical decimal numbering (1.1.1.1.1...)
20
+ 'decimal_hierarchy': {
21
+ 'pattern': r'^\s*(\d+(?:\.\d+)*)\.\s+',
22
+ 'example': '1.2.3.4.5.',
23
+ 'level_func': self._calculate_decimal_level,
24
+ 'priority': 15
25
+ },
26
+
27
+ # Hierarchical numbering without final dot (1.1.1.1.1)
28
+ 'decimal_hierarchy_no_dot': {
29
+ 'pattern': r'^\s*(\d+(?:\.\d+)+)\s+',
30
+ 'example': '1.2.3.4.5',
31
+ 'level_func': self._calculate_decimal_level,
32
+ 'priority': 14
33
+ },
34
+
35
+ # Hierarchical numbering with parentheses (1.1.1) or 1.1.1)
36
+ 'decimal_hierarchy_paren': {
37
+ 'pattern': r'^\s*(\d+(?:\.\d+)*)\)\s+',
38
+ 'example': '1.2.3.4)',
39
+ 'level_func': self._calculate_decimal_level,
40
+ 'priority': 13
41
+ },
42
+
43
+ # Mixed hierarchical (1.1.a.i.A...)
44
+ 'mixed_hierarchy': {
45
+ 'pattern': r'^\s*(\d+(?:\.(?:\d+|[a-z]+|[A-Z]+|[ivxlcdm]+))+)\.\s+',
46
+ 'example': '1.2.a.i.A.',
47
+ 'level_func': self._calculate_mixed_level,
48
+ 'priority': 12
49
+ },
50
+
51
+ # Legal numbering (1.1.1.1(a)(i))
52
+ 'legal_numbering': {
53
+ 'pattern': r'^\s*(\d+(?:\.\d+)*(?:\([a-z]+\))*(?:\([ivxlcdm]+\))*)\s+',
54
+ 'example': '1.1.1(a)(i)',
55
+ 'level_func': self._calculate_legal_level,
56
+ 'priority': 11
57
+ },
58
+
59
+ # Outline numbering (I.A.1.a.i.)
60
+ 'outline_numbering': {
61
+ 'pattern': r'^\s*([IVXLCDM]+(?:\.[A-Z]+)*(?:\.\d+)*(?:\.[a-z]+)*(?:\.[ivxlcdm]+)*)\.\s+',
62
+ 'example': 'I.A.1.a.i.',
63
+ 'level_func': self._calculate_outline_level,
64
+ 'priority': 10
65
+ },
66
+
67
+ # Section numbering (§1.1.1, Article 1.1.1)
68
+ 'section_numbering': {
69
+ 'pattern': r'^\s*(?:§|Section|Article|Chapter|Part)\s*(\d+(?:\.\d+)*)\.\s+',
70
+ 'example': '§1.2.3.',
71
+ 'level_func': self._calculate_decimal_level,
72
+ 'priority': 9
73
+ },
74
+
75
+ # Thai section numbering (มาตรา, ข้อ, หมวด)
76
+ 'thai_section_numbering': {
77
+ 'pattern': r'^\s*(?:มาตรา|ข้อ|หมวด|ส่วน)\s*(\d+(?:\.\d+)*)\s+',
78
+ 'example': 'มาตรา 1.2.3',
79
+ 'level_func': self._calculate_decimal_level,
80
+ 'priority': 9
81
+ },
82
+
83
+ # Parenthetical numbering - Arabic numerals (1), (2), (3)
84
+ 'parenthetical_arabic': {
85
+ 'pattern': r'^\s*\((\d+)\)\s+',
86
+ 'example': '(1)',
87
+ 'level_func': lambda x: 2,
88
+ 'priority': 8
89
+ },
90
+
91
+ # Parenthetical numbering - Thai numerals (๑), (๒), (๓)
92
+ 'parenthetical_thai_numerals': {
93
+ 'pattern': r'^\s*\(([๐-๙]+)\)\s+',
94
+ 'example': '(๑)',
95
+ 'level_func': lambda x: 2,
96
+ 'priority': 8
97
+ },
98
+
99
+ # Parenthetical letters - lowercase (a), (b), (c)
100
+ 'parenthetical_letters_lower': {
101
+ 'pattern': r'^\s*\(([a-z]+)\)\s+',
102
+ 'example': '(a)',
103
+ 'level_func': lambda x: 3,
104
+ 'priority': 7
105
+ },
106
+
107
+ # Parenthetical letters - uppercase (A), (B), (C)
108
+ 'parenthetical_letters_upper': {
109
+ 'pattern': r'^\s*\(([A-Z]+)\)\s+',
110
+ 'example': '(A)',
111
+ 'level_func': lambda x: 2,
112
+ 'priority': 7
113
+ },
114
+
115
+ # Parenthetical Thai letters (ก), (ข), (ค)
116
+ 'parenthetical_thai_letters': {
117
+ 'pattern': r'^\s*\(([ก-ฮ]+)\)\s+',
118
+ 'example': '(ก)',
119
+ 'level_func': lambda x: 3,
120
+ 'priority': 7
121
+ },
122
+
123
+ # Parenthetical Roman numerals - lowercase (i), (ii), (iii)
124
+ 'parenthetical_roman_lower': {
125
+ 'pattern': r'^\s*\(([ivxlcdm]+)\)\s+',
126
+ 'example': '(i)',
127
+ 'level_func': lambda x: 4,
128
+ 'priority': 6
129
+ },
130
+
131
+ # Parenthetical Roman numerals - uppercase (I), (II), (III)
132
+ 'parenthetical_roman_upper': {
133
+ 'pattern': r'^\s*\(([IVXLCDM]+)\)\s+',
134
+ 'example': '(I)',
135
+ 'level_func': lambda x: 2,
136
+ 'priority': 6
137
+ },
138
+
139
+ # Simple numbered lists (1., 2., 3.)
140
+ 'simple_numbered': {
141
+ 'pattern': r'^\s*(\d+)\.\s+',
142
+ 'example': '1.',
143
+ 'level_func': lambda x: 1,
144
+ 'priority': 5
145
+ },
146
+
147
+ # Simple numbered with parens (1), 2), 3))
148
+ 'simple_numbered_paren': {
149
+ 'pattern': r'^\s*(\d+)\)\s+',
150
+ 'example': '1)',
151
+ 'level_func': lambda x: 1,
152
+ 'priority': 5
153
+ },
154
+
155
+ # Lettered lists (a., b., c.) and (A., B., C.)
156
+ 'lettered_lower': {
157
+ 'pattern': r'^\s*([a-z]+)\.\s+',
158
+ 'example': 'a.',
159
+ 'level_func': lambda x: 2,
160
+ 'priority': 4
161
+ },
162
+
163
+ 'lettered_upper': {
164
+ 'pattern': r'^\s*([A-Z]+)\.\s+',
165
+ 'example': 'A.',
166
+ 'level_func': lambda x: 1,
167
+ 'priority': 4
168
+ },
169
+
170
+ # Thai letters (ก., ข., ค.)
171
+ 'thai_lettered': {
172
+ 'pattern': r'^\s*([ก-ฮ]+)\.\s+',
173
+ 'example': 'ก.',
174
+ 'level_func': lambda x: 2,
175
+ 'priority': 4
176
+ },
177
+
178
+ # Roman numerals (i., ii., iii.) and (I., II., III.)
179
+ 'roman_lower': {
180
+ 'pattern': r'^\s*([ivxlcdm]+)\.\s+',
181
+ 'example': 'i.',
182
+ 'level_func': lambda x: 3,
183
+ 'priority': 3
184
+ },
185
+
186
+ 'roman_upper': {
187
+ 'pattern': r'^\s*([IVXLCDM]+)\.\s+',
188
+ 'example': 'I.',
189
+ 'level_func': lambda x: 1,
190
+ 'priority': 3
191
+ },
192
+
193
+ # Bullet points with various symbols
194
+ 'bullet_symbols': {
195
+ 'pattern': r'^\s*([•·▪▫◦‣⁃⁌⁍◘◙○●▶▷►▻★☆♦♠♣♥◆◇■□▲△▼▽❖❀❁❂❃❄❅❆❇❈❉❊❋❍❏❐❑❒❖])\s+',
196
+ 'example': '•',
197
+ 'level_func': self._calculate_bullet_level,
198
+ 'priority': 2
199
+ },
200
+
201
+ # Dash and asterisk bullets
202
+ 'dash_bullets': {
203
+ 'pattern': r'^\s*([\-\*\+~=])\s+',
204
+ 'example': '-',
205
+ 'level_func': self._calculate_bullet_level,
206
+ 'priority': 2
207
+ },
208
+
209
+ # Arrow bullets
210
+ 'arrow_bullets': {
211
+ 'pattern': r'^\s*([\→\←\↑\↓\↔\↕\↖\↗\↘\↙\⇒\⇐\⇑\⇓\⇔\⇕\➔\➜\➤\➪\➫\➬\➭\➮\➯\➱\➲\➳\➴\➵\➶\➷\➸\➹\➺\➻\➼\➽\➾])\s+',
212
+ 'example': '→',
213
+ 'level_func': self._calculate_bullet_level,
214
+ 'priority': 2
215
+ },
216
+
217
+ # Checkbox items ([x], [ ], [✓])
218
+ 'checkbox': {
219
+ 'pattern': r'^\s*\[([x✓✗\s])\]\s+',
220
+ 'example': '[x]',
221
+ 'level_func': lambda x: 2,
222
+ 'priority': 1
223
+ }
224
+ }
225
+
226
+ # Sort patterns by priority (higher priority first)
227
+ self.sorted_patterns = sorted(
228
+ self.patterns.items(),
229
+ key=lambda x: x[1]['priority'],
230
+ reverse=True
231
+ )
232
+
233
+ # Header detection patterns
234
+ self.header_patterns = {
235
+ 'title_case': r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*$',
236
+ 'all_caps': r'^[A-Z\s]+$',
237
+ 'numbered_header': r'^\d+\.\s*[A-Z]',
238
+ 'section_header': r'^(?:SECTION|CHAPTER|PART|ARTICLE)\s+',
239
+ 'thai_header': r'^(?:หมวด|บท|ส่วน|มาตรา)\s+',
240
+ 'short_line': lambda text: len(text.strip()) < 50 and not text.strip().endswith('.'),
241
+ 'ends_without_period': lambda text: not text.strip().endswith('.') and not text.strip().endswith(':'),
242
+ 'capitalized_words': lambda text: sum(1 for word in text.split() if word and word[0].isupper()) / max(len(text.split()), 1) > 0.5
243
+ }
244
+
245
+ def detect_indentation(self, text: str, base_margin: float = 0) -> Dict[str, Any]:
246
+ """
247
+ Detect indentation pattern and level for given text
248
+
249
+ Args:
250
+ text: Text line to analyze
251
+ base_margin: Base left margin for relative positioning
252
+
253
+ Returns:
254
+ Dict with pattern info, level, and formatting details
255
+ """
256
+ if not text or not text.strip():
257
+ return self._create_empty_result(text)
258
+
259
+ text_stripped = text.strip()
260
+
261
+ # Count leading whitespace for additional indentation
262
+ leading_spaces = len(text) - len(text.lstrip())
263
+ space_indent_level = leading_spaces // 4 # 4 spaces = 1 level
264
+
265
+ # Try each pattern in priority order
266
+ for pattern_name, pattern_info in self.sorted_patterns:
267
+ match = re.match(pattern_info['pattern'], text, re.IGNORECASE)
268
+ if match:
269
+ # Extract the numbering/bullet part
270
+ marker = match.group(1) if match.groups() else match.group(0)
271
+
272
+ # Calculate pattern-specific level
273
+ if callable(pattern_info['level_func']):
274
+ try:
275
+ pattern_level = pattern_info['level_func'](marker)
276
+ except:
277
+ pattern_level = 1
278
+ else:
279
+ pattern_level = pattern_info['level_func']
280
+
281
+ # Combine pattern level with space indentation
282
+ total_level = max(pattern_level + space_indent_level, 1)
283
+
284
+ # Extract content after the marker
285
+ content_start = match.end()
286
+ content = text[content_start:].strip()
287
+
288
+ return {
289
+ 'has_pattern': True,
290
+ 'pattern_type': pattern_name,
291
+ 'pattern_marker': marker,
292
+ 'level': min(total_level, 10), # Cap at level 10
293
+ 'content': content,
294
+ 'original_text': text,
295
+ 'leading_spaces': leading_spaces,
296
+ 'space_indent_level': space_indent_level,
297
+ 'pattern_level': pattern_level,
298
+ 'is_bullet': self._is_bullet_pattern(pattern_name),
299
+ 'is_numbered': self._is_numbered_pattern(pattern_name),
300
+ 'is_lettered': self._is_lettered_pattern(pattern_name),
301
+ 'is_roman': self._is_roman_pattern(pattern_name),
302
+ 'is_thai': self._is_thai_pattern(pattern_name),
303
+ 'is_parenthetical': self._is_parenthetical_pattern(pattern_name),
304
+ 'formatting_hint': self._get_formatting_hint(pattern_name, total_level),
305
+ 'priority': pattern_info['priority']
306
+ }
307
+
308
+ # No pattern found - check for basic indentation
309
+ if leading_spaces > 0:
310
+ return {
311
+ 'has_pattern': False,
312
+ 'pattern_type': 'space_indent',
313
+ 'pattern_marker': '',
314
+ 'level': max(space_indent_level, 1),
315
+ 'content': text_stripped,
316
+ 'original_text': text,
317
+ 'leading_spaces': leading_spaces,
318
+ 'space_indent_level': space_indent_level,
319
+ 'pattern_level': 0,
320
+ 'is_bullet': False,
321
+ 'is_numbered': False,
322
+ 'is_lettered': False,
323
+ 'is_roman': False,
324
+ 'is_thai': False,
325
+ 'is_parenthetical': False,
326
+ 'formatting_hint': 'indented_text',
327
+ 'priority': 0
328
+ }
329
+
330
+ # No indentation at all
331
+ return self._create_empty_result(text)
332
+
333
+ def classify_text_type(self, text: str, context: Dict = None) -> Dict[str, Any]:
334
+ """
335
+ Classify text as header, paragraph, or list item based on patterns and context
336
+
337
+ Args:
338
+ text: Text to classify
339
+ context: Additional context like position, formatting, etc.
340
+
341
+ Returns:
342
+ Dict with classification results
343
+ """
344
+ if not text or not text.strip():
345
+ return {'type': 'empty', 'confidence': 1.0}
346
+
347
+ text_stripped = text.strip()
348
+ context = context or {}
349
+
350
+ # Check for indentation patterns first
351
+ indent_result = self.detect_indentation(text)
352
+
353
+ # Initialize classification scores
354
+ scores = {
355
+ 'header': 0.0,
356
+ 'paragraph': 0.0,
357
+ 'list_item': 0.0
358
+ }
359
+
360
+ # List item indicators
361
+ if indent_result['has_pattern']:
362
+ scores['list_item'] += 0.8
363
+ if indent_result['is_numbered'] or indent_result['is_lettered']:
364
+ scores['list_item'] += 0.1
365
+ if indent_result['is_bullet']:
366
+ scores['list_item'] += 0.1
367
+
368
+ # Header indicators
369
+ if len(text_stripped) < 100: # Short text more likely to be header
370
+ scores['header'] += 0.3
371
+
372
+ if len(text_stripped) < 50: # Very short text even more likely
373
+ scores['header'] += 0.2
374
+
375
+ # Check header patterns
376
+ for pattern_name, pattern in self.header_patterns.items():
377
+ if callable(pattern):
378
+ if pattern(text_stripped):
379
+ scores['header'] += 0.2
380
+ else:
381
+ if re.match(pattern, text_stripped):
382
+ scores['header'] += 0.2
383
+
384
+ # Position-based scoring from context
385
+ if context.get('y_position'):
386
+ # Higher on page = more likely header
387
+ if context['y_position'] < 100: # Top of page
388
+ scores['header'] += 0.3
389
+
390
+ # Font size context
391
+ if context.get('font_size'):
392
+ if context['font_size'] > 12: # Larger font
393
+ scores['header'] += 0.2
394
+
395
+ # Font weight context
396
+ if context.get('is_bold'):
397
+ scores['header'] += 0.2
398
+
399
+ # Paragraph indicators
400
+ if len(text_stripped) > 100: # Long text more likely paragraph
401
+ scores['paragraph'] += 0.4
402
+
403
+ if text_stripped.endswith('.'): # Ends with period
404
+ scores['paragraph'] += 0.2
405
+
406
+ if not indent_result['has_pattern'] and len(text_stripped) > 50:
407
+ scores['paragraph'] += 0.3
408
+
409
+ # Determine final classification
410
+ max_score = max(scores.values())
411
+ classification = max(scores.items(), key=lambda x: x[1])
412
+
413
+ return {
414
+ 'type': classification[0],
415
+ 'confidence': classification[1],
416
+ 'scores': scores,
417
+ 'indentation': indent_result,
418
+ 'is_header': classification[0] == 'header',
419
+ 'is_paragraph': classification[0] == 'paragraph',
420
+ 'is_list_item': classification[0] == 'list_item'
421
+ }
422
+
423
+ def _create_empty_result(self, text: str) -> Dict[str, Any]:
424
+ """Create result for text with no indentation pattern"""
425
+ return {
426
+ 'has_pattern': False,
427
+ 'pattern_type': 'normal',
428
+ 'pattern_marker': '',
429
+ 'level': 0,
430
+ 'content': text.strip(),
431
+ 'original_text': text,
432
+ 'leading_spaces': 0,
433
+ 'space_indent_level': 0,
434
+ 'pattern_level': 0,
435
+ 'is_bullet': False,
436
+ 'is_numbered': False,
437
+ 'is_lettered': False,
438
+ 'is_roman': False,
439
+ 'is_thai': False,
440
+ 'is_parenthetical': False,
441
+ 'formatting_hint': 'normal_text',
442
+ 'priority': 0
443
+ }
444
+
445
+ def _calculate_decimal_level(self, marker: str) -> int:
446
+ """Calculate level for decimal hierarchies (1.2.3.4 = level 4)"""
447
+ # Count dots to determine depth
448
+ dots = marker.count('.')
449
+ return dots + 1
450
+
451
+ def _calculate_mixed_level(self, marker: str) -> int:
452
+ """Calculate level for mixed hierarchies (1.2.a.i = level 4)"""
453
+ parts = marker.split('.')
454
+ return len([p for p in parts if p.strip()])
455
+
456
+ def _calculate_legal_level(self, marker: str) -> int:
457
+ """Calculate level for legal numbering (1.1.1(a)(i) = level 5)"""
458
+ # Count dots and parenthetical parts
459
+ dots = marker.count('.')
460
+ parens = marker.count('(')
461
+ return dots + parens + 1
462
+
463
+ def _calculate_outline_level(self, marker: str) -> int:
464
+ """Calculate level for outline numbering (I.A.1.a.i = level 5)"""
465
+ parts = marker.split('.')
466
+ return len([p for p in parts if p.strip()])
467
+
468
+ def _calculate_bullet_level(self, marker: str) -> int:
469
+ """Calculate level for bullet points based on symbol complexity"""
470
+ # More complex symbols typically indicate deeper levels
471
+ complex_bullets = ['◦', '‣', '⁃', '▪', '▫', '◘', '◙']
472
+ if marker in complex_bullets:
473
+ return 2
474
+ return 1
475
+
476
+ def _is_bullet_pattern(self, pattern_type: str) -> bool:
477
+ """Check if pattern is a bullet type"""
478
+ return any(bullet_type in pattern_type for bullet_type in ['bullet', 'dash', 'arrow', 'checkbox'])
479
+
480
+ def _is_numbered_pattern(self, pattern_type: str) -> bool:
481
+ """Check if pattern is a numbered type"""
482
+ return any(num_type in pattern_type for num_type in ['numbered', 'decimal', 'legal', 'section'])
483
+
484
+ def _is_lettered_pattern(self, pattern_type: str) -> bool:
485
+ """Check if pattern is a lettered type"""
486
+ return 'lettered' in pattern_type
487
+
488
+ def _is_roman_pattern(self, pattern_type: str) -> bool:
489
+ """Check if pattern is a roman numeral type"""
490
+ return 'roman' in pattern_type
491
+
492
+ def _is_thai_pattern(self, pattern_type: str) -> bool:
493
+ """Check if pattern is Thai-specific"""
494
+ return 'thai' in pattern_type
495
+
496
+ def _is_parenthetical_pattern(self, pattern_type: str) -> bool:
497
+ """Check if pattern is parenthetical type"""
498
+ return 'parenthetical' in pattern_type
499
+
500
+ def _get_formatting_hint(self, pattern_type: str, level: int) -> str:
501
+ """Get formatting hint for rendering"""
502
+ level_names = ['primary', 'secondary', 'tertiary', 'quaternary', 'quinary']
503
+ level_name = level_names[min(level-1, len(level_names)-1)] if level > 0 else 'normal'
504
+
505
+ if self._is_bullet_pattern(pattern_type):
506
+ return f'bullet_{level_name}'
507
+ elif self._is_numbered_pattern(pattern_type):
508
+ return f'numbered_{level_name}'
509
+ elif self._is_lettered_pattern(pattern_type):
510
+ return f'lettered_{level_name}'
511
+ elif self._is_roman_pattern(pattern_type):
512
+ return f'roman_{level_name}'
513
+ elif self._is_thai_pattern(pattern_type):
514
+ return f'thai_{level_name}'
515
+ elif self._is_parenthetical_pattern(pattern_type):
516
+ return f'parenthetical_{level_name}'
517
+ else:
518
+ return f'indent_{level_name}'
519
+
520
+ def analyze_document_structure(self, text_lines: List[str]) -> Dict[str, Any]:
521
+ """
522
+ Analyze entire document structure for consistent formatting
523
+
524
+ Args:
525
+ text_lines: List of text lines to analyze
526
+
527
+ Returns:
528
+ Dict with document structure analysis
529
+ """
530
+ analysis = {
531
+ 'total_lines': len(text_lines),
532
+ 'patterned_lines': 0,
533
+ 'max_level': 0,
534
+ 'pattern_distribution': Counter(),
535
+ 'level_distribution': Counter(),
536
+ 'formatting_hints': Counter(),
537
+ 'text_classification': Counter(),
538
+ 'has_consistent_numbering': False,
539
+ 'dominant_patterns': [],
540
+ 'header_count': 0,
541
+ 'paragraph_count': 0,
542
+ 'list_item_count': 0
543
+ }
544
+
545
+ indent_results = []
546
+
547
+ for line in text_lines:
548
+ if line.strip():
549
+ # Indentation analysis
550
+ indent_result = self.detect_indentation(line)
551
+ indent_results.append(indent_result)
552
+
553
+ # Text classification
554
+ classification = self.classify_text_type(line)
555
+ analysis['text_classification'][classification['type']] += 1
556
+
557
+ if classification['type'] == 'header':
558
+ analysis['header_count'] += 1
559
+ elif classification['type'] == 'paragraph':
560
+ analysis['paragraph_count'] += 1
561
+ elif classification['type'] == 'list_item':
562
+ analysis['list_item_count'] += 1
563
+
564
+ if indent_result['has_pattern']:
565
+ analysis['patterned_lines'] += 1
566
+ analysis['pattern_distribution'][indent_result['pattern_type']] += 1
567
+ analysis['level_distribution'][indent_result['level']] += 1
568
+ analysis['formatting_hints'][indent_result['formatting_hint']] += 1
569
+ analysis['max_level'] = max(analysis['max_level'], indent_result['level'])
570
+
571
+ # Determine dominant patterns
572
+ if analysis['pattern_distribution']:
573
+ analysis['dominant_patterns'] = analysis['pattern_distribution'].most_common(3)
574
+
575
+ # Check for consistent numbering
576
+ numbered_patterns = [p for p in analysis['pattern_distribution'] if 'numbered' in p or 'decimal' in p]
577
+ analysis['has_consistent_numbering'] = len(numbered_patterns) > 0
578
+
579
+ analysis['coverage_percentage'] = (analysis['patterned_lines'] / analysis['total_lines'] * 100) if analysis['total_lines'] > 0 else 0
580
+
581
+ return analysis
582
+
583
+
584
+ # Example usage and testing
585
+ if __name__ == "__main__":
586
+ detector = EnhancedIndentationDetector()
587
+
588
+ test_cases = [
589
+ "1. First level item",
590
+ " 1.1. Second level item",
591
+ " 1.1.1. Third level item",
592
+ " 1.1.1.1. Fourth level item",
593
+ " 1.1.1.1.1. Fifth level item",
594
+ "(1) Parenthetical Arabic",
595
+ "(๑) Parenthetical Thai numeral",
596
+ "(a) Parenthetical lowercase letter",
597
+ "(A) Parenthetical uppercase letter",
598
+ "(ก) Parenthetical Thai letter",
599
+ "(i) Parenthetical lowercase Roman",
600
+ "(I) Parenthetical uppercase Roman",
601
+ "2. Another first level",
602
+ " a. Letter sub-item",
603
+ " i. Roman sub-sub-item",
604
+ "• Bullet point",
605
+ " ◦ Sub bullet",
606
+ " ▪ Sub-sub bullet",
607
+ "- Dash item",
608
+ " * Asterisk sub-item",
609
+ " + Plus sub-sub-item",
610
+ "§1.2.3. Section numbering",
611
+ "Article 1.1.1. Article numbering",
612
+ "มาตรา 1.2.3 Thai section",
613
+ "ก. Thai letter",
614
+ "[x] Checkbox item",
615
+ "→ Arrow item",
616
+ "I. Roman numeral",
617
+ " A. Capital letter",
618
+ " 1. Number",
619
+ " a. Lowercase letter",
620
+ " i. Lowercase roman",
621
+ " Normal indented text without pattern",
622
+ "CHAPTER 1: INTRODUCTION",
623
+ "This is a regular paragraph with some text that should be classified as paragraph content.",
624
+ ]
625
+
626
+ print("Enhanced Indentation Detection Results with Parenthetical Patterns:")
627
+ print("=" * 80)
628
+
629
+ for i, test_text in enumerate(test_cases, 1):
630
+ result = detector.detect_indentation(test_text)
631
+ classification = detector.classify_text_type(test_text)
632
+
633
+ print(f"{i:2d}. Text: {test_text!r}")
634
+ print(f" Pattern: {result['pattern_type']}")
635
+ print(f" Level: {result['level']}")
636
+ print(f" Marker: {result['pattern_marker']!r}")
637
+ print(f" Content: {result['content']!r}")
638
+ print(f" Hint: {result['formatting_hint']}")
639
+ print(f" Priority: {result['priority']}")
640
+ print(f" Classification: {classification['type']} (confidence: {classification['confidence']:.2f})")
641
+ print()
642
+
643
+ # Test document analysis
644
+ print("\nDocument Structure Analysis:")
645
+ print("=" * 40)
646
+ analysis = detector.analyze_document_structure(test_cases)
647
+ for key, value in analysis.items():
648
+ print(f"{key}: {value}")
ocr_service.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
- OCR Service Module - FIXED VERSION with Improved Text Formatting and Page Numbers
3
- Handles PDF to text conversion with proper indentation, spacing, and page numbering
4
  """
5
  import re
6
  import os
@@ -30,17 +30,25 @@ except ImportError:
30
 
31
  import fitz # PyMuPDF
32
 
 
 
 
33
  # Configure logging
34
  logging.basicConfig(level=logging.INFO)
35
  logger = logging.getLogger(__name__)
36
 
37
 
38
- class HTMLProcessor:
39
- """Process OCR results through HTML for better formatting preservation - FIXED VERSION"""
 
 
 
40
 
41
  @staticmethod
42
  def create_html_from_azure_result(analysis_result) -> str:
43
- """Create structured HTML from Azure Document Intelligence result with proper spacing and page numbers"""
 
 
44
  html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8">']
45
  html_parts.append('<style>')
46
  html_parts.append('''
@@ -71,12 +79,177 @@ class HTMLProcessor:
71
  text-transform: uppercase;
72
  letter-spacing: 1px;
73
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  .paragraph {
75
  margin-bottom: 0.8em;
76
  white-space: pre-wrap;
77
  font-family: 'Consolas', 'Courier New', monospace;
78
  line-height: 1.4;
79
  }
 
80
  .title {
81
  font-size: 1.4em;
82
  font-weight: bold;
@@ -124,24 +297,13 @@ class HTMLProcessor:
124
  .table tr:nth-child(even) {
125
  background-color: #f8f9fa;
126
  }
127
- .indented {
128
- display: inline-block;
129
- white-space: pre-wrap;
130
  }
131
- .bullet-point {
132
- position: relative;
133
- padding-left: 1.2em;
134
- margin-bottom: 0.3em;
135
- }
136
- .bullet-point:before {
137
- content: "•";
138
- position: absolute;
139
- left: 0;
140
- color: #3498db;
141
- font-weight: bold;
142
- }
143
- .spaced {
144
- margin-top: 10px;
145
  }
146
  .page-number {
147
  position: relative;
@@ -164,48 +326,46 @@ class HTMLProcessor:
164
  html_parts.append(f'<div class="page">')
165
  html_parts.append(f'<div class="page-header">Page {page_num} <span class="page-number">{page_num}</span></div>')
166
 
167
- # Process content with proper ordering and spacing preservation
168
- content_items = HTMLProcessor._extract_page_content(page, analysis_result, page_num)
169
  content_items.sort(key=lambda x: (x['y_pos'], x['x_pos']))
170
 
171
- # Generate HTML for each content item with preserved spacing
172
  for item in content_items:
173
  if item['type'] == 'table':
174
- html_parts.append(HTMLProcessor._table_to_html(item['content'], item['table_idx']))
175
  else:
176
- html_parts.append(HTMLProcessor._text_to_html(item))
177
 
178
  html_parts.append('</div>')
179
 
180
  html_parts.append('</body></html>')
181
  return '\n'.join(html_parts)
182
 
183
- @staticmethod
184
- def _extract_page_content(page, analysis_result, page_num):
185
- """Extract and organize page content without losing text with proper spacing"""
186
  content_items = []
187
 
188
- # First, collect all tables for this page
189
  page_tables = []
190
  table_regions = []
191
 
192
  if analysis_result.tables:
193
  for table_idx, table in enumerate(analysis_result.tables):
194
- if HTMLProcessor._is_table_on_page(table, page_num):
195
  page_tables.append((table_idx, table))
196
- # Store table regions for overlap detection
197
  if table.bounding_regions:
198
  table_regions.append({
199
  'polygon': table.bounding_regions[0].polygon,
200
  'table_idx': table_idx
201
  })
202
 
203
- # Add table items to content
204
  for table_idx, table in page_tables:
205
  if table.bounding_regions and table.bounding_regions[0].polygon:
206
  polygon = table.bounding_regions[0].polygon
207
- y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) # Top Y
208
- x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) # Left X
209
 
210
  content_items.append({
211
  'type': 'table',
@@ -215,51 +375,192 @@ class HTMLProcessor:
215
  'x_pos': x_pos
216
  })
217
 
218
- # Calculate page margins for proper indentation detection
219
- page_left_margin = HTMLProcessor._calculate_page_margins(page, analysis_result, page_num)
220
-
221
- # Process text content - use paragraphs if available, otherwise lines
222
  if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
223
- # Use paragraphs (better content grouping)
224
  page_paragraphs = [p for p in analysis_result.paragraphs if
225
  p.bounding_regions and
226
  p.bounding_regions[0].page_number == page_num]
227
 
228
  for para in page_paragraphs:
229
  if para.content.strip():
230
- # Check if this paragraph overlaps significantly with any table
231
- overlap_ratio = HTMLProcessor._calculate_table_overlap(para, table_regions)
232
 
233
- # Only exclude if heavily overlapping (>70%) with a table
234
- if overlap_ratio < 0.7:
235
  polygon = para.bounding_regions[0].polygon
236
  y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
237
  x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
238
 
239
- # Calculate proper indentation based on page margins
240
- indent_info = HTMLProcessor._calculate_precise_indentation(x_pos, page_left_margin, para.content)
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  content_items.append({
243
  'type': 'paragraph',
244
- 'content': para.content.strip(),
245
  'role': getattr(para, 'role', 'paragraph'),
246
  'y_pos': y_pos,
247
  'x_pos': x_pos,
248
- 'indent_level': indent_info['level'],
249
- 'indent_pixels': indent_info['pixels'],
250
- 'is_bullet': indent_info['is_bullet'],
251
  'preserve_spacing': True
252
  })
253
 
254
  elif page.lines:
255
- # Use lines as fallback with enhanced spacing preservation
256
- processed_lines = HTMLProcessor._process_lines_content_with_spacing(page.lines, table_regions, page_left_margin)
257
  content_items.extend(processed_lines)
258
 
259
  return content_items
260
 
261
- @staticmethod
262
- def _is_table_on_page(table, page_num):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  """Check if table belongs to the specified page"""
264
  if not table.cells:
265
  return False
@@ -270,9 +571,8 @@ class HTMLProcessor:
270
  return True
271
  return False
272
 
273
- @staticmethod
274
- def _calculate_table_overlap(content_item, table_regions):
275
- """Calculate overlap ratio between content and tables (FIXED)"""
276
  if not table_regions or not content_item.bounding_regions:
277
  return 0.0
278
 
@@ -316,120 +616,7 @@ class HTMLProcessor:
316
 
317
  return max_overlap_ratio
318
 
319
- @staticmethod
320
- def _calculate_page_margins(page, analysis_result, page_num):
321
- """Calculate page margins to determine proper indentation baseline"""
322
- left_positions = []
323
-
324
- # Collect x positions from paragraphs if available
325
- if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
326
- page_paragraphs = [p for p in analysis_result.paragraphs if
327
- p.bounding_regions and
328
- p.bounding_regions[0].page_number == page_num]
329
-
330
- for para in page_paragraphs:
331
- if para.bounding_regions and para.bounding_regions[0].polygon:
332
- polygon = para.bounding_regions[0].polygon
333
- x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6])
334
- left_positions.append(x_pos)
335
-
336
- # Fallback to lines if no paragraphs
337
- elif page.lines:
338
- for line in page.lines:
339
- if line.polygon:
340
- x_pos = min(line.polygon[0], line.polygon[2], line.polygon[4], line.polygon[6])
341
- left_positions.append(x_pos)
342
-
343
- # Find the most common left margin (baseline)
344
- if left_positions:
345
- left_positions.sort()
346
- # Take the most frequent left position as the main margin
347
- from collections import Counter
348
- position_counts = Counter([round(pos, -1) for pos in left_positions]) # Round to nearest 10
349
- base_margin = position_counts.most_common(1)[0][0]
350
- return base_margin
351
-
352
- return 50 # Default margin if no content found
353
-
354
- @staticmethod
355
- def _calculate_precise_indentation(x_pos, base_margin, content):
356
- """Calculate precise indentation based on x position and content analysis"""
357
- # Calculate indent distance from base margin
358
- indent_distance = max(0, x_pos - base_margin)
359
-
360
- # Define indentation levels based on distance
361
- # Each level represents approximately 0.5 inch or 36 points
362
- level_threshold = 30 # Reduced threshold for better sensitivity
363
- indent_level = int(indent_distance / level_threshold)
364
-
365
- # Detect bullet points or numbered lists
366
- is_bullet = False
367
- content_stripped = content.strip()
368
-
369
- # Common bullet point patterns
370
- bullet_patterns = [
371
- r'^\s*[•·▪▫◦‣⁃]\s+', # Bullet symbols
372
- r'^\s*[\-\*\+]\s+', # Dash, asterisk, plus
373
- r'^\s*\d+[\.\)]\s+', # Numbered lists (1. or 1))
374
- r'^\s*[a-zA-Z][\.\)]\s+', # Lettered lists (a. or a))
375
- r'^\s*[ivxlcdm]+[\.\)]\s+', # Roman numerals
376
- ]
377
-
378
- for pattern in bullet_patterns:
379
- if re.match(pattern, content_stripped, re.IGNORECASE):
380
- is_bullet = True
381
- break
382
-
383
- return {
384
- 'level': min(indent_level, 6), # Cap at level 6
385
- 'pixels': indent_distance,
386
- 'is_bullet': is_bullet
387
- }
388
-
389
- @staticmethod
390
- def _process_lines_content_with_spacing(lines, table_regions, page_left_margin):
391
- """Process lines content with enhanced spacing preservation"""
392
- content_items = []
393
- processed_content = set()
394
-
395
- for line in lines:
396
- if not line.content.strip():
397
- continue
398
-
399
- # Avoid duplicates
400
- content_key = line.content.strip().lower()
401
- if content_key in processed_content:
402
- continue
403
- processed_content.add(content_key)
404
-
405
- # Check table overlap
406
- overlap_ratio = HTMLProcessor._calculate_line_table_overlap(line, table_regions)
407
-
408
- # Only exclude if heavily overlapping with table
409
- if overlap_ratio < 0.7:
410
- polygon = line.polygon
411
- y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
412
- x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
413
-
414
- # Calculate precise indentation for lines
415
- indent_info = HTMLProcessor._calculate_precise_indentation(x_pos, page_left_margin, line.content)
416
-
417
- content_items.append({
418
- 'type': 'line',
419
- 'content': line.content.strip(),
420
- 'role': 'text',
421
- 'y_pos': y_pos,
422
- 'x_pos': x_pos,
423
- 'indent_level': indent_info['level'],
424
- 'indent_pixels': indent_info['pixels'],
425
- 'is_bullet': indent_info['is_bullet'],
426
- 'preserve_spacing': True
427
- })
428
-
429
- return content_items
430
-
431
- @staticmethod
432
- def _calculate_line_table_overlap(line, table_regions):
433
  """Calculate overlap between line and tables"""
434
  if not table_regions or not line.polygon:
435
  return 0.0
@@ -474,119 +661,37 @@ class HTMLProcessor:
474
  return max_overlap
475
 
476
  @staticmethod
477
- def _text_to_html(item):
478
- """Convert text item to HTML with proper formatting and preserved spacing"""
479
- content = item['content']
480
- role = item.get('role', 'paragraph')
481
- indent_level = item.get('indent_level', 0)
482
- indent_pixels = item.get('indent_pixels', 0)
483
- is_bullet = item.get('is_bullet', False)
484
- preserve_spacing = item.get('preserve_spacing', False)
485
-
486
- # Calculate CSS indentation
487
- css_indent = max(0, indent_level)
488
-
489
- # Build CSS classes and inline styles
490
- css_classes = []
491
- inline_styles = []
492
-
493
- if css_indent > 0:
494
- inline_styles.append(f"margin-left: {css_indent * 1.5}em")
495
- css_classes.append("indented")
496
-
497
- if is_bullet:
498
- css_classes.append("bullet-point")
499
-
500
- # Preserve internal spacing within content
501
- if preserve_spacing:
502
- # Replace multiple spaces with &nbsp; to preserve spacing
503
- content = re.sub(r' +', lambda m: '&nbsp;' * len(m.group()), content)
504
- # Preserve line breaks within content
505
- content = content.replace('\n', '<br>')
506
-
507
- # Combine CSS
508
- class_str = f' class="{" ".join(css_classes)}"' if css_classes else ''
509
- style_str = f' style="{"; ".join(inline_styles)}"' if inline_styles else ''
510
-
511
- if role == 'title':
512
- return f'<div class="title"{class_str}{style_str}>{content}</div>'
513
- elif role == 'sectionHeading':
514
- return f'<div class="section-heading"{class_str}{style_str}>{content}</div>'
515
- else:
516
- # Regular paragraphs with preserved formatting
517
- return f'<div class="paragraph"{class_str}{style_str}>{content}</div>'
518
-
519
- @staticmethod
520
- def _table_to_html(table, table_idx):
521
- """Convert table to HTML with proper structure"""
522
- if not table.cells:
523
- return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
524
-
525
- # Create table matrix
526
- max_row = max(cell.row_index for cell in table.cells) + 1
527
- max_col = max(cell.column_index for cell in table.cells) + 1
528
-
529
- table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
530
-
531
- # Fill matrix
532
- for cell in table.cells:
533
- content = (cell.content or "").strip()
534
- table_matrix[cell.row_index][cell.column_index] = content
535
-
536
- # Generate HTML
537
- html_parts = [f'<div class="table-container">']
538
- html_parts.append(f'<h4>Table {table_idx + 1}</h4>')
539
- html_parts.append('<table class="table">')
540
-
541
- for row_idx, row in enumerate(table_matrix):
542
- if row_idx == 0 and any(cell.strip() for cell in row):
543
- # Header row
544
- html_parts.append('<tr>')
545
- for cell in row:
546
- html_parts.append(f'<th>{cell}</th>')
547
- html_parts.append('</tr>')
548
- else:
549
- # Data row
550
- if any(cell.strip() for cell in row): # Skip empty rows
551
- html_parts.append('<tr>')
552
- for cell in row:
553
- html_parts.append(f'<td>{cell}</td>')
554
- html_parts.append('</tr>')
555
-
556
- html_parts.append('</table></div>')
557
- return '\n'.join(html_parts)
558
-
559
- @staticmethod
560
- def html_to_formatted_text(html_content):
561
- """Convert HTML back to formatted text preserving structure, spacing, and adding page numbers"""
562
  from html.parser import HTMLParser
563
 
564
- class FixedSpacingTextExtractor(HTMLParser):
565
  def __init__(self):
566
  super().__init__()
567
  self.text_parts = []
 
568
  self.in_title = False
569
  self.in_section_heading = False
570
  self.in_table = False
571
- self.in_table_header = False
572
  self.current_table_row = []
573
  self.table_data = []
574
- self.current_indent = 0
575
- self.preserve_spacing = False
576
  self.in_page_header = False
577
- self.current_page_num = 0
 
578
 
579
  def handle_starttag(self, tag, attrs):
580
  attr_dict = dict(attrs)
581
  class_attr = attr_dict.get('class', '')
582
- style_attr = attr_dict.get('style', '')
583
 
584
  if 'page-header' in class_attr:
585
  self.in_page_header = True
586
- # Add proper page separation with page number
587
  if len(self.text_parts) > 0:
588
  self.text_parts.append('\n\n' + '=' * 80 + '\n')
589
-
 
590
  elif 'title' in class_attr:
591
  self.in_title = True
592
  elif 'section-heading' in class_attr:
@@ -594,32 +699,47 @@ class HTMLProcessor:
594
  elif tag == 'table':
595
  self.in_table = True
596
  self.table_data = []
597
- elif tag == 'th':
598
- self.in_table_header = True
599
  elif tag == 'tr':
600
  self.current_table_row = []
601
  elif tag == 'br':
602
  self.text_parts.append('\n')
603
 
604
- # Extract indentation from style
605
- if 'margin-left' in style_attr:
606
- import re
607
- margin_match = re.search(r'margin-left:\s*(\d+(?:\.\d+)?)em', style_attr)
608
- if margin_match:
609
- self.current_indent = int(float(margin_match.group(1)))
610
- else:
611
- self.current_indent = 0
612
  else:
613
- # Count indented classes as fallback
614
- self.current_indent = class_attr.count('indented')
 
 
 
 
 
 
 
 
 
 
615
 
616
- # Check if we should preserve spacing
617
- self.preserve_spacing = 'paragraph' in class_attr or 'bullet-point' in class_attr
 
 
 
 
618
 
619
  def handle_endtag(self, tag):
620
  if tag == 'div' and self.in_page_header:
621
  self.text_parts.append('\n' + '=' * 80 + '\n\n')
622
  self.in_page_header = False
 
 
 
623
  elif tag == 'div' and self.in_title:
624
  self.text_parts.append('\n\n')
625
  self.in_title = False
@@ -629,69 +749,83 @@ class HTMLProcessor:
629
  elif tag == 'table':
630
  self.in_table = False
631
  self._format_table()
632
- elif tag == 'th':
633
- self.in_table_header = False
634
  elif tag == 'tr' and self.current_table_row:
635
  self.table_data.append(self.current_table_row[:])
636
- elif tag == 'div' and not self.in_table and not self.in_title and not self.in_section_heading and not self.in_page_header:
637
- if not self.preserve_spacing:
638
  self.text_parts.append('\n')
639
 
640
- # Reset indentation when closing div
641
  if tag == 'div':
642
- self.current_indent = 0
643
- self.preserve_spacing = False
 
644
 
645
  def handle_data(self, data):
646
  if data.strip():
647
- # Convert &nbsp; back to spaces for proper spacing
648
  data = data.replace('&nbsp;', ' ')
649
 
650
  if self.in_page_header:
651
- # Extract page number and format properly
652
  page_match = re.search(r'Page (\d+)', data)
653
  if page_match:
654
- self.current_page_num = int(page_match.group(1))
655
- page_header = f"PAGE {self.current_page_num}"
656
  self.text_parts.append(page_header.center(80))
 
 
 
657
  elif self.in_title:
658
- indent_str = " " * self.current_indent
659
  self.text_parts.append(f'\n{indent_str}## {data.strip()}')
660
  elif self.in_section_heading:
661
- indent_str = " " * self.current_indent
662
  self.text_parts.append(f'\n{indent_str}### {data.strip()}')
663
  elif self.in_table:
664
- if self.in_table_header or self.current_table_row is not None:
665
- self.current_table_row.append(data.strip())
666
  else:
667
- # Apply indentation and preserve internal spacing
668
- indent_str = " " * self.current_indent
669
 
670
- if self.preserve_spacing:
671
- # Keep the exact spacing from the data
672
- formatted_data = data
673
- else:
674
- # Clean up spacing for non-preserved content
675
- formatted_data = re.sub(r'\s+', ' ', data).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676
 
677
- # Handle bullet points specially
678
- if 'bullet-point' in getattr(self, '_last_class', ''):
679
- # Remove the bullet symbol that CSS adds and format properly
680
- self.text_parts.append(f'{indent_str}• {formatted_data}')
681
  else:
682
- self.text_parts.append(f'{indent_str}{formatted_data}')
 
683
 
684
  def _format_table(self):
 
685
  if not self.table_data:
686
  return
687
 
688
  self.text_parts.append('\n\n')
689
 
690
- # Calculate column widths for better formatting
691
  if self.table_data:
692
  max_cols = max(len(row) for row in self.table_data)
693
  col_widths = [0] * max_cols
694
 
 
695
  for row in self.table_data:
696
  for i, cell in enumerate(row):
697
  if i < max_cols:
@@ -721,7 +855,7 @@ class HTMLProcessor:
721
 
722
  self.text_parts.append('\n')
723
 
724
- extractor = FixedSpacingTextExtractor()
725
  extractor.feed(html_content)
726
 
727
  result = ''.join(extractor.text_parts)
@@ -736,7 +870,7 @@ class HTMLProcessor:
736
 
737
 
738
  class OCRService:
739
- """Main OCR service with HTML processing and improved table handling"""
740
 
741
  def __init__(self):
742
  self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
@@ -758,7 +892,7 @@ class OCRService:
758
 
759
  def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
760
  """
761
- Convert PDF to text using specified method with HTML processing
762
 
763
  Args:
764
  pdf_path: Path to the PDF file
@@ -792,7 +926,7 @@ class OCRService:
792
  # Try primary method
793
  try:
794
  if method == "azure" and self.azure_client:
795
- result = self._azure_ocr_with_html(pdf_path)
796
  elif method == "tesseract":
797
  result = self._tesseract_ocr(pdf_path)
798
  elif method == "pymupdf":
@@ -811,13 +945,13 @@ class OCRService:
811
 
812
  return result
813
 
814
- def _azure_ocr_with_html(self, pdf_path: str) -> Dict[str, Any]:
815
- """Azure Document Intelligence OCR with HTML processing"""
816
  result = {
817
  'success': False,
818
  'text': '',
819
  'html': '',
820
- 'method_used': 'azure_document_intelligence',
821
  'metadata': {},
822
  'error': None
823
  }
@@ -848,11 +982,16 @@ class OCRService:
848
 
849
  analysis_result = poller.result()
850
 
851
- # Generate HTML first
852
- html_content = HTMLProcessor.create_html_from_azure_result(analysis_result)
853
 
854
- # Convert HTML to formatted text with proper page numbers and spacing
855
- formatted_text = HTMLProcessor.html_to_formatted_text(html_content)
 
 
 
 
 
856
 
857
  result.update({
858
  'success': True,
@@ -864,13 +1003,17 @@ class OCRService:
864
  'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
865
  'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
866
  'html_generated': True,
867
- 'improved_formatting': True,
 
 
868
  'page_numbers_added': True,
869
- 'azure_analysis': analysis_result
 
 
870
  }
871
  })
872
 
873
- logger.info("Azure OCR with improved HTML processing completed successfully")
874
 
875
  except Exception as e:
876
  logger.error(f"Azure OCR error: {e}")
@@ -879,12 +1022,12 @@ class OCRService:
879
  return result
880
 
881
  def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
882
- """Tesseract OCR with basic HTML generation and page numbers"""
883
  result = {
884
  'success': False,
885
  'text': '',
886
  'html': '',
887
- 'method_used': 'tesseract',
888
  'metadata': {},
889
  'error': None
890
  }
@@ -899,11 +1042,19 @@ class OCRService:
899
  page_count = len(pdf_document)
900
  all_text = []
901
  html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
902
- html_parts.append('body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }')
903
- html_parts.append('.page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }')
904
- html_parts.append('.page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }')
 
 
 
 
 
 
905
  html_parts.append('</style></head><body>')
906
 
 
 
907
  for page_num in range(page_count):
908
  # Add page header to text
909
  page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
@@ -929,10 +1080,41 @@ class OCRService:
929
 
930
  all_text.append(text)
931
 
932
- # Add to HTML with page number
933
  html_parts.append(f'<div class="page">')
934
  html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
935
- html_parts.append(f'<pre>{text}</pre></div>')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
936
 
937
  finally:
938
  if temp_img_path and os.path.exists(temp_img_path):
@@ -943,19 +1125,26 @@ class OCRService:
943
 
944
  html_parts.append('</body></html>')
945
 
 
 
 
 
946
  result.update({
947
  'success': True,
948
- 'text': '\n'.join(all_text),
949
- 'html': '\n'.join(html_parts),
950
  'metadata': {
951
  'pages': page_count,
952
  'html_generated': True,
 
 
 
953
  'page_numbers_added': True,
954
- 'improved_formatting': True
955
  }
956
  })
957
 
958
- logger.info("Tesseract OCR with improved formatting completed successfully")
959
 
960
  except Exception as e:
961
  logger.error(f"Tesseract OCR error: {e}")
@@ -970,12 +1159,12 @@ class OCRService:
970
  return result
971
 
972
  def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
973
- """PyMuPDF text extraction with HTML generation and page numbers"""
974
  result = {
975
  'success': False,
976
  'text': '',
977
  'html': '',
978
- 'method_used': 'pymupdf',
979
  'metadata': {},
980
  'error': None
981
  }
@@ -986,11 +1175,19 @@ class OCRService:
986
  page_count = len(pdf_document)
987
  all_text = []
988
  html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
989
- html_parts.append('body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }')
990
- html_parts.append('.page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }')
991
- html_parts.append('.page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }')
 
 
 
 
 
 
992
  html_parts.append('</style></head><body>')
993
 
 
 
994
  for page_num in range(page_count):
995
  # Add page header to text
996
  page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
@@ -1001,27 +1198,64 @@ class OCRService:
1001
 
1002
  all_text.append(text)
1003
 
1004
- # Add to HTML with better formatting and page numbers
1005
  html_parts.append(f'<div class="page">')
1006
  html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
1007
- formatted_text = text.replace('\n', '<br>')
1008
- html_parts.append(f'<div>{formatted_text}</div></div>')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1009
 
1010
  html_parts.append('</body></html>')
1011
 
 
 
 
 
1012
  result.update({
1013
  'success': True,
1014
- 'text': '\n'.join(all_text),
1015
- 'html': '\n'.join(html_parts),
1016
  'metadata': {
1017
  'pages': page_count,
1018
  'html_generated': True,
 
 
 
1019
  'page_numbers_added': True,
1020
- 'improved_formatting': True
1021
  }
1022
  })
1023
 
1024
- logger.info("PyMuPDF extraction with improved formatting completed successfully")
1025
 
1026
  except Exception as e:
1027
  logger.error(f"PyMuPDF error: {e}")
@@ -1058,7 +1292,7 @@ class OCRService:
1058
  logger.info(f"Trying fallback method: {method}")
1059
  try:
1060
  if method == "azure":
1061
- result = self._azure_ocr_with_html(pdf_path)
1062
  elif method == "tesseract":
1063
  result = self._tesseract_ocr(pdf_path)
1064
  elif method == "pymupdf":
 
1
  """
2
+ OCR Service Module - ENHANCED VERSION with Comprehensive Indentation Detection and Intelligent Text Classification
3
+ Handles PDF to text conversion with proper indentation, spacing, page numbering, and intelligent text analysis
4
  """
5
  import re
6
  import os
 
30
 
31
  import fitz # PyMuPDF
32
 
33
+ # Enhanced indentation detection
34
+ from enhanced_indentation import EnhancedIndentationDetector
35
+
36
  # Configure logging
37
  logging.basicConfig(level=logging.INFO)
38
  logger = logging.getLogger(__name__)
39
 
40
 
41
+ class EnhancedHTMLProcessor:
42
+ """Process OCR results through HTML with comprehensive indentation detection and intelligent text classification"""
43
+
44
+ def __init__(self):
45
+ self.indent_detector = EnhancedIndentationDetector()
46
 
47
  @staticmethod
48
  def create_html_from_azure_result(analysis_result) -> str:
49
+ """Create structured HTML from Azure Document Intelligence result with enhanced indentation and text classification"""
50
+ processor = EnhancedHTMLProcessor()
51
+
52
  html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8">']
53
  html_parts.append('<style>')
54
  html_parts.append('''
 
79
  text-transform: uppercase;
80
  letter-spacing: 1px;
81
  }
82
+
83
+ /* Enhanced indentation levels */
84
+ .indent-level-0 { margin-left: 0em; }
85
+ .indent-level-1 { margin-left: 1.5em; }
86
+ .indent-level-2 { margin-left: 3.0em; }
87
+ .indent-level-3 { margin-left: 4.5em; }
88
+ .indent-level-4 { margin-left: 6.0em; }
89
+ .indent-level-5 { margin-left: 7.5em; }
90
+ .indent-level-6 { margin-left: 9.0em; }
91
+ .indent-level-7 { margin-left: 10.5em; }
92
+ .indent-level-8 { margin-left: 12.0em; }
93
+ .indent-level-9 { margin-left: 13.5em; }
94
+ .indent-level-10 { margin-left: 15.0em; }
95
+
96
+ /* Text classification styles */
97
+ .content-header {
98
+ font-weight: bold;
99
+ color: #2c3e50;
100
+ font-size: 1.1em;
101
+ margin: 15px 0 8px 0;
102
+ border-left: 4px solid #3498db;
103
+ padding-left: 10px;
104
+ background-color: #f8f9fa;
105
+ }
106
+ .content-paragraph {
107
+ color: #333;
108
+ margin-bottom: 1em;
109
+ line-height: 1.5;
110
+ }
111
+ .content-list-item {
112
+ margin-bottom: 0.5em;
113
+ line-height: 1.4;
114
+ }
115
+
116
+ /* Pattern-specific styles */
117
+ .numbered-primary {
118
+ font-weight: bold;
119
+ color: #2c3e50;
120
+ border-left: 4px solid #3498db;
121
+ padding-left: 8px;
122
+ margin-bottom: 0.5em;
123
+ background-color: #f8f9fa;
124
+ }
125
+ .numbered-secondary {
126
+ font-weight: 600;
127
+ color: #34495e;
128
+ border-left: 3px solid #95a5a6;
129
+ padding-left: 6px;
130
+ margin-bottom: 0.4em;
131
+ background-color: #f9f9f9;
132
+ }
133
+ .numbered-tertiary {
134
+ color: #555;
135
+ border-left: 2px solid #bdc3c7;
136
+ padding-left: 4px;
137
+ margin-bottom: 0.3em;
138
+ }
139
+ .numbered-quaternary {
140
+ color: #666;
141
+ border-left: 1px solid #dee2e6;
142
+ padding-left: 3px;
143
+ margin-bottom: 0.2em;
144
+ }
145
+ .numbered-quinary {
146
+ color: #777;
147
+ padding-left: 2px;
148
+ margin-bottom: 0.2em;
149
+ }
150
+
151
+ /* Parenthetical styles */
152
+ .parenthetical-primary {
153
+ font-weight: 600;
154
+ color: #8e44ad;
155
+ border-left: 3px solid #9b59b6;
156
+ padding-left: 6px;
157
+ margin-bottom: 0.4em;
158
+ }
159
+ .parenthetical-secondary {
160
+ color: #9b59b6;
161
+ border-left: 2px solid #af7ac5;
162
+ padding-left: 4px;
163
+ margin-bottom: 0.3em;
164
+ }
165
+ .parenthetical-tertiary {
166
+ color: #af7ac5;
167
+ padding-left: 3px;
168
+ margin-bottom: 0.2em;
169
+ }
170
+ .parenthetical-quaternary {
171
+ color: #c39bd3;
172
+ padding-left: 2px;
173
+ margin-bottom: 0.2em;
174
+ }
175
+
176
+ .bullet-primary {
177
+ position: relative;
178
+ padding-left: 1.2em;
179
+ }
180
+ .bullet-primary::before {
181
+ content: "•";
182
+ position: absolute;
183
+ left: 0;
184
+ color: #3498db;
185
+ font-weight: bold;
186
+ }
187
+ .bullet-secondary {
188
+ position: relative;
189
+ padding-left: 1.2em;
190
+ }
191
+ .bullet-secondary::before {
192
+ content: "◦";
193
+ position: absolute;
194
+ left: 0;
195
+ color: #95a5a6;
196
+ }
197
+ .bullet-tertiary {
198
+ position: relative;
199
+ padding-left: 1.2em;
200
+ }
201
+ .bullet-tertiary::before {
202
+ content: "▪";
203
+ position: absolute;
204
+ left: 0;
205
+ color: #bdc3c7;
206
+ }
207
+ .bullet-quaternary {
208
+ position: relative;
209
+ padding-left: 1.2em;
210
+ }
211
+ .bullet-quaternary::before {
212
+ content: "‣";
213
+ position: absolute;
214
+ left: 0;
215
+ color: #dee2e6;
216
+ }
217
+
218
+ .lettered-primary {
219
+ font-style: italic;
220
+ color: #8e44ad;
221
+ font-weight: 600;
222
+ }
223
+ .lettered-secondary {
224
+ color: #9b59b6;
225
+ font-style: italic;
226
+ }
227
+
228
+ .roman-primary {
229
+ font-variant: small-caps;
230
+ color: #d35400;
231
+ font-weight: bold;
232
+ }
233
+ .roman-secondary {
234
+ color: #e67e22;
235
+ font-variant: small-caps;
236
+ }
237
+
238
+ .thai-primary {
239
+ color: #16a085;
240
+ font-weight: bold;
241
+ }
242
+ .thai-secondary {
243
+ color: #1abc9c;
244
+ }
245
+
246
  .paragraph {
247
  margin-bottom: 0.8em;
248
  white-space: pre-wrap;
249
  font-family: 'Consolas', 'Courier New', monospace;
250
  line-height: 1.4;
251
  }
252
+
253
  .title {
254
  font-size: 1.4em;
255
  font-weight: bold;
 
297
  .table tr:nth-child(even) {
298
  background-color: #f8f9fa;
299
  }
300
+ .indented_text {
301
+ color: #555;
302
+ font-style: italic;
303
  }
304
+ .space-indent {
305
+ border-left: 1px dotted #ccc;
306
+ padding-left: 5px;
 
 
 
 
 
 
 
 
 
 
 
307
  }
308
  .page-number {
309
  position: relative;
 
326
  html_parts.append(f'<div class="page">')
327
  html_parts.append(f'<div class="page-header">Page {page_num} <span class="page-number">{page_num}</span></div>')
328
 
329
+ # Process content with enhanced indentation detection and text classification
330
+ content_items = processor._extract_page_content_enhanced(page, analysis_result, page_num)
331
  content_items.sort(key=lambda x: (x['y_pos'], x['x_pos']))
332
 
333
+ # Generate HTML for each content item with enhanced formatting and classification
334
  for item in content_items:
335
  if item['type'] == 'table':
336
+ html_parts.append(processor._table_to_html(item['content'], item['table_idx']))
337
  else:
338
+ html_parts.append(processor._text_to_html_enhanced(item))
339
 
340
  html_parts.append('</div>')
341
 
342
  html_parts.append('</body></html>')
343
  return '\n'.join(html_parts)
344
 
345
+ def _extract_page_content_enhanced(self, page, analysis_result, page_num):
346
+ """Extract page content with enhanced indentation detection and intelligent text classification"""
 
347
  content_items = []
348
 
349
+ # Handle tables (existing logic)
350
  page_tables = []
351
  table_regions = []
352
 
353
  if analysis_result.tables:
354
  for table_idx, table in enumerate(analysis_result.tables):
355
+ if self._is_table_on_page(table, page_num):
356
  page_tables.append((table_idx, table))
 
357
  if table.bounding_regions:
358
  table_regions.append({
359
  'polygon': table.bounding_regions[0].polygon,
360
  'table_idx': table_idx
361
  })
362
 
363
+ # Add tables to content
364
  for table_idx, table in page_tables:
365
  if table.bounding_regions and table.bounding_regions[0].polygon:
366
  polygon = table.bounding_regions[0].polygon
367
+ y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7])
368
+ x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6])
369
 
370
  content_items.append({
371
  'type': 'table',
 
375
  'x_pos': x_pos
376
  })
377
 
378
+ # Process text content with enhanced indentation detection and text classification
 
 
 
379
  if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
 
380
  page_paragraphs = [p for p in analysis_result.paragraphs if
381
  p.bounding_regions and
382
  p.bounding_regions[0].page_number == page_num]
383
 
384
  for para in page_paragraphs:
385
  if para.content.strip():
386
+ # Check table overlap
387
+ overlap_ratio = self._calculate_table_overlap(para, table_regions)
388
 
389
+ if overlap_ratio < 0.7: # Not heavily overlapping with table
 
390
  polygon = para.bounding_regions[0].polygon
391
  y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
392
  x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
393
 
394
+ # Enhanced indentation detection
395
+ indent_info = self.indent_detector.detect_indentation(para.content)
396
+
397
+ # Intelligent text classification with context
398
+ context = {
399
+ 'y_position': y_pos,
400
+ 'x_position': x_pos,
401
+ 'font_size': getattr(para, 'font_size', None),
402
+ 'is_bold': getattr(para, 'is_bold', False),
403
+ 'page_number': page_num
404
+ }
405
+
406
+ text_classification = self.indent_detector.classify_text_type(para.content, context)
407
 
408
  content_items.append({
409
  'type': 'paragraph',
410
+ 'content': indent_info['content'],
411
  'role': getattr(para, 'role', 'paragraph'),
412
  'y_pos': y_pos,
413
  'x_pos': x_pos,
414
+ 'indent_info': indent_info,
415
+ 'text_classification': text_classification,
 
416
  'preserve_spacing': True
417
  })
418
 
419
  elif page.lines:
420
+ # Process lines with enhanced indentation detection and classification
421
+ processed_lines = self._process_lines_enhanced(page.lines, table_regions)
422
  content_items.extend(processed_lines)
423
 
424
  return content_items
425
 
426
+ def _process_lines_enhanced(self, lines, table_regions):
427
+ """Process lines with enhanced indentation detection and text classification"""
428
+ content_items = []
429
+ processed_content = set()
430
+
431
+ for line in lines:
432
+ if not line.content.strip():
433
+ continue
434
+
435
+ content_key = line.content.strip().lower()
436
+ if content_key in processed_content:
437
+ continue
438
+ processed_content.add(content_key)
439
+
440
+ # Check table overlap
441
+ overlap_ratio = self._calculate_line_table_overlap(line, table_regions)
442
+
443
+ if overlap_ratio < 0.7:
444
+ polygon = line.polygon
445
+ y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
446
+ x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
447
+
448
+ # Enhanced indentation detection
449
+ indent_info = self.indent_detector.detect_indentation(line.content)
450
+
451
+ # Text classification with context
452
+ context = {
453
+ 'y_position': y_pos,
454
+ 'x_position': x_pos
455
+ }
456
+
457
+ text_classification = self.indent_detector.classify_text_type(line.content, context)
458
+
459
+ content_items.append({
460
+ 'type': 'line',
461
+ 'content': indent_info['content'],
462
+ 'role': 'text',
463
+ 'y_pos': y_pos,
464
+ 'x_pos': x_pos,
465
+ 'indent_info': indent_info,
466
+ 'text_classification': text_classification,
467
+ 'preserve_spacing': True
468
+ })
469
+
470
+ return content_items
471
+
472
+ def _text_to_html_enhanced(self, item):
473
+ """Convert text item to HTML with enhanced indentation formatting and intelligent classification"""
474
+ content = item['content']
475
+ role = item.get('role', 'paragraph')
476
+ indent_info = item.get('indent_info', {})
477
+ text_classification = item.get('text_classification', {})
478
+ preserve_spacing = item.get('preserve_spacing', False)
479
+
480
+ # Build CSS classes based on indentation info and text classification
481
+ css_classes = ['paragraph']
482
+
483
+ # Add text classification class
484
+ if text_classification.get('type'):
485
+ css_classes.append(f"content-{text_classification['type']}")
486
+
487
+ # Add indentation level class
488
+ level = indent_info.get('level', 0)
489
+ css_classes.append(f'indent-level-{min(level, 10)}')
490
+
491
+ # Add pattern-specific formatting
492
+ formatting_hint = indent_info.get('formatting_hint', 'normal_text')
493
+ if formatting_hint != 'normal_text':
494
+ css_classes.append(formatting_hint)
495
+
496
+ # Add space indent class if needed
497
+ if indent_info.get('pattern_type') == 'space_indent':
498
+ css_classes.append('space-indent')
499
+
500
+ # Preserve internal spacing
501
+ if preserve_spacing:
502
+ content = re.sub(r' +', lambda m: '&nbsp;' * len(m.group()), content)
503
+ content = content.replace('\n', '<br>')
504
+
505
+ # Add pattern marker if needed (but not for bullets as CSS handles them)
506
+ pattern_marker = indent_info.get('pattern_marker', '')
507
+ if pattern_marker and not indent_info.get('is_bullet', False):
508
+ # For numbered/lettered items, include the marker
509
+ content = f"{pattern_marker} {content}"
510
+
511
+ # Build final HTML with enhanced classification
512
+ class_str = f' class="{" ".join(css_classes)}"'
513
+
514
+ # Use text classification to determine HTML structure
515
+ if text_classification.get('is_header') and text_classification.get('confidence', 0) > 0.6:
516
+ return f'<div class="content-header"{class_str}>{content}</div>'
517
+ elif role == 'title':
518
+ return f'<div class="title"{class_str}>{content}</div>'
519
+ elif role == 'sectionHeading':
520
+ return f'<div class="section-heading"{class_str}>{content}</div>'
521
+ else:
522
+ return f'<div{class_str}>{content}</div>'
523
+
524
+ def _table_to_html(self, table, table_idx):
525
+ """Convert table to HTML with proper structure"""
526
+ if not table.cells:
527
+ return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
528
+
529
+ # Create table matrix
530
+ max_row = max(cell.row_index for cell in table.cells) + 1
531
+ max_col = max(cell.column_index for cell in table.cells) + 1
532
+
533
+ table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
534
+
535
+ # Fill matrix
536
+ for cell in table.cells:
537
+ content = (cell.content or "").strip()
538
+ table_matrix[cell.row_index][cell.column_index] = content
539
+
540
+ # Generate HTML
541
+ html_parts = [f'<div class="table-container">']
542
+ html_parts.append(f'<h4>Table {table_idx + 1}</h4>')
543
+ html_parts.append('<table class="table">')
544
+
545
+ for row_idx, row in enumerate(table_matrix):
546
+ if row_idx == 0 and any(cell.strip() for cell in row):
547
+ # Header row
548
+ html_parts.append('<tr>')
549
+ for cell in row:
550
+ html_parts.append(f'<th>{cell}</th>')
551
+ html_parts.append('</tr>')
552
+ else:
553
+ # Data row
554
+ if any(cell.strip() for cell in row): # Skip empty rows
555
+ html_parts.append('<tr>')
556
+ for cell in row:
557
+ html_parts.append(f'<td>{cell}</td>')
558
+ html_parts.append('</tr>')
559
+
560
+ html_parts.append('</table></div>')
561
+ return '\n'.join(html_parts)
562
+
563
+ def _is_table_on_page(self, table, page_num):
564
  """Check if table belongs to the specified page"""
565
  if not table.cells:
566
  return False
 
571
  return True
572
  return False
573
 
574
+ def _calculate_table_overlap(self, content_item, table_regions):
575
+ """Calculate overlap ratio between content and tables"""
 
576
  if not table_regions or not content_item.bounding_regions:
577
  return 0.0
578
 
 
616
 
617
  return max_overlap_ratio
618
 
619
+ def _calculate_line_table_overlap(self, line, table_regions):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
  """Calculate overlap between line and tables"""
621
  if not table_regions or not line.polygon:
622
  return 0.0
 
661
  return max_overlap
662
 
663
  @staticmethod
664
+ def html_to_formatted_text_enhanced(html_content):
665
+ """Convert HTML back to formatted text with enhanced indentation preservation and text classification"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
  from html.parser import HTMLParser
667
 
668
+ class EnhancedTextExtractor(HTMLParser):
669
  def __init__(self):
670
  super().__init__()
671
  self.text_parts = []
672
+ self.indent_detector = EnhancedIndentationDetector()
673
  self.in_title = False
674
  self.in_section_heading = False
675
  self.in_table = False
 
676
  self.current_table_row = []
677
  self.table_data = []
678
+ self.current_indent_level = 0
679
+ self.current_formatting_hint = 'normal_text'
680
  self.in_page_header = False
681
+ self.current_classes = []
682
+ self.in_content_header = False
683
 
684
  def handle_starttag(self, tag, attrs):
685
  attr_dict = dict(attrs)
686
  class_attr = attr_dict.get('class', '')
687
+ self.current_classes = class_attr.split()
688
 
689
  if 'page-header' in class_attr:
690
  self.in_page_header = True
 
691
  if len(self.text_parts) > 0:
692
  self.text_parts.append('\n\n' + '=' * 80 + '\n')
693
+ elif 'content-header' in class_attr:
694
+ self.in_content_header = True
695
  elif 'title' in class_attr:
696
  self.in_title = True
697
  elif 'section-heading' in class_attr:
 
699
  elif tag == 'table':
700
  self.in_table = True
701
  self.table_data = []
 
 
702
  elif tag == 'tr':
703
  self.current_table_row = []
704
  elif tag == 'br':
705
  self.text_parts.append('\n')
706
 
707
+ # Extract indent level from class
708
+ for cls in self.current_classes:
709
+ if cls.startswith('indent-level-'):
710
+ try:
711
+ self.current_indent_level = int(cls.split('-')[-1])
712
+ except ValueError:
713
+ self.current_indent_level = 0
714
+ break
715
  else:
716
+ self.current_indent_level = 0
717
+
718
+ # Extract formatting hint
719
+ formatting_hints = [
720
+ 'numbered-primary', 'numbered-secondary', 'numbered-tertiary', 'numbered-quaternary', 'numbered-quinary',
721
+ 'parenthetical-primary', 'parenthetical-secondary', 'parenthetical-tertiary', 'parenthetical-quaternary',
722
+ 'bullet-primary', 'bullet-secondary', 'bullet-tertiary', 'bullet-quaternary',
723
+ 'lettered-primary', 'lettered-secondary',
724
+ 'roman-primary', 'roman-secondary',
725
+ 'thai-primary', 'thai-secondary',
726
+ 'indented_text', 'space-indent'
727
+ ]
728
 
729
+ for hint in formatting_hints:
730
+ if hint in self.current_classes:
731
+ self.current_formatting_hint = hint
732
+ break
733
+ else:
734
+ self.current_formatting_hint = 'normal_text'
735
 
736
  def handle_endtag(self, tag):
737
  if tag == 'div' and self.in_page_header:
738
  self.text_parts.append('\n' + '=' * 80 + '\n\n')
739
  self.in_page_header = False
740
+ elif tag == 'div' and self.in_content_header:
741
+ self.text_parts.append('\n\n')
742
+ self.in_content_header = False
743
  elif tag == 'div' and self.in_title:
744
  self.text_parts.append('\n\n')
745
  self.in_title = False
 
749
  elif tag == 'table':
750
  self.in_table = False
751
  self._format_table()
 
 
752
  elif tag == 'tr' and self.current_table_row:
753
  self.table_data.append(self.current_table_row[:])
754
+ elif tag == 'div' and not self.in_table:
755
+ if not self.in_title and not self.in_section_heading and not self.in_page_header and not self.in_content_header:
756
  self.text_parts.append('\n')
757
 
758
+ # Reset state
759
  if tag == 'div':
760
+ self.current_indent_level = 0
761
+ self.current_formatting_hint = 'normal_text'
762
+ self.current_classes = []
763
 
764
  def handle_data(self, data):
765
  if data.strip():
 
766
  data = data.replace('&nbsp;', ' ')
767
 
768
  if self.in_page_header:
 
769
  page_match = re.search(r'Page (\d+)', data)
770
  if page_match:
771
+ page_num = int(page_match.group(1))
772
+ page_header = f"PAGE {page_num}"
773
  self.text_parts.append(page_header.center(80))
774
+ elif self.in_content_header:
775
+ indent_str = " " * self.current_indent_level
776
+ self.text_parts.append(f'\n{indent_str}# {data.strip()}')
777
  elif self.in_title:
778
+ indent_str = " " * self.current_indent_level
779
  self.text_parts.append(f'\n{indent_str}## {data.strip()}')
780
  elif self.in_section_heading:
781
+ indent_str = " " * self.current_indent_level
782
  self.text_parts.append(f'\n{indent_str}### {data.strip()}')
783
  elif self.in_table:
784
+ self.current_table_row.append(data.strip())
 
785
  else:
786
+ # Apply enhanced indentation formatting
787
+ indent_str = " " * self.current_indent_level
788
 
789
+ # Handle different formatting hints including parenthetical
790
+ if 'bullet' in self.current_formatting_hint:
791
+ # Use appropriate bullet symbol based on level
792
+ if 'primary' in self.current_formatting_hint:
793
+ bullet = '•'
794
+ elif 'secondary' in self.current_formatting_hint:
795
+ bullet = '◦'
796
+ elif 'tertiary' in self.current_formatting_hint:
797
+ bullet = '▪'
798
+ elif 'quaternary' in self.current_formatting_hint:
799
+ bullet = '‣'
800
+ else:
801
+ bullet = '•'
802
+
803
+ self.text_parts.append(f'{indent_str}{bullet} {data.strip()}')
804
+
805
+ elif any(pattern in self.current_formatting_hint for pattern in ['numbered', 'lettered', 'roman', 'thai', 'parenthetical']):
806
+ # For numbered/lettered/parenthetical items, the marker should already be in the text
807
+ self.text_parts.append(f'{indent_str}{data.strip()}')
808
+
809
+ elif 'space-indent' in self.current_formatting_hint:
810
+ # Simple indented text
811
+ self.text_parts.append(f'{indent_str}{data.strip()}')
812
 
 
 
 
 
813
  else:
814
+ # Regular text with indentation
815
+ self.text_parts.append(f'{indent_str}{data.strip()}')
816
 
817
  def _format_table(self):
818
+ """Format table with proper alignment"""
819
  if not self.table_data:
820
  return
821
 
822
  self.text_parts.append('\n\n')
823
 
 
824
  if self.table_data:
825
  max_cols = max(len(row) for row in self.table_data)
826
  col_widths = [0] * max_cols
827
 
828
+ # Calculate column widths
829
  for row in self.table_data:
830
  for i, cell in enumerate(row):
831
  if i < max_cols:
 
855
 
856
  self.text_parts.append('\n')
857
 
858
+ extractor = EnhancedTextExtractor()
859
  extractor.feed(html_content)
860
 
861
  result = ''.join(extractor.text_parts)
 
870
 
871
 
872
  class OCRService:
873
+ """Main OCR service with enhanced HTML processing, comprehensive indentation detection, and intelligent text classification"""
874
 
875
  def __init__(self):
876
  self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
 
892
 
893
  def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
894
  """
895
+ Convert PDF to text using specified method with enhanced HTML processing and intelligent text classification
896
 
897
  Args:
898
  pdf_path: Path to the PDF file
 
926
  # Try primary method
927
  try:
928
  if method == "azure" and self.azure_client:
929
+ result = self._azure_ocr_with_enhanced_html(pdf_path)
930
  elif method == "tesseract":
931
  result = self._tesseract_ocr(pdf_path)
932
  elif method == "pymupdf":
 
945
 
946
  return result
947
 
948
+ def _azure_ocr_with_enhanced_html(self, pdf_path: str) -> Dict[str, Any]:
949
+ """Azure Document Intelligence OCR with enhanced HTML processing, indentation detection, and intelligent text classification"""
950
  result = {
951
  'success': False,
952
  'text': '',
953
  'html': '',
954
+ 'method_used': 'azure_document_intelligence_enhanced_v2',
955
  'metadata': {},
956
  'error': None
957
  }
 
982
 
983
  analysis_result = poller.result()
984
 
985
+ # Generate HTML with enhanced indentation processing and text classification
986
+ html_content = EnhancedHTMLProcessor.create_html_from_azure_result(analysis_result)
987
 
988
+ # Convert HTML to formatted text with enhanced indentation preservation and classification
989
+ formatted_text = EnhancedHTMLProcessor.html_to_formatted_text_enhanced(html_content)
990
+
991
+ # Analyze document structure with text classification
992
+ detector = EnhancedIndentationDetector()
993
+ text_lines = formatted_text.split('\n')
994
+ document_analysis = detector.analyze_document_structure(text_lines)
995
 
996
  result.update({
997
  'success': True,
 
1003
  'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
1004
  'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
1005
  'html_generated': True,
1006
+ 'enhanced_indentation': True,
1007
+ 'intelligent_text_classification': True,
1008
+ 'parenthetical_patterns_supported': True,
1009
  'page_numbers_added': True,
1010
+ 'comprehensive_formatting': True,
1011
+ 'azure_analysis': analysis_result,
1012
+ 'document_structure_analysis': document_analysis
1013
  }
1014
  })
1015
 
1016
+ logger.info("Azure OCR with enhanced indentation processing and intelligent text classification completed successfully")
1017
 
1018
  except Exception as e:
1019
  logger.error(f"Azure OCR error: {e}")
 
1022
  return result
1023
 
1024
  def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
1025
+ """Tesseract OCR with enhanced HTML generation, indentation detection, and text classification"""
1026
  result = {
1027
  'success': False,
1028
  'text': '',
1029
  'html': '',
1030
+ 'method_used': 'tesseract_enhanced_v2',
1031
  'metadata': {},
1032
  'error': None
1033
  }
 
1042
  page_count = len(pdf_document)
1043
  all_text = []
1044
  html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
1045
+ html_parts.append('''
1046
+ body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }
1047
+ .page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }
1048
+ .page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }
1049
+ .paragraph { margin-bottom: 0.8em; white-space: pre-wrap; }
1050
+ .content-header { font-weight: bold; color: #2c3e50; margin: 10px 0; }
1051
+ .content-paragraph { margin-bottom: 1em; }
1052
+ .content-list-item { margin-bottom: 0.5em; }
1053
+ ''')
1054
  html_parts.append('</style></head><body>')
1055
 
1056
+ indent_detector = EnhancedIndentationDetector()
1057
+
1058
  for page_num in range(page_count):
1059
  # Add page header to text
1060
  page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
 
1080
 
1081
  all_text.append(text)
1082
 
1083
+ # Add to HTML with enhanced indentation processing and text classification
1084
  html_parts.append(f'<div class="page">')
1085
  html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
1086
+
1087
+ # Process each line for indentation and classification
1088
+ lines = text.split('\n')
1089
+ for line in lines:
1090
+ if line.strip():
1091
+ indent_info = indent_detector.detect_indentation(line)
1092
+ text_classification = indent_detector.classify_text_type(line)
1093
+
1094
+ level = indent_info.get('level', 0)
1095
+ formatting_hint = indent_info.get('formatting_hint', 'normal_text')
1096
+
1097
+ css_classes = [f'indent-level-{min(level, 10)}']
1098
+ if formatting_hint != 'normal_text':
1099
+ css_classes.append(formatting_hint)
1100
+
1101
+ # Add text classification class
1102
+ if text_classification.get('type'):
1103
+ css_classes.append(f"content-{text_classification['type']}")
1104
+
1105
+ class_str = f' class="paragraph {" ".join(css_classes)}"'
1106
+ content = indent_info.get('content', line.strip())
1107
+
1108
+ # Add marker for non-bullet items
1109
+ marker = indent_info.get('pattern_marker', '')
1110
+ if marker and not indent_info.get('is_bullet', False):
1111
+ content = f"{marker} {content}"
1112
+
1113
+ html_parts.append(f'<div{class_str}>{content}</div>')
1114
+ else:
1115
+ html_parts.append('<div class="paragraph"><br></div>')
1116
+
1117
+ html_parts.append('</div>')
1118
 
1119
  finally:
1120
  if temp_img_path and os.path.exists(temp_img_path):
 
1125
 
1126
  html_parts.append('</body></html>')
1127
 
1128
+ # Convert HTML back to formatted text
1129
+ html_content = '\n'.join(html_parts)
1130
+ formatted_text = EnhancedHTMLProcessor.html_to_formatted_text_enhanced(html_content)
1131
+
1132
  result.update({
1133
  'success': True,
1134
+ 'text': formatted_text,
1135
+ 'html': html_content,
1136
  'metadata': {
1137
  'pages': page_count,
1138
  'html_generated': True,
1139
+ 'enhanced_indentation': True,
1140
+ 'intelligent_text_classification': True,
1141
+ 'parenthetical_patterns_supported': True,
1142
  'page_numbers_added': True,
1143
+ 'comprehensive_formatting': True
1144
  }
1145
  })
1146
 
1147
+ logger.info("Tesseract OCR with enhanced indentation processing and text classification completed successfully")
1148
 
1149
  except Exception as e:
1150
  logger.error(f"Tesseract OCR error: {e}")
 
1159
  return result
1160
 
1161
  def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
1162
+ """PyMuPDF text extraction with enhanced HTML generation, indentation detection, and text classification"""
1163
  result = {
1164
  'success': False,
1165
  'text': '',
1166
  'html': '',
1167
+ 'method_used': 'pymupdf_enhanced_v2',
1168
  'metadata': {},
1169
  'error': None
1170
  }
 
1175
  page_count = len(pdf_document)
1176
  all_text = []
1177
  html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
1178
+ html_parts.append('''
1179
+ body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }
1180
+ .page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }
1181
+ .page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }
1182
+ .paragraph { margin-bottom: 0.8em; white-space: pre-wrap; }
1183
+ .content-header { font-weight: bold; color: #2c3e50; margin: 10px 0; }
1184
+ .content-paragraph { margin-bottom: 1em; }
1185
+ .content-list-item { margin-bottom: 0.5em; }
1186
+ ''')
1187
  html_parts.append('</style></head><body>')
1188
 
1189
+ indent_detector = EnhancedIndentationDetector()
1190
+
1191
  for page_num in range(page_count):
1192
  # Add page header to text
1193
  page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
 
1198
 
1199
  all_text.append(text)
1200
 
1201
+ # Add to HTML with enhanced indentation processing and text classification
1202
  html_parts.append(f'<div class="page">')
1203
  html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
1204
+
1205
+ # Process each line for indentation and classification
1206
+ lines = text.split('\n')
1207
+ for line in lines:
1208
+ if line.strip():
1209
+ indent_info = indent_detector.detect_indentation(line)
1210
+ text_classification = indent_detector.classify_text_type(line)
1211
+
1212
+ level = indent_info.get('level', 0)
1213
+ formatting_hint = indent_info.get('formatting_hint', 'normal_text')
1214
+
1215
+ css_classes = [f'indent-level-{min(level, 10)}']
1216
+ if formatting_hint != 'normal_text':
1217
+ css_classes.append(formatting_hint)
1218
+
1219
+ # Add text classification class
1220
+ if text_classification.get('type'):
1221
+ css_classes.append(f"content-{text_classification['type']}")
1222
+
1223
+ class_str = f' class="paragraph {" ".join(css_classes)}"'
1224
+ content = indent_info.get('content', line.strip())
1225
+
1226
+ # Add marker for non-bullet items
1227
+ marker = indent_info.get('pattern_marker', '')
1228
+ if marker and not indent_info.get('is_bullet', False):
1229
+ content = f"{marker} {content}"
1230
+
1231
+ html_parts.append(f'<div{class_str}>{content}</div>')
1232
+ else:
1233
+ html_parts.append('<div class="paragraph"><br></div>')
1234
+
1235
+ html_parts.append('</div>')
1236
 
1237
  html_parts.append('</body></html>')
1238
 
1239
+ # Convert HTML back to formatted text
1240
+ html_content = '\n'.join(html_parts)
1241
+ formatted_text = EnhancedHTMLProcessor.html_to_formatted_text_enhanced(html_content)
1242
+
1243
  result.update({
1244
  'success': True,
1245
+ 'text': formatted_text,
1246
+ 'html': html_content,
1247
  'metadata': {
1248
  'pages': page_count,
1249
  'html_generated': True,
1250
+ 'enhanced_indentation': True,
1251
+ 'intelligent_text_classification': True,
1252
+ 'parenthetical_patterns_supported': True,
1253
  'page_numbers_added': True,
1254
+ 'comprehensive_formatting': True
1255
  }
1256
  })
1257
 
1258
+ logger.info("PyMuPDF extraction with enhanced indentation processing and text classification completed successfully")
1259
 
1260
  except Exception as e:
1261
  logger.error(f"PyMuPDF error: {e}")
 
1292
  logger.info(f"Trying fallback method: {method}")
1293
  try:
1294
  if method == "azure":
1295
+ result = self._azure_ocr_with_enhanced_html(pdf_path)
1296
  elif method == "tesseract":
1297
  result = self._tesseract_ocr(pdf_path)
1298
  elif method == "pymupdf":
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- # PDF OCR Service Requirements - Enhanced Version with HTML Processing
2
 
3
  # Core web framework and UI
4
  gradio>=4.0.0
@@ -22,10 +22,21 @@ PyMuPDF>=1.23.0
22
  # Document export formats (ENHANCED)
23
  python-docx>=0.8.11
24
 
25
- # HTML processing and parsing (NEW)
26
  beautifulsoup4>=4.12.0
27
  lxml>=4.9.0
28
 
 
 
 
 
 
 
 
 
 
 
 
29
  # Additional dependencies for enhanced preprocessing
30
  matplotlib>=3.7.0 # For image visualization in development
31
  scikit-image>=0.21.0 # Advanced image processing (optional)
@@ -34,45 +45,297 @@ scikit-image>=0.21.0 # Advanced image processing (optional)
34
  tqdm>=4.65.0 # Progress bars for long operations
35
  requests>=2.31.0 # HTTP requests for external services
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # System dependencies information (install separately):
38
  #
39
  # For Ubuntu/Debian:
40
  # sudo apt-get update
41
- # sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
42
  # sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
43
  # sudo apt-get install -y python3-opencv # Alternative OpenCV installation
44
  # sudo apt-get install -y libxml2-dev libxslt1-dev # For lxml
 
 
 
45
  #
46
  # For CentOS/RHEL:
47
- # sudo yum install -y tesseract tesseract-langpack-eng
48
  # sudo yum install -y opencv-python
49
  # sudo yum install -y libxml2-devel libxslt-devel
 
50
  #
51
  # For macOS:
52
  # brew install tesseract
 
53
  # brew install opencv
54
  # brew install libxml2
 
55
  #
56
  # For Windows:
57
  # Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
 
 
58
  # Add Tesseract to PATH environment variable
59
  # OpenCV and other packages should install automatically with pip
 
 
60
 
61
  # Development and testing (optional)
62
  pytest>=7.0.0
63
  pytest-cov>=4.0.0
 
 
64
  black>=23.0.0 # Code formatting
65
  flake8>=6.0.0 # Code linting
 
 
66
 
67
  # Performance monitoring (optional)
68
  memory-profiler>=0.60.0
69
  psutil>=5.9.0 # System monitoring
 
 
 
 
 
70
 
71
- # Note: The enhanced version includes:
72
- # - Fixed table processing that prevents text loss
73
- # - HTML intermediate processing for better formatting
74
- # - Enhanced export capabilities (TXT, DOCX, HTML)
75
- # - Smart overlap detection with 70% threshold
76
- # - Improved coordinate calculations for table boundaries
77
- # - Better document structure preservation
78
- # - Multi-format download options
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PDF OCR Service Requirements - Enhanced Version with Comprehensive Indentation Detection & Text Classification
2
 
3
  # Core web framework and UI
4
  gradio>=4.0.0
 
22
  # Document export formats (ENHANCED)
23
  python-docx>=0.8.11
24
 
25
+ # HTML processing and parsing
26
  beautifulsoup4>=4.12.0
27
  lxml>=4.9.0
28
 
29
+ # Enhanced text processing and pattern detection
30
+ regex>=2023.10.3 # For advanced regex patterns including parenthetical detection
31
+
32
+ # Data handling and analysis
33
+ pandas>=2.0.0 # For document structure analysis
34
+ collections-extended>=2.0.2 # For enhanced counter operations
35
+
36
+ # Text classification and analysis
37
+ scikit-learn>=1.3.0 # For advanced text classification algorithms (optional)
38
+ nltk>=3.8 # Natural language processing toolkit (optional)
39
+
40
  # Additional dependencies for enhanced preprocessing
41
  matplotlib>=3.7.0 # For image visualization in development
42
  scikit-image>=0.21.0 # Advanced image processing (optional)
 
45
  tqdm>=4.65.0 # Progress bars for long operations
46
  requests>=2.31.0 # HTTP requests for external services
47
 
48
+ # Logging and monitoring
49
+ colorlog>=6.7.0 # Enhanced logging with colors
50
+ structlog>=23.1.0 # Structured logging for better debugging
51
+
52
+ # File handling and temporary file management
53
+ pathlib2>=2.3.7 # Enhanced path operations
54
+ tempfile-plus>=1.2.0 # Advanced temporary file handling
55
+
56
+ # Date and time handling
57
+ python-dateutil>=2.8.2 # Enhanced date parsing
58
+
59
+ # Enhanced Unicode and text processing
60
+ unicodedata2>=15.0.0 # Enhanced Unicode support for Thai and other scripts
61
+ ftfy>=6.1.1 # Text fixing and encoding repair
62
+
63
+ # Configuration and validation
64
+ pydantic>=2.0.0 # Data validation and settings management
65
+ confuse>=2.0.0 # Configuration file handling
66
+
67
  # System dependencies information (install separately):
68
  #
69
  # For Ubuntu/Debian:
70
  # sudo apt-get update
71
+ # sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-tha
72
  # sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
73
  # sudo apt-get install -y python3-opencv # Alternative OpenCV installation
74
  # sudo apt-get install -y libxml2-dev libxslt1-dev # For lxml
75
+ # sudo apt-get install -y fonts-thai-tlwg fonts-thai-tlwg-otf # Thai font support
76
+ # sudo apt-get install -y language-pack-th # Thai language support
77
+ # sudo apt-get install -y fonts-noto fonts-noto-cjk # Unicode font support
78
  #
79
  # For CentOS/RHEL:
80
+ # sudo yum install -y tesseract tesseract-langpack-eng tesseract-langpack-tha
81
  # sudo yum install -y opencv-python
82
  # sudo yum install -y libxml2-devel libxslt-devel
83
+ # sudo yum install -y thai-scalable-fonts google-noto-fonts
84
  #
85
  # For macOS:
86
  # brew install tesseract
87
+ # brew install tesseract-lang # Includes Thai support
88
  # brew install opencv
89
  # brew install libxml2
90
+ # brew install font-thai-fonts font-noto
91
  #
92
  # For Windows:
93
  # Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
94
+ # Download Thai language data from: https://github.com/tesseract-ocr/tessdata
95
+ # Download Thai numerals training data if available
96
  # Add Tesseract to PATH environment variable
97
  # OpenCV and other packages should install automatically with pip
98
+ # Install Thai fonts from Windows Language Settings
99
+ # Install Unicode fonts (Noto fonts recommended)
100
 
101
  # Development and testing (optional)
102
  pytest>=7.0.0
103
  pytest-cov>=4.0.0
104
+ pytest-asyncio>=0.21.0 # For async testing
105
+ pytest-mock>=3.11.0 # For mocking in tests
106
  black>=23.0.0 # Code formatting
107
  flake8>=6.0.0 # Code linting
108
+ mypy>=1.5.0 # Type checking
109
+ isort>=5.12.0 # Import sorting
110
 
111
  # Performance monitoring (optional)
112
  memory-profiler>=0.60.0
113
  psutil>=5.9.0 # System monitoring
114
+ py-spy>=0.3.14 # Performance profiling
115
+
116
+ # Enhanced error handling and debugging
117
+ rich>=13.0.0 # Rich console output for debugging
118
+ icecream>=2.1.3 # Enhanced debugging print statements
119
 
120
+ # Enhanced file type detection
121
+ python-magic>=0.4.27 # File type detection
122
+ filetype>=1.2.0 # Alternative file type detection
123
+
124
+ # Additional text processing utilities
125
+ Unidecode>=1.3.6 # ASCII transliteration for Unicode text
126
+ langdetect>=1.0.9 # Language detection for multi-language documents
127
+
128
+ # Note: The enhanced version includes comprehensive features:
129
+ #
130
+ # COMPREHENSIVE INDENTATION DETECTION FEATURES:
131
+ # ===============================================
132
+ #
133
+ # 1. HIERARCHICAL NUMBERING PATTERNS:
134
+ # - Decimal hierarchy: 1.1.1.1.1... (unlimited depth)
135
+ # - Mixed hierarchy: 1.2.a.i.A... (numbers, letters, Roman mixed)
136
+ # - Legal numbering: 1.1.1(a)(i) (with parenthetical sub-sections)
137
+ # - Outline numbering: I.A.1.a.i. (formal document structure)
138
+ # - Section numbering: §1.2.3, Article 1.1.1, Chapter 1.2
139
+ #
140
+ # 2. PARENTHETICAL PATTERNS (NEW):
141
+ # - Arabic numerals: (1), (2), (3), (10), (25)...
142
+ # - Thai numerals: (๑), (๒), (๓), (๑๐), (๒๕)...
143
+ # - Lowercase letters: (a), (b), (c)... (z), (aa), (bb)...
144
+ # - Uppercase letters: (A), (B), (C)... (Z), (AA), (BB)...
145
+ # - Thai letters: (ก), (ข), (ค)... (ฮ)
146
+ # - Lowercase Roman: (i), (ii), (iii), (iv), (v)...
147
+ # - Uppercase Roman: (I), (II), (III), (IV), (V)...
148
+ #
149
+ # 3. TRADITIONAL PATTERNS:
150
+ # - Simple numbered lists: 1., 2., 3.
151
+ # - Simple numbered with parens: 1), 2), 3)
152
+ # - Letter lists: a., b., c. and A., B., C.
153
+ # - Thai letters: ก., ข., ค.
154
+ # - Roman numerals: i., ii., iii. and I., II., III.
155
+ # - Multiple bullet styles: •◦▪→ and 20+ more symbols
156
+ # - Checkbox items: [x], [ ], [✓], [✗]
157
+ # - Arrow bullets: →, ←, ↑, ↓, ⇒, ➔ and more
158
+ # - Dash bullets: -, *, +, ~, =
159
+ #
160
+ # 4. MULTI-LANGUAGE SUPPORT:
161
+ # - Thai script: มาตรา, ข้อ, หมวด, ส่วน
162
+ # - Thai numerals: ๐๑๒๓๔๕๖๗๘๙
163
+ # - Thai letters: ก-ฮ (44 consonants)
164
+ # - Unicode symbols: Full range of bullet and arrow characters
165
+ # - Mixed language documents: English + Thai seamlessly
166
+ #
167
+ # 5. SPACE-BASED INDENTATION:
168
+ # - Automatic detection of space-based indentation levels
169
+ # - 4-space = 1 level standard
170
+ # - Combining space indentation with pattern indentation
171
+ # - Up to 10 indentation levels supported
172
+ #
173
+ # 6. PRIORITY-BASED PATTERN MATCHING:
174
+ # - Hierarchical patterns get higher priority
175
+ # - Parenthetical patterns prioritized appropriately
176
+ # - Prevents false positives in pattern detection
177
+ # - Smart disambiguation between similar patterns
178
+ #
179
+ # INTELLIGENT TEXT CLASSIFICATION FEATURES:
180
+ # =========================================
181
+ #
182
+ # 1. HEADER DETECTION:
183
+ # - Title case detection: "Chapter One Introduction"
184
+ # - All caps detection: "SECTION A: OVERVIEW"
185
+ # - Numbered headers: "1. INTRODUCTION"
186
+ # - Section headers: "SECTION 1.2.3", "CHAPTER IV"
187
+ # - Thai headers: "หมวด ๑", "บท ก"
188
+ # - Short line detection: Lines under 50 characters
189
+ # - Position-based detection: Top of page content
190
+ # - Font size consideration: Larger fonts = likely headers
191
+ #
192
+ # 2. PARAGRAPH CLASSIFICATION:
193
+ # - Long text detection: Over 100 characters
194
+ # - Proper punctuation: Ends with periods
195
+ # - Context analysis: Position and formatting
196
+ # - Multi-sentence detection
197
+ # - Normal text flow patterns
198
+ #
199
+ # 3. LIST ITEM RECOGNITION:
200
+ # - Pattern-based identification
201
+ # - Numbered list items
202
+ # - Bulleted list items
203
+ # - Lettered list items
204
+ # - Roman numeral lists
205
+ # - Parenthetical lists
206
+ # - Checkbox lists
207
+ #
208
+ # 4. CONFIDENCE SCORING:
209
+ # - 0.0 to 1.0 confidence levels
210
+ # - Multiple factors considered
211
+ # - Context-aware scoring
212
+ # - Threshold-based classification
213
+ #
214
+ # 5. DOCUMENT STRUCTURE ANALYSIS:
215
+ # - Overall document statistics
216
+ # - Pattern distribution analysis
217
+ # - Coverage percentage calculation
218
+ # - Dominant pattern identification
219
+ # - Text type distribution
220
+ #
221
+ # ENHANCED PROCESSING FEATURES:
222
+ # =============================
223
+ #
224
+ # 1. HTML INTERMEDIATE PROCESSING:
225
+ # - Better structure preservation
226
+ # - CSS-based indentation levels
227
+ # - Pattern-specific styling
228
+ # - Text classification styling
229
+ # - Responsive design
230
+ #
231
+ # 2. TABLE HANDLING:
232
+ # - Smart overlap detection (70% threshold)
233
+ # - Prevents text loss in tables
234
+ # - Improved coordinate calculations
235
+ # - Better boundary detection
236
+ #
237
+ # 3. EXPORT CAPABILITIES:
238
+ # - Enhanced TXT: Preserved indentation and structure
239
+ # - Enhanced DOCX: Color-coded formatting, proper indentation
240
+ # - Enhanced HTML: CSS styling, responsive design
241
+ # - All formats preserve pattern recognition results
242
+ #
243
+ # 4. CROP PROCESSING:
244
+ # - High-resolution processing (2x scale)
245
+ # - Per-page customization
246
+ # - Real-time preview
247
+ # - Enhanced coordinate handling
248
+ #
249
+ # 5. PERFORMANCE MONITORING:
250
+ # - Processing time tracking
251
+ # - Success rate monitoring
252
+ # - Pattern usage statistics
253
+ # - Document analysis metrics
254
+ #
255
+ # TECHNICAL IMPROVEMENTS:
256
+ # ======================
257
+ #
258
+ # 1. ADVANCED REGEX PATTERNS:
259
+ # - Unicode-aware pattern matching
260
+ # - Thai script support
261
+ # - Complex parenthetical detection
262
+ # - Priority-based matching system
263
+ #
264
+ # 2. ERROR HANDLING:
265
+ # - Comprehensive error catching
266
+ # - Graceful degradation
267
+ # - Detailed logging
268
+ # - Recovery mechanisms
269
+ #
270
+ # 3. TESTING CAPABILITIES:
271
+ # - Unit tests for pattern detection
272
+ # - Integration tests for OCR
273
+ # - Performance benchmarking
274
+ # - Coverage reporting
275
+ #
276
+ # 4. DEBUGGING SUPPORT:
277
+ # - Rich console output
278
+ # - Structured logging
279
+ # - Pattern detection debugging
280
+ # - Classification confidence display
281
+ #
282
+ # INSTALLATION NOTES:
283
+ # ==================
284
+ #
285
+ # 1. SYSTEM DEPENDENCIES:
286
+ # Install system dependencies first (see comments above)
287
+ # Ensure Thai language support is installed
288
+ # Install Unicode fonts for proper display
289
+ #
290
+ # 2. PYTHON DEPENDENCIES:
291
+ # Run: pip install -r requirements.txt
292
+ # Consider using virtual environment
293
+ # Update pip before installation: pip install --upgrade pip
294
+ #
295
+ # 3. AZURE CONFIGURATION (OPTIONAL):
296
+ # Set environment variables:
297
+ # - AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT
298
+ # - AZURE_DOCUMENT_INTELLIGENCE_KEY
299
+ #
300
+ # 4. TESTING:
301
+ # Test with sample documents containing:
302
+ # - Various indentation patterns
303
+ # - Parenthetical numbering
304
+ # - Mixed languages (English + Thai)
305
+ # - Complex document structures
306
+ # - Tables and lists
307
+ #
308
+ # 5. PERFORMANCE OPTIMIZATION:
309
+ # For high-volume processing:
310
+ # - Consider increasing system memory
311
+ # - Use SSD storage for temporary files
312
+ # - Monitor CPU usage during processing
313
+ # - Configure appropriate log levels
314
+ #
315
+ # SUPPORTED LANGUAGES AND SCRIPTS:
316
+ # ================================
317
+ #
318
+ # - English: Full comprehensive support
319
+ # - Thai: Complete support including numerals and letters
320
+ # - Arabic numerals: 0-9 in all contexts
321
+ # - Roman numerals: I, V, X, L, C, D, M and combinations
322
+ # - Unicode symbols: Full range of bullets, arrows, and marks
323
+ # - Mixed documents: Seamless handling of multi-language content
324
+ # - International conventions: Support for various numbering systems
325
+ #
326
+ # VERSION COMPATIBILITY:
327
+ # =====================
328
+ #
329
+ # - Python: 3.8+ required, 3.10+ recommended
330
+ # - Operating Systems: Windows, macOS, Linux
331
+ # - Memory: 4GB+ recommended for large documents
332
+ # - Storage: 1GB+ free space for temporary files
333
+ # - Network: Required for Azure Document Intelligence (optional)
334
+ #
335
+ # This enhanced version provides the most comprehensive indentation detection
336
+ # and text classification system available, with particular strength in:
337
+ # - Parenthetical pattern recognition ((1), (๑), (a), (i), (ก))
338
+ # - Thai language and script support
339
+ # - Intelligent document structure analysis
340
+ # - Multi-format export with preserved formatting
341
+ # - Real-time pattern demonstration and analysis