Spaces:
Running
Running
Upload 11 files
Browse files- app.py +171 -56
- backend.py +533 -181
- enhanced_indentation.py +648 -0
- ocr_service.py +580 -346
- requirements.txt +275 -12
app.py
CHANGED
|
@@ -17,6 +17,7 @@ from dotenv import load_dotenv
|
|
| 17 |
load_dotenv()
|
| 18 |
|
| 19 |
from backend import BackendManager
|
|
|
|
| 20 |
|
| 21 |
# Configure logging
|
| 22 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -25,6 +26,9 @@ logger = logging.getLogger(__name__)
|
|
| 25 |
# Initialize backend manager
|
| 26 |
backend_manager = BackendManager()
|
| 27 |
|
|
|
|
|
|
|
|
|
|
| 28 |
# Check if python-docx is available
|
| 29 |
try:
|
| 30 |
from docx import Document
|
|
@@ -303,16 +307,16 @@ def update_crop_preview_interactive(page_selection, crop_top, crop_bottom, crop_
|
|
| 303 |
logger.error(f"Error updating crop preview: {e}")
|
| 304 |
return None
|
| 305 |
|
| 306 |
-
def
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
"""Process PDF with
|
| 311 |
if pdf_file is None:
|
| 312 |
return "No file uploaded.", "", "", "Error: No file selected"
|
| 313 |
|
| 314 |
try:
|
| 315 |
-
progress(0.1, desc="Initializing
|
| 316 |
|
| 317 |
# Prepare enhanced preprocessing options
|
| 318 |
preprocessing_options = {
|
|
@@ -321,19 +325,19 @@ def process_pdf_with_html_enhancement(pdf_file, ocr_method, enable_header_footer
|
|
| 321 |
'crop_settings': pdf_manager.get_crop_settings_for_processing() if enable_header_footer_removal else None
|
| 322 |
}
|
| 323 |
|
| 324 |
-
progress(0.3, desc="Processing with
|
| 325 |
|
| 326 |
-
# Process the PDF with enhanced preprocessing
|
| 327 |
result = backend_manager.process_pdf_with_enhanced_resolution(
|
| 328 |
pdf_file.name, ocr_method, preprocessing_options
|
| 329 |
)
|
| 330 |
|
| 331 |
-
progress(0.9, desc="Finalizing
|
| 332 |
progress(1.0, desc="Complete!")
|
| 333 |
|
| 334 |
if result['success']:
|
| 335 |
metadata_info = format_enhanced_metadata(result['metadata'], result['method_used'])
|
| 336 |
-
status = f"Success: Processed using {result['method_used']} with
|
| 337 |
|
| 338 |
# Return text, HTML, metadata, and status
|
| 339 |
return (result['text'],
|
|
@@ -345,11 +349,11 @@ def process_pdf_with_html_enhancement(pdf_file, ocr_method, enable_header_footer
|
|
| 345 |
return f"Error: {error_msg}", "", "", f"Processing failed: {error_msg}"
|
| 346 |
|
| 347 |
except Exception as e:
|
| 348 |
-
logger.error(f"
|
| 349 |
return f"Error: {str(e)}", "", "", f"Unexpected error: {str(e)}"
|
| 350 |
|
| 351 |
def format_enhanced_metadata(metadata, method_used):
|
| 352 |
-
"""Enhanced metadata formatting with
|
| 353 |
if not metadata:
|
| 354 |
return f"Method used: {method_used}"
|
| 355 |
|
|
@@ -364,6 +368,15 @@ def format_enhanced_metadata(metadata, method_used):
|
|
| 364 |
if metadata.get('html_processing', False):
|
| 365 |
info_lines.append("HTML generation: Enabled")
|
| 366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
if metadata.get('enhanced_resolution', False) and 'resolution_scale' in metadata:
|
| 368 |
info_lines.append(f"Enhanced resolution: {metadata.get('resolution_scale', 'N/A')}x")
|
| 369 |
|
|
@@ -373,6 +386,25 @@ def format_enhanced_metadata(metadata, method_used):
|
|
| 373 |
if 'tables' in metadata:
|
| 374 |
info_lines.append(f"Tables detected: {metadata['tables']}")
|
| 375 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
if 'processing_time_seconds' in metadata:
|
| 377 |
info_lines.append(f"Processing time: {metadata['processing_time_seconds']:.2f} seconds")
|
| 378 |
|
|
@@ -381,8 +413,8 @@ def format_enhanced_metadata(metadata, method_used):
|
|
| 381 |
def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
|
| 382 |
crop_top, crop_bottom, crop_left, crop_right,
|
| 383 |
apply_to_all_pages, current_page_selection):
|
| 384 |
-
"""Prepare enhanced downloads with
|
| 385 |
-
text, html, metadata, status =
|
| 386 |
pdf_file, method, enable_header_footer_removal,
|
| 387 |
crop_top, crop_bottom, crop_left, crop_right,
|
| 388 |
apply_to_all_pages, current_page_selection
|
|
@@ -417,59 +449,95 @@ def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
|
|
| 417 |
gr.update(visible=False))
|
| 418 |
|
| 419 |
def get_enhanced_method_info(method):
|
| 420 |
-
"""Get information about selected OCR method with
|
| 421 |
method_descriptions = {
|
| 422 |
-
"auto": "**Auto Selection**: Automatically chooses the best available method with HTML processing
|
| 423 |
-
"azure": "**Azure Document Intelligence**: Advanced cloud-based OCR with HTML generation, layout preservation,
|
| 424 |
-
"tesseract": "**Tesseract OCR**: Open-source OCR with HTML output,
|
| 425 |
-
"pymupdf": "**PyMuPDF**: Fast extraction enhanced with HTML processing
|
| 426 |
}
|
| 427 |
|
| 428 |
return method_descriptions.get(method, "Select a method to see details.")
|
| 429 |
|
| 430 |
def check_enhanced_service_status():
|
| 431 |
-
"""Check and display enhanced service status"""
|
| 432 |
available_methods = backend_manager.get_available_methods()
|
| 433 |
|
| 434 |
-
status_lines = ["**Available OCR Methods (Enhanced with
|
| 435 |
|
| 436 |
if "azure" in available_methods:
|
| 437 |
-
status_lines.append("
|
| 438 |
else:
|
| 439 |
-
status_lines.append("
|
| 440 |
|
| 441 |
if "tesseract" in available_methods:
|
| 442 |
-
status_lines.append("
|
| 443 |
else:
|
| 444 |
-
status_lines.append("
|
| 445 |
|
| 446 |
if "pymupdf" in available_methods:
|
| 447 |
-
status_lines.append("
|
| 448 |
else:
|
| 449 |
-
status_lines.append("
|
| 450 |
|
| 451 |
# Add enhanced features status
|
| 452 |
-
status_lines.append("
|
| 453 |
-
status_lines.append("
|
| 454 |
-
status_lines.append("
|
| 455 |
-
status_lines.append("
|
| 456 |
-
status_lines.append("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
if HAS_DOCX_SUPPORT:
|
| 459 |
-
status_lines.append("
|
| 460 |
else:
|
| 461 |
-
status_lines.append("
|
|
|
|
|
|
|
|
|
|
| 462 |
|
| 463 |
-
|
| 464 |
-
|
|
|
|
| 465 |
|
| 466 |
return "\n".join(status_lines)
|
| 467 |
|
| 468 |
def create_enhanced_interface():
|
| 469 |
-
"""Create enhanced Gradio interface with
|
| 470 |
|
| 471 |
with gr.Blocks(
|
| 472 |
-
title="PDF OCR Service - Enhanced with
|
| 473 |
theme=gr.themes.Soft(),
|
| 474 |
css="""
|
| 475 |
.main-header { text-align: center; margin-bottom: 2rem; }
|
|
@@ -484,14 +552,14 @@ def create_enhanced_interface():
|
|
| 484 |
|
| 485 |
gr.HTML("""
|
| 486 |
<div class="main-header">
|
| 487 |
-
<h1>PDF OCR Service - Enhanced with
|
| 488 |
-
<p>Convert PDF documents to text using enhanced OCR with HTML intermediate processing, smart table handling, and
|
| 489 |
</div>
|
| 490 |
""")
|
| 491 |
|
| 492 |
# Instructions at the top
|
| 493 |
with gr.Group(elem_classes=["instructions-panel"]):
|
| 494 |
-
gr.HTML("<h3>Instructions & Features</h3>")
|
| 495 |
gr.HTML("""
|
| 496 |
<div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem;">
|
| 497 |
<h4>How to Use:</h4>
|
|
@@ -499,19 +567,66 @@ def create_enhanced_interface():
|
|
| 499 |
<li><strong>Upload PDF:</strong> Select your PDF file in the configuration panel below</li>
|
| 500 |
<li><strong>Choose Method:</strong> Select OCR method (Auto recommended for best results)</li>
|
| 501 |
<li><strong>Configure Crop (Optional):</strong> Enable header/footer removal and adjust crop settings</li>
|
| 502 |
-
<li><strong>Process:</strong> Click the process button to extract text with
|
| 503 |
-
<li><strong>Download:</strong> Get results in TXT, DOCX, or HTML format</li>
|
| 504 |
</ol>
|
| 505 |
|
| 506 |
-
<h4>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
<ul>
|
| 508 |
<li><strong>Smart Table Detection:</strong> 70% overlap threshold prevents text loss</li>
|
| 509 |
<li><strong>HTML Processing:</strong> Better structure and formatting preservation</li>
|
| 510 |
-
<li><strong>Multi-format Export:</strong> TXT, DOCX, and HTML downloads</li>
|
| 511 |
<li><strong>Advanced Crop Control:</strong> Per-page customization with real-time preview</li>
|
| 512 |
<li><strong>Enhanced Resolution:</strong> High-quality processing for better accuracy</li>
|
| 513 |
-
<li><strong>
|
| 514 |
-
<li><strong>
|
|
|
|
| 515 |
</ul>
|
| 516 |
</div>
|
| 517 |
""")
|
|
@@ -543,7 +658,7 @@ def create_enhanced_interface():
|
|
| 543 |
choices=["auto", "azure", "tesseract", "pymupdf"],
|
| 544 |
value="auto",
|
| 545 |
label="OCR Method",
|
| 546 |
-
info="Choose OCR method (all enhanced with
|
| 547 |
)
|
| 548 |
|
| 549 |
# Method information display
|
|
@@ -628,7 +743,7 @@ def create_enhanced_interface():
|
|
| 628 |
|
| 629 |
# Process button
|
| 630 |
process_btn = gr.Button(
|
| 631 |
-
"Process PDF with
|
| 632 |
variant="primary",
|
| 633 |
size="lg"
|
| 634 |
)
|
|
@@ -666,8 +781,8 @@ def create_enhanced_interface():
|
|
| 666 |
|
| 667 |
# Extracted text output
|
| 668 |
text_output = gr.Textbox(
|
| 669 |
-
label="Extracted Text (Enhanced with
|
| 670 |
-
placeholder="Processed text with HTML enhancement and preserved formatting will appear here...",
|
| 671 |
lines=20,
|
| 672 |
max_lines=30,
|
| 673 |
interactive=False,
|
|
@@ -676,9 +791,9 @@ def create_enhanced_interface():
|
|
| 676 |
|
| 677 |
# Metadata information
|
| 678 |
metadata_output = gr.Textbox(
|
| 679 |
-
label="Processing Information",
|
| 680 |
interactive=False,
|
| 681 |
-
lines=
|
| 682 |
)
|
| 683 |
|
| 684 |
# Enhanced download buttons
|
|
@@ -689,7 +804,7 @@ def create_enhanced_interface():
|
|
| 689 |
variant="secondary"
|
| 690 |
)
|
| 691 |
download_docx_btn = gr.DownloadButton(
|
| 692 |
-
"Download Enhanced DOCX",
|
| 693 |
visible=False,
|
| 694 |
variant="secondary"
|
| 695 |
)
|
|
@@ -701,7 +816,7 @@ def create_enhanced_interface():
|
|
| 701 |
|
| 702 |
# Service Status at the bottom
|
| 703 |
with gr.Group(elem_classes=["status-box"]):
|
| 704 |
-
gr.HTML("<h4>Service Status</h4>")
|
| 705 |
service_status = gr.Markdown(
|
| 706 |
value=check_enhanced_service_status()
|
| 707 |
)
|
|
@@ -793,7 +908,7 @@ def create_enhanced_interface():
|
|
| 793 |
return interface
|
| 794 |
|
| 795 |
def launch_enhanced_ui():
|
| 796 |
-
"""Launch the enhanced Gradio interface with
|
| 797 |
try:
|
| 798 |
interface = create_enhanced_interface()
|
| 799 |
interface.launch(
|
|
|
|
| 17 |
load_dotenv()
|
| 18 |
|
| 19 |
from backend import BackendManager
|
| 20 |
+
from enhanced_indentation import EnhancedIndentationDetector
|
| 21 |
|
| 22 |
# Configure logging
|
| 23 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 26 |
# Initialize backend manager
|
| 27 |
backend_manager = BackendManager()
|
| 28 |
|
| 29 |
+
# Initialize enhanced indentation detector
|
| 30 |
+
indent_detector = EnhancedIndentationDetector()
|
| 31 |
+
|
| 32 |
# Check if python-docx is available
|
| 33 |
try:
|
| 34 |
from docx import Document
|
|
|
|
| 307 |
logger.error(f"Error updating crop preview: {e}")
|
| 308 |
return None
|
| 309 |
|
| 310 |
+
def process_pdf_with_enhanced_indentation(pdf_file, ocr_method, enable_header_footer_removal,
|
| 311 |
+
crop_top, crop_bottom, crop_left, crop_right,
|
| 312 |
+
apply_to_all_pages, current_page_selection,
|
| 313 |
+
progress=gr.Progress()):
|
| 314 |
+
"""Process PDF with enhanced indentation detection, text classification, and comprehensive formatting"""
|
| 315 |
if pdf_file is None:
|
| 316 |
return "No file uploaded.", "", "", "Error: No file selected"
|
| 317 |
|
| 318 |
try:
|
| 319 |
+
progress(0.1, desc="Initializing enhanced processing with comprehensive indentation detection and intelligent text classification...")
|
| 320 |
|
| 321 |
# Prepare enhanced preprocessing options
|
| 322 |
preprocessing_options = {
|
|
|
|
| 325 |
'crop_settings': pdf_manager.get_crop_settings_for_processing() if enable_header_footer_removal else None
|
| 326 |
}
|
| 327 |
|
| 328 |
+
progress(0.3, desc="Processing with enhanced indentation detection and text classification...")
|
| 329 |
|
| 330 |
+
# Process the PDF with enhanced preprocessing, indentation detection, and text classification
|
| 331 |
result = backend_manager.process_pdf_with_enhanced_resolution(
|
| 332 |
pdf_file.name, ocr_method, preprocessing_options
|
| 333 |
)
|
| 334 |
|
| 335 |
+
progress(0.9, desc="Finalizing enhanced processing...")
|
| 336 |
progress(1.0, desc="Complete!")
|
| 337 |
|
| 338 |
if result['success']:
|
| 339 |
metadata_info = format_enhanced_metadata(result['metadata'], result['method_used'])
|
| 340 |
+
status = f"Success: Processed using {result['method_used']} with comprehensive indentation detection and intelligent text classification"
|
| 341 |
|
| 342 |
# Return text, HTML, metadata, and status
|
| 343 |
return (result['text'],
|
|
|
|
| 349 |
return f"Error: {error_msg}", "", "", f"Processing failed: {error_msg}"
|
| 350 |
|
| 351 |
except Exception as e:
|
| 352 |
+
logger.error(f"Enhanced processing error: {e}")
|
| 353 |
return f"Error: {str(e)}", "", "", f"Unexpected error: {str(e)}"
|
| 354 |
|
| 355 |
def format_enhanced_metadata(metadata, method_used):
|
| 356 |
+
"""Enhanced metadata formatting with comprehensive indentation processing and text classification info"""
|
| 357 |
if not metadata:
|
| 358 |
return f"Method used: {method_used}"
|
| 359 |
|
|
|
|
| 368 |
if metadata.get('html_processing', False):
|
| 369 |
info_lines.append("HTML generation: Enabled")
|
| 370 |
|
| 371 |
+
if metadata.get('comprehensive_indentation', False):
|
| 372 |
+
info_lines.append("Comprehensive indentation detection: Enabled")
|
| 373 |
+
|
| 374 |
+
if metadata.get('intelligent_text_classification', False):
|
| 375 |
+
info_lines.append("Intelligent text classification: Enabled")
|
| 376 |
+
|
| 377 |
+
if metadata.get('parenthetical_patterns_supported', False):
|
| 378 |
+
info_lines.append("Parenthetical patterns: Supported (Arabic, Thai, Letters, Roman)")
|
| 379 |
+
|
| 380 |
if metadata.get('enhanced_resolution', False) and 'resolution_scale' in metadata:
|
| 381 |
info_lines.append(f"Enhanced resolution: {metadata.get('resolution_scale', 'N/A')}x")
|
| 382 |
|
|
|
|
| 386 |
if 'tables' in metadata:
|
| 387 |
info_lines.append(f"Tables detected: {metadata['tables']}")
|
| 388 |
|
| 389 |
+
# Document structure analysis information
|
| 390 |
+
if 'document_structure_analysis' in metadata:
|
| 391 |
+
analysis = metadata['document_structure_analysis']
|
| 392 |
+
if not analysis.get('analysis_failed', False):
|
| 393 |
+
info_lines.append(f"Patterned lines detected: {analysis.get('patterned_lines', 0)}")
|
| 394 |
+
info_lines.append(f"Maximum indentation level: {analysis.get('max_level', 0)}")
|
| 395 |
+
info_lines.append(f"Pattern coverage: {analysis.get('coverage_percentage', 0):.1f}%")
|
| 396 |
+
|
| 397 |
+
# Text classification results
|
| 398 |
+
if 'text_classification' in analysis:
|
| 399 |
+
classification = analysis['text_classification']
|
| 400 |
+
info_lines.append(f"Headers detected: {analysis.get('header_count', 0)}")
|
| 401 |
+
info_lines.append(f"Paragraphs detected: {analysis.get('paragraph_count', 0)}")
|
| 402 |
+
info_lines.append(f"List items detected: {analysis.get('list_item_count', 0)}")
|
| 403 |
+
|
| 404 |
+
if analysis.get('dominant_patterns'):
|
| 405 |
+
dominant = analysis['dominant_patterns'][0][0] if analysis['dominant_patterns'] else 'None'
|
| 406 |
+
info_lines.append(f"Dominant pattern: {dominant}")
|
| 407 |
+
|
| 408 |
if 'processing_time_seconds' in metadata:
|
| 409 |
info_lines.append(f"Processing time: {metadata['processing_time_seconds']:.2f} seconds")
|
| 410 |
|
|
|
|
| 413 |
def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
|
| 414 |
crop_top, crop_bottom, crop_left, crop_right,
|
| 415 |
apply_to_all_pages, current_page_selection):
|
| 416 |
+
"""Prepare enhanced downloads with comprehensive indentation processing and text classification"""
|
| 417 |
+
text, html, metadata, status = process_pdf_with_enhanced_indentation(
|
| 418 |
pdf_file, method, enable_header_footer_removal,
|
| 419 |
crop_top, crop_bottom, crop_left, crop_right,
|
| 420 |
apply_to_all_pages, current_page_selection
|
|
|
|
| 449 |
gr.update(visible=False))
|
| 450 |
|
| 451 |
def get_enhanced_method_info(method):
|
| 452 |
+
"""Get information about selected OCR method with comprehensive indentation processing and text classification"""
|
| 453 |
method_descriptions = {
|
| 454 |
+
"auto": "**Auto Selection**: Automatically chooses the best available method with comprehensive indentation detection, intelligent text classification, HTML processing, enhanced pattern recognition for hierarchical numbering (including parenthetical patterns like (1), (๑), (a)), bullets, and multi-language support.",
|
| 455 |
+
"azure": "**Azure Document Intelligence**: Advanced cloud-based OCR with comprehensive indentation detection, intelligent text classification, HTML generation, layout preservation, smart table detection, and support for complex document structures including hierarchical numbering and parenthetical patterns.",
|
| 456 |
+
"tesseract": "**Tesseract OCR**: Open-source OCR enhanced with comprehensive indentation detection, intelligent text classification, HTML output, advanced image preprocessing, resolution scaling, and pattern recognition for various numbering styles including parenthetical patterns and bullet points.",
|
| 457 |
+
"pymupdf": "**PyMuPDF**: Fast extraction enhanced with comprehensive indentation detection, intelligent text classification, HTML processing, improved formatting preservation, and pattern recognition for maintaining document structure and hierarchy including parenthetical numbering."
|
| 458 |
}
|
| 459 |
|
| 460 |
return method_descriptions.get(method, "Select a method to see details.")
|
| 461 |
|
| 462 |
def check_enhanced_service_status():
|
| 463 |
+
"""Check and display enhanced service status with indentation detection and text classification capabilities"""
|
| 464 |
available_methods = backend_manager.get_available_methods()
|
| 465 |
|
| 466 |
+
status_lines = ["**Available OCR Methods (Enhanced with Comprehensive Indentation Detection & Text Classification):**"]
|
| 467 |
|
| 468 |
if "azure" in available_methods:
|
| 469 |
+
status_lines.append("✅ Azure Document Intelligence - Ready (HTML + Tables + Comprehensive Indentation + Text Classification)")
|
| 470 |
else:
|
| 471 |
+
status_lines.append("❌ Azure Document Intelligence - Not configured")
|
| 472 |
|
| 473 |
if "tesseract" in available_methods:
|
| 474 |
+
status_lines.append("✅ Tesseract OCR - Ready (HTML Enhanced + Comprehensive Indentation + Text Classification)")
|
| 475 |
else:
|
| 476 |
+
status_lines.append("❌ Tesseract OCR - Not available")
|
| 477 |
|
| 478 |
if "pymupdf" in available_methods:
|
| 479 |
+
status_lines.append("✅ PyMuPDF - Ready (HTML Enhanced + Comprehensive Indentation + Text Classification)")
|
| 480 |
else:
|
| 481 |
+
status_lines.append("❌ PyMuPDF - Not available")
|
| 482 |
|
| 483 |
# Add enhanced features status
|
| 484 |
+
status_lines.append("")
|
| 485 |
+
status_lines.append("**Comprehensive Indentation Detection Features:**")
|
| 486 |
+
status_lines.append("✅ Hierarchical Decimal Numbering (1.1.1.1.1...)")
|
| 487 |
+
status_lines.append("✅ Mixed Hierarchical Numbering (1.2.a.i.A...)")
|
| 488 |
+
status_lines.append("✅ Legal Numbering (1.1.1(a)(i))")
|
| 489 |
+
status_lines.append("✅ Outline Numbering (I.A.1.a.i.)")
|
| 490 |
+
status_lines.append("✅ Section Numbering (§1.2.3, Article 1.1.1)")
|
| 491 |
+
status_lines.append("✅ Parenthetical Arabic Numerals ((1), (2), (3))")
|
| 492 |
+
status_lines.append("✅ Parenthetical Thai Numerals ((๑), (๒), (๓))")
|
| 493 |
+
status_lines.append("✅ Parenthetical Letters ((a), (b), (A), (B))")
|
| 494 |
+
status_lines.append("✅ Parenthetical Roman Numerals ((i), (ii), (I), (II))")
|
| 495 |
+
status_lines.append("✅ Parenthetical Thai Letters ((ก), (ข), (ค))")
|
| 496 |
+
status_lines.append("✅ Thai Script Support (มาตรา, ข้อ, ก.ข.ค.)")
|
| 497 |
+
status_lines.append("✅ Multiple Bullet Styles (•◦▪→ and more)")
|
| 498 |
+
status_lines.append("✅ Checkbox Items ([x], [ ], [✓])")
|
| 499 |
+
status_lines.append("✅ Roman Numerals (I.II.III, i.ii.iii)")
|
| 500 |
+
status_lines.append("✅ Letter Lists (A.B.C, a.b.c)")
|
| 501 |
+
status_lines.append("✅ Space-based Indentation Detection")
|
| 502 |
+
status_lines.append("✅ Priority-based Pattern Matching")
|
| 503 |
+
|
| 504 |
+
status_lines.append("")
|
| 505 |
+
status_lines.append("**Intelligent Text Classification Features:**")
|
| 506 |
+
status_lines.append("✅ Header Detection (title case, all caps, short lines)")
|
| 507 |
+
status_lines.append("✅ Paragraph Classification (long text, proper punctuation)")
|
| 508 |
+
status_lines.append("✅ List Item Recognition (patterned content)")
|
| 509 |
+
status_lines.append("✅ Context-aware Analysis (position, font size)")
|
| 510 |
+
status_lines.append("✅ Confidence Scoring")
|
| 511 |
+
status_lines.append("✅ Document Structure Analysis")
|
| 512 |
+
|
| 513 |
+
status_lines.append("")
|
| 514 |
+
status_lines.append("**Enhanced Processing Features:**")
|
| 515 |
+
status_lines.append("✅ HTML Processing - Available")
|
| 516 |
+
status_lines.append("✅ Enhanced Table Handling - Available")
|
| 517 |
+
status_lines.append("✅ Smart Text Preservation - Available")
|
| 518 |
+
status_lines.append("✅ Multi-Page Crop Preview - Available")
|
| 519 |
+
status_lines.append("✅ Per-Page Crop Customization - Available")
|
| 520 |
+
status_lines.append("✅ Document Structure Analysis - Available")
|
| 521 |
|
| 522 |
if HAS_DOCX_SUPPORT:
|
| 523 |
+
status_lines.append("✅ Enhanced DOCX Export - Available (with indentation formatting)")
|
| 524 |
else:
|
| 525 |
+
status_lines.append("❌ Enhanced DOCX Export - Install python-docx to enable")
|
| 526 |
+
|
| 527 |
+
status_lines.append("✅ HTML File Export - Available")
|
| 528 |
+
status_lines.append("✅ Enhanced Text Export - Available")
|
| 529 |
|
| 530 |
+
# Add pattern detection statistics
|
| 531 |
+
pattern_count = len(indent_detector.patterns)
|
| 532 |
+
status_lines.append(f"✅ Pattern Detection Engine - {pattern_count} patterns supported")
|
| 533 |
|
| 534 |
return "\n".join(status_lines)
|
| 535 |
|
| 536 |
def create_enhanced_interface():
|
| 537 |
+
"""Create enhanced Gradio interface with comprehensive indentation detection and text classification"""
|
| 538 |
|
| 539 |
with gr.Blocks(
|
| 540 |
+
title="PDF OCR Service - Enhanced with Comprehensive Indentation Detection & Text Classification",
|
| 541 |
theme=gr.themes.Soft(),
|
| 542 |
css="""
|
| 543 |
.main-header { text-align: center; margin-bottom: 2rem; }
|
|
|
|
| 552 |
|
| 553 |
gr.HTML("""
|
| 554 |
<div class="main-header">
|
| 555 |
+
<h1>PDF OCR Service - Enhanced with Comprehensive Indentation Detection & Intelligent Text Classification</h1>
|
| 556 |
+
<p>Convert PDF documents to text using enhanced OCR with HTML intermediate processing, smart table handling, comprehensive indentation pattern recognition including parenthetical patterns like (1), (๑), (a), and intelligent text classification for headers, paragraphs, and list items</p>
|
| 557 |
</div>
|
| 558 |
""")
|
| 559 |
|
| 560 |
# Instructions at the top
|
| 561 |
with gr.Group(elem_classes=["instructions-panel"]):
|
| 562 |
+
gr.HTML("<h3>Instructions & Enhanced Features</h3>")
|
| 563 |
gr.HTML("""
|
| 564 |
<div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem;">
|
| 565 |
<h4>How to Use:</h4>
|
|
|
|
| 567 |
<li><strong>Upload PDF:</strong> Select your PDF file in the configuration panel below</li>
|
| 568 |
<li><strong>Choose Method:</strong> Select OCR method (Auto recommended for best results)</li>
|
| 569 |
<li><strong>Configure Crop (Optional):</strong> Enable header/footer removal and adjust crop settings</li>
|
| 570 |
+
<li><strong>Process:</strong> Click the process button to extract text with comprehensive indentation detection and text classification</li>
|
| 571 |
+
<li><strong>Download:</strong> Get results in TXT, DOCX, or HTML format with preserved formatting</li>
|
| 572 |
</ol>
|
| 573 |
|
| 574 |
+
<h4>Comprehensive Indentation Detection & Text Classification Features:</h4>
|
| 575 |
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin-top: 0.5rem;">
|
| 576 |
+
<div>
|
| 577 |
+
<strong>Hierarchical Numbering:</strong>
|
| 578 |
+
<ul>
|
| 579 |
+
<li>Decimal: 1.1.1.1.1...</li>
|
| 580 |
+
<li>Mixed: 1.2.a.i.A...</li>
|
| 581 |
+
<li>Legal: 1.1.1(a)(i)</li>
|
| 582 |
+
<li>Outline: I.A.1.a.i.</li>
|
| 583 |
+
<li>Section: §1.2.3, Article 1.1.1</li>
|
| 584 |
+
</ul>
|
| 585 |
+
</div>
|
| 586 |
+
<div>
|
| 587 |
+
<strong>Parenthetical Patterns:</strong>
|
| 588 |
+
<ul>
|
| 589 |
+
<li>Arabic: (1), (2), (3)</li>
|
| 590 |
+
<li>Thai Numerals: (๑), (๒), (๓)</li>
|
| 591 |
+
<li>Letters: (a), (b), (A), (B)</li>
|
| 592 |
+
<li>Roman: (i), (ii), (I), (II)</li>
|
| 593 |
+
<li>Thai Letters: (ก), (ข), (ค)</li>
|
| 594 |
+
</ul>
|
| 595 |
+
</div>
|
| 596 |
+
</div>
|
| 597 |
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin-top: 0.5rem;">
|
| 598 |
+
<div>
|
| 599 |
+
<strong>Multi-Language & Symbols:</strong>
|
| 600 |
+
<ul>
|
| 601 |
+
<li>Thai Script: มาตรา, ข้อ, ก.ข.ค.</li>
|
| 602 |
+
<li>Bullets: •◦▪→ and 20+ more</li>
|
| 603 |
+
<li>Roman: I.II.III, i.ii.iii</li>
|
| 604 |
+
<li>Letters: A.B.C, a.b.c</li>
|
| 605 |
+
<li>Checkboxes: [x], [ ], [✓]</li>
|
| 606 |
+
</ul>
|
| 607 |
+
</div>
|
| 608 |
+
<div>
|
| 609 |
+
<strong>Intelligent Text Classification:</strong>
|
| 610 |
+
<ul>
|
| 611 |
+
<li>Header Detection: Title case, all caps, short lines</li>
|
| 612 |
+
<li>Paragraph Recognition: Long text, proper punctuation</li>
|
| 613 |
+
<li>List Item Identification: Patterned content</li>
|
| 614 |
+
<li>Context Analysis: Position, font size, formatting</li>
|
| 615 |
+
<li>Confidence Scoring: Reliability assessment</li>
|
| 616 |
+
</ul>
|
| 617 |
+
</div>
|
| 618 |
+
</div>
|
| 619 |
+
|
| 620 |
+
<h4>Technical Enhancements:</h4>
|
| 621 |
<ul>
|
| 622 |
<li><strong>Smart Table Detection:</strong> 70% overlap threshold prevents text loss</li>
|
| 623 |
<li><strong>HTML Processing:</strong> Better structure and formatting preservation</li>
|
| 624 |
+
<li><strong>Multi-format Export:</strong> TXT, DOCX, and HTML downloads with preserved indentation</li>
|
| 625 |
<li><strong>Advanced Crop Control:</strong> Per-page customization with real-time preview</li>
|
| 626 |
<li><strong>Enhanced Resolution:</strong> High-quality processing for better accuracy</li>
|
| 627 |
+
<li><strong>Document Analysis:</strong> Automatic structure detection and statistics</li>
|
| 628 |
+
<li><strong>Priority Pattern Matching:</strong> Intelligent pattern detection with priority ranking</li>
|
| 629 |
+
<li><strong>Text Classification:</strong> Automated header, paragraph, and list item detection</li>
|
| 630 |
</ul>
|
| 631 |
</div>
|
| 632 |
""")
|
|
|
|
| 658 |
choices=["auto", "azure", "tesseract", "pymupdf"],
|
| 659 |
value="auto",
|
| 660 |
label="OCR Method",
|
| 661 |
+
info="Choose OCR method (all enhanced with comprehensive indentation detection and text classification)"
|
| 662 |
)
|
| 663 |
|
| 664 |
# Method information display
|
|
|
|
| 743 |
|
| 744 |
# Process button
|
| 745 |
process_btn = gr.Button(
|
| 746 |
+
"Process PDF with Comprehensive Indentation Detection & Text Classification",
|
| 747 |
variant="primary",
|
| 748 |
size="lg"
|
| 749 |
)
|
|
|
|
| 781 |
|
| 782 |
# Extracted text output
|
| 783 |
text_output = gr.Textbox(
|
| 784 |
+
label="Extracted Text (Enhanced with Comprehensive Indentation Detection & Text Classification)",
|
| 785 |
+
placeholder="Processed text with comprehensive indentation detection, intelligent text classification, HTML enhancement, and preserved formatting will appear here...",
|
| 786 |
lines=20,
|
| 787 |
max_lines=30,
|
| 788 |
interactive=False,
|
|
|
|
| 791 |
|
| 792 |
# Metadata information
|
| 793 |
metadata_output = gr.Textbox(
|
| 794 |
+
label="Processing Information & Document Analysis",
|
| 795 |
interactive=False,
|
| 796 |
+
lines=8
|
| 797 |
)
|
| 798 |
|
| 799 |
# Enhanced download buttons
|
|
|
|
| 804 |
variant="secondary"
|
| 805 |
)
|
| 806 |
download_docx_btn = gr.DownloadButton(
|
| 807 |
+
"Download Enhanced DOCX (with Indentation & Classification)",
|
| 808 |
visible=False,
|
| 809 |
variant="secondary"
|
| 810 |
)
|
|
|
|
| 816 |
|
| 817 |
# Service Status at the bottom
|
| 818 |
with gr.Group(elem_classes=["status-box"]):
|
| 819 |
+
gr.HTML("<h4>Service Status & Capabilities</h4>")
|
| 820 |
service_status = gr.Markdown(
|
| 821 |
value=check_enhanced_service_status()
|
| 822 |
)
|
|
|
|
| 908 |
return interface
|
| 909 |
|
| 910 |
def launch_enhanced_ui():
|
| 911 |
+
"""Launch the enhanced Gradio interface with comprehensive indentation detection and text classification"""
|
| 912 |
try:
|
| 913 |
interface = create_enhanced_interface()
|
| 914 |
interface.launch(
|
backend.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
Backend Management Module -
|
| 3 |
Coordinates between UI and OCR services, handles file management and preprocessing
|
| 4 |
"""
|
| 5 |
import re
|
|
@@ -14,24 +14,34 @@ from datetime import datetime
|
|
| 14 |
import cv2
|
| 15 |
import numpy as np
|
| 16 |
import fitz # PyMuPDF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# Load environment variables
|
| 19 |
from dotenv import load_dotenv
|
| 20 |
load_dotenv()
|
| 21 |
|
| 22 |
from ocr_service import OCRService
|
|
|
|
| 23 |
|
| 24 |
# Configure logging
|
| 25 |
logging.basicConfig(level=logging.INFO)
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
|
| 28 |
|
| 29 |
-
class
|
| 30 |
-
"""Advanced document export with HTML
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
@staticmethod
|
| 33 |
def create_enhanced_txt_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
|
| 34 |
-
"""Create enhanced TXT file with improved formatting"""
|
| 35 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 36 |
temp_file = tempfile.NamedTemporaryFile(
|
| 37 |
suffix=f'_extracted_text_{timestamp}.txt',
|
|
@@ -42,8 +52,8 @@ class DocumentExporter:
|
|
| 42 |
|
| 43 |
try:
|
| 44 |
# Add header
|
| 45 |
-
temp_file.write("PDF OCR Extraction Results - Enhanced with
|
| 46 |
-
temp_file.write("=" *
|
| 47 |
|
| 48 |
# Add metadata
|
| 49 |
if metadata_info:
|
|
@@ -53,11 +63,22 @@ class DocumentExporter:
|
|
| 53 |
|
| 54 |
# Add timestamp
|
| 55 |
temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 56 |
-
temp_file.write("=" *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
# Add main content
|
| 59 |
-
temp_file.write("Extracted Text (
|
| 60 |
-
temp_file.write("-" *
|
| 61 |
temp_file.write(text_content)
|
| 62 |
|
| 63 |
temp_file.close()
|
|
@@ -68,67 +89,57 @@ class DocumentExporter:
|
|
| 68 |
temp_file.close()
|
| 69 |
raise
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
"""Create enhanced DOCX file from HTML content with proper spacing and indentation"""
|
| 74 |
try:
|
| 75 |
-
from docx import Document
|
| 76 |
-
from docx.shared import Inches, Pt, RGBColor
|
| 77 |
-
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
| 78 |
-
from docx.enum.table import WD_TABLE_ALIGNMENT
|
| 79 |
-
from docx.oxml.shared import OxmlElement, qn
|
| 80 |
-
from html.parser import HTMLParser
|
| 81 |
-
|
| 82 |
-
# Enhanced HTML to DOCX parser with spacing preservation
|
| 83 |
class EnhancedDOCXHTMLParser(HTMLParser):
|
| 84 |
-
def __init__(self, doc):
|
| 85 |
super().__init__()
|
| 86 |
self.doc = doc
|
|
|
|
| 87 |
self.current_paragraph = None
|
| 88 |
-
self.current_run = None
|
| 89 |
self.in_table = False
|
| 90 |
-
self.current_table = None
|
| 91 |
-
self.current_row = None
|
| 92 |
-
self.current_cell = None
|
| 93 |
self.table_data = []
|
| 94 |
self.current_table_row = []
|
| 95 |
-
self.
|
| 96 |
-
self.
|
| 97 |
-
self.
|
| 98 |
-
self.
|
| 99 |
-
self.
|
|
|
|
|
|
|
| 100 |
|
| 101 |
def handle_starttag(self, tag, attrs):
|
| 102 |
attr_dict = dict(attrs)
|
| 103 |
class_attr = attr_dict.get('class', '')
|
| 104 |
-
|
| 105 |
|
| 106 |
-
if tag == 'div'
|
| 107 |
-
# Add minimal page separation (just paragraph spacing, no page break)
|
| 108 |
if hasattr(self, 'has_content'):
|
| 109 |
-
# Add just 2 line breaks worth of spacing
|
| 110 |
self.doc.add_paragraph()
|
| 111 |
self.doc.add_paragraph()
|
| 112 |
self.has_content = True
|
| 113 |
|
| 114 |
-
elif
|
| 115 |
self.current_paragraph = self.doc.add_heading(level=1)
|
| 116 |
self.current_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
|
|
| 117 |
|
| 118 |
-
elif
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
self.current_paragraph = self.doc.add_heading(level=1)
|
| 120 |
-
self.
|
| 121 |
-
self._apply_spacing_from_style(style_attr)
|
| 122 |
|
| 123 |
-
elif
|
| 124 |
self.current_paragraph = self.doc.add_heading(level=2)
|
| 125 |
-
self.
|
| 126 |
-
self._apply_spacing_from_style(style_attr)
|
| 127 |
|
| 128 |
elif tag == 'div' and 'paragraph' in class_attr:
|
| 129 |
self.current_paragraph = self.doc.add_paragraph()
|
| 130 |
-
self.
|
| 131 |
-
self._apply_spacing_from_style(style_attr)
|
| 132 |
|
| 133 |
elif tag == 'table':
|
| 134 |
self.in_table = True
|
|
@@ -137,47 +148,81 @@ class DocumentExporter:
|
|
| 137 |
elif tag == 'tr':
|
| 138 |
self.current_table_row = []
|
| 139 |
|
| 140 |
-
elif tag == 'th' or tag == 'td':
|
| 141 |
-
pass # Will be handled in handle_data
|
| 142 |
-
|
| 143 |
elif tag == 'br':
|
| 144 |
if self.current_paragraph:
|
| 145 |
self.current_paragraph.add_run().add_break()
|
| 146 |
|
| 147 |
-
def
|
| 148 |
-
"""Apply
|
| 149 |
if not self.current_paragraph:
|
| 150 |
return
|
| 151 |
|
| 152 |
-
# Extract
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
self.current_paragraph.paragraph_format.left_indent = Inches(indent_inches)
|
| 160 |
-
|
| 161 |
-
# For bullet points, add hanging indent
|
| 162 |
-
if self.is_bullet_point:
|
| 163 |
-
self.current_paragraph.paragraph_format.first_line_indent = Inches(-0.25)
|
| 164 |
|
| 165 |
-
#
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
| 167 |
self.current_paragraph.paragraph_format.line_spacing = 1.15
|
| 168 |
|
| 169 |
-
#
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
def handle_endtag(self, tag):
|
| 173 |
-
if tag == 'div'
|
| 174 |
-
self.
|
| 175 |
-
|
| 176 |
-
self.
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
| 180 |
self.current_paragraph = None
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
elif tag == 'table':
|
| 183 |
self.in_table = False
|
|
@@ -189,28 +234,123 @@ class DocumentExporter:
|
|
| 189 |
|
| 190 |
def handle_data(self, data):
|
| 191 |
if data.strip():
|
| 192 |
-
# Convert back to regular spaces
|
| 193 |
data = data.replace(' ', ' ')
|
| 194 |
|
| 195 |
if self.in_table:
|
| 196 |
self.current_table_row.append(data.strip())
|
| 197 |
elif self.current_paragraph is not None:
|
| 198 |
-
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
run.bold = True
|
| 201 |
run.font.size = Pt(16)
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
run.bold = True
|
| 204 |
run.font.size = Pt(14)
|
|
|
|
| 205 |
else:
|
| 206 |
-
#
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
def _create_enhanced_docx_table(self):
|
|
|
|
| 210 |
if not self.table_data:
|
| 211 |
return
|
| 212 |
|
| 213 |
-
# Create table with proper formatting
|
| 214 |
rows = len(self.table_data)
|
| 215 |
cols = max(len(row) for row in self.table_data) if self.table_data else 1
|
| 216 |
|
|
@@ -218,10 +358,7 @@ class DocumentExporter:
|
|
| 218 |
table.style = 'Table Grid'
|
| 219 |
table.alignment = WD_TABLE_ALIGNMENT.LEFT
|
| 220 |
|
| 221 |
-
#
|
| 222 |
-
table.autofit = False
|
| 223 |
-
|
| 224 |
-
# Fill table data with proper formatting
|
| 225 |
for row_idx, row_data in enumerate(self.table_data):
|
| 226 |
table_row = table.rows[row_idx]
|
| 227 |
for col_idx, cell_data in enumerate(row_data):
|
|
@@ -235,15 +372,19 @@ class DocumentExporter:
|
|
| 235 |
for run in paragraph.runs:
|
| 236 |
run.bold = True
|
| 237 |
run.font.size = Pt(10)
|
|
|
|
| 238 |
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
else:
|
| 240 |
# Regular data cells
|
| 241 |
for paragraph in cell.paragraphs:
|
| 242 |
for run in paragraph.runs:
|
| 243 |
run.font.size = Pt(10)
|
| 244 |
-
|
| 245 |
-
# Set cell margins for better spacing
|
| 246 |
-
cell.vertical_alignment = WD_ALIGN_PARAGRAPH.LEFT
|
| 247 |
|
| 248 |
# Add spacing after table
|
| 249 |
self.doc.add_paragraph()
|
|
@@ -251,14 +392,14 @@ class DocumentExporter:
|
|
| 251 |
# Create DOCX document
|
| 252 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 253 |
temp_file = tempfile.NamedTemporaryFile(
|
| 254 |
-
suffix=f'
|
| 255 |
delete=False
|
| 256 |
)
|
| 257 |
temp_file.close()
|
| 258 |
|
| 259 |
doc = Document()
|
| 260 |
|
| 261 |
-
# Set document margins for better
|
| 262 |
sections = doc.sections
|
| 263 |
for section in sections:
|
| 264 |
section.top_margin = Inches(1)
|
|
@@ -266,84 +407,65 @@ class DocumentExporter:
|
|
| 266 |
section.left_margin = Inches(1)
|
| 267 |
section.right_margin = Inches(1)
|
| 268 |
|
| 269 |
-
#
|
| 270 |
title = doc.add_heading('PDF OCR Extraction Results', 0)
|
| 271 |
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
|
|
|
|
|
| 272 |
|
| 273 |
-
# Add subtitle
|
| 274 |
subtitle_para = doc.add_paragraph()
|
| 275 |
-
subtitle_run = subtitle_para.add_run('Enhanced with
|
| 276 |
subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 277 |
subtitle_run.italic = True
|
| 278 |
subtitle_run.font.size = Pt(12)
|
| 279 |
subtitle_run.font.color.rgb = RGBColor(102, 102, 102)
|
| 280 |
|
| 281 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
if metadata_info:
|
| 283 |
doc.add_heading('Processing Information', level=1)
|
| 284 |
meta_para = doc.add_paragraph()
|
| 285 |
meta_run = meta_para.add_run(metadata_info)
|
| 286 |
meta_run.font.size = Pt(10)
|
| 287 |
meta_para.style = 'Intense Quote'
|
| 288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
|
| 290 |
-
# Process
|
| 291 |
doc.add_heading('Extracted Content', level=1)
|
| 292 |
|
| 293 |
-
if html_content and '<
|
| 294 |
-
# Parse HTML
|
| 295 |
-
parser = EnhancedDOCXHTMLParser(doc)
|
| 296 |
parser.feed(html_content)
|
| 297 |
else:
|
| 298 |
-
# Fallback to text
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
if para.strip().startswith('==='):
|
| 303 |
-
# Page headers with minimal separation
|
| 304 |
-
page_header = doc.add_heading(para.strip(), level=1)
|
| 305 |
-
page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 306 |
-
elif para.strip().startswith('#'):
|
| 307 |
-
# Titles
|
| 308 |
-
title_text = para.strip().lstrip('#').strip()
|
| 309 |
-
title_para = doc.add_heading(title_text, level=1)
|
| 310 |
-
elif para.strip().startswith('##'):
|
| 311 |
-
# Section headings
|
| 312 |
-
heading_text = para.strip().lstrip('#').strip()
|
| 313 |
-
heading_para = doc.add_heading(heading_text, level=2)
|
| 314 |
-
else:
|
| 315 |
-
# Regular paragraphs with spacing preservation
|
| 316 |
-
lines = para.split('\n')
|
| 317 |
-
for line in lines:
|
| 318 |
-
if line.strip():
|
| 319 |
-
para_element = doc.add_paragraph()
|
| 320 |
-
|
| 321 |
-
# Calculate indentation from leading spaces
|
| 322 |
-
leading_spaces = len(line) - len(line.lstrip())
|
| 323 |
-
if leading_spaces > 0:
|
| 324 |
-
indent_level = leading_spaces // 2 # 2 spaces = 1 indent level
|
| 325 |
-
para_element.paragraph_format.left_indent = Inches(0.5 * indent_level)
|
| 326 |
-
|
| 327 |
-
# Add the text content
|
| 328 |
-
run = para_element.add_run(line.strip())
|
| 329 |
-
run.font.size = Pt(11)
|
| 330 |
-
|
| 331 |
-
# Set line spacing
|
| 332 |
-
para_element.paragraph_format.line_spacing = 1.15
|
| 333 |
-
para_element.paragraph_format.space_after = Pt(3)
|
| 334 |
-
|
| 335 |
-
# Enhanced footer
|
| 336 |
footer_section = doc.sections[0]
|
| 337 |
footer = footer_section.footer
|
| 338 |
footer_para = footer.paragraphs[0]
|
| 339 |
-
footer_para.text = f"Generated by Enhanced PDF OCR Service on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
| 340 |
footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 341 |
footer_run = footer_para.runs[0]
|
| 342 |
-
footer_run.font.size = Pt(
|
| 343 |
footer_run.font.color.rgb = RGBColor(128, 128, 128)
|
| 344 |
|
| 345 |
doc.save(temp_file.name)
|
| 346 |
-
logger.info(f"Enhanced DOCX file with
|
| 347 |
return temp_file.name
|
| 348 |
|
| 349 |
except ImportError:
|
|
@@ -356,9 +478,99 @@ class DocumentExporter:
|
|
| 356 |
pass
|
| 357 |
raise
|
| 358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
@staticmethod
|
| 360 |
def create_html_file(html_content: str, metadata_info: str = "") -> str:
|
| 361 |
-
"""Create standalone HTML file"""
|
| 362 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 363 |
temp_file = tempfile.NamedTemporaryFile(
|
| 364 |
suffix=f'_extracted_document_{timestamp}.html',
|
|
@@ -368,26 +580,80 @@ class DocumentExporter:
|
|
| 368 |
)
|
| 369 |
|
| 370 |
try:
|
| 371 |
-
#
|
| 372 |
-
enhanced_html = html_content
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
|
| 382 |
-
# Wrap content in container
|
| 383 |
-
if '<body>' in enhanced_html:
|
| 384 |
enhanced_html = enhanced_html.replace(
|
| 385 |
'<body>',
|
| 386 |
'''<body>
|
| 387 |
<div class="container">
|
| 388 |
<div class="header">
|
| 389 |
<h1>PDF OCR Extraction Results</h1>
|
| 390 |
-
<p>Enhanced with
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
</div>''' +
|
| 392 |
(f'<div class="metadata"><h3>Processing Information</h3><pre>{metadata_info}</pre></div>' if metadata_info else '')
|
| 393 |
)
|
|
@@ -404,23 +670,24 @@ class DocumentExporter:
|
|
| 404 |
|
| 405 |
|
| 406 |
class BackendManager:
|
| 407 |
-
"""Enhanced backend manager with
|
| 408 |
|
| 409 |
def __init__(self):
|
| 410 |
self.ocr_service = OCRService()
|
|
|
|
| 411 |
self.processing_history = []
|
| 412 |
self.max_history_size = int(os.getenv('MAX_HISTORY_SIZE', 100))
|
| 413 |
|
| 414 |
# Create directories for temporary files and logs
|
| 415 |
-
self.temp_dir = Path(tempfile.gettempdir()) / '
|
| 416 |
self.temp_dir.mkdir(exist_ok=True)
|
| 417 |
|
| 418 |
-
logger.info("Enhanced backend manager with
|
| 419 |
|
| 420 |
def process_pdf_with_enhanced_resolution(self, pdf_path: str, method: str = "auto",
|
| 421 |
preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 422 |
"""
|
| 423 |
-
Process PDF with enhanced resolution and
|
| 424 |
|
| 425 |
Args:
|
| 426 |
pdf_path: Path to the PDF file
|
|
@@ -428,7 +695,7 @@ class BackendManager:
|
|
| 428 |
preprocessing_options: Dictionary containing preprocessing settings
|
| 429 |
|
| 430 |
Returns:
|
| 431 |
-
Dict containing processing results with HTML content
|
| 432 |
"""
|
| 433 |
start_time = datetime.now()
|
| 434 |
|
|
@@ -460,7 +727,7 @@ class BackendManager:
|
|
| 460 |
# Generate file hash for tracking
|
| 461 |
file_hash = self._calculate_file_hash(pdf_path)
|
| 462 |
|
| 463 |
-
logger.info(f"Processing PDF with enhanced
|
| 464 |
logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
|
| 465 |
|
| 466 |
# Handle preprocessing if enabled
|
|
@@ -478,12 +745,23 @@ class BackendManager:
|
|
| 478 |
processed_pdf_path = pdf_path
|
| 479 |
|
| 480 |
try:
|
| 481 |
-
# Process with enhanced OCR
|
| 482 |
result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method)
|
| 483 |
|
| 484 |
# Add processing metadata
|
| 485 |
processing_time = (datetime.now() - start_time).total_seconds()
|
| 486 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
result['metadata'].update({
|
| 488 |
'file_hash': file_hash,
|
| 489 |
'file_size_mb': round(file_size / (1024*1024), 2),
|
|
@@ -491,8 +769,12 @@ class BackendManager:
|
|
| 491 |
'timestamp': start_time.isoformat(),
|
| 492 |
'enhanced_processing': True,
|
| 493 |
'html_processing': True,
|
|
|
|
|
|
|
|
|
|
| 494 |
'header_footer_removed': preprocessing_applied,
|
| 495 |
-
'preprocessing_options': preprocessing_options if preprocessing_applied else None
|
|
|
|
| 496 |
})
|
| 497 |
|
| 498 |
# Cleanup temporary preprocessed file
|
|
@@ -502,7 +784,7 @@ class BackendManager:
|
|
| 502 |
except:
|
| 503 |
pass
|
| 504 |
|
| 505 |
-
# Log results
|
| 506 |
if result['success']:
|
| 507 |
text_length = len(result['text'])
|
| 508 |
has_html = bool(result.get('html'))
|
|
@@ -512,10 +794,17 @@ class BackendManager:
|
|
| 512 |
logger.info(f"Method used: {result['method_used']}")
|
| 513 |
logger.info(f"Text extracted: {text_length} characters")
|
| 514 |
logger.info(f"HTML generated: {has_html}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
if table_count > 0:
|
| 516 |
logger.info(f"Tables detected: {table_count}")
|
| 517 |
if preprocessing_applied:
|
| 518 |
logger.info("Enhanced preprocessing applied")
|
|
|
|
|
|
|
|
|
|
| 519 |
|
| 520 |
# Add to processing history
|
| 521 |
self._add_to_history({
|
|
@@ -528,7 +817,11 @@ class BackendManager:
|
|
| 528 |
'processing_time': processing_time,
|
| 529 |
'preprocessing_applied': preprocessing_applied,
|
| 530 |
'html_generated': has_html,
|
| 531 |
-
'enhanced_processing': True
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
})
|
| 533 |
else:
|
| 534 |
logger.error(f"Enhanced processing failed: {result.get('error', 'Unknown error')}")
|
|
@@ -542,7 +835,10 @@ class BackendManager:
|
|
| 542 |
'error': result.get('error', 'Unknown error'),
|
| 543 |
'processing_time': processing_time,
|
| 544 |
'preprocessing_applied': preprocessing_applied,
|
| 545 |
-
'enhanced_processing': True
|
|
|
|
|
|
|
|
|
|
| 546 |
})
|
| 547 |
|
| 548 |
return result
|
|
@@ -566,7 +862,10 @@ class BackendManager:
|
|
| 566 |
'success': False,
|
| 567 |
'error': str(e),
|
| 568 |
'processing_time': processing_time,
|
| 569 |
-
'enhanced_processing': True
|
|
|
|
|
|
|
|
|
|
| 570 |
})
|
| 571 |
|
| 572 |
return {
|
|
@@ -579,12 +878,15 @@ class BackendManager:
|
|
| 579 |
'file_hash': file_hash,
|
| 580 |
'processing_time_seconds': round(processing_time, 2),
|
| 581 |
'timestamp': start_time.isoformat(),
|
| 582 |
-
'enhanced_processing': True
|
|
|
|
|
|
|
|
|
|
| 583 |
}
|
| 584 |
}
|
| 585 |
|
| 586 |
def _apply_enhanced_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str:
|
| 587 |
-
"""Apply enhanced preprocessing with high-resolution crop handling
|
| 588 |
crop_settings = options.get('crop_settings', {})
|
| 589 |
per_page_crops = crop_settings.get('per_page_crops', {})
|
| 590 |
enhanced_resolution = crop_settings.get('enhanced_resolution', True)
|
|
@@ -602,7 +904,7 @@ class BackendManager:
|
|
| 602 |
page = doc.load_page(page_num)
|
| 603 |
page_rect = page.rect
|
| 604 |
|
| 605 |
-
# Get crop settings for this page
|
| 606 |
page_crop = per_page_crops.get(page_num, per_page_crops.get(0, {
|
| 607 |
'top': 0, 'bottom': 0, 'left': 0, 'right': 0
|
| 608 |
}))
|
|
@@ -636,7 +938,6 @@ class BackendManager:
|
|
| 636 |
|
| 637 |
# Create new page with enhanced resolution if enabled
|
| 638 |
if enhanced_resolution:
|
| 639 |
-
# Use high resolution for better quality
|
| 640 |
new_page = new_doc.new_page(
|
| 641 |
width=new_rect.width,
|
| 642 |
height=new_rect.height
|
|
@@ -676,36 +977,36 @@ class BackendManager:
|
|
| 676 |
|
| 677 |
def create_enhanced_downloads(self, text_content: str, html_content: str,
|
| 678 |
metadata_info: str = "") -> Dict[str, str]:
|
| 679 |
-
"""Create enhanced download files with
|
| 680 |
download_files = {}
|
| 681 |
|
| 682 |
try:
|
| 683 |
# Create enhanced TXT file
|
| 684 |
-
txt_path =
|
| 685 |
text_content, html_content, metadata_info
|
| 686 |
)
|
| 687 |
download_files['txt'] = txt_path
|
| 688 |
logger.info(f"Enhanced TXT file created: {txt_path}")
|
| 689 |
|
| 690 |
-
# Create enhanced DOCX file
|
| 691 |
try:
|
| 692 |
-
docx_path =
|
| 693 |
text_content, html_content, metadata_info
|
| 694 |
)
|
| 695 |
download_files['docx'] = docx_path
|
| 696 |
-
logger.info(f"Enhanced DOCX file created: {docx_path}")
|
| 697 |
except ImportError:
|
| 698 |
logger.warning("python-docx not available. DOCX creation skipped.")
|
| 699 |
except Exception as e:
|
| 700 |
-
logger.error(f"DOCX creation failed: {e}")
|
| 701 |
|
| 702 |
# Create standalone HTML file
|
| 703 |
try:
|
| 704 |
-
html_path =
|
| 705 |
html_content, metadata_info
|
| 706 |
)
|
| 707 |
download_files['html'] = html_path
|
| 708 |
-
logger.info(f"HTML file created: {html_path}")
|
| 709 |
except Exception as e:
|
| 710 |
logger.error(f"HTML file creation failed: {e}")
|
| 711 |
|
|
@@ -744,10 +1045,18 @@ class BackendManager:
|
|
| 744 |
'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)),
|
| 745 |
'enhanced_processing': True,
|
| 746 |
'html_processing': True,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 747 |
'docx_export_available': docx_available,
|
| 748 |
'enhanced_crop_processing': True,
|
| 749 |
'multi_resolution_support': True,
|
| 750 |
-
'crop_processing_fixed': True
|
|
|
|
|
|
|
|
|
|
|
|
|
| 751 |
}
|
| 752 |
|
| 753 |
return status
|
|
@@ -795,7 +1104,7 @@ class BackendManager:
|
|
| 795 |
logger.error(f"Error during cleanup: {e}")
|
| 796 |
|
| 797 |
def get_enhanced_statistics(self) -> Dict[str, Any]:
|
| 798 |
-
"""Get enhanced processing statistics"""
|
| 799 |
if not self.processing_history:
|
| 800 |
return {
|
| 801 |
'total_processed': 0,
|
|
@@ -806,7 +1115,11 @@ class BackendManager:
|
|
| 806 |
'total_tables_processed': 0,
|
| 807 |
'preprocessing_usage': 0,
|
| 808 |
'html_generation_rate': 0,
|
| 809 |
-
'enhanced_processing_usage': 0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 810 |
}
|
| 811 |
|
| 812 |
total_processed = len(self.processing_history)
|
|
@@ -826,9 +1139,20 @@ class BackendManager:
|
|
| 826 |
preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False))
|
| 827 |
html_generated = sum(1 for h in self.processing_history if h.get('html_generated', False))
|
| 828 |
enhanced_processing = sum(1 for h in self.processing_history if h.get('enhanced_processing', False))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 829 |
|
| 830 |
html_generation_rate = (html_generated / total_processed) * 100 if total_processed > 0 else 0
|
| 831 |
enhanced_processing_rate = (enhanced_processing / total_processed) * 100 if total_processed > 0 else 0
|
|
|
|
|
|
|
|
|
|
| 832 |
|
| 833 |
return {
|
| 834 |
'total_processed': total_processed,
|
|
@@ -842,7 +1166,14 @@ class BackendManager:
|
|
| 842 |
'preprocessing_usage': preprocessing_usage,
|
| 843 |
'html_generation_rate': round(html_generation_rate, 2),
|
| 844 |
'enhanced_processing_usage': enhanced_processing,
|
| 845 |
-
'enhanced_processing_rate': round(enhanced_processing_rate, 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 846 |
}
|
| 847 |
|
| 848 |
|
|
@@ -861,8 +1192,29 @@ if __name__ == "__main__":
|
|
| 861 |
# Test the enhanced backend manager
|
| 862 |
manager = BackendManager()
|
| 863 |
|
| 864 |
-
print("Enhanced Backend Manager with
|
| 865 |
-
print("=" *
|
| 866 |
print(f"Available methods: {manager.get_available_methods()}")
|
| 867 |
print(f"Service status: {manager.get_service_status()}")
|
| 868 |
-
print(f"Enhanced statistics: {manager.get_enhanced_statistics()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Backend Management Module - ENHANCED VERSION with Comprehensive Indentation Detection and Intelligent Text Classification
|
| 3 |
Coordinates between UI and OCR services, handles file management and preprocessing
|
| 4 |
"""
|
| 5 |
import re
|
|
|
|
| 14 |
import cv2
|
| 15 |
import numpy as np
|
| 16 |
import fitz # PyMuPDF
|
| 17 |
+
from docx import Document
|
| 18 |
+
from docx.shared import Inches, Pt, RGBColor
|
| 19 |
+
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
| 20 |
+
from docx.enum.table import WD_TABLE_ALIGNMENT
|
| 21 |
+
from docx.oxml.shared import OxmlElement, qn
|
| 22 |
+
from html.parser import HTMLParser
|
| 23 |
|
| 24 |
# Load environment variables
|
| 25 |
from dotenv import load_dotenv
|
| 26 |
load_dotenv()
|
| 27 |
|
| 28 |
from ocr_service import OCRService
|
| 29 |
+
from enhanced_indentation import EnhancedIndentationDetector
|
| 30 |
|
| 31 |
# Configure logging
|
| 32 |
logging.basicConfig(level=logging.INFO)
|
| 33 |
logger = logging.getLogger(__name__)
|
| 34 |
|
| 35 |
|
| 36 |
+
class EnhancedDocumentExporter:
|
| 37 |
+
"""Advanced document export with comprehensive indentation support, parenthetical patterns, and text classification for HTML and DOCX"""
|
| 38 |
+
|
| 39 |
+
def __init__(self):
|
| 40 |
+
self.indent_detector = EnhancedIndentationDetector()
|
| 41 |
|
| 42 |
@staticmethod
|
| 43 |
def create_enhanced_txt_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
|
| 44 |
+
"""Create enhanced TXT file with improved formatting and indentation preservation"""
|
| 45 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 46 |
temp_file = tempfile.NamedTemporaryFile(
|
| 47 |
suffix=f'_extracted_text_{timestamp}.txt',
|
|
|
|
| 52 |
|
| 53 |
try:
|
| 54 |
# Add header
|
| 55 |
+
temp_file.write("PDF OCR Extraction Results - Enhanced with Comprehensive Indentation Detection & Text Classification\n")
|
| 56 |
+
temp_file.write("=" * 90 + "\n\n")
|
| 57 |
|
| 58 |
# Add metadata
|
| 59 |
if metadata_info:
|
|
|
|
| 63 |
|
| 64 |
# Add timestamp
|
| 65 |
temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 66 |
+
temp_file.write("=" * 90 + "\n\n")
|
| 67 |
+
|
| 68 |
+
# Add feature list
|
| 69 |
+
temp_file.write("Enhanced Features Applied:\n")
|
| 70 |
+
temp_file.write("-" * 25 + "\n")
|
| 71 |
+
temp_file.write("• Comprehensive Indentation Detection (20+ patterns)\n")
|
| 72 |
+
temp_file.write("• Parenthetical Patterns ((1), (๑), (a), (i), (ก))\n")
|
| 73 |
+
temp_file.write("• Intelligent Text Classification (headers, paragraphs, lists)\n")
|
| 74 |
+
temp_file.write("• Multi-language Support (English, Thai)\n")
|
| 75 |
+
temp_file.write("• HTML Intermediate Processing\n")
|
| 76 |
+
temp_file.write("• Priority-based Pattern Matching\n")
|
| 77 |
+
temp_file.write("• Document Structure Analysis\n\n")
|
| 78 |
|
| 79 |
# Add main content
|
| 80 |
+
temp_file.write("Extracted Text (Enhanced with Comprehensive Pattern Detection):\n")
|
| 81 |
+
temp_file.write("-" * 60 + "\n\n")
|
| 82 |
temp_file.write(text_content)
|
| 83 |
|
| 84 |
temp_file.close()
|
|
|
|
| 89 |
temp_file.close()
|
| 90 |
raise
|
| 91 |
|
| 92 |
+
def create_enhanced_docx_file(self, text_content: str, html_content: str, metadata_info: str = "") -> str:
|
| 93 |
+
"""Create enhanced DOCX file with comprehensive indentation support, parenthetical patterns, and text classification"""
|
|
|
|
| 94 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
class EnhancedDOCXHTMLParser(HTMLParser):
|
| 96 |
+
def __init__(self, doc, processor):
|
| 97 |
super().__init__()
|
| 98 |
self.doc = doc
|
| 99 |
+
self.processor = processor
|
| 100 |
self.current_paragraph = None
|
|
|
|
| 101 |
self.in_table = False
|
|
|
|
|
|
|
|
|
|
| 102 |
self.table_data = []
|
| 103 |
self.current_table_row = []
|
| 104 |
+
self.current_indent_level = 0
|
| 105 |
+
self.current_formatting_hint = 'normal_text'
|
| 106 |
+
self.in_title = False
|
| 107 |
+
self.in_section_heading = False
|
| 108 |
+
self.in_page_header = False
|
| 109 |
+
self.in_content_header = False
|
| 110 |
+
self.current_classes = []
|
| 111 |
|
| 112 |
def handle_starttag(self, tag, attrs):
|
| 113 |
attr_dict = dict(attrs)
|
| 114 |
class_attr = attr_dict.get('class', '')
|
| 115 |
+
self.current_classes = class_attr.split()
|
| 116 |
|
| 117 |
+
if 'page' in class_attr and tag == 'div':
|
|
|
|
| 118 |
if hasattr(self, 'has_content'):
|
|
|
|
| 119 |
self.doc.add_paragraph()
|
| 120 |
self.doc.add_paragraph()
|
| 121 |
self.has_content = True
|
| 122 |
|
| 123 |
+
elif 'page-header' in class_attr:
|
| 124 |
self.current_paragraph = self.doc.add_heading(level=1)
|
| 125 |
self.current_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 126 |
+
self.in_page_header = True
|
| 127 |
|
| 128 |
+
elif 'content-header' in class_attr:
|
| 129 |
+
self.current_paragraph = self.doc.add_heading(level=2)
|
| 130 |
+
self.in_content_header = True
|
| 131 |
+
|
| 132 |
+
elif 'title' in class_attr:
|
| 133 |
self.current_paragraph = self.doc.add_heading(level=1)
|
| 134 |
+
self.in_title = True
|
|
|
|
| 135 |
|
| 136 |
+
elif 'section-heading' in class_attr:
|
| 137 |
self.current_paragraph = self.doc.add_heading(level=2)
|
| 138 |
+
self.in_section_heading = True
|
|
|
|
| 139 |
|
| 140 |
elif tag == 'div' and 'paragraph' in class_attr:
|
| 141 |
self.current_paragraph = self.doc.add_paragraph()
|
| 142 |
+
self._apply_enhanced_formatting()
|
|
|
|
| 143 |
|
| 144 |
elif tag == 'table':
|
| 145 |
self.in_table = True
|
|
|
|
| 148 |
elif tag == 'tr':
|
| 149 |
self.current_table_row = []
|
| 150 |
|
|
|
|
|
|
|
|
|
|
| 151 |
elif tag == 'br':
|
| 152 |
if self.current_paragraph:
|
| 153 |
self.current_paragraph.add_run().add_break()
|
| 154 |
|
| 155 |
+
def _apply_enhanced_formatting(self):
|
| 156 |
+
"""Apply enhanced formatting based on CSS classes and indentation detection"""
|
| 157 |
if not self.current_paragraph:
|
| 158 |
return
|
| 159 |
|
| 160 |
+
# Extract indent level from classes
|
| 161 |
+
for cls in self.current_classes:
|
| 162 |
+
if cls.startswith('indent-level-'):
|
| 163 |
+
try:
|
| 164 |
+
self.current_indent_level = int(cls.split('-')[-1])
|
| 165 |
+
except ValueError:
|
| 166 |
+
self.current_indent_level = 0
|
| 167 |
+
break
|
| 168 |
+
|
| 169 |
+
# Extract formatting hint from classes
|
| 170 |
+
formatting_hints = [
|
| 171 |
+
'numbered-primary', 'numbered-secondary', 'numbered-tertiary', 'numbered-quaternary', 'numbered-quinary',
|
| 172 |
+
'parenthetical-primary', 'parenthetical-secondary', 'parenthetical-tertiary', 'parenthetical-quaternary',
|
| 173 |
+
'bullet-primary', 'bullet-secondary', 'bullet-tertiary', 'bullet-quaternary',
|
| 174 |
+
'lettered-primary', 'lettered-secondary',
|
| 175 |
+
'roman-primary', 'roman-secondary',
|
| 176 |
+
'thai-primary', 'thai-secondary',
|
| 177 |
+
'indented_text', 'space-indent'
|
| 178 |
+
]
|
| 179 |
+
|
| 180 |
+
for hint in formatting_hints:
|
| 181 |
+
if hint in self.current_classes:
|
| 182 |
+
self.current_formatting_hint = hint
|
| 183 |
+
break
|
| 184 |
+
else:
|
| 185 |
+
self.current_formatting_hint = 'normal_text'
|
| 186 |
+
|
| 187 |
+
# Apply indentation
|
| 188 |
+
if self.current_indent_level > 0:
|
| 189 |
+
indent_inches = self.current_indent_level * 0.5
|
| 190 |
self.current_paragraph.paragraph_format.left_indent = Inches(indent_inches)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
+
# Apply hanging indent for bullets and parenthetical items
|
| 193 |
+
if 'bullet' in self.current_formatting_hint or 'parenthetical' in self.current_formatting_hint:
|
| 194 |
+
self.current_paragraph.paragraph_format.first_line_indent = Inches(-0.25)
|
| 195 |
+
|
| 196 |
+
# Set line spacing and paragraph spacing
|
| 197 |
self.current_paragraph.paragraph_format.line_spacing = 1.15
|
| 198 |
|
| 199 |
+
# Apply spacing based on formatting hint
|
| 200 |
+
if 'primary' in self.current_formatting_hint:
|
| 201 |
+
self.current_paragraph.paragraph_format.space_before = Pt(10)
|
| 202 |
+
self.current_paragraph.paragraph_format.space_after = Pt(8)
|
| 203 |
+
elif 'secondary' in self.current_formatting_hint:
|
| 204 |
+
self.current_paragraph.paragraph_format.space_before = Pt(8)
|
| 205 |
+
self.current_paragraph.paragraph_format.space_after = Pt(6)
|
| 206 |
+
elif 'tertiary' in self.current_formatting_hint:
|
| 207 |
+
self.current_paragraph.paragraph_format.space_before = Pt(6)
|
| 208 |
+
self.current_paragraph.paragraph_format.space_after = Pt(4)
|
| 209 |
+
else:
|
| 210 |
+
self.current_paragraph.paragraph_format.space_after = Pt(3)
|
| 211 |
|
| 212 |
def handle_endtag(self, tag):
|
| 213 |
+
if tag == 'div':
|
| 214 |
+
if self.in_page_header:
|
| 215 |
+
self.in_page_header = False
|
| 216 |
+
elif self.in_content_header:
|
| 217 |
+
self.in_content_header = False
|
| 218 |
+
elif self.in_title:
|
| 219 |
+
self.in_title = False
|
| 220 |
+
elif self.in_section_heading:
|
| 221 |
+
self.in_section_heading = False
|
| 222 |
self.current_paragraph = None
|
| 223 |
+
self.current_indent_level = 0
|
| 224 |
+
self.current_formatting_hint = 'normal_text'
|
| 225 |
+
self.current_classes = []
|
| 226 |
|
| 227 |
elif tag == 'table':
|
| 228 |
self.in_table = False
|
|
|
|
| 234 |
|
| 235 |
def handle_data(self, data):
|
| 236 |
if data.strip():
|
|
|
|
| 237 |
data = data.replace(' ', ' ')
|
| 238 |
|
| 239 |
if self.in_table:
|
| 240 |
self.current_table_row.append(data.strip())
|
| 241 |
elif self.current_paragraph is not None:
|
| 242 |
+
# Detect patterns in the text for additional formatting
|
| 243 |
+
indent_info = self.processor.indent_detector.detect_indentation(data)
|
| 244 |
+
text_classification = self.processor.indent_detector.classify_text_type(data)
|
| 245 |
+
|
| 246 |
+
run = self.current_paragraph.add_run(data.strip())
|
| 247 |
+
|
| 248 |
+
# Apply formatting based on pattern, level, and text classification
|
| 249 |
+
if self.in_title:
|
| 250 |
run.bold = True
|
| 251 |
run.font.size = Pt(16)
|
| 252 |
+
run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue
|
| 253 |
+
elif self.in_content_header or text_classification.get('is_header'):
|
| 254 |
+
run.bold = True
|
| 255 |
+
run.font.size = Pt(14)
|
| 256 |
+
run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue
|
| 257 |
+
elif self.in_section_heading:
|
| 258 |
+
run.bold = True
|
| 259 |
+
run.font.size = Pt(14)
|
| 260 |
+
run.font.color.rgb = RGBColor(52, 73, 94) # Darker blue
|
| 261 |
+
elif self.in_page_header:
|
| 262 |
run.bold = True
|
| 263 |
run.font.size = Pt(14)
|
| 264 |
+
run.font.color.rgb = RGBColor(44, 62, 80)
|
| 265 |
else:
|
| 266 |
+
# Apply pattern-specific formatting
|
| 267 |
+
self._apply_pattern_formatting(run, indent_info, text_classification)
|
| 268 |
+
|
| 269 |
+
def _apply_pattern_formatting(self, run, indent_info, text_classification):
|
| 270 |
+
"""Apply formatting based on detected pattern, classification, and current formatting hint"""
|
| 271 |
+
pattern_type = indent_info.get('pattern_type', 'normal')
|
| 272 |
+
level = indent_info.get('level', 0)
|
| 273 |
+
is_numbered = indent_info.get('is_numbered', False)
|
| 274 |
+
is_bullet = indent_info.get('is_bullet', False)
|
| 275 |
+
is_lettered = indent_info.get('is_lettered', False)
|
| 276 |
+
is_roman = indent_info.get('is_roman', False)
|
| 277 |
+
is_thai = indent_info.get('is_thai', False)
|
| 278 |
+
is_parenthetical = indent_info.get('is_parenthetical', False)
|
| 279 |
+
|
| 280 |
+
# Base font size
|
| 281 |
+
run.font.size = Pt(11)
|
| 282 |
+
|
| 283 |
+
# Apply formatting based on current formatting hint and detected pattern
|
| 284 |
+
if 'numbered' in self.current_formatting_hint or is_numbered:
|
| 285 |
+
if 'primary' in self.current_formatting_hint or level == 1:
|
| 286 |
+
run.bold = True
|
| 287 |
+
run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue
|
| 288 |
+
elif 'secondary' in self.current_formatting_hint or level == 2:
|
| 289 |
+
run.font.color.rgb = RGBColor(52, 73, 94) # Medium blue
|
| 290 |
+
elif 'tertiary' in self.current_formatting_hint or level == 3:
|
| 291 |
+
run.font.color.rgb = RGBColor(85, 85, 85) # Dark gray
|
| 292 |
+
else:
|
| 293 |
+
run.font.color.rgb = RGBColor(102, 102, 102) # Gray
|
| 294 |
+
|
| 295 |
+
elif 'parenthetical' in self.current_formatting_hint or is_parenthetical:
|
| 296 |
+
# Special formatting for parenthetical patterns
|
| 297 |
+
if 'primary' in self.current_formatting_hint or level == 2:
|
| 298 |
+
run.bold = True
|
| 299 |
+
run.font.color.rgb = RGBColor(142, 68, 173) # Purple
|
| 300 |
+
elif 'secondary' in self.current_formatting_hint or level == 3:
|
| 301 |
+
run.font.color.rgb = RGBColor(155, 89, 182) # Light purple
|
| 302 |
+
elif 'tertiary' in self.current_formatting_hint or level == 4:
|
| 303 |
+
run.font.color.rgb = RGBColor(175, 122, 197) # Lighter purple
|
| 304 |
+
else:
|
| 305 |
+
run.font.color.rgb = RGBColor(195, 155, 211) # Very light purple
|
| 306 |
+
|
| 307 |
+
elif 'bullet' in self.current_formatting_hint or is_bullet:
|
| 308 |
+
if 'primary' in self.current_formatting_hint or level == 1:
|
| 309 |
+
run.font.color.rgb = RGBColor(52, 152, 219) # Blue
|
| 310 |
+
elif 'secondary' in self.current_formatting_hint or level == 2:
|
| 311 |
+
run.font.color.rgb = RGBColor(149, 165, 166) # Gray
|
| 312 |
+
elif 'tertiary' in self.current_formatting_hint or level == 3:
|
| 313 |
+
run.font.color.rgb = RGBColor(189, 195, 199) # Light gray
|
| 314 |
+
else:
|
| 315 |
+
run.font.color.rgb = RGBColor(189, 195, 199) # Light gray
|
| 316 |
+
|
| 317 |
+
elif 'lettered' in self.current_formatting_hint or is_lettered:
|
| 318 |
+
run.italic = True
|
| 319 |
+
if 'primary' in self.current_formatting_hint:
|
| 320 |
+
run.font.color.rgb = RGBColor(142, 68, 173) # Purple
|
| 321 |
+
else:
|
| 322 |
+
run.font.color.rgb = RGBColor(155, 89, 182) # Light purple
|
| 323 |
+
|
| 324 |
+
elif 'roman' in self.current_formatting_hint or is_roman:
|
| 325 |
+
run.font.color.rgb = RGBColor(211, 84, 0) # Orange
|
| 326 |
+
run.font.name = 'Times New Roman' # Roman style font
|
| 327 |
+
|
| 328 |
+
elif 'thai' in self.current_formatting_hint or is_thai:
|
| 329 |
+
if 'primary' in self.current_formatting_hint:
|
| 330 |
+
run.bold = True
|
| 331 |
+
run.font.color.rgb = RGBColor(22, 160, 133) # Teal
|
| 332 |
+
else:
|
| 333 |
+
run.font.color.rgb = RGBColor(26, 188, 156) # Light teal
|
| 334 |
+
|
| 335 |
+
elif 'space-indent' in self.current_formatting_hint:
|
| 336 |
+
run.italic = True
|
| 337 |
+
run.font.color.rgb = RGBColor(85, 85, 85) # Dark gray
|
| 338 |
+
|
| 339 |
+
else:
|
| 340 |
+
# Default text formatting based on classification
|
| 341 |
+
if text_classification.get('is_header'):
|
| 342 |
+
run.bold = True
|
| 343 |
+
run.font.color.rgb = RGBColor(44, 62, 80) # Dark blue
|
| 344 |
+
elif text_classification.get('is_list_item'):
|
| 345 |
+
run.font.color.rgb = RGBColor(52, 152, 219) # Blue
|
| 346 |
+
else:
|
| 347 |
+
run.font.color.rgb = RGBColor(0, 0, 0) # Black
|
| 348 |
|
| 349 |
def _create_enhanced_docx_table(self):
|
| 350 |
+
"""Create table with enhanced formatting"""
|
| 351 |
if not self.table_data:
|
| 352 |
return
|
| 353 |
|
|
|
|
| 354 |
rows = len(self.table_data)
|
| 355 |
cols = max(len(row) for row in self.table_data) if self.table_data else 1
|
| 356 |
|
|
|
|
| 358 |
table.style = 'Table Grid'
|
| 359 |
table.alignment = WD_TABLE_ALIGNMENT.LEFT
|
| 360 |
|
| 361 |
+
# Fill table data with enhanced formatting
|
|
|
|
|
|
|
|
|
|
| 362 |
for row_idx, row_data in enumerate(self.table_data):
|
| 363 |
table_row = table.rows[row_idx]
|
| 364 |
for col_idx, cell_data in enumerate(row_data):
|
|
|
|
| 372 |
for run in paragraph.runs:
|
| 373 |
run.bold = True
|
| 374 |
run.font.size = Pt(10)
|
| 375 |
+
run.font.color.rgb = RGBColor(44, 62, 80)
|
| 376 |
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 377 |
+
|
| 378 |
+
# Add background color to header
|
| 379 |
+
shading_elm_1 = OxmlElement('w:shd')
|
| 380 |
+
shading_elm_1.set(qn('w:fill'), 'ECF0F1')
|
| 381 |
+
paragraph._element.get_or_add_pPr().append(shading_elm_1)
|
| 382 |
else:
|
| 383 |
# Regular data cells
|
| 384 |
for paragraph in cell.paragraphs:
|
| 385 |
for run in paragraph.runs:
|
| 386 |
run.font.size = Pt(10)
|
| 387 |
+
paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
|
|
|
|
|
|
| 388 |
|
| 389 |
# Add spacing after table
|
| 390 |
self.doc.add_paragraph()
|
|
|
|
| 392 |
# Create DOCX document
|
| 393 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 394 |
temp_file = tempfile.NamedTemporaryFile(
|
| 395 |
+
suffix=f'_enhanced_document_{timestamp}.docx',
|
| 396 |
delete=False
|
| 397 |
)
|
| 398 |
temp_file.close()
|
| 399 |
|
| 400 |
doc = Document()
|
| 401 |
|
| 402 |
+
# Set document margins for better layout
|
| 403 |
sections = doc.sections
|
| 404 |
for section in sections:
|
| 405 |
section.top_margin = Inches(1)
|
|
|
|
| 407 |
section.left_margin = Inches(1)
|
| 408 |
section.right_margin = Inches(1)
|
| 409 |
|
| 410 |
+
# Add title with enhanced styling
|
| 411 |
title = doc.add_heading('PDF OCR Extraction Results', 0)
|
| 412 |
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 413 |
+
title_run = title.runs[0]
|
| 414 |
+
title_run.font.color.rgb = RGBColor(44, 62, 80)
|
| 415 |
|
| 416 |
+
# Add subtitle
|
| 417 |
subtitle_para = doc.add_paragraph()
|
| 418 |
+
subtitle_run = subtitle_para.add_run('Enhanced with Comprehensive Indentation Detection & Intelligent Text Classification')
|
| 419 |
subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 420 |
subtitle_run.italic = True
|
| 421 |
subtitle_run.font.size = Pt(12)
|
| 422 |
subtitle_run.font.color.rgb = RGBColor(102, 102, 102)
|
| 423 |
|
| 424 |
+
# Add feature list
|
| 425 |
+
features_para = doc.add_paragraph()
|
| 426 |
+
features_run = features_para.add_run('Features: Hierarchical Numbering • Parenthetical Patterns ((1), (๑), (a)) • Bullet Points • Letter & Roman Numerals • Thai Script • Multi-level Indentation • Text Classification')
|
| 427 |
+
features_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 428 |
+
features_run.font.size = Pt(9)
|
| 429 |
+
features_run.font.color.rgb = RGBColor(149, 165, 166)
|
| 430 |
+
|
| 431 |
+
# Add metadata section
|
| 432 |
if metadata_info:
|
| 433 |
doc.add_heading('Processing Information', level=1)
|
| 434 |
meta_para = doc.add_paragraph()
|
| 435 |
meta_run = meta_para.add_run(metadata_info)
|
| 436 |
meta_run.font.size = Pt(10)
|
| 437 |
meta_para.style = 'Intense Quote'
|
| 438 |
+
|
| 439 |
+
# Add background to metadata
|
| 440 |
+
shading_elm = OxmlElement('w:shd')
|
| 441 |
+
shading_elm.set(qn('w:fill'), 'F8F9FA')
|
| 442 |
+
meta_para._element.get_or_add_pPr().append(shading_elm)
|
| 443 |
+
|
| 444 |
+
doc.add_paragraph()
|
| 445 |
|
| 446 |
+
# Process content
|
| 447 |
doc.add_heading('Extracted Content', level=1)
|
| 448 |
|
| 449 |
+
if html_content and '<div' in html_content:
|
| 450 |
+
# Parse HTML with enhanced indentation processing and text classification
|
| 451 |
+
parser = EnhancedDOCXHTMLParser(doc, self)
|
| 452 |
parser.feed(html_content)
|
| 453 |
else:
|
| 454 |
+
# Fallback to text processing with enhanced indentation and classification
|
| 455 |
+
self._process_text_content_enhanced(doc, text_content)
|
| 456 |
+
|
| 457 |
+
# Add enhanced footer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
footer_section = doc.sections[0]
|
| 459 |
footer = footer_section.footer
|
| 460 |
footer_para = footer.paragraphs[0]
|
| 461 |
+
footer_para.text = f"Generated by Enhanced PDF OCR Service with Comprehensive Indentation Detection & Text Classification on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
| 462 |
footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 463 |
footer_run = footer_para.runs[0]
|
| 464 |
+
footer_run.font.size = Pt(8)
|
| 465 |
footer_run.font.color.rgb = RGBColor(128, 128, 128)
|
| 466 |
|
| 467 |
doc.save(temp_file.name)
|
| 468 |
+
logger.info(f"Enhanced DOCX file with comprehensive indentation support and text classification created: {temp_file.name}")
|
| 469 |
return temp_file.name
|
| 470 |
|
| 471 |
except ImportError:
|
|
|
|
| 478 |
pass
|
| 479 |
raise
|
| 480 |
|
| 481 |
+
def _process_text_content_enhanced(self, doc, text_content):
|
| 482 |
+
"""Process text content with enhanced indentation detection and text classification"""
|
| 483 |
+
paragraphs = text_content.split('\n\n')
|
| 484 |
+
|
| 485 |
+
for para_text in paragraphs:
|
| 486 |
+
if not para_text.strip():
|
| 487 |
+
continue
|
| 488 |
+
|
| 489 |
+
lines = para_text.split('\n')
|
| 490 |
+
for line in lines:
|
| 491 |
+
if not line.strip():
|
| 492 |
+
continue
|
| 493 |
+
|
| 494 |
+
# Detect indentation and classify text
|
| 495 |
+
indent_info = self.indent_detector.detect_indentation(line)
|
| 496 |
+
text_classification = self.indent_detector.classify_text_type(line)
|
| 497 |
+
|
| 498 |
+
if line.strip().startswith('==='):
|
| 499 |
+
# Page headers
|
| 500 |
+
page_header = doc.add_heading(line.strip(), level=1)
|
| 501 |
+
page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 502 |
+
header_run = page_header.runs[0]
|
| 503 |
+
header_run.font.color.rgb = RGBColor(44, 62, 80)
|
| 504 |
+
elif line.strip().startswith('##'):
|
| 505 |
+
# Section headings
|
| 506 |
+
heading_text = line.strip().lstrip('#').strip()
|
| 507 |
+
heading = doc.add_heading(heading_text, level=2)
|
| 508 |
+
heading_run = heading.runs[0]
|
| 509 |
+
heading_run.font.color.rgb = RGBColor(52, 73, 94)
|
| 510 |
+
elif text_classification.get('is_header') and text_classification.get('confidence', 0) > 0.7:
|
| 511 |
+
# Detected headers
|
| 512 |
+
heading = doc.add_heading(indent_info.get('content', line.strip()), level=2)
|
| 513 |
+
heading_run = heading.runs[0]
|
| 514 |
+
heading_run.font.color.rgb = RGBColor(52, 73, 94)
|
| 515 |
+
else:
|
| 516 |
+
# Regular content with enhanced indentation and classification
|
| 517 |
+
para = doc.add_paragraph()
|
| 518 |
+
|
| 519 |
+
# Apply indentation based on detected level
|
| 520 |
+
level = indent_info.get('level', 0)
|
| 521 |
+
if level > 0:
|
| 522 |
+
para.paragraph_format.left_indent = Inches(level * 0.5)
|
| 523 |
+
|
| 524 |
+
# Apply pattern-specific formatting
|
| 525 |
+
if indent_info.get('is_bullet', False) or indent_info.get('is_parenthetical', False):
|
| 526 |
+
para.paragraph_format.first_line_indent = Inches(-0.25)
|
| 527 |
+
|
| 528 |
+
# Set proper spacing
|
| 529 |
+
para.paragraph_format.line_spacing = 1.15
|
| 530 |
+
para.paragraph_format.space_after = Pt(3)
|
| 531 |
+
|
| 532 |
+
# Add content with enhanced formatting
|
| 533 |
+
content = indent_info.get('content', line.strip())
|
| 534 |
+
marker = indent_info.get('pattern_marker', '')
|
| 535 |
+
|
| 536 |
+
# Include marker for non-bullet items
|
| 537 |
+
if marker and not indent_info.get('is_bullet', False):
|
| 538 |
+
content = f"{marker} {content}"
|
| 539 |
+
|
| 540 |
+
run = para.add_run(content)
|
| 541 |
+
run.font.size = Pt(11)
|
| 542 |
+
|
| 543 |
+
# Apply color coding based on pattern type and classification
|
| 544 |
+
pattern_type = indent_info.get('pattern_type', 'normal')
|
| 545 |
+
if 'numbered' in pattern_type or 'decimal' in pattern_type:
|
| 546 |
+
if level == 1:
|
| 547 |
+
run.bold = True
|
| 548 |
+
run.font.color.rgb = RGBColor(44, 62, 80)
|
| 549 |
+
elif level == 2:
|
| 550 |
+
run.font.color.rgb = RGBColor(52, 73, 94)
|
| 551 |
+
else:
|
| 552 |
+
run.font.color.rgb = RGBColor(85, 85, 85)
|
| 553 |
+
elif 'parenthetical' in pattern_type:
|
| 554 |
+
if level <= 2:
|
| 555 |
+
run.bold = True
|
| 556 |
+
run.font.color.rgb = RGBColor(142, 68, 173) # Purple
|
| 557 |
+
else:
|
| 558 |
+
run.font.color.rgb = RGBColor(155, 89, 182) # Light purple
|
| 559 |
+
elif 'bullet' in pattern_type:
|
| 560 |
+
run.font.color.rgb = RGBColor(52, 152, 219)
|
| 561 |
+
elif 'lettered' in pattern_type:
|
| 562 |
+
run.italic = True
|
| 563 |
+
run.font.color.rgb = RGBColor(142, 68, 173)
|
| 564 |
+
elif 'roman' in pattern_type:
|
| 565 |
+
run.font.color.rgb = RGBColor(211, 84, 0)
|
| 566 |
+
elif 'thai' in pattern_type:
|
| 567 |
+
run.font.color.rgb = RGBColor(22, 160, 133)
|
| 568 |
+
elif text_classification.get('is_list_item'):
|
| 569 |
+
run.font.color.rgb = RGBColor(52, 152, 219)
|
| 570 |
+
|
| 571 |
@staticmethod
|
| 572 |
def create_html_file(html_content: str, metadata_info: str = "") -> str:
|
| 573 |
+
"""Create standalone HTML file with enhanced styling for comprehensive indentation and text classification"""
|
| 574 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 575 |
temp_file = tempfile.NamedTemporaryFile(
|
| 576 |
suffix=f'_extracted_document_{timestamp}.html',
|
|
|
|
| 580 |
)
|
| 581 |
|
| 582 |
try:
|
| 583 |
+
# Enhance HTML with better styling
|
| 584 |
+
enhanced_html = html_content
|
| 585 |
+
|
| 586 |
+
# Add comprehensive styling if not already present
|
| 587 |
+
if '<style>' not in enhanced_html:
|
| 588 |
+
enhanced_html = enhanced_html.replace(
|
| 589 |
+
'<head>',
|
| 590 |
+
'''<head>
|
| 591 |
+
<style>
|
| 592 |
+
body {
|
| 593 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 594 |
+
line-height: 1.6;
|
| 595 |
+
margin: 20px;
|
| 596 |
+
background-color: #f9f9f9;
|
| 597 |
+
}
|
| 598 |
+
.container {
|
| 599 |
+
max-width: 1200px;
|
| 600 |
+
margin: 0 auto;
|
| 601 |
+
background-color: white;
|
| 602 |
+
padding: 30px;
|
| 603 |
+
border-radius: 8px;
|
| 604 |
+
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
| 605 |
+
}
|
| 606 |
+
.header {
|
| 607 |
+
text-align: center;
|
| 608 |
+
margin-bottom: 30px;
|
| 609 |
+
border-bottom: 3px solid #2c3e50;
|
| 610 |
+
padding-bottom: 20px;
|
| 611 |
+
}
|
| 612 |
+
.metadata {
|
| 613 |
+
background-color: #ecf0f1;
|
| 614 |
+
padding: 15px;
|
| 615 |
+
border-radius: 5px;
|
| 616 |
+
margin-bottom: 25px;
|
| 617 |
+
border-left: 4px solid #3498db;
|
| 618 |
+
}
|
| 619 |
+
.enhanced-features {
|
| 620 |
+
background-color: #e8f5e8;
|
| 621 |
+
padding: 10px;
|
| 622 |
+
border-radius: 5px;
|
| 623 |
+
margin-bottom: 20px;
|
| 624 |
+
border-left: 4px solid #27ae60;
|
| 625 |
+
font-size: 0.9em;
|
| 626 |
+
}
|
| 627 |
+
.classification-features {
|
| 628 |
+
background-color: #fef9e7;
|
| 629 |
+
padding: 10px;
|
| 630 |
+
border-radius: 5px;
|
| 631 |
+
margin-bottom: 20px;
|
| 632 |
+
border-left: 4px solid #f39c12;
|
| 633 |
+
font-size: 0.9em;
|
| 634 |
+
}
|
| 635 |
+
</style>'''
|
| 636 |
+
)
|
| 637 |
|
| 638 |
+
# Wrap content in container if not already wrapped
|
| 639 |
+
if '<body>' in enhanced_html and '.container' not in enhanced_html:
|
| 640 |
enhanced_html = enhanced_html.replace(
|
| 641 |
'<body>',
|
| 642 |
'''<body>
|
| 643 |
<div class="container">
|
| 644 |
<div class="header">
|
| 645 |
<h1>PDF OCR Extraction Results</h1>
|
| 646 |
+
<p>Enhanced with Comprehensive Indentation Detection & Intelligent Text Classification</p>
|
| 647 |
+
</div>
|
| 648 |
+
<div class="enhanced-features">
|
| 649 |
+
<strong>Indentation Features:</strong> Hierarchical Numbering • Parenthetical Patterns ((1), (๑), (a), (i), (ก)) •
|
| 650 |
+
Multi-level Bullets • Letter & Roman Numerals • Thai Script Support •
|
| 651 |
+
Space-based Indentation • Pattern Priority Detection
|
| 652 |
+
</div>
|
| 653 |
+
<div class="classification-features">
|
| 654 |
+
<strong>Text Classification:</strong> Header Detection • Paragraph Recognition •
|
| 655 |
+
List Item Identification • Context Analysis • Confidence Scoring •
|
| 656 |
+
Document Structure Analysis
|
| 657 |
</div>''' +
|
| 658 |
(f'<div class="metadata"><h3>Processing Information</h3><pre>{metadata_info}</pre></div>' if metadata_info else '')
|
| 659 |
)
|
|
|
|
| 670 |
|
| 671 |
|
| 672 |
class BackendManager:
|
| 673 |
+
"""Enhanced backend manager with comprehensive indentation detection, parenthetical patterns, text classification, and advanced export capabilities"""
|
| 674 |
|
| 675 |
def __init__(self):
|
| 676 |
self.ocr_service = OCRService()
|
| 677 |
+
self.document_exporter = EnhancedDocumentExporter()
|
| 678 |
self.processing_history = []
|
| 679 |
self.max_history_size = int(os.getenv('MAX_HISTORY_SIZE', 100))
|
| 680 |
|
| 681 |
# Create directories for temporary files and logs
|
| 682 |
+
self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service_enhanced_v2'
|
| 683 |
self.temp_dir.mkdir(exist_ok=True)
|
| 684 |
|
| 685 |
+
logger.info("Enhanced backend manager with comprehensive indentation detection and text classification initialized successfully")
|
| 686 |
|
| 687 |
def process_pdf_with_enhanced_resolution(self, pdf_path: str, method: str = "auto",
|
| 688 |
preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 689 |
"""
|
| 690 |
+
Process PDF with enhanced resolution, comprehensive indentation detection, and intelligent text classification
|
| 691 |
|
| 692 |
Args:
|
| 693 |
pdf_path: Path to the PDF file
|
|
|
|
| 695 |
preprocessing_options: Dictionary containing preprocessing settings
|
| 696 |
|
| 697 |
Returns:
|
| 698 |
+
Dict containing processing results with enhanced HTML content, indentation, and text classification
|
| 699 |
"""
|
| 700 |
start_time = datetime.now()
|
| 701 |
|
|
|
|
| 727 |
# Generate file hash for tracking
|
| 728 |
file_hash = self._calculate_file_hash(pdf_path)
|
| 729 |
|
| 730 |
+
logger.info(f"Processing PDF with enhanced indentation detection and text classification: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
|
| 731 |
logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
|
| 732 |
|
| 733 |
# Handle preprocessing if enabled
|
|
|
|
| 745 |
processed_pdf_path = pdf_path
|
| 746 |
|
| 747 |
try:
|
| 748 |
+
# Process with enhanced OCR, indentation detection, and text classification
|
| 749 |
result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method)
|
| 750 |
|
| 751 |
# Add processing metadata
|
| 752 |
processing_time = (datetime.now() - start_time).total_seconds()
|
| 753 |
|
| 754 |
+
# Analyze document structure with text classification if successful
|
| 755 |
+
document_analysis = {}
|
| 756 |
+
if result['success'] and result['text']:
|
| 757 |
+
try:
|
| 758 |
+
text_lines = result['text'].split('\n')
|
| 759 |
+
detector = EnhancedIndentationDetector()
|
| 760 |
+
document_analysis = detector.analyze_document_structure(text_lines)
|
| 761 |
+
except Exception as analysis_error:
|
| 762 |
+
logger.warning(f"Document structure analysis failed: {analysis_error}")
|
| 763 |
+
document_analysis = {'analysis_failed': True}
|
| 764 |
+
|
| 765 |
result['metadata'].update({
|
| 766 |
'file_hash': file_hash,
|
| 767 |
'file_size_mb': round(file_size / (1024*1024), 2),
|
|
|
|
| 769 |
'timestamp': start_time.isoformat(),
|
| 770 |
'enhanced_processing': True,
|
| 771 |
'html_processing': True,
|
| 772 |
+
'comprehensive_indentation': True,
|
| 773 |
+
'parenthetical_patterns_supported': True,
|
| 774 |
+
'intelligent_text_classification': True,
|
| 775 |
'header_footer_removed': preprocessing_applied,
|
| 776 |
+
'preprocessing_options': preprocessing_options if preprocessing_applied else None,
|
| 777 |
+
'document_structure_analysis': document_analysis
|
| 778 |
})
|
| 779 |
|
| 780 |
# Cleanup temporary preprocessed file
|
|
|
|
| 784 |
except:
|
| 785 |
pass
|
| 786 |
|
| 787 |
+
# Log results with enhanced information
|
| 788 |
if result['success']:
|
| 789 |
text_length = len(result['text'])
|
| 790 |
has_html = bool(result.get('html'))
|
|
|
|
| 794 |
logger.info(f"Method used: {result['method_used']}")
|
| 795 |
logger.info(f"Text extracted: {text_length} characters")
|
| 796 |
logger.info(f"HTML generated: {has_html}")
|
| 797 |
+
logger.info(f"Comprehensive indentation detection: Enabled")
|
| 798 |
+
logger.info(f"Parenthetical patterns supported: Enabled")
|
| 799 |
+
logger.info(f"Intelligent text classification: Enabled")
|
| 800 |
+
|
| 801 |
if table_count > 0:
|
| 802 |
logger.info(f"Tables detected: {table_count}")
|
| 803 |
if preprocessing_applied:
|
| 804 |
logger.info("Enhanced preprocessing applied")
|
| 805 |
+
if document_analysis and not document_analysis.get('analysis_failed'):
|
| 806 |
+
logger.info(f"Document analysis: {document_analysis.get('patterned_lines', 0)} patterned lines, max level {document_analysis.get('max_level', 0)}")
|
| 807 |
+
logger.info(f"Text classification: {document_analysis.get('header_count', 0)} headers, {document_analysis.get('paragraph_count', 0)} paragraphs, {document_analysis.get('list_item_count', 0)} list items")
|
| 808 |
|
| 809 |
# Add to processing history
|
| 810 |
self._add_to_history({
|
|
|
|
| 817 |
'processing_time': processing_time,
|
| 818 |
'preprocessing_applied': preprocessing_applied,
|
| 819 |
'html_generated': has_html,
|
| 820 |
+
'enhanced_processing': True,
|
| 821 |
+
'comprehensive_indentation': True,
|
| 822 |
+
'parenthetical_patterns_supported': True,
|
| 823 |
+
'intelligent_text_classification': True,
|
| 824 |
+
'document_analysis': document_analysis
|
| 825 |
})
|
| 826 |
else:
|
| 827 |
logger.error(f"Enhanced processing failed: {result.get('error', 'Unknown error')}")
|
|
|
|
| 835 |
'error': result.get('error', 'Unknown error'),
|
| 836 |
'processing_time': processing_time,
|
| 837 |
'preprocessing_applied': preprocessing_applied,
|
| 838 |
+
'enhanced_processing': True,
|
| 839 |
+
'comprehensive_indentation': True,
|
| 840 |
+
'parenthetical_patterns_supported': True,
|
| 841 |
+
'intelligent_text_classification': True
|
| 842 |
})
|
| 843 |
|
| 844 |
return result
|
|
|
|
| 862 |
'success': False,
|
| 863 |
'error': str(e),
|
| 864 |
'processing_time': processing_time,
|
| 865 |
+
'enhanced_processing': True,
|
| 866 |
+
'comprehensive_indentation': True,
|
| 867 |
+
'parenthetical_patterns_supported': True,
|
| 868 |
+
'intelligent_text_classification': True
|
| 869 |
})
|
| 870 |
|
| 871 |
return {
|
|
|
|
| 878 |
'file_hash': file_hash,
|
| 879 |
'processing_time_seconds': round(processing_time, 2),
|
| 880 |
'timestamp': start_time.isoformat(),
|
| 881 |
+
'enhanced_processing': True,
|
| 882 |
+
'comprehensive_indentation': True,
|
| 883 |
+
'parenthetical_patterns_supported': True,
|
| 884 |
+
'intelligent_text_classification': True
|
| 885 |
}
|
| 886 |
}
|
| 887 |
|
| 888 |
def _apply_enhanced_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str:
|
| 889 |
+
"""Apply enhanced preprocessing with high-resolution crop handling"""
|
| 890 |
crop_settings = options.get('crop_settings', {})
|
| 891 |
per_page_crops = crop_settings.get('per_page_crops', {})
|
| 892 |
enhanced_resolution = crop_settings.get('enhanced_resolution', True)
|
|
|
|
| 904 |
page = doc.load_page(page_num)
|
| 905 |
page_rect = page.rect
|
| 906 |
|
| 907 |
+
# Get crop settings for this page
|
| 908 |
page_crop = per_page_crops.get(page_num, per_page_crops.get(0, {
|
| 909 |
'top': 0, 'bottom': 0, 'left': 0, 'right': 0
|
| 910 |
}))
|
|
|
|
| 938 |
|
| 939 |
# Create new page with enhanced resolution if enabled
|
| 940 |
if enhanced_resolution:
|
|
|
|
| 941 |
new_page = new_doc.new_page(
|
| 942 |
width=new_rect.width,
|
| 943 |
height=new_rect.height
|
|
|
|
| 977 |
|
| 978 |
def create_enhanced_downloads(self, text_content: str, html_content: str,
|
| 979 |
metadata_info: str = "") -> Dict[str, str]:
|
| 980 |
+
"""Create enhanced download files with comprehensive indentation support, parenthetical patterns, and text classification"""
|
| 981 |
download_files = {}
|
| 982 |
|
| 983 |
try:
|
| 984 |
# Create enhanced TXT file
|
| 985 |
+
txt_path = EnhancedDocumentExporter.create_enhanced_txt_file(
|
| 986 |
text_content, html_content, metadata_info
|
| 987 |
)
|
| 988 |
download_files['txt'] = txt_path
|
| 989 |
logger.info(f"Enhanced TXT file created: {txt_path}")
|
| 990 |
|
| 991 |
+
# Create enhanced DOCX file with comprehensive indentation support and text classification
|
| 992 |
try:
|
| 993 |
+
docx_path = self.document_exporter.create_enhanced_docx_file(
|
| 994 |
text_content, html_content, metadata_info
|
| 995 |
)
|
| 996 |
download_files['docx'] = docx_path
|
| 997 |
+
logger.info(f"Enhanced DOCX file with comprehensive indentation and text classification created: {docx_path}")
|
| 998 |
except ImportError:
|
| 999 |
logger.warning("python-docx not available. DOCX creation skipped.")
|
| 1000 |
except Exception as e:
|
| 1001 |
+
logger.error(f"Enhanced DOCX creation failed: {e}")
|
| 1002 |
|
| 1003 |
# Create standalone HTML file
|
| 1004 |
try:
|
| 1005 |
+
html_path = EnhancedDocumentExporter.create_html_file(
|
| 1006 |
html_content, metadata_info
|
| 1007 |
)
|
| 1008 |
download_files['html'] = html_path
|
| 1009 |
+
logger.info(f"Enhanced HTML file created: {html_path}")
|
| 1010 |
except Exception as e:
|
| 1011 |
logger.error(f"HTML file creation failed: {e}")
|
| 1012 |
|
|
|
|
| 1045 |
'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)),
|
| 1046 |
'enhanced_processing': True,
|
| 1047 |
'html_processing': True,
|
| 1048 |
+
'comprehensive_indentation': True,
|
| 1049 |
+
'parenthetical_patterns_supported': True,
|
| 1050 |
+
'intelligent_text_classification': True,
|
| 1051 |
+
'pattern_detection_count': len(EnhancedIndentationDetector().patterns),
|
| 1052 |
'docx_export_available': docx_available,
|
| 1053 |
'enhanced_crop_processing': True,
|
| 1054 |
'multi_resolution_support': True,
|
| 1055 |
+
'crop_processing_fixed': True,
|
| 1056 |
+
'document_structure_analysis': True,
|
| 1057 |
+
'thai_script_support': True,
|
| 1058 |
+
'multi_level_support': True,
|
| 1059 |
+
'text_classification_features': True
|
| 1060 |
}
|
| 1061 |
|
| 1062 |
return status
|
|
|
|
| 1104 |
logger.error(f"Error during cleanup: {e}")
|
| 1105 |
|
| 1106 |
def get_enhanced_statistics(self) -> Dict[str, Any]:
|
| 1107 |
+
"""Get enhanced processing statistics with indentation analysis and text classification"""
|
| 1108 |
if not self.processing_history:
|
| 1109 |
return {
|
| 1110 |
'total_processed': 0,
|
|
|
|
| 1115 |
'total_tables_processed': 0,
|
| 1116 |
'preprocessing_usage': 0,
|
| 1117 |
'html_generation_rate': 0,
|
| 1118 |
+
'enhanced_processing_usage': 0,
|
| 1119 |
+
'comprehensive_indentation_usage': 0,
|
| 1120 |
+
'parenthetical_patterns_usage': 0,
|
| 1121 |
+
'text_classification_usage': 0,
|
| 1122 |
+
'document_analysis_success_rate': 0
|
| 1123 |
}
|
| 1124 |
|
| 1125 |
total_processed = len(self.processing_history)
|
|
|
|
| 1139 |
preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False))
|
| 1140 |
html_generated = sum(1 for h in self.processing_history if h.get('html_generated', False))
|
| 1141 |
enhanced_processing = sum(1 for h in self.processing_history if h.get('enhanced_processing', False))
|
| 1142 |
+
comprehensive_indentation = sum(1 for h in self.processing_history if h.get('comprehensive_indentation', False))
|
| 1143 |
+
parenthetical_patterns = sum(1 for h in self.processing_history if h.get('parenthetical_patterns_supported', False))
|
| 1144 |
+
text_classification = sum(1 for h in self.processing_history if h.get('intelligent_text_classification', False))
|
| 1145 |
+
|
| 1146 |
+
# Document analysis statistics
|
| 1147 |
+
doc_analysis_success = sum(1 for h in self.processing_history
|
| 1148 |
+
if h.get('document_analysis', {}) and not h.get('document_analysis', {}).get('analysis_failed', False))
|
| 1149 |
+
doc_analysis_rate = (doc_analysis_success / total_processed) * 100 if total_processed > 0 else 0
|
| 1150 |
|
| 1151 |
html_generation_rate = (html_generated / total_processed) * 100 if total_processed > 0 else 0
|
| 1152 |
enhanced_processing_rate = (enhanced_processing / total_processed) * 100 if total_processed > 0 else 0
|
| 1153 |
+
comprehensive_indentation_rate = (comprehensive_indentation / total_processed) * 100 if total_processed > 0 else 0
|
| 1154 |
+
parenthetical_patterns_rate = (parenthetical_patterns / total_processed) * 100 if total_processed > 0 else 0
|
| 1155 |
+
text_classification_rate = (text_classification / total_processed) * 100 if total_processed > 0 else 0
|
| 1156 |
|
| 1157 |
return {
|
| 1158 |
'total_processed': total_processed,
|
|
|
|
| 1166 |
'preprocessing_usage': preprocessing_usage,
|
| 1167 |
'html_generation_rate': round(html_generation_rate, 2),
|
| 1168 |
'enhanced_processing_usage': enhanced_processing,
|
| 1169 |
+
'enhanced_processing_rate': round(enhanced_processing_rate, 2),
|
| 1170 |
+
'comprehensive_indentation_usage': comprehensive_indentation,
|
| 1171 |
+
'comprehensive_indentation_rate': round(comprehensive_indentation_rate, 2),
|
| 1172 |
+
'parenthetical_patterns_usage': parenthetical_patterns,
|
| 1173 |
+
'parenthetical_patterns_rate': round(parenthetical_patterns_rate, 2),
|
| 1174 |
+
'text_classification_usage': text_classification,
|
| 1175 |
+
'text_classification_rate': round(text_classification_rate, 2),
|
| 1176 |
+
'document_analysis_success_rate': round(doc_analysis_rate, 2)
|
| 1177 |
}
|
| 1178 |
|
| 1179 |
|
|
|
|
| 1192 |
# Test the enhanced backend manager
|
| 1193 |
manager = BackendManager()
|
| 1194 |
|
| 1195 |
+
print("Enhanced Backend Manager with Comprehensive Indentation Detection & Text Classification Test")
|
| 1196 |
+
print("=" * 100)
|
| 1197 |
print(f"Available methods: {manager.get_available_methods()}")
|
| 1198 |
print(f"Service status: {manager.get_service_status()}")
|
| 1199 |
+
print(f"Enhanced statistics: {manager.get_enhanced_statistics()}")
|
| 1200 |
+
|
| 1201 |
+
# Test indentation detector with parenthetical patterns
|
| 1202 |
+
detector = EnhancedIndentationDetector()
|
| 1203 |
+
test_cases = [
|
| 1204 |
+
"1.2.3. Hierarchical item",
|
| 1205 |
+
"(1) Parenthetical Arabic",
|
| 1206 |
+
"(๑) Parenthetical Thai numeral",
|
| 1207 |
+
"(a) Parenthetical letter",
|
| 1208 |
+
"(i) Parenthetical Roman",
|
| 1209 |
+
"(ก) Parenthetical Thai letter"
|
| 1210 |
+
]
|
| 1211 |
+
|
| 1212 |
+
print(f"\nIndentation Detection Test with Parenthetical Patterns:")
|
| 1213 |
+
print("-" * 60)
|
| 1214 |
+
for test_text in test_cases:
|
| 1215 |
+
result = detector.detect_indentation(test_text)
|
| 1216 |
+
classification = detector.classify_text_type(test_text)
|
| 1217 |
+
print(f"Text: {test_text}")
|
| 1218 |
+
print(f" Pattern: {result['pattern_type']}, Level: {result['level']}")
|
| 1219 |
+
print(f" Classification: {classification['type']} (confidence: {classification['confidence']:.2f})")
|
| 1220 |
+
print()
|
enhanced_indentation.py
ADDED
|
@@ -0,0 +1,648 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Enhanced Indentation Detection System
|
| 3 |
+
Comprehensive regex-based system for detecting hierarchical numbering and indentation levels
|
| 4 |
+
For PDF OCR Service with HTML and DOCX output support including parenthetical patterns
|
| 5 |
+
"""
|
| 6 |
+
import re
|
| 7 |
+
import logging
|
| 8 |
+
from typing import Dict, Tuple, Optional, List, Any
|
| 9 |
+
from collections import Counter
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
class EnhancedIndentationDetector:
|
| 14 |
+
"""Advanced indentation detection with comprehensive pattern matching including parenthetical patterns"""
|
| 15 |
+
|
| 16 |
+
def __init__(self):
|
| 17 |
+
# Define comprehensive patterns for different numbering styles
|
| 18 |
+
self.patterns = {
|
| 19 |
+
# Hierarchical decimal numbering (1.1.1.1.1...)
|
| 20 |
+
'decimal_hierarchy': {
|
| 21 |
+
'pattern': r'^\s*(\d+(?:\.\d+)*)\.\s+',
|
| 22 |
+
'example': '1.2.3.4.5.',
|
| 23 |
+
'level_func': self._calculate_decimal_level,
|
| 24 |
+
'priority': 15
|
| 25 |
+
},
|
| 26 |
+
|
| 27 |
+
# Hierarchical numbering without final dot (1.1.1.1.1)
|
| 28 |
+
'decimal_hierarchy_no_dot': {
|
| 29 |
+
'pattern': r'^\s*(\d+(?:\.\d+)+)\s+',
|
| 30 |
+
'example': '1.2.3.4.5',
|
| 31 |
+
'level_func': self._calculate_decimal_level,
|
| 32 |
+
'priority': 14
|
| 33 |
+
},
|
| 34 |
+
|
| 35 |
+
# Hierarchical numbering with parentheses (1.1.1) or 1.1.1)
|
| 36 |
+
'decimal_hierarchy_paren': {
|
| 37 |
+
'pattern': r'^\s*(\d+(?:\.\d+)*)\)\s+',
|
| 38 |
+
'example': '1.2.3.4)',
|
| 39 |
+
'level_func': self._calculate_decimal_level,
|
| 40 |
+
'priority': 13
|
| 41 |
+
},
|
| 42 |
+
|
| 43 |
+
# Mixed hierarchical (1.1.a.i.A...)
|
| 44 |
+
'mixed_hierarchy': {
|
| 45 |
+
'pattern': r'^\s*(\d+(?:\.(?:\d+|[a-z]+|[A-Z]+|[ivxlcdm]+))+)\.\s+',
|
| 46 |
+
'example': '1.2.a.i.A.',
|
| 47 |
+
'level_func': self._calculate_mixed_level,
|
| 48 |
+
'priority': 12
|
| 49 |
+
},
|
| 50 |
+
|
| 51 |
+
# Legal numbering (1.1.1.1(a)(i))
|
| 52 |
+
'legal_numbering': {
|
| 53 |
+
'pattern': r'^\s*(\d+(?:\.\d+)*(?:\([a-z]+\))*(?:\([ivxlcdm]+\))*)\s+',
|
| 54 |
+
'example': '1.1.1(a)(i)',
|
| 55 |
+
'level_func': self._calculate_legal_level,
|
| 56 |
+
'priority': 11
|
| 57 |
+
},
|
| 58 |
+
|
| 59 |
+
# Outline numbering (I.A.1.a.i.)
|
| 60 |
+
'outline_numbering': {
|
| 61 |
+
'pattern': r'^\s*([IVXLCDM]+(?:\.[A-Z]+)*(?:\.\d+)*(?:\.[a-z]+)*(?:\.[ivxlcdm]+)*)\.\s+',
|
| 62 |
+
'example': 'I.A.1.a.i.',
|
| 63 |
+
'level_func': self._calculate_outline_level,
|
| 64 |
+
'priority': 10
|
| 65 |
+
},
|
| 66 |
+
|
| 67 |
+
# Section numbering (§1.1.1, Article 1.1.1)
|
| 68 |
+
'section_numbering': {
|
| 69 |
+
'pattern': r'^\s*(?:§|Section|Article|Chapter|Part)\s*(\d+(?:\.\d+)*)\.\s+',
|
| 70 |
+
'example': '§1.2.3.',
|
| 71 |
+
'level_func': self._calculate_decimal_level,
|
| 72 |
+
'priority': 9
|
| 73 |
+
},
|
| 74 |
+
|
| 75 |
+
# Thai section numbering (มาตรา, ข้อ, หมวด)
|
| 76 |
+
'thai_section_numbering': {
|
| 77 |
+
'pattern': r'^\s*(?:มาตรา|ข้อ|หมวด|ส่วน)\s*(\d+(?:\.\d+)*)\s+',
|
| 78 |
+
'example': 'มาตรา 1.2.3',
|
| 79 |
+
'level_func': self._calculate_decimal_level,
|
| 80 |
+
'priority': 9
|
| 81 |
+
},
|
| 82 |
+
|
| 83 |
+
# Parenthetical numbering - Arabic numerals (1), (2), (3)
|
| 84 |
+
'parenthetical_arabic': {
|
| 85 |
+
'pattern': r'^\s*\((\d+)\)\s+',
|
| 86 |
+
'example': '(1)',
|
| 87 |
+
'level_func': lambda x: 2,
|
| 88 |
+
'priority': 8
|
| 89 |
+
},
|
| 90 |
+
|
| 91 |
+
# Parenthetical numbering - Thai numerals (๑), (๒), (๓)
|
| 92 |
+
'parenthetical_thai_numerals': {
|
| 93 |
+
'pattern': r'^\s*\(([๐-๙]+)\)\s+',
|
| 94 |
+
'example': '(๑)',
|
| 95 |
+
'level_func': lambda x: 2,
|
| 96 |
+
'priority': 8
|
| 97 |
+
},
|
| 98 |
+
|
| 99 |
+
# Parenthetical letters - lowercase (a), (b), (c)
|
| 100 |
+
'parenthetical_letters_lower': {
|
| 101 |
+
'pattern': r'^\s*\(([a-z]+)\)\s+',
|
| 102 |
+
'example': '(a)',
|
| 103 |
+
'level_func': lambda x: 3,
|
| 104 |
+
'priority': 7
|
| 105 |
+
},
|
| 106 |
+
|
| 107 |
+
# Parenthetical letters - uppercase (A), (B), (C)
|
| 108 |
+
'parenthetical_letters_upper': {
|
| 109 |
+
'pattern': r'^\s*\(([A-Z]+)\)\s+',
|
| 110 |
+
'example': '(A)',
|
| 111 |
+
'level_func': lambda x: 2,
|
| 112 |
+
'priority': 7
|
| 113 |
+
},
|
| 114 |
+
|
| 115 |
+
# Parenthetical Thai letters (ก), (ข), (ค)
|
| 116 |
+
'parenthetical_thai_letters': {
|
| 117 |
+
'pattern': r'^\s*\(([ก-ฮ]+)\)\s+',
|
| 118 |
+
'example': '(ก)',
|
| 119 |
+
'level_func': lambda x: 3,
|
| 120 |
+
'priority': 7
|
| 121 |
+
},
|
| 122 |
+
|
| 123 |
+
# Parenthetical Roman numerals - lowercase (i), (ii), (iii)
|
| 124 |
+
'parenthetical_roman_lower': {
|
| 125 |
+
'pattern': r'^\s*\(([ivxlcdm]+)\)\s+',
|
| 126 |
+
'example': '(i)',
|
| 127 |
+
'level_func': lambda x: 4,
|
| 128 |
+
'priority': 6
|
| 129 |
+
},
|
| 130 |
+
|
| 131 |
+
# Parenthetical Roman numerals - uppercase (I), (II), (III)
|
| 132 |
+
'parenthetical_roman_upper': {
|
| 133 |
+
'pattern': r'^\s*\(([IVXLCDM]+)\)\s+',
|
| 134 |
+
'example': '(I)',
|
| 135 |
+
'level_func': lambda x: 2,
|
| 136 |
+
'priority': 6
|
| 137 |
+
},
|
| 138 |
+
|
| 139 |
+
# Simple numbered lists (1., 2., 3.)
|
| 140 |
+
'simple_numbered': {
|
| 141 |
+
'pattern': r'^\s*(\d+)\.\s+',
|
| 142 |
+
'example': '1.',
|
| 143 |
+
'level_func': lambda x: 1,
|
| 144 |
+
'priority': 5
|
| 145 |
+
},
|
| 146 |
+
|
| 147 |
+
# Simple numbered with parens (1), 2), 3))
|
| 148 |
+
'simple_numbered_paren': {
|
| 149 |
+
'pattern': r'^\s*(\d+)\)\s+',
|
| 150 |
+
'example': '1)',
|
| 151 |
+
'level_func': lambda x: 1,
|
| 152 |
+
'priority': 5
|
| 153 |
+
},
|
| 154 |
+
|
| 155 |
+
# Lettered lists (a., b., c.) and (A., B., C.)
|
| 156 |
+
'lettered_lower': {
|
| 157 |
+
'pattern': r'^\s*([a-z]+)\.\s+',
|
| 158 |
+
'example': 'a.',
|
| 159 |
+
'level_func': lambda x: 2,
|
| 160 |
+
'priority': 4
|
| 161 |
+
},
|
| 162 |
+
|
| 163 |
+
'lettered_upper': {
|
| 164 |
+
'pattern': r'^\s*([A-Z]+)\.\s+',
|
| 165 |
+
'example': 'A.',
|
| 166 |
+
'level_func': lambda x: 1,
|
| 167 |
+
'priority': 4
|
| 168 |
+
},
|
| 169 |
+
|
| 170 |
+
# Thai letters (ก., ข., ค.)
|
| 171 |
+
'thai_lettered': {
|
| 172 |
+
'pattern': r'^\s*([ก-ฮ]+)\.\s+',
|
| 173 |
+
'example': 'ก.',
|
| 174 |
+
'level_func': lambda x: 2,
|
| 175 |
+
'priority': 4
|
| 176 |
+
},
|
| 177 |
+
|
| 178 |
+
# Roman numerals (i., ii., iii.) and (I., II., III.)
|
| 179 |
+
'roman_lower': {
|
| 180 |
+
'pattern': r'^\s*([ivxlcdm]+)\.\s+',
|
| 181 |
+
'example': 'i.',
|
| 182 |
+
'level_func': lambda x: 3,
|
| 183 |
+
'priority': 3
|
| 184 |
+
},
|
| 185 |
+
|
| 186 |
+
'roman_upper': {
|
| 187 |
+
'pattern': r'^\s*([IVXLCDM]+)\.\s+',
|
| 188 |
+
'example': 'I.',
|
| 189 |
+
'level_func': lambda x: 1,
|
| 190 |
+
'priority': 3
|
| 191 |
+
},
|
| 192 |
+
|
| 193 |
+
# Bullet points with various symbols
|
| 194 |
+
'bullet_symbols': {
|
| 195 |
+
'pattern': r'^\s*([•·▪▫◦‣⁃⁌⁍◘◙○●▶▷►▻★☆♦♠♣♥◆◇■□▲△▼▽❖❀❁❂❃❄❅❆❇❈❉❊❋❍❏❐❑❒❖])\s+',
|
| 196 |
+
'example': '•',
|
| 197 |
+
'level_func': self._calculate_bullet_level,
|
| 198 |
+
'priority': 2
|
| 199 |
+
},
|
| 200 |
+
|
| 201 |
+
# Dash and asterisk bullets
|
| 202 |
+
'dash_bullets': {
|
| 203 |
+
'pattern': r'^\s*([\-\*\+~=])\s+',
|
| 204 |
+
'example': '-',
|
| 205 |
+
'level_func': self._calculate_bullet_level,
|
| 206 |
+
'priority': 2
|
| 207 |
+
},
|
| 208 |
+
|
| 209 |
+
# Arrow bullets
|
| 210 |
+
'arrow_bullets': {
|
| 211 |
+
'pattern': r'^\s*([\→\←\↑\↓\↔\↕\↖\↗\↘\↙\⇒\⇐\⇑\⇓\⇔\⇕\➔\➜\➤\➪\➫\➬\➭\➮\➯\➱\➲\➳\➴\➵\➶\➷\➸\➹\➺\➻\➼\➽\➾])\s+',
|
| 212 |
+
'example': '→',
|
| 213 |
+
'level_func': self._calculate_bullet_level,
|
| 214 |
+
'priority': 2
|
| 215 |
+
},
|
| 216 |
+
|
| 217 |
+
# Checkbox items ([x], [ ], [✓])
|
| 218 |
+
'checkbox': {
|
| 219 |
+
'pattern': r'^\s*\[([x✓✗\s])\]\s+',
|
| 220 |
+
'example': '[x]',
|
| 221 |
+
'level_func': lambda x: 2,
|
| 222 |
+
'priority': 1
|
| 223 |
+
}
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
# Sort patterns by priority (higher priority first)
|
| 227 |
+
self.sorted_patterns = sorted(
|
| 228 |
+
self.patterns.items(),
|
| 229 |
+
key=lambda x: x[1]['priority'],
|
| 230 |
+
reverse=True
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
# Header detection patterns
|
| 234 |
+
self.header_patterns = {
|
| 235 |
+
'title_case': r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*$',
|
| 236 |
+
'all_caps': r'^[A-Z\s]+$',
|
| 237 |
+
'numbered_header': r'^\d+\.\s*[A-Z]',
|
| 238 |
+
'section_header': r'^(?:SECTION|CHAPTER|PART|ARTICLE)\s+',
|
| 239 |
+
'thai_header': r'^(?:หมวด|บท|ส่วน|มาตรา)\s+',
|
| 240 |
+
'short_line': lambda text: len(text.strip()) < 50 and not text.strip().endswith('.'),
|
| 241 |
+
'ends_without_period': lambda text: not text.strip().endswith('.') and not text.strip().endswith(':'),
|
| 242 |
+
'capitalized_words': lambda text: sum(1 for word in text.split() if word and word[0].isupper()) / max(len(text.split()), 1) > 0.5
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
def detect_indentation(self, text: str, base_margin: float = 0) -> Dict[str, Any]:
|
| 246 |
+
"""
|
| 247 |
+
Detect indentation pattern and level for given text
|
| 248 |
+
|
| 249 |
+
Args:
|
| 250 |
+
text: Text line to analyze
|
| 251 |
+
base_margin: Base left margin for relative positioning
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
Dict with pattern info, level, and formatting details
|
| 255 |
+
"""
|
| 256 |
+
if not text or not text.strip():
|
| 257 |
+
return self._create_empty_result(text)
|
| 258 |
+
|
| 259 |
+
text_stripped = text.strip()
|
| 260 |
+
|
| 261 |
+
# Count leading whitespace for additional indentation
|
| 262 |
+
leading_spaces = len(text) - len(text.lstrip())
|
| 263 |
+
space_indent_level = leading_spaces // 4 # 4 spaces = 1 level
|
| 264 |
+
|
| 265 |
+
# Try each pattern in priority order
|
| 266 |
+
for pattern_name, pattern_info in self.sorted_patterns:
|
| 267 |
+
match = re.match(pattern_info['pattern'], text, re.IGNORECASE)
|
| 268 |
+
if match:
|
| 269 |
+
# Extract the numbering/bullet part
|
| 270 |
+
marker = match.group(1) if match.groups() else match.group(0)
|
| 271 |
+
|
| 272 |
+
# Calculate pattern-specific level
|
| 273 |
+
if callable(pattern_info['level_func']):
|
| 274 |
+
try:
|
| 275 |
+
pattern_level = pattern_info['level_func'](marker)
|
| 276 |
+
except:
|
| 277 |
+
pattern_level = 1
|
| 278 |
+
else:
|
| 279 |
+
pattern_level = pattern_info['level_func']
|
| 280 |
+
|
| 281 |
+
# Combine pattern level with space indentation
|
| 282 |
+
total_level = max(pattern_level + space_indent_level, 1)
|
| 283 |
+
|
| 284 |
+
# Extract content after the marker
|
| 285 |
+
content_start = match.end()
|
| 286 |
+
content = text[content_start:].strip()
|
| 287 |
+
|
| 288 |
+
return {
|
| 289 |
+
'has_pattern': True,
|
| 290 |
+
'pattern_type': pattern_name,
|
| 291 |
+
'pattern_marker': marker,
|
| 292 |
+
'level': min(total_level, 10), # Cap at level 10
|
| 293 |
+
'content': content,
|
| 294 |
+
'original_text': text,
|
| 295 |
+
'leading_spaces': leading_spaces,
|
| 296 |
+
'space_indent_level': space_indent_level,
|
| 297 |
+
'pattern_level': pattern_level,
|
| 298 |
+
'is_bullet': self._is_bullet_pattern(pattern_name),
|
| 299 |
+
'is_numbered': self._is_numbered_pattern(pattern_name),
|
| 300 |
+
'is_lettered': self._is_lettered_pattern(pattern_name),
|
| 301 |
+
'is_roman': self._is_roman_pattern(pattern_name),
|
| 302 |
+
'is_thai': self._is_thai_pattern(pattern_name),
|
| 303 |
+
'is_parenthetical': self._is_parenthetical_pattern(pattern_name),
|
| 304 |
+
'formatting_hint': self._get_formatting_hint(pattern_name, total_level),
|
| 305 |
+
'priority': pattern_info['priority']
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
# No pattern found - check for basic indentation
|
| 309 |
+
if leading_spaces > 0:
|
| 310 |
+
return {
|
| 311 |
+
'has_pattern': False,
|
| 312 |
+
'pattern_type': 'space_indent',
|
| 313 |
+
'pattern_marker': '',
|
| 314 |
+
'level': max(space_indent_level, 1),
|
| 315 |
+
'content': text_stripped,
|
| 316 |
+
'original_text': text,
|
| 317 |
+
'leading_spaces': leading_spaces,
|
| 318 |
+
'space_indent_level': space_indent_level,
|
| 319 |
+
'pattern_level': 0,
|
| 320 |
+
'is_bullet': False,
|
| 321 |
+
'is_numbered': False,
|
| 322 |
+
'is_lettered': False,
|
| 323 |
+
'is_roman': False,
|
| 324 |
+
'is_thai': False,
|
| 325 |
+
'is_parenthetical': False,
|
| 326 |
+
'formatting_hint': 'indented_text',
|
| 327 |
+
'priority': 0
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
# No indentation at all
|
| 331 |
+
return self._create_empty_result(text)
|
| 332 |
+
|
| 333 |
+
def classify_text_type(self, text: str, context: Dict = None) -> Dict[str, Any]:
|
| 334 |
+
"""
|
| 335 |
+
Classify text as header, paragraph, or list item based on patterns and context
|
| 336 |
+
|
| 337 |
+
Args:
|
| 338 |
+
text: Text to classify
|
| 339 |
+
context: Additional context like position, formatting, etc.
|
| 340 |
+
|
| 341 |
+
Returns:
|
| 342 |
+
Dict with classification results
|
| 343 |
+
"""
|
| 344 |
+
if not text or not text.strip():
|
| 345 |
+
return {'type': 'empty', 'confidence': 1.0}
|
| 346 |
+
|
| 347 |
+
text_stripped = text.strip()
|
| 348 |
+
context = context or {}
|
| 349 |
+
|
| 350 |
+
# Check for indentation patterns first
|
| 351 |
+
indent_result = self.detect_indentation(text)
|
| 352 |
+
|
| 353 |
+
# Initialize classification scores
|
| 354 |
+
scores = {
|
| 355 |
+
'header': 0.0,
|
| 356 |
+
'paragraph': 0.0,
|
| 357 |
+
'list_item': 0.0
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
# List item indicators
|
| 361 |
+
if indent_result['has_pattern']:
|
| 362 |
+
scores['list_item'] += 0.8
|
| 363 |
+
if indent_result['is_numbered'] or indent_result['is_lettered']:
|
| 364 |
+
scores['list_item'] += 0.1
|
| 365 |
+
if indent_result['is_bullet']:
|
| 366 |
+
scores['list_item'] += 0.1
|
| 367 |
+
|
| 368 |
+
# Header indicators
|
| 369 |
+
if len(text_stripped) < 100: # Short text more likely to be header
|
| 370 |
+
scores['header'] += 0.3
|
| 371 |
+
|
| 372 |
+
if len(text_stripped) < 50: # Very short text even more likely
|
| 373 |
+
scores['header'] += 0.2
|
| 374 |
+
|
| 375 |
+
# Check header patterns
|
| 376 |
+
for pattern_name, pattern in self.header_patterns.items():
|
| 377 |
+
if callable(pattern):
|
| 378 |
+
if pattern(text_stripped):
|
| 379 |
+
scores['header'] += 0.2
|
| 380 |
+
else:
|
| 381 |
+
if re.match(pattern, text_stripped):
|
| 382 |
+
scores['header'] += 0.2
|
| 383 |
+
|
| 384 |
+
# Position-based scoring from context
|
| 385 |
+
if context.get('y_position'):
|
| 386 |
+
# Higher on page = more likely header
|
| 387 |
+
if context['y_position'] < 100: # Top of page
|
| 388 |
+
scores['header'] += 0.3
|
| 389 |
+
|
| 390 |
+
# Font size context
|
| 391 |
+
if context.get('font_size'):
|
| 392 |
+
if context['font_size'] > 12: # Larger font
|
| 393 |
+
scores['header'] += 0.2
|
| 394 |
+
|
| 395 |
+
# Font weight context
|
| 396 |
+
if context.get('is_bold'):
|
| 397 |
+
scores['header'] += 0.2
|
| 398 |
+
|
| 399 |
+
# Paragraph indicators
|
| 400 |
+
if len(text_stripped) > 100: # Long text more likely paragraph
|
| 401 |
+
scores['paragraph'] += 0.4
|
| 402 |
+
|
| 403 |
+
if text_stripped.endswith('.'): # Ends with period
|
| 404 |
+
scores['paragraph'] += 0.2
|
| 405 |
+
|
| 406 |
+
if not indent_result['has_pattern'] and len(text_stripped) > 50:
|
| 407 |
+
scores['paragraph'] += 0.3
|
| 408 |
+
|
| 409 |
+
# Determine final classification
|
| 410 |
+
max_score = max(scores.values())
|
| 411 |
+
classification = max(scores.items(), key=lambda x: x[1])
|
| 412 |
+
|
| 413 |
+
return {
|
| 414 |
+
'type': classification[0],
|
| 415 |
+
'confidence': classification[1],
|
| 416 |
+
'scores': scores,
|
| 417 |
+
'indentation': indent_result,
|
| 418 |
+
'is_header': classification[0] == 'header',
|
| 419 |
+
'is_paragraph': classification[0] == 'paragraph',
|
| 420 |
+
'is_list_item': classification[0] == 'list_item'
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
def _create_empty_result(self, text: str) -> Dict[str, Any]:
|
| 424 |
+
"""Create result for text with no indentation pattern"""
|
| 425 |
+
return {
|
| 426 |
+
'has_pattern': False,
|
| 427 |
+
'pattern_type': 'normal',
|
| 428 |
+
'pattern_marker': '',
|
| 429 |
+
'level': 0,
|
| 430 |
+
'content': text.strip(),
|
| 431 |
+
'original_text': text,
|
| 432 |
+
'leading_spaces': 0,
|
| 433 |
+
'space_indent_level': 0,
|
| 434 |
+
'pattern_level': 0,
|
| 435 |
+
'is_bullet': False,
|
| 436 |
+
'is_numbered': False,
|
| 437 |
+
'is_lettered': False,
|
| 438 |
+
'is_roman': False,
|
| 439 |
+
'is_thai': False,
|
| 440 |
+
'is_parenthetical': False,
|
| 441 |
+
'formatting_hint': 'normal_text',
|
| 442 |
+
'priority': 0
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
def _calculate_decimal_level(self, marker: str) -> int:
|
| 446 |
+
"""Calculate level for decimal hierarchies (1.2.3.4 = level 4)"""
|
| 447 |
+
# Count dots to determine depth
|
| 448 |
+
dots = marker.count('.')
|
| 449 |
+
return dots + 1
|
| 450 |
+
|
| 451 |
+
def _calculate_mixed_level(self, marker: str) -> int:
|
| 452 |
+
"""Calculate level for mixed hierarchies (1.2.a.i = level 4)"""
|
| 453 |
+
parts = marker.split('.')
|
| 454 |
+
return len([p for p in parts if p.strip()])
|
| 455 |
+
|
| 456 |
+
def _calculate_legal_level(self, marker: str) -> int:
|
| 457 |
+
"""Calculate level for legal numbering (1.1.1(a)(i) = level 5)"""
|
| 458 |
+
# Count dots and parenthetical parts
|
| 459 |
+
dots = marker.count('.')
|
| 460 |
+
parens = marker.count('(')
|
| 461 |
+
return dots + parens + 1
|
| 462 |
+
|
| 463 |
+
def _calculate_outline_level(self, marker: str) -> int:
|
| 464 |
+
"""Calculate level for outline numbering (I.A.1.a.i = level 5)"""
|
| 465 |
+
parts = marker.split('.')
|
| 466 |
+
return len([p for p in parts if p.strip()])
|
| 467 |
+
|
| 468 |
+
def _calculate_bullet_level(self, marker: str) -> int:
|
| 469 |
+
"""Calculate level for bullet points based on symbol complexity"""
|
| 470 |
+
# More complex symbols typically indicate deeper levels
|
| 471 |
+
complex_bullets = ['◦', '‣', '⁃', '▪', '▫', '◘', '◙']
|
| 472 |
+
if marker in complex_bullets:
|
| 473 |
+
return 2
|
| 474 |
+
return 1
|
| 475 |
+
|
| 476 |
+
def _is_bullet_pattern(self, pattern_type: str) -> bool:
|
| 477 |
+
"""Check if pattern is a bullet type"""
|
| 478 |
+
return any(bullet_type in pattern_type for bullet_type in ['bullet', 'dash', 'arrow', 'checkbox'])
|
| 479 |
+
|
| 480 |
+
def _is_numbered_pattern(self, pattern_type: str) -> bool:
|
| 481 |
+
"""Check if pattern is a numbered type"""
|
| 482 |
+
return any(num_type in pattern_type for num_type in ['numbered', 'decimal', 'legal', 'section'])
|
| 483 |
+
|
| 484 |
+
def _is_lettered_pattern(self, pattern_type: str) -> bool:
|
| 485 |
+
"""Check if pattern is a lettered type"""
|
| 486 |
+
return 'lettered' in pattern_type
|
| 487 |
+
|
| 488 |
+
def _is_roman_pattern(self, pattern_type: str) -> bool:
|
| 489 |
+
"""Check if pattern is a roman numeral type"""
|
| 490 |
+
return 'roman' in pattern_type
|
| 491 |
+
|
| 492 |
+
def _is_thai_pattern(self, pattern_type: str) -> bool:
|
| 493 |
+
"""Check if pattern is Thai-specific"""
|
| 494 |
+
return 'thai' in pattern_type
|
| 495 |
+
|
| 496 |
+
def _is_parenthetical_pattern(self, pattern_type: str) -> bool:
|
| 497 |
+
"""Check if pattern is parenthetical type"""
|
| 498 |
+
return 'parenthetical' in pattern_type
|
| 499 |
+
|
| 500 |
+
def _get_formatting_hint(self, pattern_type: str, level: int) -> str:
|
| 501 |
+
"""Get formatting hint for rendering"""
|
| 502 |
+
level_names = ['primary', 'secondary', 'tertiary', 'quaternary', 'quinary']
|
| 503 |
+
level_name = level_names[min(level-1, len(level_names)-1)] if level > 0 else 'normal'
|
| 504 |
+
|
| 505 |
+
if self._is_bullet_pattern(pattern_type):
|
| 506 |
+
return f'bullet_{level_name}'
|
| 507 |
+
elif self._is_numbered_pattern(pattern_type):
|
| 508 |
+
return f'numbered_{level_name}'
|
| 509 |
+
elif self._is_lettered_pattern(pattern_type):
|
| 510 |
+
return f'lettered_{level_name}'
|
| 511 |
+
elif self._is_roman_pattern(pattern_type):
|
| 512 |
+
return f'roman_{level_name}'
|
| 513 |
+
elif self._is_thai_pattern(pattern_type):
|
| 514 |
+
return f'thai_{level_name}'
|
| 515 |
+
elif self._is_parenthetical_pattern(pattern_type):
|
| 516 |
+
return f'parenthetical_{level_name}'
|
| 517 |
+
else:
|
| 518 |
+
return f'indent_{level_name}'
|
| 519 |
+
|
| 520 |
+
def analyze_document_structure(self, text_lines: List[str]) -> Dict[str, Any]:
|
| 521 |
+
"""
|
| 522 |
+
Analyze entire document structure for consistent formatting
|
| 523 |
+
|
| 524 |
+
Args:
|
| 525 |
+
text_lines: List of text lines to analyze
|
| 526 |
+
|
| 527 |
+
Returns:
|
| 528 |
+
Dict with document structure analysis
|
| 529 |
+
"""
|
| 530 |
+
analysis = {
|
| 531 |
+
'total_lines': len(text_lines),
|
| 532 |
+
'patterned_lines': 0,
|
| 533 |
+
'max_level': 0,
|
| 534 |
+
'pattern_distribution': Counter(),
|
| 535 |
+
'level_distribution': Counter(),
|
| 536 |
+
'formatting_hints': Counter(),
|
| 537 |
+
'text_classification': Counter(),
|
| 538 |
+
'has_consistent_numbering': False,
|
| 539 |
+
'dominant_patterns': [],
|
| 540 |
+
'header_count': 0,
|
| 541 |
+
'paragraph_count': 0,
|
| 542 |
+
'list_item_count': 0
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
indent_results = []
|
| 546 |
+
|
| 547 |
+
for line in text_lines:
|
| 548 |
+
if line.strip():
|
| 549 |
+
# Indentation analysis
|
| 550 |
+
indent_result = self.detect_indentation(line)
|
| 551 |
+
indent_results.append(indent_result)
|
| 552 |
+
|
| 553 |
+
# Text classification
|
| 554 |
+
classification = self.classify_text_type(line)
|
| 555 |
+
analysis['text_classification'][classification['type']] += 1
|
| 556 |
+
|
| 557 |
+
if classification['type'] == 'header':
|
| 558 |
+
analysis['header_count'] += 1
|
| 559 |
+
elif classification['type'] == 'paragraph':
|
| 560 |
+
analysis['paragraph_count'] += 1
|
| 561 |
+
elif classification['type'] == 'list_item':
|
| 562 |
+
analysis['list_item_count'] += 1
|
| 563 |
+
|
| 564 |
+
if indent_result['has_pattern']:
|
| 565 |
+
analysis['patterned_lines'] += 1
|
| 566 |
+
analysis['pattern_distribution'][indent_result['pattern_type']] += 1
|
| 567 |
+
analysis['level_distribution'][indent_result['level']] += 1
|
| 568 |
+
analysis['formatting_hints'][indent_result['formatting_hint']] += 1
|
| 569 |
+
analysis['max_level'] = max(analysis['max_level'], indent_result['level'])
|
| 570 |
+
|
| 571 |
+
# Determine dominant patterns
|
| 572 |
+
if analysis['pattern_distribution']:
|
| 573 |
+
analysis['dominant_patterns'] = analysis['pattern_distribution'].most_common(3)
|
| 574 |
+
|
| 575 |
+
# Check for consistent numbering
|
| 576 |
+
numbered_patterns = [p for p in analysis['pattern_distribution'] if 'numbered' in p or 'decimal' in p]
|
| 577 |
+
analysis['has_consistent_numbering'] = len(numbered_patterns) > 0
|
| 578 |
+
|
| 579 |
+
analysis['coverage_percentage'] = (analysis['patterned_lines'] / analysis['total_lines'] * 100) if analysis['total_lines'] > 0 else 0
|
| 580 |
+
|
| 581 |
+
return analysis
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
# Example usage and testing
|
| 585 |
+
if __name__ == "__main__":
|
| 586 |
+
detector = EnhancedIndentationDetector()
|
| 587 |
+
|
| 588 |
+
test_cases = [
|
| 589 |
+
"1. First level item",
|
| 590 |
+
" 1.1. Second level item",
|
| 591 |
+
" 1.1.1. Third level item",
|
| 592 |
+
" 1.1.1.1. Fourth level item",
|
| 593 |
+
" 1.1.1.1.1. Fifth level item",
|
| 594 |
+
"(1) Parenthetical Arabic",
|
| 595 |
+
"(๑) Parenthetical Thai numeral",
|
| 596 |
+
"(a) Parenthetical lowercase letter",
|
| 597 |
+
"(A) Parenthetical uppercase letter",
|
| 598 |
+
"(ก) Parenthetical Thai letter",
|
| 599 |
+
"(i) Parenthetical lowercase Roman",
|
| 600 |
+
"(I) Parenthetical uppercase Roman",
|
| 601 |
+
"2. Another first level",
|
| 602 |
+
" a. Letter sub-item",
|
| 603 |
+
" i. Roman sub-sub-item",
|
| 604 |
+
"• Bullet point",
|
| 605 |
+
" ◦ Sub bullet",
|
| 606 |
+
" ▪ Sub-sub bullet",
|
| 607 |
+
"- Dash item",
|
| 608 |
+
" * Asterisk sub-item",
|
| 609 |
+
" + Plus sub-sub-item",
|
| 610 |
+
"§1.2.3. Section numbering",
|
| 611 |
+
"Article 1.1.1. Article numbering",
|
| 612 |
+
"มาตรา 1.2.3 Thai section",
|
| 613 |
+
"ก. Thai letter",
|
| 614 |
+
"[x] Checkbox item",
|
| 615 |
+
"→ Arrow item",
|
| 616 |
+
"I. Roman numeral",
|
| 617 |
+
" A. Capital letter",
|
| 618 |
+
" 1. Number",
|
| 619 |
+
" a. Lowercase letter",
|
| 620 |
+
" i. Lowercase roman",
|
| 621 |
+
" Normal indented text without pattern",
|
| 622 |
+
"CHAPTER 1: INTRODUCTION",
|
| 623 |
+
"This is a regular paragraph with some text that should be classified as paragraph content.",
|
| 624 |
+
]
|
| 625 |
+
|
| 626 |
+
print("Enhanced Indentation Detection Results with Parenthetical Patterns:")
|
| 627 |
+
print("=" * 80)
|
| 628 |
+
|
| 629 |
+
for i, test_text in enumerate(test_cases, 1):
|
| 630 |
+
result = detector.detect_indentation(test_text)
|
| 631 |
+
classification = detector.classify_text_type(test_text)
|
| 632 |
+
|
| 633 |
+
print(f"{i:2d}. Text: {test_text!r}")
|
| 634 |
+
print(f" Pattern: {result['pattern_type']}")
|
| 635 |
+
print(f" Level: {result['level']}")
|
| 636 |
+
print(f" Marker: {result['pattern_marker']!r}")
|
| 637 |
+
print(f" Content: {result['content']!r}")
|
| 638 |
+
print(f" Hint: {result['formatting_hint']}")
|
| 639 |
+
print(f" Priority: {result['priority']}")
|
| 640 |
+
print(f" Classification: {classification['type']} (confidence: {classification['confidence']:.2f})")
|
| 641 |
+
print()
|
| 642 |
+
|
| 643 |
+
# Test document analysis
|
| 644 |
+
print("\nDocument Structure Analysis:")
|
| 645 |
+
print("=" * 40)
|
| 646 |
+
analysis = detector.analyze_document_structure(test_cases)
|
| 647 |
+
for key, value in analysis.items():
|
| 648 |
+
print(f"{key}: {value}")
|
ocr_service.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
-
OCR Service Module -
|
| 3 |
-
Handles PDF to text conversion with proper indentation, spacing,
|
| 4 |
"""
|
| 5 |
import re
|
| 6 |
import os
|
|
@@ -30,17 +30,25 @@ except ImportError:
|
|
| 30 |
|
| 31 |
import fitz # PyMuPDF
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
# Configure logging
|
| 34 |
logging.basicConfig(level=logging.INFO)
|
| 35 |
logger = logging.getLogger(__name__)
|
| 36 |
|
| 37 |
|
| 38 |
-
class
|
| 39 |
-
"""Process OCR results through HTML
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
@staticmethod
|
| 42 |
def create_html_from_azure_result(analysis_result) -> str:
|
| 43 |
-
"""Create structured HTML from Azure Document Intelligence result with
|
|
|
|
|
|
|
| 44 |
html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8">']
|
| 45 |
html_parts.append('<style>')
|
| 46 |
html_parts.append('''
|
|
@@ -71,12 +79,177 @@ class HTMLProcessor:
|
|
| 71 |
text-transform: uppercase;
|
| 72 |
letter-spacing: 1px;
|
| 73 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
.paragraph {
|
| 75 |
margin-bottom: 0.8em;
|
| 76 |
white-space: pre-wrap;
|
| 77 |
font-family: 'Consolas', 'Courier New', monospace;
|
| 78 |
line-height: 1.4;
|
| 79 |
}
|
|
|
|
| 80 |
.title {
|
| 81 |
font-size: 1.4em;
|
| 82 |
font-weight: bold;
|
|
@@ -124,24 +297,13 @@ class HTMLProcessor:
|
|
| 124 |
.table tr:nth-child(even) {
|
| 125 |
background-color: #f8f9fa;
|
| 126 |
}
|
| 127 |
-
.
|
| 128 |
-
|
| 129 |
-
|
| 130 |
}
|
| 131 |
-
.
|
| 132 |
-
|
| 133 |
-
padding-left:
|
| 134 |
-
margin-bottom: 0.3em;
|
| 135 |
-
}
|
| 136 |
-
.bullet-point:before {
|
| 137 |
-
content: "•";
|
| 138 |
-
position: absolute;
|
| 139 |
-
left: 0;
|
| 140 |
-
color: #3498db;
|
| 141 |
-
font-weight: bold;
|
| 142 |
-
}
|
| 143 |
-
.spaced {
|
| 144 |
-
margin-top: 10px;
|
| 145 |
}
|
| 146 |
.page-number {
|
| 147 |
position: relative;
|
|
@@ -164,48 +326,46 @@ class HTMLProcessor:
|
|
| 164 |
html_parts.append(f'<div class="page">')
|
| 165 |
html_parts.append(f'<div class="page-header">Page {page_num} <span class="page-number">{page_num}</span></div>')
|
| 166 |
|
| 167 |
-
# Process content with
|
| 168 |
-
content_items =
|
| 169 |
content_items.sort(key=lambda x: (x['y_pos'], x['x_pos']))
|
| 170 |
|
| 171 |
-
# Generate HTML for each content item with
|
| 172 |
for item in content_items:
|
| 173 |
if item['type'] == 'table':
|
| 174 |
-
html_parts.append(
|
| 175 |
else:
|
| 176 |
-
html_parts.append(
|
| 177 |
|
| 178 |
html_parts.append('</div>')
|
| 179 |
|
| 180 |
html_parts.append('</body></html>')
|
| 181 |
return '\n'.join(html_parts)
|
| 182 |
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
"""Extract and organize page content without losing text with proper spacing"""
|
| 186 |
content_items = []
|
| 187 |
|
| 188 |
-
#
|
| 189 |
page_tables = []
|
| 190 |
table_regions = []
|
| 191 |
|
| 192 |
if analysis_result.tables:
|
| 193 |
for table_idx, table in enumerate(analysis_result.tables):
|
| 194 |
-
if
|
| 195 |
page_tables.append((table_idx, table))
|
| 196 |
-
# Store table regions for overlap detection
|
| 197 |
if table.bounding_regions:
|
| 198 |
table_regions.append({
|
| 199 |
'polygon': table.bounding_regions[0].polygon,
|
| 200 |
'table_idx': table_idx
|
| 201 |
})
|
| 202 |
|
| 203 |
-
# Add
|
| 204 |
for table_idx, table in page_tables:
|
| 205 |
if table.bounding_regions and table.bounding_regions[0].polygon:
|
| 206 |
polygon = table.bounding_regions[0].polygon
|
| 207 |
-
y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7])
|
| 208 |
-
x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6])
|
| 209 |
|
| 210 |
content_items.append({
|
| 211 |
'type': 'table',
|
|
@@ -215,51 +375,192 @@ class HTMLProcessor:
|
|
| 215 |
'x_pos': x_pos
|
| 216 |
})
|
| 217 |
|
| 218 |
-
#
|
| 219 |
-
page_left_margin = HTMLProcessor._calculate_page_margins(page, analysis_result, page_num)
|
| 220 |
-
|
| 221 |
-
# Process text content - use paragraphs if available, otherwise lines
|
| 222 |
if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
|
| 223 |
-
# Use paragraphs (better content grouping)
|
| 224 |
page_paragraphs = [p for p in analysis_result.paragraphs if
|
| 225 |
p.bounding_regions and
|
| 226 |
p.bounding_regions[0].page_number == page_num]
|
| 227 |
|
| 228 |
for para in page_paragraphs:
|
| 229 |
if para.content.strip():
|
| 230 |
-
# Check
|
| 231 |
-
overlap_ratio =
|
| 232 |
|
| 233 |
-
|
| 234 |
-
if overlap_ratio < 0.7:
|
| 235 |
polygon = para.bounding_regions[0].polygon
|
| 236 |
y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
|
| 237 |
x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
|
| 238 |
|
| 239 |
-
#
|
| 240 |
-
indent_info =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
content_items.append({
|
| 243 |
'type': 'paragraph',
|
| 244 |
-
'content':
|
| 245 |
'role': getattr(para, 'role', 'paragraph'),
|
| 246 |
'y_pos': y_pos,
|
| 247 |
'x_pos': x_pos,
|
| 248 |
-
'
|
| 249 |
-
'
|
| 250 |
-
'is_bullet': indent_info['is_bullet'],
|
| 251 |
'preserve_spacing': True
|
| 252 |
})
|
| 253 |
|
| 254 |
elif page.lines:
|
| 255 |
-
#
|
| 256 |
-
processed_lines =
|
| 257 |
content_items.extend(processed_lines)
|
| 258 |
|
| 259 |
return content_items
|
| 260 |
|
| 261 |
-
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
"""Check if table belongs to the specified page"""
|
| 264 |
if not table.cells:
|
| 265 |
return False
|
|
@@ -270,9 +571,8 @@ class HTMLProcessor:
|
|
| 270 |
return True
|
| 271 |
return False
|
| 272 |
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
"""Calculate overlap ratio between content and tables (FIXED)"""
|
| 276 |
if not table_regions or not content_item.bounding_regions:
|
| 277 |
return 0.0
|
| 278 |
|
|
@@ -316,120 +616,7 @@ class HTMLProcessor:
|
|
| 316 |
|
| 317 |
return max_overlap_ratio
|
| 318 |
|
| 319 |
-
|
| 320 |
-
def _calculate_page_margins(page, analysis_result, page_num):
|
| 321 |
-
"""Calculate page margins to determine proper indentation baseline"""
|
| 322 |
-
left_positions = []
|
| 323 |
-
|
| 324 |
-
# Collect x positions from paragraphs if available
|
| 325 |
-
if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
|
| 326 |
-
page_paragraphs = [p for p in analysis_result.paragraphs if
|
| 327 |
-
p.bounding_regions and
|
| 328 |
-
p.bounding_regions[0].page_number == page_num]
|
| 329 |
-
|
| 330 |
-
for para in page_paragraphs:
|
| 331 |
-
if para.bounding_regions and para.bounding_regions[0].polygon:
|
| 332 |
-
polygon = para.bounding_regions[0].polygon
|
| 333 |
-
x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6])
|
| 334 |
-
left_positions.append(x_pos)
|
| 335 |
-
|
| 336 |
-
# Fallback to lines if no paragraphs
|
| 337 |
-
elif page.lines:
|
| 338 |
-
for line in page.lines:
|
| 339 |
-
if line.polygon:
|
| 340 |
-
x_pos = min(line.polygon[0], line.polygon[2], line.polygon[4], line.polygon[6])
|
| 341 |
-
left_positions.append(x_pos)
|
| 342 |
-
|
| 343 |
-
# Find the most common left margin (baseline)
|
| 344 |
-
if left_positions:
|
| 345 |
-
left_positions.sort()
|
| 346 |
-
# Take the most frequent left position as the main margin
|
| 347 |
-
from collections import Counter
|
| 348 |
-
position_counts = Counter([round(pos, -1) for pos in left_positions]) # Round to nearest 10
|
| 349 |
-
base_margin = position_counts.most_common(1)[0][0]
|
| 350 |
-
return base_margin
|
| 351 |
-
|
| 352 |
-
return 50 # Default margin if no content found
|
| 353 |
-
|
| 354 |
-
@staticmethod
|
| 355 |
-
def _calculate_precise_indentation(x_pos, base_margin, content):
|
| 356 |
-
"""Calculate precise indentation based on x position and content analysis"""
|
| 357 |
-
# Calculate indent distance from base margin
|
| 358 |
-
indent_distance = max(0, x_pos - base_margin)
|
| 359 |
-
|
| 360 |
-
# Define indentation levels based on distance
|
| 361 |
-
# Each level represents approximately 0.5 inch or 36 points
|
| 362 |
-
level_threshold = 30 # Reduced threshold for better sensitivity
|
| 363 |
-
indent_level = int(indent_distance / level_threshold)
|
| 364 |
-
|
| 365 |
-
# Detect bullet points or numbered lists
|
| 366 |
-
is_bullet = False
|
| 367 |
-
content_stripped = content.strip()
|
| 368 |
-
|
| 369 |
-
# Common bullet point patterns
|
| 370 |
-
bullet_patterns = [
|
| 371 |
-
r'^\s*[•·▪▫◦‣⁃]\s+', # Bullet symbols
|
| 372 |
-
r'^\s*[\-\*\+]\s+', # Dash, asterisk, plus
|
| 373 |
-
r'^\s*\d+[\.\)]\s+', # Numbered lists (1. or 1))
|
| 374 |
-
r'^\s*[a-zA-Z][\.\)]\s+', # Lettered lists (a. or a))
|
| 375 |
-
r'^\s*[ivxlcdm]+[\.\)]\s+', # Roman numerals
|
| 376 |
-
]
|
| 377 |
-
|
| 378 |
-
for pattern in bullet_patterns:
|
| 379 |
-
if re.match(pattern, content_stripped, re.IGNORECASE):
|
| 380 |
-
is_bullet = True
|
| 381 |
-
break
|
| 382 |
-
|
| 383 |
-
return {
|
| 384 |
-
'level': min(indent_level, 6), # Cap at level 6
|
| 385 |
-
'pixels': indent_distance,
|
| 386 |
-
'is_bullet': is_bullet
|
| 387 |
-
}
|
| 388 |
-
|
| 389 |
-
@staticmethod
|
| 390 |
-
def _process_lines_content_with_spacing(lines, table_regions, page_left_margin):
|
| 391 |
-
"""Process lines content with enhanced spacing preservation"""
|
| 392 |
-
content_items = []
|
| 393 |
-
processed_content = set()
|
| 394 |
-
|
| 395 |
-
for line in lines:
|
| 396 |
-
if not line.content.strip():
|
| 397 |
-
continue
|
| 398 |
-
|
| 399 |
-
# Avoid duplicates
|
| 400 |
-
content_key = line.content.strip().lower()
|
| 401 |
-
if content_key in processed_content:
|
| 402 |
-
continue
|
| 403 |
-
processed_content.add(content_key)
|
| 404 |
-
|
| 405 |
-
# Check table overlap
|
| 406 |
-
overlap_ratio = HTMLProcessor._calculate_line_table_overlap(line, table_regions)
|
| 407 |
-
|
| 408 |
-
# Only exclude if heavily overlapping with table
|
| 409 |
-
if overlap_ratio < 0.7:
|
| 410 |
-
polygon = line.polygon
|
| 411 |
-
y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
|
| 412 |
-
x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
|
| 413 |
-
|
| 414 |
-
# Calculate precise indentation for lines
|
| 415 |
-
indent_info = HTMLProcessor._calculate_precise_indentation(x_pos, page_left_margin, line.content)
|
| 416 |
-
|
| 417 |
-
content_items.append({
|
| 418 |
-
'type': 'line',
|
| 419 |
-
'content': line.content.strip(),
|
| 420 |
-
'role': 'text',
|
| 421 |
-
'y_pos': y_pos,
|
| 422 |
-
'x_pos': x_pos,
|
| 423 |
-
'indent_level': indent_info['level'],
|
| 424 |
-
'indent_pixels': indent_info['pixels'],
|
| 425 |
-
'is_bullet': indent_info['is_bullet'],
|
| 426 |
-
'preserve_spacing': True
|
| 427 |
-
})
|
| 428 |
-
|
| 429 |
-
return content_items
|
| 430 |
-
|
| 431 |
-
@staticmethod
|
| 432 |
-
def _calculate_line_table_overlap(line, table_regions):
|
| 433 |
"""Calculate overlap between line and tables"""
|
| 434 |
if not table_regions or not line.polygon:
|
| 435 |
return 0.0
|
|
@@ -474,119 +661,37 @@ class HTMLProcessor:
|
|
| 474 |
return max_overlap
|
| 475 |
|
| 476 |
@staticmethod
|
| 477 |
-
def
|
| 478 |
-
"""Convert
|
| 479 |
-
content = item['content']
|
| 480 |
-
role = item.get('role', 'paragraph')
|
| 481 |
-
indent_level = item.get('indent_level', 0)
|
| 482 |
-
indent_pixels = item.get('indent_pixels', 0)
|
| 483 |
-
is_bullet = item.get('is_bullet', False)
|
| 484 |
-
preserve_spacing = item.get('preserve_spacing', False)
|
| 485 |
-
|
| 486 |
-
# Calculate CSS indentation
|
| 487 |
-
css_indent = max(0, indent_level)
|
| 488 |
-
|
| 489 |
-
# Build CSS classes and inline styles
|
| 490 |
-
css_classes = []
|
| 491 |
-
inline_styles = []
|
| 492 |
-
|
| 493 |
-
if css_indent > 0:
|
| 494 |
-
inline_styles.append(f"margin-left: {css_indent * 1.5}em")
|
| 495 |
-
css_classes.append("indented")
|
| 496 |
-
|
| 497 |
-
if is_bullet:
|
| 498 |
-
css_classes.append("bullet-point")
|
| 499 |
-
|
| 500 |
-
# Preserve internal spacing within content
|
| 501 |
-
if preserve_spacing:
|
| 502 |
-
# Replace multiple spaces with to preserve spacing
|
| 503 |
-
content = re.sub(r' +', lambda m: ' ' * len(m.group()), content)
|
| 504 |
-
# Preserve line breaks within content
|
| 505 |
-
content = content.replace('\n', '<br>')
|
| 506 |
-
|
| 507 |
-
# Combine CSS
|
| 508 |
-
class_str = f' class="{" ".join(css_classes)}"' if css_classes else ''
|
| 509 |
-
style_str = f' style="{"; ".join(inline_styles)}"' if inline_styles else ''
|
| 510 |
-
|
| 511 |
-
if role == 'title':
|
| 512 |
-
return f'<div class="title"{class_str}{style_str}>{content}</div>'
|
| 513 |
-
elif role == 'sectionHeading':
|
| 514 |
-
return f'<div class="section-heading"{class_str}{style_str}>{content}</div>'
|
| 515 |
-
else:
|
| 516 |
-
# Regular paragraphs with preserved formatting
|
| 517 |
-
return f'<div class="paragraph"{class_str}{style_str}>{content}</div>'
|
| 518 |
-
|
| 519 |
-
@staticmethod
|
| 520 |
-
def _table_to_html(table, table_idx):
|
| 521 |
-
"""Convert table to HTML with proper structure"""
|
| 522 |
-
if not table.cells:
|
| 523 |
-
return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
|
| 524 |
-
|
| 525 |
-
# Create table matrix
|
| 526 |
-
max_row = max(cell.row_index for cell in table.cells) + 1
|
| 527 |
-
max_col = max(cell.column_index for cell in table.cells) + 1
|
| 528 |
-
|
| 529 |
-
table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
|
| 530 |
-
|
| 531 |
-
# Fill matrix
|
| 532 |
-
for cell in table.cells:
|
| 533 |
-
content = (cell.content or "").strip()
|
| 534 |
-
table_matrix[cell.row_index][cell.column_index] = content
|
| 535 |
-
|
| 536 |
-
# Generate HTML
|
| 537 |
-
html_parts = [f'<div class="table-container">']
|
| 538 |
-
html_parts.append(f'<h4>Table {table_idx + 1}</h4>')
|
| 539 |
-
html_parts.append('<table class="table">')
|
| 540 |
-
|
| 541 |
-
for row_idx, row in enumerate(table_matrix):
|
| 542 |
-
if row_idx == 0 and any(cell.strip() for cell in row):
|
| 543 |
-
# Header row
|
| 544 |
-
html_parts.append('<tr>')
|
| 545 |
-
for cell in row:
|
| 546 |
-
html_parts.append(f'<th>{cell}</th>')
|
| 547 |
-
html_parts.append('</tr>')
|
| 548 |
-
else:
|
| 549 |
-
# Data row
|
| 550 |
-
if any(cell.strip() for cell in row): # Skip empty rows
|
| 551 |
-
html_parts.append('<tr>')
|
| 552 |
-
for cell in row:
|
| 553 |
-
html_parts.append(f'<td>{cell}</td>')
|
| 554 |
-
html_parts.append('</tr>')
|
| 555 |
-
|
| 556 |
-
html_parts.append('</table></div>')
|
| 557 |
-
return '\n'.join(html_parts)
|
| 558 |
-
|
| 559 |
-
@staticmethod
|
| 560 |
-
def html_to_formatted_text(html_content):
|
| 561 |
-
"""Convert HTML back to formatted text preserving structure, spacing, and adding page numbers"""
|
| 562 |
from html.parser import HTMLParser
|
| 563 |
|
| 564 |
-
class
|
| 565 |
def __init__(self):
|
| 566 |
super().__init__()
|
| 567 |
self.text_parts = []
|
|
|
|
| 568 |
self.in_title = False
|
| 569 |
self.in_section_heading = False
|
| 570 |
self.in_table = False
|
| 571 |
-
self.in_table_header = False
|
| 572 |
self.current_table_row = []
|
| 573 |
self.table_data = []
|
| 574 |
-
self.
|
| 575 |
-
self.
|
| 576 |
self.in_page_header = False
|
| 577 |
-
self.
|
|
|
|
| 578 |
|
| 579 |
def handle_starttag(self, tag, attrs):
|
| 580 |
attr_dict = dict(attrs)
|
| 581 |
class_attr = attr_dict.get('class', '')
|
| 582 |
-
|
| 583 |
|
| 584 |
if 'page-header' in class_attr:
|
| 585 |
self.in_page_header = True
|
| 586 |
-
# Add proper page separation with page number
|
| 587 |
if len(self.text_parts) > 0:
|
| 588 |
self.text_parts.append('\n\n' + '=' * 80 + '\n')
|
| 589 |
-
|
|
|
|
| 590 |
elif 'title' in class_attr:
|
| 591 |
self.in_title = True
|
| 592 |
elif 'section-heading' in class_attr:
|
|
@@ -594,32 +699,47 @@ class HTMLProcessor:
|
|
| 594 |
elif tag == 'table':
|
| 595 |
self.in_table = True
|
| 596 |
self.table_data = []
|
| 597 |
-
elif tag == 'th':
|
| 598 |
-
self.in_table_header = True
|
| 599 |
elif tag == 'tr':
|
| 600 |
self.current_table_row = []
|
| 601 |
elif tag == 'br':
|
| 602 |
self.text_parts.append('\n')
|
| 603 |
|
| 604 |
-
# Extract
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
else:
|
| 613 |
-
|
| 614 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
|
| 616 |
-
|
| 617 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
|
| 619 |
def handle_endtag(self, tag):
|
| 620 |
if tag == 'div' and self.in_page_header:
|
| 621 |
self.text_parts.append('\n' + '=' * 80 + '\n\n')
|
| 622 |
self.in_page_header = False
|
|
|
|
|
|
|
|
|
|
| 623 |
elif tag == 'div' and self.in_title:
|
| 624 |
self.text_parts.append('\n\n')
|
| 625 |
self.in_title = False
|
|
@@ -629,69 +749,83 @@ class HTMLProcessor:
|
|
| 629 |
elif tag == 'table':
|
| 630 |
self.in_table = False
|
| 631 |
self._format_table()
|
| 632 |
-
elif tag == 'th':
|
| 633 |
-
self.in_table_header = False
|
| 634 |
elif tag == 'tr' and self.current_table_row:
|
| 635 |
self.table_data.append(self.current_table_row[:])
|
| 636 |
-
elif tag == 'div' and not self.in_table
|
| 637 |
-
if not self.
|
| 638 |
self.text_parts.append('\n')
|
| 639 |
|
| 640 |
-
# Reset
|
| 641 |
if tag == 'div':
|
| 642 |
-
self.
|
| 643 |
-
self.
|
|
|
|
| 644 |
|
| 645 |
def handle_data(self, data):
|
| 646 |
if data.strip():
|
| 647 |
-
# Convert back to spaces for proper spacing
|
| 648 |
data = data.replace(' ', ' ')
|
| 649 |
|
| 650 |
if self.in_page_header:
|
| 651 |
-
# Extract page number and format properly
|
| 652 |
page_match = re.search(r'Page (\d+)', data)
|
| 653 |
if page_match:
|
| 654 |
-
|
| 655 |
-
page_header = f"PAGE {
|
| 656 |
self.text_parts.append(page_header.center(80))
|
|
|
|
|
|
|
|
|
|
| 657 |
elif self.in_title:
|
| 658 |
-
indent_str = " " * self.
|
| 659 |
self.text_parts.append(f'\n{indent_str}## {data.strip()}')
|
| 660 |
elif self.in_section_heading:
|
| 661 |
-
indent_str = " " * self.
|
| 662 |
self.text_parts.append(f'\n{indent_str}### {data.strip()}')
|
| 663 |
elif self.in_table:
|
| 664 |
-
|
| 665 |
-
self.current_table_row.append(data.strip())
|
| 666 |
else:
|
| 667 |
-
# Apply indentation
|
| 668 |
-
indent_str = " " * self.
|
| 669 |
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 676 |
|
| 677 |
-
# Handle bullet points specially
|
| 678 |
-
if 'bullet-point' in getattr(self, '_last_class', ''):
|
| 679 |
-
# Remove the bullet symbol that CSS adds and format properly
|
| 680 |
-
self.text_parts.append(f'{indent_str}• {formatted_data}')
|
| 681 |
else:
|
| 682 |
-
|
|
|
|
| 683 |
|
| 684 |
def _format_table(self):
|
|
|
|
| 685 |
if not self.table_data:
|
| 686 |
return
|
| 687 |
|
| 688 |
self.text_parts.append('\n\n')
|
| 689 |
|
| 690 |
-
# Calculate column widths for better formatting
|
| 691 |
if self.table_data:
|
| 692 |
max_cols = max(len(row) for row in self.table_data)
|
| 693 |
col_widths = [0] * max_cols
|
| 694 |
|
|
|
|
| 695 |
for row in self.table_data:
|
| 696 |
for i, cell in enumerate(row):
|
| 697 |
if i < max_cols:
|
|
@@ -721,7 +855,7 @@ class HTMLProcessor:
|
|
| 721 |
|
| 722 |
self.text_parts.append('\n')
|
| 723 |
|
| 724 |
-
extractor =
|
| 725 |
extractor.feed(html_content)
|
| 726 |
|
| 727 |
result = ''.join(extractor.text_parts)
|
|
@@ -736,7 +870,7 @@ class HTMLProcessor:
|
|
| 736 |
|
| 737 |
|
| 738 |
class OCRService:
|
| 739 |
-
"""Main OCR service with HTML processing and
|
| 740 |
|
| 741 |
def __init__(self):
|
| 742 |
self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
|
|
@@ -758,7 +892,7 @@ class OCRService:
|
|
| 758 |
|
| 759 |
def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
|
| 760 |
"""
|
| 761 |
-
Convert PDF to text using specified method with HTML processing
|
| 762 |
|
| 763 |
Args:
|
| 764 |
pdf_path: Path to the PDF file
|
|
@@ -792,7 +926,7 @@ class OCRService:
|
|
| 792 |
# Try primary method
|
| 793 |
try:
|
| 794 |
if method == "azure" and self.azure_client:
|
| 795 |
-
result = self.
|
| 796 |
elif method == "tesseract":
|
| 797 |
result = self._tesseract_ocr(pdf_path)
|
| 798 |
elif method == "pymupdf":
|
|
@@ -811,13 +945,13 @@ class OCRService:
|
|
| 811 |
|
| 812 |
return result
|
| 813 |
|
| 814 |
-
def
|
| 815 |
-
"""Azure Document Intelligence OCR with HTML processing"""
|
| 816 |
result = {
|
| 817 |
'success': False,
|
| 818 |
'text': '',
|
| 819 |
'html': '',
|
| 820 |
-
'method_used': '
|
| 821 |
'metadata': {},
|
| 822 |
'error': None
|
| 823 |
}
|
|
@@ -848,11 +982,16 @@ class OCRService:
|
|
| 848 |
|
| 849 |
analysis_result = poller.result()
|
| 850 |
|
| 851 |
-
# Generate HTML
|
| 852 |
-
html_content =
|
| 853 |
|
| 854 |
-
# Convert HTML to formatted text with
|
| 855 |
-
formatted_text =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 856 |
|
| 857 |
result.update({
|
| 858 |
'success': True,
|
|
@@ -864,13 +1003,17 @@ class OCRService:
|
|
| 864 |
'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
|
| 865 |
'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
|
| 866 |
'html_generated': True,
|
| 867 |
-
'
|
|
|
|
|
|
|
| 868 |
'page_numbers_added': True,
|
| 869 |
-
'
|
|
|
|
|
|
|
| 870 |
}
|
| 871 |
})
|
| 872 |
|
| 873 |
-
logger.info("Azure OCR with
|
| 874 |
|
| 875 |
except Exception as e:
|
| 876 |
logger.error(f"Azure OCR error: {e}")
|
|
@@ -879,12 +1022,12 @@ class OCRService:
|
|
| 879 |
return result
|
| 880 |
|
| 881 |
def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
|
| 882 |
-
"""Tesseract OCR with
|
| 883 |
result = {
|
| 884 |
'success': False,
|
| 885 |
'text': '',
|
| 886 |
'html': '',
|
| 887 |
-
'method_used': '
|
| 888 |
'metadata': {},
|
| 889 |
'error': None
|
| 890 |
}
|
|
@@ -899,11 +1042,19 @@ class OCRService:
|
|
| 899 |
page_count = len(pdf_document)
|
| 900 |
all_text = []
|
| 901 |
html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
|
| 902 |
-
html_parts.append('
|
| 903 |
-
|
| 904 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 905 |
html_parts.append('</style></head><body>')
|
| 906 |
|
|
|
|
|
|
|
| 907 |
for page_num in range(page_count):
|
| 908 |
# Add page header to text
|
| 909 |
page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
|
|
@@ -929,10 +1080,41 @@ class OCRService:
|
|
| 929 |
|
| 930 |
all_text.append(text)
|
| 931 |
|
| 932 |
-
# Add to HTML with
|
| 933 |
html_parts.append(f'<div class="page">')
|
| 934 |
html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
|
| 935 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 936 |
|
| 937 |
finally:
|
| 938 |
if temp_img_path and os.path.exists(temp_img_path):
|
|
@@ -943,19 +1125,26 @@ class OCRService:
|
|
| 943 |
|
| 944 |
html_parts.append('</body></html>')
|
| 945 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 946 |
result.update({
|
| 947 |
'success': True,
|
| 948 |
-
'text':
|
| 949 |
-
'html':
|
| 950 |
'metadata': {
|
| 951 |
'pages': page_count,
|
| 952 |
'html_generated': True,
|
|
|
|
|
|
|
|
|
|
| 953 |
'page_numbers_added': True,
|
| 954 |
-
'
|
| 955 |
}
|
| 956 |
})
|
| 957 |
|
| 958 |
-
logger.info("Tesseract OCR with
|
| 959 |
|
| 960 |
except Exception as e:
|
| 961 |
logger.error(f"Tesseract OCR error: {e}")
|
|
@@ -970,12 +1159,12 @@ class OCRService:
|
|
| 970 |
return result
|
| 971 |
|
| 972 |
def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
|
| 973 |
-
"""PyMuPDF text extraction with HTML generation and
|
| 974 |
result = {
|
| 975 |
'success': False,
|
| 976 |
'text': '',
|
| 977 |
'html': '',
|
| 978 |
-
'method_used': '
|
| 979 |
'metadata': {},
|
| 980 |
'error': None
|
| 981 |
}
|
|
@@ -986,11 +1175,19 @@ class OCRService:
|
|
| 986 |
page_count = len(pdf_document)
|
| 987 |
all_text = []
|
| 988 |
html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
|
| 989 |
-
html_parts.append('
|
| 990 |
-
|
| 991 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 992 |
html_parts.append('</style></head><body>')
|
| 993 |
|
|
|
|
|
|
|
| 994 |
for page_num in range(page_count):
|
| 995 |
# Add page header to text
|
| 996 |
page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
|
|
@@ -1001,27 +1198,64 @@ class OCRService:
|
|
| 1001 |
|
| 1002 |
all_text.append(text)
|
| 1003 |
|
| 1004 |
-
# Add to HTML with
|
| 1005 |
html_parts.append(f'<div class="page">')
|
| 1006 |
html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
|
| 1007 |
-
|
| 1008 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1009 |
|
| 1010 |
html_parts.append('</body></html>')
|
| 1011 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1012 |
result.update({
|
| 1013 |
'success': True,
|
| 1014 |
-
'text':
|
| 1015 |
-
'html':
|
| 1016 |
'metadata': {
|
| 1017 |
'pages': page_count,
|
| 1018 |
'html_generated': True,
|
|
|
|
|
|
|
|
|
|
| 1019 |
'page_numbers_added': True,
|
| 1020 |
-
'
|
| 1021 |
}
|
| 1022 |
})
|
| 1023 |
|
| 1024 |
-
logger.info("PyMuPDF extraction with
|
| 1025 |
|
| 1026 |
except Exception as e:
|
| 1027 |
logger.error(f"PyMuPDF error: {e}")
|
|
@@ -1058,7 +1292,7 @@ class OCRService:
|
|
| 1058 |
logger.info(f"Trying fallback method: {method}")
|
| 1059 |
try:
|
| 1060 |
if method == "azure":
|
| 1061 |
-
result = self.
|
| 1062 |
elif method == "tesseract":
|
| 1063 |
result = self._tesseract_ocr(pdf_path)
|
| 1064 |
elif method == "pymupdf":
|
|
|
|
| 1 |
"""
|
| 2 |
+
OCR Service Module - ENHANCED VERSION with Comprehensive Indentation Detection and Intelligent Text Classification
|
| 3 |
+
Handles PDF to text conversion with proper indentation, spacing, page numbering, and intelligent text analysis
|
| 4 |
"""
|
| 5 |
import re
|
| 6 |
import os
|
|
|
|
| 30 |
|
| 31 |
import fitz # PyMuPDF
|
| 32 |
|
| 33 |
+
# Enhanced indentation detection
|
| 34 |
+
from enhanced_indentation import EnhancedIndentationDetector
|
| 35 |
+
|
| 36 |
# Configure logging
|
| 37 |
logging.basicConfig(level=logging.INFO)
|
| 38 |
logger = logging.getLogger(__name__)
|
| 39 |
|
| 40 |
|
| 41 |
+
class EnhancedHTMLProcessor:
|
| 42 |
+
"""Process OCR results through HTML with comprehensive indentation detection and intelligent text classification"""
|
| 43 |
+
|
| 44 |
+
def __init__(self):
|
| 45 |
+
self.indent_detector = EnhancedIndentationDetector()
|
| 46 |
|
| 47 |
@staticmethod
|
| 48 |
def create_html_from_azure_result(analysis_result) -> str:
|
| 49 |
+
"""Create structured HTML from Azure Document Intelligence result with enhanced indentation and text classification"""
|
| 50 |
+
processor = EnhancedHTMLProcessor()
|
| 51 |
+
|
| 52 |
html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8">']
|
| 53 |
html_parts.append('<style>')
|
| 54 |
html_parts.append('''
|
|
|
|
| 79 |
text-transform: uppercase;
|
| 80 |
letter-spacing: 1px;
|
| 81 |
}
|
| 82 |
+
|
| 83 |
+
/* Enhanced indentation levels */
|
| 84 |
+
.indent-level-0 { margin-left: 0em; }
|
| 85 |
+
.indent-level-1 { margin-left: 1.5em; }
|
| 86 |
+
.indent-level-2 { margin-left: 3.0em; }
|
| 87 |
+
.indent-level-3 { margin-left: 4.5em; }
|
| 88 |
+
.indent-level-4 { margin-left: 6.0em; }
|
| 89 |
+
.indent-level-5 { margin-left: 7.5em; }
|
| 90 |
+
.indent-level-6 { margin-left: 9.0em; }
|
| 91 |
+
.indent-level-7 { margin-left: 10.5em; }
|
| 92 |
+
.indent-level-8 { margin-left: 12.0em; }
|
| 93 |
+
.indent-level-9 { margin-left: 13.5em; }
|
| 94 |
+
.indent-level-10 { margin-left: 15.0em; }
|
| 95 |
+
|
| 96 |
+
/* Text classification styles */
|
| 97 |
+
.content-header {
|
| 98 |
+
font-weight: bold;
|
| 99 |
+
color: #2c3e50;
|
| 100 |
+
font-size: 1.1em;
|
| 101 |
+
margin: 15px 0 8px 0;
|
| 102 |
+
border-left: 4px solid #3498db;
|
| 103 |
+
padding-left: 10px;
|
| 104 |
+
background-color: #f8f9fa;
|
| 105 |
+
}
|
| 106 |
+
.content-paragraph {
|
| 107 |
+
color: #333;
|
| 108 |
+
margin-bottom: 1em;
|
| 109 |
+
line-height: 1.5;
|
| 110 |
+
}
|
| 111 |
+
.content-list-item {
|
| 112 |
+
margin-bottom: 0.5em;
|
| 113 |
+
line-height: 1.4;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
/* Pattern-specific styles */
|
| 117 |
+
.numbered-primary {
|
| 118 |
+
font-weight: bold;
|
| 119 |
+
color: #2c3e50;
|
| 120 |
+
border-left: 4px solid #3498db;
|
| 121 |
+
padding-left: 8px;
|
| 122 |
+
margin-bottom: 0.5em;
|
| 123 |
+
background-color: #f8f9fa;
|
| 124 |
+
}
|
| 125 |
+
.numbered-secondary {
|
| 126 |
+
font-weight: 600;
|
| 127 |
+
color: #34495e;
|
| 128 |
+
border-left: 3px solid #95a5a6;
|
| 129 |
+
padding-left: 6px;
|
| 130 |
+
margin-bottom: 0.4em;
|
| 131 |
+
background-color: #f9f9f9;
|
| 132 |
+
}
|
| 133 |
+
.numbered-tertiary {
|
| 134 |
+
color: #555;
|
| 135 |
+
border-left: 2px solid #bdc3c7;
|
| 136 |
+
padding-left: 4px;
|
| 137 |
+
margin-bottom: 0.3em;
|
| 138 |
+
}
|
| 139 |
+
.numbered-quaternary {
|
| 140 |
+
color: #666;
|
| 141 |
+
border-left: 1px solid #dee2e6;
|
| 142 |
+
padding-left: 3px;
|
| 143 |
+
margin-bottom: 0.2em;
|
| 144 |
+
}
|
| 145 |
+
.numbered-quinary {
|
| 146 |
+
color: #777;
|
| 147 |
+
padding-left: 2px;
|
| 148 |
+
margin-bottom: 0.2em;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
/* Parenthetical styles */
|
| 152 |
+
.parenthetical-primary {
|
| 153 |
+
font-weight: 600;
|
| 154 |
+
color: #8e44ad;
|
| 155 |
+
border-left: 3px solid #9b59b6;
|
| 156 |
+
padding-left: 6px;
|
| 157 |
+
margin-bottom: 0.4em;
|
| 158 |
+
}
|
| 159 |
+
.parenthetical-secondary {
|
| 160 |
+
color: #9b59b6;
|
| 161 |
+
border-left: 2px solid #af7ac5;
|
| 162 |
+
padding-left: 4px;
|
| 163 |
+
margin-bottom: 0.3em;
|
| 164 |
+
}
|
| 165 |
+
.parenthetical-tertiary {
|
| 166 |
+
color: #af7ac5;
|
| 167 |
+
padding-left: 3px;
|
| 168 |
+
margin-bottom: 0.2em;
|
| 169 |
+
}
|
| 170 |
+
.parenthetical-quaternary {
|
| 171 |
+
color: #c39bd3;
|
| 172 |
+
padding-left: 2px;
|
| 173 |
+
margin-bottom: 0.2em;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
.bullet-primary {
|
| 177 |
+
position: relative;
|
| 178 |
+
padding-left: 1.2em;
|
| 179 |
+
}
|
| 180 |
+
.bullet-primary::before {
|
| 181 |
+
content: "•";
|
| 182 |
+
position: absolute;
|
| 183 |
+
left: 0;
|
| 184 |
+
color: #3498db;
|
| 185 |
+
font-weight: bold;
|
| 186 |
+
}
|
| 187 |
+
.bullet-secondary {
|
| 188 |
+
position: relative;
|
| 189 |
+
padding-left: 1.2em;
|
| 190 |
+
}
|
| 191 |
+
.bullet-secondary::before {
|
| 192 |
+
content: "◦";
|
| 193 |
+
position: absolute;
|
| 194 |
+
left: 0;
|
| 195 |
+
color: #95a5a6;
|
| 196 |
+
}
|
| 197 |
+
.bullet-tertiary {
|
| 198 |
+
position: relative;
|
| 199 |
+
padding-left: 1.2em;
|
| 200 |
+
}
|
| 201 |
+
.bullet-tertiary::before {
|
| 202 |
+
content: "▪";
|
| 203 |
+
position: absolute;
|
| 204 |
+
left: 0;
|
| 205 |
+
color: #bdc3c7;
|
| 206 |
+
}
|
| 207 |
+
.bullet-quaternary {
|
| 208 |
+
position: relative;
|
| 209 |
+
padding-left: 1.2em;
|
| 210 |
+
}
|
| 211 |
+
.bullet-quaternary::before {
|
| 212 |
+
content: "‣";
|
| 213 |
+
position: absolute;
|
| 214 |
+
left: 0;
|
| 215 |
+
color: #dee2e6;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
.lettered-primary {
|
| 219 |
+
font-style: italic;
|
| 220 |
+
color: #8e44ad;
|
| 221 |
+
font-weight: 600;
|
| 222 |
+
}
|
| 223 |
+
.lettered-secondary {
|
| 224 |
+
color: #9b59b6;
|
| 225 |
+
font-style: italic;
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
.roman-primary {
|
| 229 |
+
font-variant: small-caps;
|
| 230 |
+
color: #d35400;
|
| 231 |
+
font-weight: bold;
|
| 232 |
+
}
|
| 233 |
+
.roman-secondary {
|
| 234 |
+
color: #e67e22;
|
| 235 |
+
font-variant: small-caps;
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
.thai-primary {
|
| 239 |
+
color: #16a085;
|
| 240 |
+
font-weight: bold;
|
| 241 |
+
}
|
| 242 |
+
.thai-secondary {
|
| 243 |
+
color: #1abc9c;
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
.paragraph {
|
| 247 |
margin-bottom: 0.8em;
|
| 248 |
white-space: pre-wrap;
|
| 249 |
font-family: 'Consolas', 'Courier New', monospace;
|
| 250 |
line-height: 1.4;
|
| 251 |
}
|
| 252 |
+
|
| 253 |
.title {
|
| 254 |
font-size: 1.4em;
|
| 255 |
font-weight: bold;
|
|
|
|
| 297 |
.table tr:nth-child(even) {
|
| 298 |
background-color: #f8f9fa;
|
| 299 |
}
|
| 300 |
+
.indented_text {
|
| 301 |
+
color: #555;
|
| 302 |
+
font-style: italic;
|
| 303 |
}
|
| 304 |
+
.space-indent {
|
| 305 |
+
border-left: 1px dotted #ccc;
|
| 306 |
+
padding-left: 5px;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
}
|
| 308 |
.page-number {
|
| 309 |
position: relative;
|
|
|
|
| 326 |
html_parts.append(f'<div class="page">')
|
| 327 |
html_parts.append(f'<div class="page-header">Page {page_num} <span class="page-number">{page_num}</span></div>')
|
| 328 |
|
| 329 |
+
# Process content with enhanced indentation detection and text classification
|
| 330 |
+
content_items = processor._extract_page_content_enhanced(page, analysis_result, page_num)
|
| 331 |
content_items.sort(key=lambda x: (x['y_pos'], x['x_pos']))
|
| 332 |
|
| 333 |
+
# Generate HTML for each content item with enhanced formatting and classification
|
| 334 |
for item in content_items:
|
| 335 |
if item['type'] == 'table':
|
| 336 |
+
html_parts.append(processor._table_to_html(item['content'], item['table_idx']))
|
| 337 |
else:
|
| 338 |
+
html_parts.append(processor._text_to_html_enhanced(item))
|
| 339 |
|
| 340 |
html_parts.append('</div>')
|
| 341 |
|
| 342 |
html_parts.append('</body></html>')
|
| 343 |
return '\n'.join(html_parts)
|
| 344 |
|
| 345 |
+
def _extract_page_content_enhanced(self, page, analysis_result, page_num):
|
| 346 |
+
"""Extract page content with enhanced indentation detection and intelligent text classification"""
|
|
|
|
| 347 |
content_items = []
|
| 348 |
|
| 349 |
+
# Handle tables (existing logic)
|
| 350 |
page_tables = []
|
| 351 |
table_regions = []
|
| 352 |
|
| 353 |
if analysis_result.tables:
|
| 354 |
for table_idx, table in enumerate(analysis_result.tables):
|
| 355 |
+
if self._is_table_on_page(table, page_num):
|
| 356 |
page_tables.append((table_idx, table))
|
|
|
|
| 357 |
if table.bounding_regions:
|
| 358 |
table_regions.append({
|
| 359 |
'polygon': table.bounding_regions[0].polygon,
|
| 360 |
'table_idx': table_idx
|
| 361 |
})
|
| 362 |
|
| 363 |
+
# Add tables to content
|
| 364 |
for table_idx, table in page_tables:
|
| 365 |
if table.bounding_regions and table.bounding_regions[0].polygon:
|
| 366 |
polygon = table.bounding_regions[0].polygon
|
| 367 |
+
y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7])
|
| 368 |
+
x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6])
|
| 369 |
|
| 370 |
content_items.append({
|
| 371 |
'type': 'table',
|
|
|
|
| 375 |
'x_pos': x_pos
|
| 376 |
})
|
| 377 |
|
| 378 |
+
# Process text content with enhanced indentation detection and text classification
|
|
|
|
|
|
|
|
|
|
| 379 |
if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
|
|
|
|
| 380 |
page_paragraphs = [p for p in analysis_result.paragraphs if
|
| 381 |
p.bounding_regions and
|
| 382 |
p.bounding_regions[0].page_number == page_num]
|
| 383 |
|
| 384 |
for para in page_paragraphs:
|
| 385 |
if para.content.strip():
|
| 386 |
+
# Check table overlap
|
| 387 |
+
overlap_ratio = self._calculate_table_overlap(para, table_regions)
|
| 388 |
|
| 389 |
+
if overlap_ratio < 0.7: # Not heavily overlapping with table
|
|
|
|
| 390 |
polygon = para.bounding_regions[0].polygon
|
| 391 |
y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
|
| 392 |
x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
|
| 393 |
|
| 394 |
+
# Enhanced indentation detection
|
| 395 |
+
indent_info = self.indent_detector.detect_indentation(para.content)
|
| 396 |
+
|
| 397 |
+
# Intelligent text classification with context
|
| 398 |
+
context = {
|
| 399 |
+
'y_position': y_pos,
|
| 400 |
+
'x_position': x_pos,
|
| 401 |
+
'font_size': getattr(para, 'font_size', None),
|
| 402 |
+
'is_bold': getattr(para, 'is_bold', False),
|
| 403 |
+
'page_number': page_num
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
text_classification = self.indent_detector.classify_text_type(para.content, context)
|
| 407 |
|
| 408 |
content_items.append({
|
| 409 |
'type': 'paragraph',
|
| 410 |
+
'content': indent_info['content'],
|
| 411 |
'role': getattr(para, 'role', 'paragraph'),
|
| 412 |
'y_pos': y_pos,
|
| 413 |
'x_pos': x_pos,
|
| 414 |
+
'indent_info': indent_info,
|
| 415 |
+
'text_classification': text_classification,
|
|
|
|
| 416 |
'preserve_spacing': True
|
| 417 |
})
|
| 418 |
|
| 419 |
elif page.lines:
|
| 420 |
+
# Process lines with enhanced indentation detection and classification
|
| 421 |
+
processed_lines = self._process_lines_enhanced(page.lines, table_regions)
|
| 422 |
content_items.extend(processed_lines)
|
| 423 |
|
| 424 |
return content_items
|
| 425 |
|
| 426 |
+
def _process_lines_enhanced(self, lines, table_regions):
|
| 427 |
+
"""Process lines with enhanced indentation detection and text classification"""
|
| 428 |
+
content_items = []
|
| 429 |
+
processed_content = set()
|
| 430 |
+
|
| 431 |
+
for line in lines:
|
| 432 |
+
if not line.content.strip():
|
| 433 |
+
continue
|
| 434 |
+
|
| 435 |
+
content_key = line.content.strip().lower()
|
| 436 |
+
if content_key in processed_content:
|
| 437 |
+
continue
|
| 438 |
+
processed_content.add(content_key)
|
| 439 |
+
|
| 440 |
+
# Check table overlap
|
| 441 |
+
overlap_ratio = self._calculate_line_table_overlap(line, table_regions)
|
| 442 |
+
|
| 443 |
+
if overlap_ratio < 0.7:
|
| 444 |
+
polygon = line.polygon
|
| 445 |
+
y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
|
| 446 |
+
x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
|
| 447 |
+
|
| 448 |
+
# Enhanced indentation detection
|
| 449 |
+
indent_info = self.indent_detector.detect_indentation(line.content)
|
| 450 |
+
|
| 451 |
+
# Text classification with context
|
| 452 |
+
context = {
|
| 453 |
+
'y_position': y_pos,
|
| 454 |
+
'x_position': x_pos
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
text_classification = self.indent_detector.classify_text_type(line.content, context)
|
| 458 |
+
|
| 459 |
+
content_items.append({
|
| 460 |
+
'type': 'line',
|
| 461 |
+
'content': indent_info['content'],
|
| 462 |
+
'role': 'text',
|
| 463 |
+
'y_pos': y_pos,
|
| 464 |
+
'x_pos': x_pos,
|
| 465 |
+
'indent_info': indent_info,
|
| 466 |
+
'text_classification': text_classification,
|
| 467 |
+
'preserve_spacing': True
|
| 468 |
+
})
|
| 469 |
+
|
| 470 |
+
return content_items
|
| 471 |
+
|
| 472 |
+
def _text_to_html_enhanced(self, item):
|
| 473 |
+
"""Convert text item to HTML with enhanced indentation formatting and intelligent classification"""
|
| 474 |
+
content = item['content']
|
| 475 |
+
role = item.get('role', 'paragraph')
|
| 476 |
+
indent_info = item.get('indent_info', {})
|
| 477 |
+
text_classification = item.get('text_classification', {})
|
| 478 |
+
preserve_spacing = item.get('preserve_spacing', False)
|
| 479 |
+
|
| 480 |
+
# Build CSS classes based on indentation info and text classification
|
| 481 |
+
css_classes = ['paragraph']
|
| 482 |
+
|
| 483 |
+
# Add text classification class
|
| 484 |
+
if text_classification.get('type'):
|
| 485 |
+
css_classes.append(f"content-{text_classification['type']}")
|
| 486 |
+
|
| 487 |
+
# Add indentation level class
|
| 488 |
+
level = indent_info.get('level', 0)
|
| 489 |
+
css_classes.append(f'indent-level-{min(level, 10)}')
|
| 490 |
+
|
| 491 |
+
# Add pattern-specific formatting
|
| 492 |
+
formatting_hint = indent_info.get('formatting_hint', 'normal_text')
|
| 493 |
+
if formatting_hint != 'normal_text':
|
| 494 |
+
css_classes.append(formatting_hint)
|
| 495 |
+
|
| 496 |
+
# Add space indent class if needed
|
| 497 |
+
if indent_info.get('pattern_type') == 'space_indent':
|
| 498 |
+
css_classes.append('space-indent')
|
| 499 |
+
|
| 500 |
+
# Preserve internal spacing
|
| 501 |
+
if preserve_spacing:
|
| 502 |
+
content = re.sub(r' +', lambda m: ' ' * len(m.group()), content)
|
| 503 |
+
content = content.replace('\n', '<br>')
|
| 504 |
+
|
| 505 |
+
# Add pattern marker if needed (but not for bullets as CSS handles them)
|
| 506 |
+
pattern_marker = indent_info.get('pattern_marker', '')
|
| 507 |
+
if pattern_marker and not indent_info.get('is_bullet', False):
|
| 508 |
+
# For numbered/lettered items, include the marker
|
| 509 |
+
content = f"{pattern_marker} {content}"
|
| 510 |
+
|
| 511 |
+
# Build final HTML with enhanced classification
|
| 512 |
+
class_str = f' class="{" ".join(css_classes)}"'
|
| 513 |
+
|
| 514 |
+
# Use text classification to determine HTML structure
|
| 515 |
+
if text_classification.get('is_header') and text_classification.get('confidence', 0) > 0.6:
|
| 516 |
+
return f'<div class="content-header"{class_str}>{content}</div>'
|
| 517 |
+
elif role == 'title':
|
| 518 |
+
return f'<div class="title"{class_str}>{content}</div>'
|
| 519 |
+
elif role == 'sectionHeading':
|
| 520 |
+
return f'<div class="section-heading"{class_str}>{content}</div>'
|
| 521 |
+
else:
|
| 522 |
+
return f'<div{class_str}>{content}</div>'
|
| 523 |
+
|
| 524 |
+
def _table_to_html(self, table, table_idx):
|
| 525 |
+
"""Convert table to HTML with proper structure"""
|
| 526 |
+
if not table.cells:
|
| 527 |
+
return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
|
| 528 |
+
|
| 529 |
+
# Create table matrix
|
| 530 |
+
max_row = max(cell.row_index for cell in table.cells) + 1
|
| 531 |
+
max_col = max(cell.column_index for cell in table.cells) + 1
|
| 532 |
+
|
| 533 |
+
table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
|
| 534 |
+
|
| 535 |
+
# Fill matrix
|
| 536 |
+
for cell in table.cells:
|
| 537 |
+
content = (cell.content or "").strip()
|
| 538 |
+
table_matrix[cell.row_index][cell.column_index] = content
|
| 539 |
+
|
| 540 |
+
# Generate HTML
|
| 541 |
+
html_parts = [f'<div class="table-container">']
|
| 542 |
+
html_parts.append(f'<h4>Table {table_idx + 1}</h4>')
|
| 543 |
+
html_parts.append('<table class="table">')
|
| 544 |
+
|
| 545 |
+
for row_idx, row in enumerate(table_matrix):
|
| 546 |
+
if row_idx == 0 and any(cell.strip() for cell in row):
|
| 547 |
+
# Header row
|
| 548 |
+
html_parts.append('<tr>')
|
| 549 |
+
for cell in row:
|
| 550 |
+
html_parts.append(f'<th>{cell}</th>')
|
| 551 |
+
html_parts.append('</tr>')
|
| 552 |
+
else:
|
| 553 |
+
# Data row
|
| 554 |
+
if any(cell.strip() for cell in row): # Skip empty rows
|
| 555 |
+
html_parts.append('<tr>')
|
| 556 |
+
for cell in row:
|
| 557 |
+
html_parts.append(f'<td>{cell}</td>')
|
| 558 |
+
html_parts.append('</tr>')
|
| 559 |
+
|
| 560 |
+
html_parts.append('</table></div>')
|
| 561 |
+
return '\n'.join(html_parts)
|
| 562 |
+
|
| 563 |
+
def _is_table_on_page(self, table, page_num):
|
| 564 |
"""Check if table belongs to the specified page"""
|
| 565 |
if not table.cells:
|
| 566 |
return False
|
|
|
|
| 571 |
return True
|
| 572 |
return False
|
| 573 |
|
| 574 |
+
def _calculate_table_overlap(self, content_item, table_regions):
|
| 575 |
+
"""Calculate overlap ratio between content and tables"""
|
|
|
|
| 576 |
if not table_regions or not content_item.bounding_regions:
|
| 577 |
return 0.0
|
| 578 |
|
|
|
|
| 616 |
|
| 617 |
return max_overlap_ratio
|
| 618 |
|
| 619 |
+
def _calculate_line_table_overlap(self, line, table_regions):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
"""Calculate overlap between line and tables"""
|
| 621 |
if not table_regions or not line.polygon:
|
| 622 |
return 0.0
|
|
|
|
| 661 |
return max_overlap
|
| 662 |
|
| 663 |
@staticmethod
|
| 664 |
+
def html_to_formatted_text_enhanced(html_content):
|
| 665 |
+
"""Convert HTML back to formatted text with enhanced indentation preservation and text classification"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 666 |
from html.parser import HTMLParser
|
| 667 |
|
| 668 |
+
class EnhancedTextExtractor(HTMLParser):
|
| 669 |
def __init__(self):
|
| 670 |
super().__init__()
|
| 671 |
self.text_parts = []
|
| 672 |
+
self.indent_detector = EnhancedIndentationDetector()
|
| 673 |
self.in_title = False
|
| 674 |
self.in_section_heading = False
|
| 675 |
self.in_table = False
|
|
|
|
| 676 |
self.current_table_row = []
|
| 677 |
self.table_data = []
|
| 678 |
+
self.current_indent_level = 0
|
| 679 |
+
self.current_formatting_hint = 'normal_text'
|
| 680 |
self.in_page_header = False
|
| 681 |
+
self.current_classes = []
|
| 682 |
+
self.in_content_header = False
|
| 683 |
|
| 684 |
def handle_starttag(self, tag, attrs):
|
| 685 |
attr_dict = dict(attrs)
|
| 686 |
class_attr = attr_dict.get('class', '')
|
| 687 |
+
self.current_classes = class_attr.split()
|
| 688 |
|
| 689 |
if 'page-header' in class_attr:
|
| 690 |
self.in_page_header = True
|
|
|
|
| 691 |
if len(self.text_parts) > 0:
|
| 692 |
self.text_parts.append('\n\n' + '=' * 80 + '\n')
|
| 693 |
+
elif 'content-header' in class_attr:
|
| 694 |
+
self.in_content_header = True
|
| 695 |
elif 'title' in class_attr:
|
| 696 |
self.in_title = True
|
| 697 |
elif 'section-heading' in class_attr:
|
|
|
|
| 699 |
elif tag == 'table':
|
| 700 |
self.in_table = True
|
| 701 |
self.table_data = []
|
|
|
|
|
|
|
| 702 |
elif tag == 'tr':
|
| 703 |
self.current_table_row = []
|
| 704 |
elif tag == 'br':
|
| 705 |
self.text_parts.append('\n')
|
| 706 |
|
| 707 |
+
# Extract indent level from class
|
| 708 |
+
for cls in self.current_classes:
|
| 709 |
+
if cls.startswith('indent-level-'):
|
| 710 |
+
try:
|
| 711 |
+
self.current_indent_level = int(cls.split('-')[-1])
|
| 712 |
+
except ValueError:
|
| 713 |
+
self.current_indent_level = 0
|
| 714 |
+
break
|
| 715 |
else:
|
| 716 |
+
self.current_indent_level = 0
|
| 717 |
+
|
| 718 |
+
# Extract formatting hint
|
| 719 |
+
formatting_hints = [
|
| 720 |
+
'numbered-primary', 'numbered-secondary', 'numbered-tertiary', 'numbered-quaternary', 'numbered-quinary',
|
| 721 |
+
'parenthetical-primary', 'parenthetical-secondary', 'parenthetical-tertiary', 'parenthetical-quaternary',
|
| 722 |
+
'bullet-primary', 'bullet-secondary', 'bullet-tertiary', 'bullet-quaternary',
|
| 723 |
+
'lettered-primary', 'lettered-secondary',
|
| 724 |
+
'roman-primary', 'roman-secondary',
|
| 725 |
+
'thai-primary', 'thai-secondary',
|
| 726 |
+
'indented_text', 'space-indent'
|
| 727 |
+
]
|
| 728 |
|
| 729 |
+
for hint in formatting_hints:
|
| 730 |
+
if hint in self.current_classes:
|
| 731 |
+
self.current_formatting_hint = hint
|
| 732 |
+
break
|
| 733 |
+
else:
|
| 734 |
+
self.current_formatting_hint = 'normal_text'
|
| 735 |
|
| 736 |
def handle_endtag(self, tag):
|
| 737 |
if tag == 'div' and self.in_page_header:
|
| 738 |
self.text_parts.append('\n' + '=' * 80 + '\n\n')
|
| 739 |
self.in_page_header = False
|
| 740 |
+
elif tag == 'div' and self.in_content_header:
|
| 741 |
+
self.text_parts.append('\n\n')
|
| 742 |
+
self.in_content_header = False
|
| 743 |
elif tag == 'div' and self.in_title:
|
| 744 |
self.text_parts.append('\n\n')
|
| 745 |
self.in_title = False
|
|
|
|
| 749 |
elif tag == 'table':
|
| 750 |
self.in_table = False
|
| 751 |
self._format_table()
|
|
|
|
|
|
|
| 752 |
elif tag == 'tr' and self.current_table_row:
|
| 753 |
self.table_data.append(self.current_table_row[:])
|
| 754 |
+
elif tag == 'div' and not self.in_table:
|
| 755 |
+
if not self.in_title and not self.in_section_heading and not self.in_page_header and not self.in_content_header:
|
| 756 |
self.text_parts.append('\n')
|
| 757 |
|
| 758 |
+
# Reset state
|
| 759 |
if tag == 'div':
|
| 760 |
+
self.current_indent_level = 0
|
| 761 |
+
self.current_formatting_hint = 'normal_text'
|
| 762 |
+
self.current_classes = []
|
| 763 |
|
| 764 |
def handle_data(self, data):
|
| 765 |
if data.strip():
|
|
|
|
| 766 |
data = data.replace(' ', ' ')
|
| 767 |
|
| 768 |
if self.in_page_header:
|
|
|
|
| 769 |
page_match = re.search(r'Page (\d+)', data)
|
| 770 |
if page_match:
|
| 771 |
+
page_num = int(page_match.group(1))
|
| 772 |
+
page_header = f"PAGE {page_num}"
|
| 773 |
self.text_parts.append(page_header.center(80))
|
| 774 |
+
elif self.in_content_header:
|
| 775 |
+
indent_str = " " * self.current_indent_level
|
| 776 |
+
self.text_parts.append(f'\n{indent_str}# {data.strip()}')
|
| 777 |
elif self.in_title:
|
| 778 |
+
indent_str = " " * self.current_indent_level
|
| 779 |
self.text_parts.append(f'\n{indent_str}## {data.strip()}')
|
| 780 |
elif self.in_section_heading:
|
| 781 |
+
indent_str = " " * self.current_indent_level
|
| 782 |
self.text_parts.append(f'\n{indent_str}### {data.strip()}')
|
| 783 |
elif self.in_table:
|
| 784 |
+
self.current_table_row.append(data.strip())
|
|
|
|
| 785 |
else:
|
| 786 |
+
# Apply enhanced indentation formatting
|
| 787 |
+
indent_str = " " * self.current_indent_level
|
| 788 |
|
| 789 |
+
# Handle different formatting hints including parenthetical
|
| 790 |
+
if 'bullet' in self.current_formatting_hint:
|
| 791 |
+
# Use appropriate bullet symbol based on level
|
| 792 |
+
if 'primary' in self.current_formatting_hint:
|
| 793 |
+
bullet = '•'
|
| 794 |
+
elif 'secondary' in self.current_formatting_hint:
|
| 795 |
+
bullet = '◦'
|
| 796 |
+
elif 'tertiary' in self.current_formatting_hint:
|
| 797 |
+
bullet = '▪'
|
| 798 |
+
elif 'quaternary' in self.current_formatting_hint:
|
| 799 |
+
bullet = '‣'
|
| 800 |
+
else:
|
| 801 |
+
bullet = '•'
|
| 802 |
+
|
| 803 |
+
self.text_parts.append(f'{indent_str}{bullet} {data.strip()}')
|
| 804 |
+
|
| 805 |
+
elif any(pattern in self.current_formatting_hint for pattern in ['numbered', 'lettered', 'roman', 'thai', 'parenthetical']):
|
| 806 |
+
# For numbered/lettered/parenthetical items, the marker should already be in the text
|
| 807 |
+
self.text_parts.append(f'{indent_str}{data.strip()}')
|
| 808 |
+
|
| 809 |
+
elif 'space-indent' in self.current_formatting_hint:
|
| 810 |
+
# Simple indented text
|
| 811 |
+
self.text_parts.append(f'{indent_str}{data.strip()}')
|
| 812 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 813 |
else:
|
| 814 |
+
# Regular text with indentation
|
| 815 |
+
self.text_parts.append(f'{indent_str}{data.strip()}')
|
| 816 |
|
| 817 |
def _format_table(self):
|
| 818 |
+
"""Format table with proper alignment"""
|
| 819 |
if not self.table_data:
|
| 820 |
return
|
| 821 |
|
| 822 |
self.text_parts.append('\n\n')
|
| 823 |
|
|
|
|
| 824 |
if self.table_data:
|
| 825 |
max_cols = max(len(row) for row in self.table_data)
|
| 826 |
col_widths = [0] * max_cols
|
| 827 |
|
| 828 |
+
# Calculate column widths
|
| 829 |
for row in self.table_data:
|
| 830 |
for i, cell in enumerate(row):
|
| 831 |
if i < max_cols:
|
|
|
|
| 855 |
|
| 856 |
self.text_parts.append('\n')
|
| 857 |
|
| 858 |
+
extractor = EnhancedTextExtractor()
|
| 859 |
extractor.feed(html_content)
|
| 860 |
|
| 861 |
result = ''.join(extractor.text_parts)
|
|
|
|
| 870 |
|
| 871 |
|
| 872 |
class OCRService:
|
| 873 |
+
"""Main OCR service with enhanced HTML processing, comprehensive indentation detection, and intelligent text classification"""
|
| 874 |
|
| 875 |
def __init__(self):
|
| 876 |
self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
|
|
|
|
| 892 |
|
| 893 |
def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
|
| 894 |
"""
|
| 895 |
+
Convert PDF to text using specified method with enhanced HTML processing and intelligent text classification
|
| 896 |
|
| 897 |
Args:
|
| 898 |
pdf_path: Path to the PDF file
|
|
|
|
| 926 |
# Try primary method
|
| 927 |
try:
|
| 928 |
if method == "azure" and self.azure_client:
|
| 929 |
+
result = self._azure_ocr_with_enhanced_html(pdf_path)
|
| 930 |
elif method == "tesseract":
|
| 931 |
result = self._tesseract_ocr(pdf_path)
|
| 932 |
elif method == "pymupdf":
|
|
|
|
| 945 |
|
| 946 |
return result
|
| 947 |
|
| 948 |
+
def _azure_ocr_with_enhanced_html(self, pdf_path: str) -> Dict[str, Any]:
|
| 949 |
+
"""Azure Document Intelligence OCR with enhanced HTML processing, indentation detection, and intelligent text classification"""
|
| 950 |
result = {
|
| 951 |
'success': False,
|
| 952 |
'text': '',
|
| 953 |
'html': '',
|
| 954 |
+
'method_used': 'azure_document_intelligence_enhanced_v2',
|
| 955 |
'metadata': {},
|
| 956 |
'error': None
|
| 957 |
}
|
|
|
|
| 982 |
|
| 983 |
analysis_result = poller.result()
|
| 984 |
|
| 985 |
+
# Generate HTML with enhanced indentation processing and text classification
|
| 986 |
+
html_content = EnhancedHTMLProcessor.create_html_from_azure_result(analysis_result)
|
| 987 |
|
| 988 |
+
# Convert HTML to formatted text with enhanced indentation preservation and classification
|
| 989 |
+
formatted_text = EnhancedHTMLProcessor.html_to_formatted_text_enhanced(html_content)
|
| 990 |
+
|
| 991 |
+
# Analyze document structure with text classification
|
| 992 |
+
detector = EnhancedIndentationDetector()
|
| 993 |
+
text_lines = formatted_text.split('\n')
|
| 994 |
+
document_analysis = detector.analyze_document_structure(text_lines)
|
| 995 |
|
| 996 |
result.update({
|
| 997 |
'success': True,
|
|
|
|
| 1003 |
'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
|
| 1004 |
'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
|
| 1005 |
'html_generated': True,
|
| 1006 |
+
'enhanced_indentation': True,
|
| 1007 |
+
'intelligent_text_classification': True,
|
| 1008 |
+
'parenthetical_patterns_supported': True,
|
| 1009 |
'page_numbers_added': True,
|
| 1010 |
+
'comprehensive_formatting': True,
|
| 1011 |
+
'azure_analysis': analysis_result,
|
| 1012 |
+
'document_structure_analysis': document_analysis
|
| 1013 |
}
|
| 1014 |
})
|
| 1015 |
|
| 1016 |
+
logger.info("Azure OCR with enhanced indentation processing and intelligent text classification completed successfully")
|
| 1017 |
|
| 1018 |
except Exception as e:
|
| 1019 |
logger.error(f"Azure OCR error: {e}")
|
|
|
|
| 1022 |
return result
|
| 1023 |
|
| 1024 |
def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
|
| 1025 |
+
"""Tesseract OCR with enhanced HTML generation, indentation detection, and text classification"""
|
| 1026 |
result = {
|
| 1027 |
'success': False,
|
| 1028 |
'text': '',
|
| 1029 |
'html': '',
|
| 1030 |
+
'method_used': 'tesseract_enhanced_v2',
|
| 1031 |
'metadata': {},
|
| 1032 |
'error': None
|
| 1033 |
}
|
|
|
|
| 1042 |
page_count = len(pdf_document)
|
| 1043 |
all_text = []
|
| 1044 |
html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
|
| 1045 |
+
html_parts.append('''
|
| 1046 |
+
body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }
|
| 1047 |
+
.page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }
|
| 1048 |
+
.page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }
|
| 1049 |
+
.paragraph { margin-bottom: 0.8em; white-space: pre-wrap; }
|
| 1050 |
+
.content-header { font-weight: bold; color: #2c3e50; margin: 10px 0; }
|
| 1051 |
+
.content-paragraph { margin-bottom: 1em; }
|
| 1052 |
+
.content-list-item { margin-bottom: 0.5em; }
|
| 1053 |
+
''')
|
| 1054 |
html_parts.append('</style></head><body>')
|
| 1055 |
|
| 1056 |
+
indent_detector = EnhancedIndentationDetector()
|
| 1057 |
+
|
| 1058 |
for page_num in range(page_count):
|
| 1059 |
# Add page header to text
|
| 1060 |
page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
|
|
|
|
| 1080 |
|
| 1081 |
all_text.append(text)
|
| 1082 |
|
| 1083 |
+
# Add to HTML with enhanced indentation processing and text classification
|
| 1084 |
html_parts.append(f'<div class="page">')
|
| 1085 |
html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
|
| 1086 |
+
|
| 1087 |
+
# Process each line for indentation and classification
|
| 1088 |
+
lines = text.split('\n')
|
| 1089 |
+
for line in lines:
|
| 1090 |
+
if line.strip():
|
| 1091 |
+
indent_info = indent_detector.detect_indentation(line)
|
| 1092 |
+
text_classification = indent_detector.classify_text_type(line)
|
| 1093 |
+
|
| 1094 |
+
level = indent_info.get('level', 0)
|
| 1095 |
+
formatting_hint = indent_info.get('formatting_hint', 'normal_text')
|
| 1096 |
+
|
| 1097 |
+
css_classes = [f'indent-level-{min(level, 10)}']
|
| 1098 |
+
if formatting_hint != 'normal_text':
|
| 1099 |
+
css_classes.append(formatting_hint)
|
| 1100 |
+
|
| 1101 |
+
# Add text classification class
|
| 1102 |
+
if text_classification.get('type'):
|
| 1103 |
+
css_classes.append(f"content-{text_classification['type']}")
|
| 1104 |
+
|
| 1105 |
+
class_str = f' class="paragraph {" ".join(css_classes)}"'
|
| 1106 |
+
content = indent_info.get('content', line.strip())
|
| 1107 |
+
|
| 1108 |
+
# Add marker for non-bullet items
|
| 1109 |
+
marker = indent_info.get('pattern_marker', '')
|
| 1110 |
+
if marker and not indent_info.get('is_bullet', False):
|
| 1111 |
+
content = f"{marker} {content}"
|
| 1112 |
+
|
| 1113 |
+
html_parts.append(f'<div{class_str}>{content}</div>')
|
| 1114 |
+
else:
|
| 1115 |
+
html_parts.append('<div class="paragraph"><br></div>')
|
| 1116 |
+
|
| 1117 |
+
html_parts.append('</div>')
|
| 1118 |
|
| 1119 |
finally:
|
| 1120 |
if temp_img_path and os.path.exists(temp_img_path):
|
|
|
|
| 1125 |
|
| 1126 |
html_parts.append('</body></html>')
|
| 1127 |
|
| 1128 |
+
# Convert HTML back to formatted text
|
| 1129 |
+
html_content = '\n'.join(html_parts)
|
| 1130 |
+
formatted_text = EnhancedHTMLProcessor.html_to_formatted_text_enhanced(html_content)
|
| 1131 |
+
|
| 1132 |
result.update({
|
| 1133 |
'success': True,
|
| 1134 |
+
'text': formatted_text,
|
| 1135 |
+
'html': html_content,
|
| 1136 |
'metadata': {
|
| 1137 |
'pages': page_count,
|
| 1138 |
'html_generated': True,
|
| 1139 |
+
'enhanced_indentation': True,
|
| 1140 |
+
'intelligent_text_classification': True,
|
| 1141 |
+
'parenthetical_patterns_supported': True,
|
| 1142 |
'page_numbers_added': True,
|
| 1143 |
+
'comprehensive_formatting': True
|
| 1144 |
}
|
| 1145 |
})
|
| 1146 |
|
| 1147 |
+
logger.info("Tesseract OCR with enhanced indentation processing and text classification completed successfully")
|
| 1148 |
|
| 1149 |
except Exception as e:
|
| 1150 |
logger.error(f"Tesseract OCR error: {e}")
|
|
|
|
| 1159 |
return result
|
| 1160 |
|
| 1161 |
def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
|
| 1162 |
+
"""PyMuPDF text extraction with enhanced HTML generation, indentation detection, and text classification"""
|
| 1163 |
result = {
|
| 1164 |
'success': False,
|
| 1165 |
'text': '',
|
| 1166 |
'html': '',
|
| 1167 |
+
'method_used': 'pymupdf_enhanced_v2',
|
| 1168 |
'metadata': {},
|
| 1169 |
'error': None
|
| 1170 |
}
|
|
|
|
| 1175 |
page_count = len(pdf_document)
|
| 1176 |
all_text = []
|
| 1177 |
html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
|
| 1178 |
+
html_parts.append('''
|
| 1179 |
+
body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }
|
| 1180 |
+
.page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }
|
| 1181 |
+
.page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }
|
| 1182 |
+
.paragraph { margin-bottom: 0.8em; white-space: pre-wrap; }
|
| 1183 |
+
.content-header { font-weight: bold; color: #2c3e50; margin: 10px 0; }
|
| 1184 |
+
.content-paragraph { margin-bottom: 1em; }
|
| 1185 |
+
.content-list-item { margin-bottom: 0.5em; }
|
| 1186 |
+
''')
|
| 1187 |
html_parts.append('</style></head><body>')
|
| 1188 |
|
| 1189 |
+
indent_detector = EnhancedIndentationDetector()
|
| 1190 |
+
|
| 1191 |
for page_num in range(page_count):
|
| 1192 |
# Add page header to text
|
| 1193 |
page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
|
|
|
|
| 1198 |
|
| 1199 |
all_text.append(text)
|
| 1200 |
|
| 1201 |
+
# Add to HTML with enhanced indentation processing and text classification
|
| 1202 |
html_parts.append(f'<div class="page">')
|
| 1203 |
html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
|
| 1204 |
+
|
| 1205 |
+
# Process each line for indentation and classification
|
| 1206 |
+
lines = text.split('\n')
|
| 1207 |
+
for line in lines:
|
| 1208 |
+
if line.strip():
|
| 1209 |
+
indent_info = indent_detector.detect_indentation(line)
|
| 1210 |
+
text_classification = indent_detector.classify_text_type(line)
|
| 1211 |
+
|
| 1212 |
+
level = indent_info.get('level', 0)
|
| 1213 |
+
formatting_hint = indent_info.get('formatting_hint', 'normal_text')
|
| 1214 |
+
|
| 1215 |
+
css_classes = [f'indent-level-{min(level, 10)}']
|
| 1216 |
+
if formatting_hint != 'normal_text':
|
| 1217 |
+
css_classes.append(formatting_hint)
|
| 1218 |
+
|
| 1219 |
+
# Add text classification class
|
| 1220 |
+
if text_classification.get('type'):
|
| 1221 |
+
css_classes.append(f"content-{text_classification['type']}")
|
| 1222 |
+
|
| 1223 |
+
class_str = f' class="paragraph {" ".join(css_classes)}"'
|
| 1224 |
+
content = indent_info.get('content', line.strip())
|
| 1225 |
+
|
| 1226 |
+
# Add marker for non-bullet items
|
| 1227 |
+
marker = indent_info.get('pattern_marker', '')
|
| 1228 |
+
if marker and not indent_info.get('is_bullet', False):
|
| 1229 |
+
content = f"{marker} {content}"
|
| 1230 |
+
|
| 1231 |
+
html_parts.append(f'<div{class_str}>{content}</div>')
|
| 1232 |
+
else:
|
| 1233 |
+
html_parts.append('<div class="paragraph"><br></div>')
|
| 1234 |
+
|
| 1235 |
+
html_parts.append('</div>')
|
| 1236 |
|
| 1237 |
html_parts.append('</body></html>')
|
| 1238 |
|
| 1239 |
+
# Convert HTML back to formatted text
|
| 1240 |
+
html_content = '\n'.join(html_parts)
|
| 1241 |
+
formatted_text = EnhancedHTMLProcessor.html_to_formatted_text_enhanced(html_content)
|
| 1242 |
+
|
| 1243 |
result.update({
|
| 1244 |
'success': True,
|
| 1245 |
+
'text': formatted_text,
|
| 1246 |
+
'html': html_content,
|
| 1247 |
'metadata': {
|
| 1248 |
'pages': page_count,
|
| 1249 |
'html_generated': True,
|
| 1250 |
+
'enhanced_indentation': True,
|
| 1251 |
+
'intelligent_text_classification': True,
|
| 1252 |
+
'parenthetical_patterns_supported': True,
|
| 1253 |
'page_numbers_added': True,
|
| 1254 |
+
'comprehensive_formatting': True
|
| 1255 |
}
|
| 1256 |
})
|
| 1257 |
|
| 1258 |
+
logger.info("PyMuPDF extraction with enhanced indentation processing and text classification completed successfully")
|
| 1259 |
|
| 1260 |
except Exception as e:
|
| 1261 |
logger.error(f"PyMuPDF error: {e}")
|
|
|
|
| 1292 |
logger.info(f"Trying fallback method: {method}")
|
| 1293 |
try:
|
| 1294 |
if method == "azure":
|
| 1295 |
+
result = self._azure_ocr_with_enhanced_html(pdf_path)
|
| 1296 |
elif method == "tesseract":
|
| 1297 |
result = self._tesseract_ocr(pdf_path)
|
| 1298 |
elif method == "pymupdf":
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# PDF OCR Service Requirements - Enhanced Version with
|
| 2 |
|
| 3 |
# Core web framework and UI
|
| 4 |
gradio>=4.0.0
|
|
@@ -22,10 +22,21 @@ PyMuPDF>=1.23.0
|
|
| 22 |
# Document export formats (ENHANCED)
|
| 23 |
python-docx>=0.8.11
|
| 24 |
|
| 25 |
-
# HTML processing and parsing
|
| 26 |
beautifulsoup4>=4.12.0
|
| 27 |
lxml>=4.9.0
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
# Additional dependencies for enhanced preprocessing
|
| 30 |
matplotlib>=3.7.0 # For image visualization in development
|
| 31 |
scikit-image>=0.21.0 # Advanced image processing (optional)
|
|
@@ -34,45 +45,297 @@ scikit-image>=0.21.0 # Advanced image processing (optional)
|
|
| 34 |
tqdm>=4.65.0 # Progress bars for long operations
|
| 35 |
requests>=2.31.0 # HTTP requests for external services
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# System dependencies information (install separately):
|
| 38 |
#
|
| 39 |
# For Ubuntu/Debian:
|
| 40 |
# sudo apt-get update
|
| 41 |
-
# sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
|
| 42 |
# sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
|
| 43 |
# sudo apt-get install -y python3-opencv # Alternative OpenCV installation
|
| 44 |
# sudo apt-get install -y libxml2-dev libxslt1-dev # For lxml
|
|
|
|
|
|
|
|
|
|
| 45 |
#
|
| 46 |
# For CentOS/RHEL:
|
| 47 |
-
# sudo yum install -y tesseract tesseract-langpack-eng
|
| 48 |
# sudo yum install -y opencv-python
|
| 49 |
# sudo yum install -y libxml2-devel libxslt-devel
|
|
|
|
| 50 |
#
|
| 51 |
# For macOS:
|
| 52 |
# brew install tesseract
|
|
|
|
| 53 |
# brew install opencv
|
| 54 |
# brew install libxml2
|
|
|
|
| 55 |
#
|
| 56 |
# For Windows:
|
| 57 |
# Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
|
|
|
|
|
|
|
| 58 |
# Add Tesseract to PATH environment variable
|
| 59 |
# OpenCV and other packages should install automatically with pip
|
|
|
|
|
|
|
| 60 |
|
| 61 |
# Development and testing (optional)
|
| 62 |
pytest>=7.0.0
|
| 63 |
pytest-cov>=4.0.0
|
|
|
|
|
|
|
| 64 |
black>=23.0.0 # Code formatting
|
| 65 |
flake8>=6.0.0 # Code linting
|
|
|
|
|
|
|
| 66 |
|
| 67 |
# Performance monitoring (optional)
|
| 68 |
memory-profiler>=0.60.0
|
| 69 |
psutil>=5.9.0 # System monitoring
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
#
|
| 72 |
-
#
|
| 73 |
-
#
|
| 74 |
-
|
| 75 |
-
#
|
| 76 |
-
#
|
| 77 |
-
#
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PDF OCR Service Requirements - Enhanced Version with Comprehensive Indentation Detection & Text Classification
|
| 2 |
|
| 3 |
# Core web framework and UI
|
| 4 |
gradio>=4.0.0
|
|
|
|
| 22 |
# Document export formats (ENHANCED)
|
| 23 |
python-docx>=0.8.11
|
| 24 |
|
| 25 |
+
# HTML processing and parsing
|
| 26 |
beautifulsoup4>=4.12.0
|
| 27 |
lxml>=4.9.0
|
| 28 |
|
| 29 |
+
# Enhanced text processing and pattern detection
|
| 30 |
+
regex>=2023.10.3 # For advanced regex patterns including parenthetical detection
|
| 31 |
+
|
| 32 |
+
# Data handling and analysis
|
| 33 |
+
pandas>=2.0.0 # For document structure analysis
|
| 34 |
+
collections-extended>=2.0.2 # For enhanced counter operations
|
| 35 |
+
|
| 36 |
+
# Text classification and analysis
|
| 37 |
+
scikit-learn>=1.3.0 # For advanced text classification algorithms (optional)
|
| 38 |
+
nltk>=3.8 # Natural language processing toolkit (optional)
|
| 39 |
+
|
| 40 |
# Additional dependencies for enhanced preprocessing
|
| 41 |
matplotlib>=3.7.0 # For image visualization in development
|
| 42 |
scikit-image>=0.21.0 # Advanced image processing (optional)
|
|
|
|
| 45 |
tqdm>=4.65.0 # Progress bars for long operations
|
| 46 |
requests>=2.31.0 # HTTP requests for external services
|
| 47 |
|
| 48 |
+
# Logging and monitoring
|
| 49 |
+
colorlog>=6.7.0 # Enhanced logging with colors
|
| 50 |
+
structlog>=23.1.0 # Structured logging for better debugging
|
| 51 |
+
|
| 52 |
+
# File handling and temporary file management
|
| 53 |
+
pathlib2>=2.3.7 # Enhanced path operations
|
| 54 |
+
tempfile-plus>=1.2.0 # Advanced temporary file handling
|
| 55 |
+
|
| 56 |
+
# Date and time handling
|
| 57 |
+
python-dateutil>=2.8.2 # Enhanced date parsing
|
| 58 |
+
|
| 59 |
+
# Enhanced Unicode and text processing
|
| 60 |
+
unicodedata2>=15.0.0 # Enhanced Unicode support for Thai and other scripts
|
| 61 |
+
ftfy>=6.1.1 # Text fixing and encoding repair
|
| 62 |
+
|
| 63 |
+
# Configuration and validation
|
| 64 |
+
pydantic>=2.0.0 # Data validation and settings management
|
| 65 |
+
confuse>=2.0.0 # Configuration file handling
|
| 66 |
+
|
| 67 |
# System dependencies information (install separately):
|
| 68 |
#
|
| 69 |
# For Ubuntu/Debian:
|
| 70 |
# sudo apt-get update
|
| 71 |
+
# sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-tha
|
| 72 |
# sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
|
| 73 |
# sudo apt-get install -y python3-opencv # Alternative OpenCV installation
|
| 74 |
# sudo apt-get install -y libxml2-dev libxslt1-dev # For lxml
|
| 75 |
+
# sudo apt-get install -y fonts-thai-tlwg fonts-thai-tlwg-otf # Thai font support
|
| 76 |
+
# sudo apt-get install -y language-pack-th # Thai language support
|
| 77 |
+
# sudo apt-get install -y fonts-noto fonts-noto-cjk # Unicode font support
|
| 78 |
#
|
| 79 |
# For CentOS/RHEL:
|
| 80 |
+
# sudo yum install -y tesseract tesseract-langpack-eng tesseract-langpack-tha
|
| 81 |
# sudo yum install -y opencv-python
|
| 82 |
# sudo yum install -y libxml2-devel libxslt-devel
|
| 83 |
+
# sudo yum install -y thai-scalable-fonts google-noto-fonts
|
| 84 |
#
|
| 85 |
# For macOS:
|
| 86 |
# brew install tesseract
|
| 87 |
+
# brew install tesseract-lang # Includes Thai support
|
| 88 |
# brew install opencv
|
| 89 |
# brew install libxml2
|
| 90 |
+
# brew install font-thai-fonts font-noto
|
| 91 |
#
|
| 92 |
# For Windows:
|
| 93 |
# Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
|
| 94 |
+
# Download Thai language data from: https://github.com/tesseract-ocr/tessdata
|
| 95 |
+
# Download Thai numerals training data if available
|
| 96 |
# Add Tesseract to PATH environment variable
|
| 97 |
# OpenCV and other packages should install automatically with pip
|
| 98 |
+
# Install Thai fonts from Windows Language Settings
|
| 99 |
+
# Install Unicode fonts (Noto fonts recommended)
|
| 100 |
|
| 101 |
# Development and testing (optional)
|
| 102 |
pytest>=7.0.0
|
| 103 |
pytest-cov>=4.0.0
|
| 104 |
+
pytest-asyncio>=0.21.0 # For async testing
|
| 105 |
+
pytest-mock>=3.11.0 # For mocking in tests
|
| 106 |
black>=23.0.0 # Code formatting
|
| 107 |
flake8>=6.0.0 # Code linting
|
| 108 |
+
mypy>=1.5.0 # Type checking
|
| 109 |
+
isort>=5.12.0 # Import sorting
|
| 110 |
|
| 111 |
# Performance monitoring (optional)
|
| 112 |
memory-profiler>=0.60.0
|
| 113 |
psutil>=5.9.0 # System monitoring
|
| 114 |
+
py-spy>=0.3.14 # Performance profiling
|
| 115 |
+
|
| 116 |
+
# Enhanced error handling and debugging
|
| 117 |
+
rich>=13.0.0 # Rich console output for debugging
|
| 118 |
+
icecream>=2.1.3 # Enhanced debugging print statements
|
| 119 |
|
| 120 |
+
# Enhanced file type detection
|
| 121 |
+
python-magic>=0.4.27 # File type detection
|
| 122 |
+
filetype>=1.2.0 # Alternative file type detection
|
| 123 |
+
|
| 124 |
+
# Additional text processing utilities
|
| 125 |
+
Unidecode>=1.3.6 # ASCII transliteration for Unicode text
|
| 126 |
+
langdetect>=1.0.9 # Language detection for multi-language documents
|
| 127 |
+
|
| 128 |
+
# Note: The enhanced version includes comprehensive features:
|
| 129 |
+
#
|
| 130 |
+
# COMPREHENSIVE INDENTATION DETECTION FEATURES:
|
| 131 |
+
# ===============================================
|
| 132 |
+
#
|
| 133 |
+
# 1. HIERARCHICAL NUMBERING PATTERNS:
|
| 134 |
+
# - Decimal hierarchy: 1.1.1.1.1... (unlimited depth)
|
| 135 |
+
# - Mixed hierarchy: 1.2.a.i.A... (numbers, letters, Roman mixed)
|
| 136 |
+
# - Legal numbering: 1.1.1(a)(i) (with parenthetical sub-sections)
|
| 137 |
+
# - Outline numbering: I.A.1.a.i. (formal document structure)
|
| 138 |
+
# - Section numbering: §1.2.3, Article 1.1.1, Chapter 1.2
|
| 139 |
+
#
|
| 140 |
+
# 2. PARENTHETICAL PATTERNS (NEW):
|
| 141 |
+
# - Arabic numerals: (1), (2), (3), (10), (25)...
|
| 142 |
+
# - Thai numerals: (๑), (๒), (๓), (๑๐), (๒๕)...
|
| 143 |
+
# - Lowercase letters: (a), (b), (c)... (z), (aa), (bb)...
|
| 144 |
+
# - Uppercase letters: (A), (B), (C)... (Z), (AA), (BB)...
|
| 145 |
+
# - Thai letters: (ก), (ข), (ค)... (ฮ)
|
| 146 |
+
# - Lowercase Roman: (i), (ii), (iii), (iv), (v)...
|
| 147 |
+
# - Uppercase Roman: (I), (II), (III), (IV), (V)...
|
| 148 |
+
#
|
| 149 |
+
# 3. TRADITIONAL PATTERNS:
|
| 150 |
+
# - Simple numbered lists: 1., 2., 3.
|
| 151 |
+
# - Simple numbered with parens: 1), 2), 3)
|
| 152 |
+
# - Letter lists: a., b., c. and A., B., C.
|
| 153 |
+
# - Thai letters: ก., ข., ค.
|
| 154 |
+
# - Roman numerals: i., ii., iii. and I., II., III.
|
| 155 |
+
# - Multiple bullet styles: •◦▪→ and 20+ more symbols
|
| 156 |
+
# - Checkbox items: [x], [ ], [✓], [✗]
|
| 157 |
+
# - Arrow bullets: →, ←, ↑, ↓, ⇒, ➔ and more
|
| 158 |
+
# - Dash bullets: -, *, +, ~, =
|
| 159 |
+
#
|
| 160 |
+
# 4. MULTI-LANGUAGE SUPPORT:
|
| 161 |
+
# - Thai script: มาตรา, ข้อ, หมวด, ส่วน
|
| 162 |
+
# - Thai numerals: ๐๑๒๓๔๕๖๗๘๙
|
| 163 |
+
# - Thai letters: ก-ฮ (44 consonants)
|
| 164 |
+
# - Unicode symbols: Full range of bullet and arrow characters
|
| 165 |
+
# - Mixed language documents: English + Thai seamlessly
|
| 166 |
+
#
|
| 167 |
+
# 5. SPACE-BASED INDENTATION:
|
| 168 |
+
# - Automatic detection of space-based indentation levels
|
| 169 |
+
# - 4-space = 1 level standard
|
| 170 |
+
# - Combining space indentation with pattern indentation
|
| 171 |
+
# - Up to 10 indentation levels supported
|
| 172 |
+
#
|
| 173 |
+
# 6. PRIORITY-BASED PATTERN MATCHING:
|
| 174 |
+
# - Hierarchical patterns get higher priority
|
| 175 |
+
# - Parenthetical patterns prioritized appropriately
|
| 176 |
+
# - Prevents false positives in pattern detection
|
| 177 |
+
# - Smart disambiguation between similar patterns
|
| 178 |
+
#
|
| 179 |
+
# INTELLIGENT TEXT CLASSIFICATION FEATURES:
|
| 180 |
+
# =========================================
|
| 181 |
+
#
|
| 182 |
+
# 1. HEADER DETECTION:
|
| 183 |
+
# - Title case detection: "Chapter One Introduction"
|
| 184 |
+
# - All caps detection: "SECTION A: OVERVIEW"
|
| 185 |
+
# - Numbered headers: "1. INTRODUCTION"
|
| 186 |
+
# - Section headers: "SECTION 1.2.3", "CHAPTER IV"
|
| 187 |
+
# - Thai headers: "หมวด ๑", "บท ก"
|
| 188 |
+
# - Short line detection: Lines under 50 characters
|
| 189 |
+
# - Position-based detection: Top of page content
|
| 190 |
+
# - Font size consideration: Larger fonts = likely headers
|
| 191 |
+
#
|
| 192 |
+
# 2. PARAGRAPH CLASSIFICATION:
|
| 193 |
+
# - Long text detection: Over 100 characters
|
| 194 |
+
# - Proper punctuation: Ends with periods
|
| 195 |
+
# - Context analysis: Position and formatting
|
| 196 |
+
# - Multi-sentence detection
|
| 197 |
+
# - Normal text flow patterns
|
| 198 |
+
#
|
| 199 |
+
# 3. LIST ITEM RECOGNITION:
|
| 200 |
+
# - Pattern-based identification
|
| 201 |
+
# - Numbered list items
|
| 202 |
+
# - Bulleted list items
|
| 203 |
+
# - Lettered list items
|
| 204 |
+
# - Roman numeral lists
|
| 205 |
+
# - Parenthetical lists
|
| 206 |
+
# - Checkbox lists
|
| 207 |
+
#
|
| 208 |
+
# 4. CONFIDENCE SCORING:
|
| 209 |
+
# - 0.0 to 1.0 confidence levels
|
| 210 |
+
# - Multiple factors considered
|
| 211 |
+
# - Context-aware scoring
|
| 212 |
+
# - Threshold-based classification
|
| 213 |
+
#
|
| 214 |
+
# 5. DOCUMENT STRUCTURE ANALYSIS:
|
| 215 |
+
# - Overall document statistics
|
| 216 |
+
# - Pattern distribution analysis
|
| 217 |
+
# - Coverage percentage calculation
|
| 218 |
+
# - Dominant pattern identification
|
| 219 |
+
# - Text type distribution
|
| 220 |
+
#
|
| 221 |
+
# ENHANCED PROCESSING FEATURES:
|
| 222 |
+
# =============================
|
| 223 |
+
#
|
| 224 |
+
# 1. HTML INTERMEDIATE PROCESSING:
|
| 225 |
+
# - Better structure preservation
|
| 226 |
+
# - CSS-based indentation levels
|
| 227 |
+
# - Pattern-specific styling
|
| 228 |
+
# - Text classification styling
|
| 229 |
+
# - Responsive design
|
| 230 |
+
#
|
| 231 |
+
# 2. TABLE HANDLING:
|
| 232 |
+
# - Smart overlap detection (70% threshold)
|
| 233 |
+
# - Prevents text loss in tables
|
| 234 |
+
# - Improved coordinate calculations
|
| 235 |
+
# - Better boundary detection
|
| 236 |
+
#
|
| 237 |
+
# 3. EXPORT CAPABILITIES:
|
| 238 |
+
# - Enhanced TXT: Preserved indentation and structure
|
| 239 |
+
# - Enhanced DOCX: Color-coded formatting, proper indentation
|
| 240 |
+
# - Enhanced HTML: CSS styling, responsive design
|
| 241 |
+
# - All formats preserve pattern recognition results
|
| 242 |
+
#
|
| 243 |
+
# 4. CROP PROCESSING:
|
| 244 |
+
# - High-resolution processing (2x scale)
|
| 245 |
+
# - Per-page customization
|
| 246 |
+
# - Real-time preview
|
| 247 |
+
# - Enhanced coordinate handling
|
| 248 |
+
#
|
| 249 |
+
# 5. PERFORMANCE MONITORING:
|
| 250 |
+
# - Processing time tracking
|
| 251 |
+
# - Success rate monitoring
|
| 252 |
+
# - Pattern usage statistics
|
| 253 |
+
# - Document analysis metrics
|
| 254 |
+
#
|
| 255 |
+
# TECHNICAL IMPROVEMENTS:
|
| 256 |
+
# ======================
|
| 257 |
+
#
|
| 258 |
+
# 1. ADVANCED REGEX PATTERNS:
|
| 259 |
+
# - Unicode-aware pattern matching
|
| 260 |
+
# - Thai script support
|
| 261 |
+
# - Complex parenthetical detection
|
| 262 |
+
# - Priority-based matching system
|
| 263 |
+
#
|
| 264 |
+
# 2. ERROR HANDLING:
|
| 265 |
+
# - Comprehensive error catching
|
| 266 |
+
# - Graceful degradation
|
| 267 |
+
# - Detailed logging
|
| 268 |
+
# - Recovery mechanisms
|
| 269 |
+
#
|
| 270 |
+
# 3. TESTING CAPABILITIES:
|
| 271 |
+
# - Unit tests for pattern detection
|
| 272 |
+
# - Integration tests for OCR
|
| 273 |
+
# - Performance benchmarking
|
| 274 |
+
# - Coverage reporting
|
| 275 |
+
#
|
| 276 |
+
# 4. DEBUGGING SUPPORT:
|
| 277 |
+
# - Rich console output
|
| 278 |
+
# - Structured logging
|
| 279 |
+
# - Pattern detection debugging
|
| 280 |
+
# - Classification confidence display
|
| 281 |
+
#
|
| 282 |
+
# INSTALLATION NOTES:
|
| 283 |
+
# ==================
|
| 284 |
+
#
|
| 285 |
+
# 1. SYSTEM DEPENDENCIES:
|
| 286 |
+
# Install system dependencies first (see comments above)
|
| 287 |
+
# Ensure Thai language support is installed
|
| 288 |
+
# Install Unicode fonts for proper display
|
| 289 |
+
#
|
| 290 |
+
# 2. PYTHON DEPENDENCIES:
|
| 291 |
+
# Run: pip install -r requirements.txt
|
| 292 |
+
# Consider using virtual environment
|
| 293 |
+
# Update pip before installation: pip install --upgrade pip
|
| 294 |
+
#
|
| 295 |
+
# 3. AZURE CONFIGURATION (OPTIONAL):
|
| 296 |
+
# Set environment variables:
|
| 297 |
+
# - AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT
|
| 298 |
+
# - AZURE_DOCUMENT_INTELLIGENCE_KEY
|
| 299 |
+
#
|
| 300 |
+
# 4. TESTING:
|
| 301 |
+
# Test with sample documents containing:
|
| 302 |
+
# - Various indentation patterns
|
| 303 |
+
# - Parenthetical numbering
|
| 304 |
+
# - Mixed languages (English + Thai)
|
| 305 |
+
# - Complex document structures
|
| 306 |
+
# - Tables and lists
|
| 307 |
+
#
|
| 308 |
+
# 5. PERFORMANCE OPTIMIZATION:
|
| 309 |
+
# For high-volume processing:
|
| 310 |
+
# - Consider increasing system memory
|
| 311 |
+
# - Use SSD storage for temporary files
|
| 312 |
+
# - Monitor CPU usage during processing
|
| 313 |
+
# - Configure appropriate log levels
|
| 314 |
+
#
|
| 315 |
+
# SUPPORTED LANGUAGES AND SCRIPTS:
|
| 316 |
+
# ================================
|
| 317 |
+
#
|
| 318 |
+
# - English: Full comprehensive support
|
| 319 |
+
# - Thai: Complete support including numerals and letters
|
| 320 |
+
# - Arabic numerals: 0-9 in all contexts
|
| 321 |
+
# - Roman numerals: I, V, X, L, C, D, M and combinations
|
| 322 |
+
# - Unicode symbols: Full range of bullets, arrows, and marks
|
| 323 |
+
# - Mixed documents: Seamless handling of multi-language content
|
| 324 |
+
# - International conventions: Support for various numbering systems
|
| 325 |
+
#
|
| 326 |
+
# VERSION COMPATIBILITY:
|
| 327 |
+
# =====================
|
| 328 |
+
#
|
| 329 |
+
# - Python: 3.8+ required, 3.10+ recommended
|
| 330 |
+
# - Operating Systems: Windows, macOS, Linux
|
| 331 |
+
# - Memory: 4GB+ recommended for large documents
|
| 332 |
+
# - Storage: 1GB+ free space for temporary files
|
| 333 |
+
# - Network: Required for Azure Document Intelligence (optional)
|
| 334 |
+
#
|
| 335 |
+
# This enhanced version provides the most comprehensive indentation detection
|
| 336 |
+
# and text classification system available, with particular strength in:
|
| 337 |
+
# - Parenthetical pattern recognition ((1), (๑), (a), (i), (ก))
|
| 338 |
+
# - Thai language and script support
|
| 339 |
+
# - Intelligent document structure analysis
|
| 340 |
+
# - Multi-format export with preserved formatting
|
| 341 |
+
# - Real-time pattern demonstration and analysis
|