Spaces:
Running
Running
Upload 10 files
Browse files- .gitattributes +35 -35
- app.py +635 -783
- backend.py +556 -413
- ocr_service.py +790 -324
- readme.md +196 -157
- requirements.txt +24 -8
.gitattributes
CHANGED
|
@@ -1,35 +1,35 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
|
@@ -1,18 +1,16 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Gradio UI for PDF OCR Service - Enhanced with Header/Footer Removal
|
| 3 |
-
User interface for PDF to text conversion with multiple OCR providers and preprocessing options
|
| 4 |
-
"""
|
| 5 |
import re
|
| 6 |
import gradio as gr
|
| 7 |
import os
|
| 8 |
import tempfile
|
| 9 |
import logging
|
|
|
|
| 10 |
from pathlib import Path
|
| 11 |
from datetime import datetime
|
| 12 |
import cv2
|
| 13 |
import numpy as np
|
| 14 |
from PIL import Image
|
| 15 |
import fitz # PyMuPDF
|
|
|
|
| 16 |
|
| 17 |
# Load environment variables
|
| 18 |
from dotenv import load_dotenv
|
|
@@ -28,179 +26,330 @@ logger = logging.getLogger(__name__)
|
|
| 28 |
backend_manager = BackendManager()
|
| 29 |
|
| 30 |
# Check if python-docx is available
|
| 31 |
-
from docx.shared import Pt
|
| 32 |
-
from docx.enum.table import WD_TABLE_ALIGNMENT
|
| 33 |
try:
|
| 34 |
from docx import Document
|
| 35 |
-
from docx.shared import Inches
|
|
|
|
| 36 |
HAS_DOCX_SUPPORT = True
|
| 37 |
logger.info("DOCX export available")
|
| 38 |
except ImportError:
|
| 39 |
HAS_DOCX_SUPPORT = False
|
| 40 |
logger.info("DOCX export not available - install python-docx to enable")
|
| 41 |
|
| 42 |
-
# Global variables for crop
|
| 43 |
-
|
| 44 |
-
'
|
| 45 |
-
'
|
| 46 |
-
'
|
| 47 |
-
'
|
|
|
|
| 48 |
}
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
def
|
| 52 |
-
"""
|
| 53 |
if pdf_file is None:
|
| 54 |
-
return None
|
| 55 |
|
| 56 |
try:
|
| 57 |
-
|
| 58 |
-
doc = fitz.open(pdf_path)
|
| 59 |
-
|
| 60 |
-
if page_num >= len(doc):
|
| 61 |
-
page_num = 0
|
| 62 |
-
|
| 63 |
-
page = doc.load_page(page_num)
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
-
|
|
|
|
|
|
|
| 76 |
|
| 77 |
-
return img_array
|
| 78 |
except Exception as e:
|
| 79 |
-
logger.error(f"Error
|
| 80 |
return None
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
if pdf_file is None:
|
| 86 |
return None
|
| 87 |
|
| 88 |
try:
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
-
#
|
| 94 |
-
|
| 95 |
-
# RGBA to RGB
|
| 96 |
-
img_array = img_array[:, :, :3]
|
| 97 |
-
|
| 98 |
-
img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
|
| 99 |
-
height, width = img_bgr.shape[:2]
|
| 100 |
-
|
| 101 |
-
# Calculate crop areas
|
| 102 |
-
top_px = int(height * top_crop / 100)
|
| 103 |
-
bottom_px = int(height * bottom_crop / 100)
|
| 104 |
-
left_px = int(width * left_crop / 100)
|
| 105 |
-
right_px = int(width * right_crop / 100)
|
| 106 |
-
|
| 107 |
-
# Store current settings
|
| 108 |
-
current_crop_settings.update({
|
| 109 |
-
'top': top_px,
|
| 110 |
-
'bottom': bottom_px,
|
| 111 |
-
'left': left_px,
|
| 112 |
-
'right': right_px
|
| 113 |
-
})
|
| 114 |
-
|
| 115 |
-
# Create overlay
|
| 116 |
-
overlay = img_bgr.copy()
|
| 117 |
-
|
| 118 |
-
# Draw crop areas in red (areas to be removed)
|
| 119 |
-
if top_px > 0:
|
| 120 |
-
cv2.rectangle(overlay, (0, 0), (width, top_px), (0, 0, 255), -1)
|
| 121 |
-
if bottom_px > 0:
|
| 122 |
-
cv2.rectangle(overlay, (0, height - bottom_px), (width, height), (0, 0, 255), -1)
|
| 123 |
-
if left_px > 0:
|
| 124 |
-
cv2.rectangle(overlay, (0, 0), (left_px, height), (0, 0, 255), -1)
|
| 125 |
-
if right_px > 0:
|
| 126 |
-
cv2.rectangle(overlay, (width - right_px, 0), (width, height), (0, 0, 255), -1)
|
| 127 |
-
|
| 128 |
-
# Draw content area outline in green
|
| 129 |
-
content_top = top_px
|
| 130 |
-
content_bottom = height - bottom_px
|
| 131 |
-
content_left = left_px
|
| 132 |
-
content_right = width - right_px
|
| 133 |
-
|
| 134 |
-
if content_right > content_left and content_bottom > content_top:
|
| 135 |
-
cv2.rectangle(overlay, (content_left, content_top), (content_right, content_bottom), (0, 255, 0), 3)
|
| 136 |
-
|
| 137 |
-
# Blend overlay with original
|
| 138 |
-
result = cv2.addWeighted(img_bgr, 0.7, overlay, 0.3, 0)
|
| 139 |
-
|
| 140 |
-
# Add text annotations
|
| 141 |
-
cv2.putText(result, "RED: Areas to remove", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
|
| 142 |
-
cv2.putText(result, "GREEN: Content area", (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
|
| 143 |
-
|
| 144 |
-
# Convert back to RGB for display
|
| 145 |
-
result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
|
| 146 |
-
|
| 147 |
-
return result_rgb
|
| 148 |
|
| 149 |
except Exception as e:
|
| 150 |
logger.error(f"Error updating crop preview: {e}")
|
| 151 |
return None
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
Process
|
| 158 |
-
"""
|
| 159 |
if pdf_file is None:
|
| 160 |
-
return "No file uploaded.", "", "
|
| 161 |
|
| 162 |
-
temp_file_path = None
|
| 163 |
try:
|
| 164 |
-
progress(0.1, desc="Initializing...")
|
| 165 |
-
|
| 166 |
-
# Handle Gradio file object
|
| 167 |
-
temp_file_path = pdf_file.name
|
| 168 |
|
| 169 |
-
# Prepare preprocessing options
|
| 170 |
preprocessing_options = {
|
| 171 |
'enable_header_footer_removal': enable_header_footer_removal,
|
| 172 |
-
'
|
| 173 |
-
|
| 174 |
-
'bottom': crop_bottom,
|
| 175 |
-
'left': crop_left,
|
| 176 |
-
'right': crop_right
|
| 177 |
-
}
|
| 178 |
}
|
| 179 |
|
| 180 |
-
progress(0.3, desc="Processing
|
| 181 |
|
| 182 |
-
# Process the PDF with
|
| 183 |
-
result = backend_manager.
|
|
|
|
|
|
|
| 184 |
|
| 185 |
-
progress(0.9, desc="Finalizing...")
|
| 186 |
progress(1.0, desc="Complete!")
|
| 187 |
|
| 188 |
if result['success']:
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
else:
|
| 194 |
error_msg = result.get('error', 'Unknown error occurred')
|
| 195 |
-
return f"Error: {error_msg}", "",
|
| 196 |
|
| 197 |
except Exception as e:
|
| 198 |
-
logger.error(f"
|
| 199 |
-
return f"Error: {str(e)}", "",
|
| 200 |
-
|
| 201 |
|
| 202 |
-
def
|
| 203 |
-
"""
|
| 204 |
if not metadata:
|
| 205 |
return f"Method used: {method_used}"
|
| 206 |
|
|
@@ -209,750 +358,453 @@ def format_metadata(metadata, method_used):
|
|
| 209 |
if 'pages' in metadata:
|
| 210 |
info_lines.append(f"Pages processed: {metadata['pages']}")
|
| 211 |
|
| 212 |
-
if '
|
| 213 |
-
info_lines.append(
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
-
if '
|
| 216 |
-
|
| 217 |
-
info_lines.append(f"Handwritten content: {handwritten_status}")
|
| 218 |
|
| 219 |
-
if '
|
| 220 |
-
|
| 221 |
-
|
|
|
|
|
|
|
| 222 |
|
| 223 |
if 'processing_time_seconds' in metadata:
|
| 224 |
info_lines.append(f"Processing time: {metadata['processing_time_seconds']:.2f} seconds")
|
| 225 |
|
| 226 |
return "\n".join(info_lines)
|
| 227 |
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
encoding='utf-8'
|
| 237 |
)
|
| 238 |
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
temp_file.write("PDF OCR Extraction Results\n")
|
| 242 |
-
temp_file.write("=" * 50 + "\n\n")
|
| 243 |
-
|
| 244 |
-
# Add metadata
|
| 245 |
-
if metadata_info:
|
| 246 |
-
temp_file.write("Processing Information:\n")
|
| 247 |
-
temp_file.write("-" * 25 + "\n")
|
| 248 |
-
temp_file.write(metadata_info + "\n\n")
|
| 249 |
-
|
| 250 |
-
# Add timestamp
|
| 251 |
-
temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 252 |
-
temp_file.write("=" * 50 + "\n\n")
|
| 253 |
-
|
| 254 |
-
# Add main content with clean table processing
|
| 255 |
-
temp_file.write("Extracted Text:\n")
|
| 256 |
-
temp_file.write("-" * 15 + "\n\n")
|
| 257 |
-
|
| 258 |
-
# Process content to clean up table duplications
|
| 259 |
-
cleaned_content = _clean_text_content_for_txt(text_content)
|
| 260 |
-
temp_file.write(cleaned_content)
|
| 261 |
-
|
| 262 |
-
temp_file.close()
|
| 263 |
-
return temp_file.name
|
| 264 |
-
|
| 265 |
-
except Exception as e:
|
| 266 |
-
logger.error(f"Error creating TXT file: {e}")
|
| 267 |
-
temp_file.close()
|
| 268 |
-
raise
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
def _clean_text_content_for_txt(content):
|
| 272 |
-
"""Clean text content for TXT export, removing table duplications"""
|
| 273 |
-
if not content.strip():
|
| 274 |
-
return content
|
| 275 |
-
|
| 276 |
-
# Split by pages first
|
| 277 |
-
if '=== PAGE ' in content:
|
| 278 |
-
pages = content.split('=== PAGE ')
|
| 279 |
-
cleaned_pages = []
|
| 280 |
-
|
| 281 |
-
for i, page_content in enumerate(pages):
|
| 282 |
-
if i == 0 and not page_content.strip():
|
| 283 |
-
continue
|
| 284 |
-
|
| 285 |
-
if i > 0:
|
| 286 |
-
# Add page header
|
| 287 |
-
page_num = page_content.split(' ===')[0] if ' ===' in page_content else str(i)
|
| 288 |
-
cleaned_pages.append(f"\n--- Page {page_num} ---\n")
|
| 289 |
-
|
| 290 |
-
# Get content after page header
|
| 291 |
-
content_part = page_content.split('===\n', 1)[-1] if '===\n' in page_content else page_content
|
| 292 |
-
else:
|
| 293 |
-
content_part = page_content
|
| 294 |
-
|
| 295 |
-
# Clean this page's content
|
| 296 |
-
cleaned_page = _clean_page_content_for_txt(content_part)
|
| 297 |
-
if cleaned_page.strip():
|
| 298 |
-
cleaned_pages.append(cleaned_page)
|
| 299 |
-
|
| 300 |
-
return '\n'.join(cleaned_pages)
|
| 301 |
-
else:
|
| 302 |
-
# No page structure, clean as single content
|
| 303 |
-
return _clean_page_content_for_txt(content)
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
def _clean_page_content_for_txt(content):
|
| 307 |
-
"""Clean a single page's content for TXT export"""
|
| 308 |
-
if not content.strip():
|
| 309 |
-
return ""
|
| 310 |
-
|
| 311 |
-
import re
|
| 312 |
-
|
| 313 |
-
# Split content by table markers
|
| 314 |
-
parts = re.split(r'\n?--- TABLE \d+ ---\n?', content)
|
| 315 |
-
|
| 316 |
-
cleaned_parts = []
|
| 317 |
-
table_count = 0
|
| 318 |
-
|
| 319 |
-
# Find all table sections
|
| 320 |
-
table_matches = re.finditer(r'\n?--- TABLE (\d+) ---\n?(.*?)(?=\n?--- TABLE \d+ ---|$)', content, re.DOTALL)
|
| 321 |
-
table_contents = {}
|
| 322 |
-
|
| 323 |
-
for match in table_matches:
|
| 324 |
-
table_num = match.group(1)
|
| 325 |
-
table_content = match.group(2).strip()
|
| 326 |
-
table_contents[int(table_num)] = table_content
|
| 327 |
-
|
| 328 |
-
# Process each part
|
| 329 |
-
for i, part in enumerate(parts):
|
| 330 |
-
if part.strip():
|
| 331 |
-
# Clean the text part
|
| 332 |
-
cleaned_part = _clean_text_part(part)
|
| 333 |
-
if cleaned_part.strip():
|
| 334 |
-
cleaned_parts.append(cleaned_part)
|
| 335 |
-
|
| 336 |
-
# Add table if this part was followed by one
|
| 337 |
-
if i < len(parts) - 1: # Not the last part
|
| 338 |
-
table_count += 1
|
| 339 |
-
if table_count in table_contents:
|
| 340 |
-
table_header = f"\n--- TABLE {table_count} ---\n"
|
| 341 |
-
table_text = _format_table_for_txt(table_contents[table_count])
|
| 342 |
-
cleaned_parts.append(table_header + table_text)
|
| 343 |
-
|
| 344 |
-
return '\n'.join(cleaned_parts)
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
def _clean_text_part(text_part):
|
| 348 |
-
"""Clean a text part of any remaining table content"""
|
| 349 |
-
if not text_part.strip():
|
| 350 |
-
return ""
|
| 351 |
-
|
| 352 |
-
import re
|
| 353 |
-
|
| 354 |
-
# Remove any stray table markers
|
| 355 |
-
cleaned = re.sub(r'\n?--- TABLE \d+ ---\n?', '', text_part)
|
| 356 |
-
cleaned = re.sub(r'\n?--- Table \d+ ---\n?', '', cleaned)
|
| 357 |
-
|
| 358 |
-
# Split into lines and filter out table-like content
|
| 359 |
-
lines = cleaned.split('\n')
|
| 360 |
-
filtered_lines = []
|
| 361 |
-
|
| 362 |
-
for line in lines:
|
| 363 |
-
line = line.strip()
|
| 364 |
-
if not line:
|
| 365 |
-
filtered_lines.append('') # Keep empty lines for spacing
|
| 366 |
-
continue
|
| 367 |
-
|
| 368 |
-
# Skip lines that look like table content (multiple | separators)
|
| 369 |
-
if line.count('|') >= 2:
|
| 370 |
-
continue
|
| 371 |
-
|
| 372 |
-
# Skip separator lines
|
| 373 |
-
if line.replace('-', '').replace(' ', '').replace('|', '') == '':
|
| 374 |
-
continue
|
| 375 |
-
|
| 376 |
-
filtered_lines.append(line)
|
| 377 |
-
|
| 378 |
-
# Remove excessive empty lines
|
| 379 |
-
result_lines = []
|
| 380 |
-
prev_empty = False
|
| 381 |
-
|
| 382 |
-
for line in filtered_lines:
|
| 383 |
-
if line == '':
|
| 384 |
-
if not prev_empty:
|
| 385 |
-
result_lines.append(line)
|
| 386 |
-
prev_empty = True
|
| 387 |
-
else:
|
| 388 |
-
result_lines.append(line)
|
| 389 |
-
prev_empty = False
|
| 390 |
-
|
| 391 |
-
return '\n'.join(result_lines)
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
def _format_table_for_txt(table_content):
|
| 395 |
-
"""Format table content nicely for TXT output"""
|
| 396 |
-
if not table_content.strip():
|
| 397 |
-
return ""
|
| 398 |
-
|
| 399 |
-
lines = [line.strip() for line in table_content.split('\n') if line.strip()]
|
| 400 |
-
|
| 401 |
-
# Look for table structure
|
| 402 |
-
table_lines = []
|
| 403 |
-
for line in lines:
|
| 404 |
-
if '|' in line:
|
| 405 |
-
# Clean up the table line
|
| 406 |
-
cells = [cell.strip() for cell in line.split('|')]
|
| 407 |
-
# Remove empty cells at start/end
|
| 408 |
-
while cells and not cells[0]:
|
| 409 |
-
cells.pop(0)
|
| 410 |
-
while cells and not cells[-1]:
|
| 411 |
-
cells.pop()
|
| 412 |
-
if cells:
|
| 413 |
-
table_lines.append(cells)
|
| 414 |
-
|
| 415 |
-
if not table_lines:
|
| 416 |
-
return table_content # Return as is if no table structure found
|
| 417 |
-
|
| 418 |
-
# Calculate column widths
|
| 419 |
-
if table_lines:
|
| 420 |
-
max_cols = max(len(row) for row in table_lines)
|
| 421 |
-
col_widths = [0] * max_cols
|
| 422 |
-
|
| 423 |
-
for row in table_lines:
|
| 424 |
-
for i in range(min(len(row), max_cols)):
|
| 425 |
-
col_widths[i] = max(col_widths[i], len(row[i]) if i < len(row) else 0)
|
| 426 |
-
|
| 427 |
-
# Format table with proper alignment
|
| 428 |
-
formatted_lines = []
|
| 429 |
-
for i, row in enumerate(table_lines):
|
| 430 |
-
formatted_row = []
|
| 431 |
-
for j in range(max_cols):
|
| 432 |
-
cell_content = row[j] if j < len(row) else ""
|
| 433 |
-
width = max(col_widths[j], 3)
|
| 434 |
-
formatted_row.append(cell_content.ljust(width))
|
| 435 |
-
|
| 436 |
-
formatted_lines.append(" | ".join(formatted_row))
|
| 437 |
-
|
| 438 |
-
# Add separator after header row
|
| 439 |
-
if i == 0 and len(table_lines) > 1:
|
| 440 |
-
separator = " | ".join(["-" * max(col_widths[k], 3) for k in range(max_cols)])
|
| 441 |
-
formatted_lines.append(separator)
|
| 442 |
-
|
| 443 |
-
return '\n'.join(formatted_lines)
|
| 444 |
-
|
| 445 |
-
return table_content
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
def create_docx_file(text_content, metadata_info=""):
|
| 449 |
-
"""Create DOCX file with enhanced table handling - NO separator rows"""
|
| 450 |
-
if not HAS_DOCX_SUPPORT:
|
| 451 |
-
raise ImportError("python-docx not installed. Cannot create DOCX files.")
|
| 452 |
-
|
| 453 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 454 |
-
temp_file = tempfile.NamedTemporaryFile(
|
| 455 |
-
suffix=f'_extracted_text_{timestamp}.docx',
|
| 456 |
-
delete=False
|
| 457 |
-
)
|
| 458 |
-
temp_file.close()
|
| 459 |
-
|
| 460 |
-
try:
|
| 461 |
-
from docx import Document
|
| 462 |
-
from docx.shared import Inches, Pt
|
| 463 |
-
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
| 464 |
-
from docx.enum.table import WD_TABLE_ALIGNMENT
|
| 465 |
-
|
| 466 |
-
doc = Document()
|
| 467 |
-
|
| 468 |
-
# Set margins
|
| 469 |
-
sections = doc.sections
|
| 470 |
-
for section in sections:
|
| 471 |
-
section.top_margin = Inches(1)
|
| 472 |
-
section.bottom_margin = Inches(1)
|
| 473 |
-
section.left_margin = Inches(1)
|
| 474 |
-
section.right_margin = Inches(1)
|
| 475 |
-
|
| 476 |
-
# Title
|
| 477 |
-
title = doc.add_heading('PDF OCR Extraction Results', 0)
|
| 478 |
-
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 479 |
-
|
| 480 |
-
# Metadata
|
| 481 |
-
if metadata_info:
|
| 482 |
-
doc.add_heading('Processing Information', level=1)
|
| 483 |
-
metadata_para = doc.add_paragraph(metadata_info)
|
| 484 |
-
metadata_para.style = 'Intense Quote'
|
| 485 |
-
doc.add_page_break()
|
| 486 |
-
|
| 487 |
-
# Enhanced content processing
|
| 488 |
-
_add_enhanced_content_to_docx(doc, text_content)
|
| 489 |
-
|
| 490 |
-
# Footer
|
| 491 |
-
footer_section = doc.sections[0]
|
| 492 |
-
footer = footer_section.footer
|
| 493 |
-
footer_para = footer.paragraphs[0]
|
| 494 |
-
footer_para.text = f"Generated by PDF OCR Service on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
| 495 |
-
|
| 496 |
-
doc.save(temp_file.name)
|
| 497 |
-
logger.info(f"Enhanced DOCX file created: {temp_file.name}")
|
| 498 |
-
return temp_file.name
|
| 499 |
-
|
| 500 |
-
except Exception as e:
|
| 501 |
-
logger.error(f"Error creating DOCX file: {e}")
|
| 502 |
try:
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
if line.startswith('=== PAGE '):
|
| 523 |
-
if current_table_content:
|
| 524 |
-
_add_enhanced_table(doc, current_table_content)
|
| 525 |
-
current_table_content = []
|
| 526 |
-
in_table = False
|
| 527 |
-
|
| 528 |
-
page_num = line.replace('=== PAGE ', '').replace(' ===', '')
|
| 529 |
-
doc.add_heading(f'Page {page_num}', level=1)
|
| 530 |
-
continue
|
| 531 |
-
|
| 532 |
-
# Handle table start
|
| 533 |
-
if line.startswith('--- TABLE '):
|
| 534 |
-
if current_table_content:
|
| 535 |
-
_add_enhanced_table(doc, current_table_content)
|
| 536 |
-
|
| 537 |
-
current_table_content = []
|
| 538 |
-
in_table = True
|
| 539 |
-
table_num = line.replace('--- TABLE ', '').replace(' ---', '')
|
| 540 |
-
current_table_content.append(f"Table {table_num}")
|
| 541 |
-
continue
|
| 542 |
-
|
| 543 |
-
# Handle content
|
| 544 |
-
if in_table:
|
| 545 |
-
if line and not line.startswith('==='):
|
| 546 |
-
current_table_content.append(line)
|
| 547 |
-
else:
|
| 548 |
-
# Regular text
|
| 549 |
-
if line:
|
| 550 |
-
if line.startswith('# '):
|
| 551 |
-
doc.add_heading(line[2:], level=1)
|
| 552 |
-
elif line.startswith('## '):
|
| 553 |
-
doc.add_heading(line[3:], level=2)
|
| 554 |
-
elif line.startswith('### '):
|
| 555 |
-
doc.add_heading(line[4:], level=3)
|
| 556 |
-
else:
|
| 557 |
-
doc.add_paragraph(line)
|
| 558 |
-
else:
|
| 559 |
-
# Empty line - add small space
|
| 560 |
-
doc.add_paragraph("")
|
| 561 |
-
|
| 562 |
-
# Handle any remaining table
|
| 563 |
-
if current_table_content:
|
| 564 |
-
_add_enhanced_table(doc, current_table_content)
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
def _add_enhanced_table(doc, table_content):
|
| 568 |
-
"""Add table with enhanced processing - REMOVES separator rows"""
|
| 569 |
-
if not table_content:
|
| 570 |
-
return
|
| 571 |
-
|
| 572 |
-
# First line should be table title
|
| 573 |
-
if table_content:
|
| 574 |
-
doc.add_heading(table_content[0], level=3)
|
| 575 |
-
table_lines = table_content[1:]
|
| 576 |
else:
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
# Find lines that contain pipes (table rows) and FILTER OUT separator rows
|
| 583 |
-
table_rows = []
|
| 584 |
-
for line in table_lines:
|
| 585 |
-
if '|' in line and line.strip():
|
| 586 |
-
# CRITICAL: Skip separator rows (lines that are mostly dashes)
|
| 587 |
-
line_content = line.replace('|', '').replace(' ', '')
|
| 588 |
-
if line_content.replace('-', '') == '':
|
| 589 |
-
continue # Skip this separator row
|
| 590 |
-
|
| 591 |
-
# Split and clean
|
| 592 |
-
cells = [cell.strip() for cell in line.split('|')]
|
| 593 |
-
# Remove empty cells at edges
|
| 594 |
-
while cells and not cells[0]:
|
| 595 |
-
cells.pop(0)
|
| 596 |
-
while cells and not cells[-1]:
|
| 597 |
-
cells.pop()
|
| 598 |
-
if cells:
|
| 599 |
-
table_rows.append(cells)
|
| 600 |
-
|
| 601 |
-
if not table_rows:
|
| 602 |
-
# No table structure, add as text
|
| 603 |
-
for line in table_lines:
|
| 604 |
-
if line.strip():
|
| 605 |
-
doc.add_paragraph(line)
|
| 606 |
-
return
|
| 607 |
-
|
| 608 |
-
# Create table
|
| 609 |
-
max_cols = max(len(row) for row in table_rows)
|
| 610 |
-
table = doc.add_table(rows=len(table_rows), cols=max_cols)
|
| 611 |
-
table.style = 'Table Grid'
|
| 612 |
-
|
| 613 |
-
# Fill table
|
| 614 |
-
for row_idx, row_data in enumerate(table_rows):
|
| 615 |
-
table_row = table.rows[row_idx]
|
| 616 |
-
for col_idx in range(max_cols):
|
| 617 |
-
cell = table_row.cells[col_idx]
|
| 618 |
-
if col_idx < len(row_data):
|
| 619 |
-
cell.text = row_data[col_idx]
|
| 620 |
-
|
| 621 |
-
# Bold first row
|
| 622 |
-
if row_idx == 0:
|
| 623 |
-
for paragraph in cell.paragraphs:
|
| 624 |
-
for run in paragraph.runs:
|
| 625 |
-
run.bold = True
|
| 626 |
-
|
| 627 |
-
doc.add_paragraph("") # Space after table
|
| 628 |
-
|
| 629 |
|
| 630 |
-
def
|
| 631 |
-
"""Get information about selected OCR method"""
|
| 632 |
method_descriptions = {
|
| 633 |
-
"auto": "
|
| 634 |
-
"azure": "
|
| 635 |
-
"tesseract": "
|
| 636 |
-
"pymupdf": "
|
| 637 |
}
|
| 638 |
|
| 639 |
return method_descriptions.get(method, "Select a method to see details.")
|
| 640 |
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
"""Check and display service status"""
|
| 644 |
available_methods = backend_manager.get_available_methods()
|
| 645 |
|
| 646 |
-
status_lines = ["**Available OCR Methods:**"]
|
| 647 |
|
| 648 |
if "azure" in available_methods:
|
| 649 |
-
status_lines.append("
|
| 650 |
else:
|
| 651 |
-
status_lines.append("
|
| 652 |
|
| 653 |
if "tesseract" in available_methods:
|
| 654 |
-
status_lines.append("
|
| 655 |
else:
|
| 656 |
-
status_lines.append("
|
| 657 |
|
| 658 |
if "pymupdf" in available_methods:
|
| 659 |
-
status_lines.append("
|
| 660 |
else:
|
| 661 |
-
status_lines.append("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
|
| 663 |
-
# Add DOCX support status
|
| 664 |
if HAS_DOCX_SUPPORT:
|
| 665 |
-
status_lines.append("
|
| 666 |
else:
|
| 667 |
-
status_lines.append("
|
| 668 |
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
def process_and_prepare_downloads(pdf_file, method, enable_header_footer_removal, crop_top, crop_bottom, crop_left, crop_right):
|
| 672 |
-
"""Process PDF and prepare both TXT and DOCX downloads if successful"""
|
| 673 |
-
text, metadata, status = process_pdf_file(pdf_file, method, enable_header_footer_removal, crop_top, crop_bottom, crop_left, crop_right)
|
| 674 |
|
| 675 |
-
|
| 676 |
-
if text and not text.startswith("Error:") and not text.startswith("No file"):
|
| 677 |
-
try:
|
| 678 |
-
# Create TXT file
|
| 679 |
-
txt_path = create_txt_file(text, metadata)
|
| 680 |
-
|
| 681 |
-
# Create DOCX file if support is available
|
| 682 |
-
if HAS_DOCX_SUPPORT:
|
| 683 |
-
try:
|
| 684 |
-
docx_path = create_docx_file(text, metadata)
|
| 685 |
-
return (text, metadata, status,
|
| 686 |
-
gr.update(visible=True, value=txt_path),
|
| 687 |
-
gr.update(visible=True, value=docx_path))
|
| 688 |
-
except Exception as docx_error:
|
| 689 |
-
logger.warning(f"DOCX creation failed: {docx_error}")
|
| 690 |
-
return (text, metadata, status,
|
| 691 |
-
gr.update(visible=True, value=txt_path),
|
| 692 |
-
gr.update(visible=False))
|
| 693 |
-
else:
|
| 694 |
-
return (text, metadata, status,
|
| 695 |
-
gr.update(visible=True, value=txt_path),
|
| 696 |
-
gr.update(visible=False))
|
| 697 |
-
|
| 698 |
-
except Exception as file_error:
|
| 699 |
-
logger.error(f"File creation error: {file_error}")
|
| 700 |
-
return (text, metadata, status,
|
| 701 |
-
gr.update(visible=False),
|
| 702 |
-
gr.update(visible=False))
|
| 703 |
-
else:
|
| 704 |
-
return (text, metadata, status,
|
| 705 |
-
gr.update(visible=False),
|
| 706 |
-
gr.update(visible=False))
|
| 707 |
-
|
| 708 |
|
| 709 |
-
def
|
| 710 |
-
"""Create
|
| 711 |
|
| 712 |
with gr.Blocks(
|
| 713 |
-
title="PDF OCR Service - Enhanced",
|
| 714 |
theme=gr.themes.Soft(),
|
| 715 |
css="""
|
| 716 |
.main-header { text-align: center; margin-bottom: 2rem; }
|
| 717 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 718 |
.status-box { border-left: 4px solid #007bff; padding: 1rem; background-color: #f8f9fa; }
|
| 719 |
-
.preprocessing-box { border: 2px solid #28a745; padding: 1rem; border-radius: 0.5rem; background-color: #f8fff8; }
|
| 720 |
"""
|
| 721 |
) as interface:
|
| 722 |
|
| 723 |
gr.HTML("""
|
| 724 |
<div class="main-header">
|
| 725 |
-
<h1>
|
| 726 |
-
<p>Convert PDF documents to text using
|
| 727 |
</div>
|
| 728 |
""")
|
| 729 |
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
|
|
|
|
|
|
|
|
|
| 740 |
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 758 |
|
| 759 |
enable_header_footer_removal = gr.Checkbox(
|
| 760 |
-
label="Enable Header/Footer Removal",
|
| 761 |
value=False,
|
| 762 |
-
info="Remove headers and footers
|
| 763 |
)
|
| 764 |
|
| 765 |
-
#
|
| 766 |
with gr.Group(visible=False) as crop_controls:
|
| 767 |
-
gr.HTML("<h5>
|
| 768 |
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 777 |
|
| 778 |
-
|
| 779 |
-
minimum=0,
|
| 780 |
-
maximum=30,
|
| 781 |
-
value=5,
|
| 782 |
-
step=0.5,
|
| 783 |
-
label="Bottom Crop %",
|
| 784 |
-
info="Percentage of page height to remove from bottom"
|
| 785 |
-
)
|
| 786 |
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 795 |
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 803 |
)
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
size="lg"
|
| 812 |
-
)
|
| 813 |
-
|
| 814 |
-
# Service status
|
| 815 |
-
gr.HTML("<h4>🔧 Service Status</h4>")
|
| 816 |
-
service_status = gr.Markdown(
|
| 817 |
-
value=check_service_status(),
|
| 818 |
-
elem_classes=["status-box"]
|
| 819 |
-
)
|
| 820 |
-
|
| 821 |
-
# Refresh status button
|
| 822 |
-
refresh_btn = gr.Button("🔄 Refresh Status", size="sm")
|
| 823 |
|
| 824 |
with gr.Column(scale=2):
|
| 825 |
-
gr.
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
text_output = gr.Textbox(
|
| 844 |
-
label="Extracted Text",
|
| 845 |
-
placeholder="Processed text will appear here...",
|
| 846 |
-
lines=20,
|
| 847 |
-
max_lines=30,
|
| 848 |
-
interactive=False,
|
| 849 |
-
show_copy_button=True
|
| 850 |
-
)
|
| 851 |
-
|
| 852 |
-
# Metadata information
|
| 853 |
-
metadata_output = gr.Textbox(
|
| 854 |
-
label="Processing Information",
|
| 855 |
-
interactive=False,
|
| 856 |
-
lines=4
|
| 857 |
-
)
|
| 858 |
-
|
| 859 |
-
# Download buttons
|
| 860 |
-
with gr.Row():
|
| 861 |
-
download_txt_btn = gr.DownloadButton(
|
| 862 |
-
"📄 Download TXT",
|
| 863 |
-
visible=False,
|
| 864 |
-
variant="secondary"
|
| 865 |
)
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
|
|
|
|
|
|
| 870 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 871 |
|
| 872 |
-
#
|
| 873 |
-
gr.HTML("<h3>💡 Tips & Features</h3>")
|
| 874 |
-
|
| 875 |
-
# Create tips content based on available features
|
| 876 |
-
download_info = "Get results as formatted TXT files"
|
| 877 |
-
if HAS_DOCX_SUPPORT:
|
| 878 |
-
download_info += " and structured DOCX files with clean table formatting"
|
| 879 |
-
else:
|
| 880 |
-
download_info += " (install python-docx for DOCX export)"
|
| 881 |
-
|
| 882 |
-
tips_html = f"""
|
| 883 |
-
<div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem; margin: 1rem 0;">
|
| 884 |
-
<ul>
|
| 885 |
-
<li><strong>Auto method</strong> is recommended for most users - intelligently selects the best OCR method</li>
|
| 886 |
-
<li><strong>Header/Footer Removal:</strong> Clean up scanned documents by removing headers and footers</li>
|
| 887 |
-
<li><strong>Fixed Removal:</strong> Remove specific pixel amounts from top/bottom of each page</li>
|
| 888 |
-
<li><strong>Smart Crop:</strong> Use visual preview to set exact crop areas</li>
|
| 889 |
-
<li><strong>Table Processing:</strong> Enhanced table detection with clean formatting (no separator lines)</li>
|
| 890 |
-
<li><strong>Download Options:</strong> {download_info}</li>
|
| 891 |
-
<li><strong>Azure Document Intelligence</strong> provides the best quality for complex documents</li>
|
| 892 |
-
<li>Larger files may take longer to process - progress bar shows current status</li>
|
| 893 |
-
<li>Supported file types: PDF documents (up to 50MB by default)</li>
|
| 894 |
-
</ul>
|
| 895 |
-
</div>
|
| 896 |
-
"""
|
| 897 |
|
| 898 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 899 |
|
| 900 |
-
#
|
| 901 |
method_choice.change(
|
| 902 |
-
fn=
|
| 903 |
inputs=[method_choice],
|
| 904 |
outputs=[method_info]
|
| 905 |
)
|
| 906 |
|
|
|
|
| 907 |
enable_header_footer_removal.change(
|
| 908 |
-
fn=lambda enabled:
|
| 909 |
gr.update(visible=enabled),
|
| 910 |
-
gr.update(visible=enabled
|
| 911 |
-
|
| 912 |
-
gr.update(visible=enabled and "crop")
|
| 913 |
-
),
|
| 914 |
inputs=[enable_header_footer_removal],
|
| 915 |
-
outputs=[crop_controls,
|
| 916 |
)
|
| 917 |
-
|
| 918 |
-
#
|
| 919 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 920 |
crop_input.change(
|
| 921 |
-
fn=
|
| 922 |
-
inputs=[
|
| 923 |
outputs=[crop_preview]
|
| 924 |
)
|
| 925 |
|
| 926 |
-
#
|
| 927 |
-
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 931 |
)
|
| 932 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 933 |
refresh_btn.click(
|
| 934 |
-
fn=
|
| 935 |
outputs=[service_status]
|
| 936 |
)
|
| 937 |
|
|
|
|
| 938 |
process_btn.click(
|
| 939 |
-
fn=
|
| 940 |
-
inputs=[pdf_input, method_choice, enable_header_footer_removal,
|
| 941 |
-
|
|
|
|
|
|
|
|
|
|
| 942 |
)
|
| 943 |
|
| 944 |
return interface
|
| 945 |
|
| 946 |
-
|
| 947 |
-
|
| 948 |
-
|
| 949 |
-
|
| 950 |
-
|
| 951 |
-
|
| 952 |
-
|
| 953 |
-
|
| 954 |
-
|
| 955 |
-
|
|
|
|
|
|
|
|
|
|
| 956 |
|
| 957 |
if __name__ == "__main__":
|
| 958 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
import gradio as gr
|
| 3 |
import os
|
| 4 |
import tempfile
|
| 5 |
import logging
|
| 6 |
+
import json
|
| 7 |
from pathlib import Path
|
| 8 |
from datetime import datetime
|
| 9 |
import cv2
|
| 10 |
import numpy as np
|
| 11 |
from PIL import Image
|
| 12 |
import fitz # PyMuPDF
|
| 13 |
+
from typing import Dict, List, Tuple, Optional
|
| 14 |
|
| 15 |
# Load environment variables
|
| 16 |
from dotenv import load_dotenv
|
|
|
|
| 26 |
backend_manager = BackendManager()
|
| 27 |
|
| 28 |
# Check if python-docx is available
|
|
|
|
|
|
|
| 29 |
try:
|
| 30 |
from docx import Document
|
| 31 |
+
from docx.shared import Inches, Pt
|
| 32 |
+
from docx.enum.table import WD_TABLE_ALIGNMENT
|
| 33 |
HAS_DOCX_SUPPORT = True
|
| 34 |
logger.info("DOCX export available")
|
| 35 |
except ImportError:
|
| 36 |
HAS_DOCX_SUPPORT = False
|
| 37 |
logger.info("DOCX export not available - install python-docx to enable")
|
| 38 |
|
| 39 |
+
# Global variables for enhanced crop management
|
| 40 |
+
current_pdf_data = {
|
| 41 |
+
'path': None,
|
| 42 |
+
'page_count': 0,
|
| 43 |
+
'page_images': {},
|
| 44 |
+
'crop_settings': {},
|
| 45 |
+
'default_crop_all': True
|
| 46 |
}
|
| 47 |
|
| 48 |
+
class PDFPageManager:
|
| 49 |
+
"""Manages PDF page previews and crop settings with enhanced resolution - FIXED VERSION"""
|
| 50 |
+
|
| 51 |
+
def __init__(self):
|
| 52 |
+
self.pdf_doc = None
|
| 53 |
+
self.page_images = {}
|
| 54 |
+
self.crop_settings = {}
|
| 55 |
+
self.current_page = 0
|
| 56 |
+
self.high_res_scale = 2.0 # Reduced from 3.0 for better performance
|
| 57 |
+
|
| 58 |
+
def load_pdf(self, pdf_path: str) -> Dict:
|
| 59 |
+
"""Load PDF and generate high-resolution page previews - FIXED"""
|
| 60 |
+
try:
|
| 61 |
+
if self.pdf_doc:
|
| 62 |
+
self.pdf_doc.close()
|
| 63 |
+
|
| 64 |
+
self.pdf_doc = fitz.open(pdf_path)
|
| 65 |
+
page_count = len(self.pdf_doc)
|
| 66 |
+
|
| 67 |
+
# Generate high-resolution previews for all pages
|
| 68 |
+
self.page_images = {}
|
| 69 |
+
for page_num in range(page_count):
|
| 70 |
+
self.page_images[page_num] = self._generate_high_res_preview(page_num)
|
| 71 |
+
|
| 72 |
+
# Initialize default crop settings for all pages
|
| 73 |
+
self.crop_settings = {
|
| 74 |
+
i: {'top': 0, 'bottom': 0, 'left': 0, 'right': 0, 'custom': False}
|
| 75 |
+
for i in range(page_count)
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
logger.info(f"PDF loaded successfully: {page_count} pages")
|
| 79 |
+
|
| 80 |
+
return {
|
| 81 |
+
'success': True,
|
| 82 |
+
'page_count': page_count,
|
| 83 |
+
'pages': list(range(page_count))
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.error(f"Error loading PDF: {e}")
|
| 88 |
+
return {'success': False, 'error': str(e)}
|
| 89 |
+
|
| 90 |
+
def _generate_high_res_preview(self, page_num: int) -> np.ndarray:
|
| 91 |
+
"""Generate high-resolution preview for better crop visualization - FIXED"""
|
| 92 |
+
try:
|
| 93 |
+
if not self.pdf_doc:
|
| 94 |
+
return None
|
| 95 |
+
|
| 96 |
+
page = self.pdf_doc.load_page(page_num)
|
| 97 |
+
|
| 98 |
+
# Use high resolution matrix for better quality
|
| 99 |
+
mat = fitz.Matrix(self.high_res_scale, self.high_res_scale)
|
| 100 |
+
pix = page.get_pixmap(matrix=mat)
|
| 101 |
+
img_data = pix.tobytes("png")
|
| 102 |
+
|
| 103 |
+
# Convert to PIL Image and then to numpy array
|
| 104 |
+
import io
|
| 105 |
+
pil_image = Image.open(io.BytesIO(img_data))
|
| 106 |
+
img_array = np.array(pil_image)
|
| 107 |
+
|
| 108 |
+
# Convert RGBA to RGB if needed
|
| 109 |
+
if len(img_array.shape) == 3 and img_array.shape[2] == 4:
|
| 110 |
+
img_array = img_array[:, :, :3]
|
| 111 |
+
|
| 112 |
+
return img_array
|
| 113 |
+
|
| 114 |
+
except Exception as e:
|
| 115 |
+
logger.error(f"Error generating preview for page {page_num}: {e}")
|
| 116 |
+
return None
|
| 117 |
+
|
| 118 |
+
def update_crop_visualization(self, page_num: int, crop_coords: Dict) -> np.ndarray:
|
| 119 |
+
"""Update crop visualization with enhanced preview - FIXED"""
|
| 120 |
+
if page_num not in self.page_images or self.page_images[page_num] is None:
|
| 121 |
+
logger.warning(f"No image available for page {page_num}")
|
| 122 |
+
return None
|
| 123 |
+
|
| 124 |
+
try:
|
| 125 |
+
img_array = self.page_images[page_num].copy()
|
| 126 |
+
height, width = img_array.shape[:2]
|
| 127 |
+
|
| 128 |
+
# Convert coordinates from percentages to pixels
|
| 129 |
+
x1 = int(crop_coords.get('left', 0) * width / 100)
|
| 130 |
+
y1 = int(crop_coords.get('top', 0) * height / 100)
|
| 131 |
+
x2 = width - int(crop_coords.get('right', 0) * width / 100)
|
| 132 |
+
y2 = height - int(crop_coords.get('bottom', 0) * height / 100)
|
| 133 |
+
|
| 134 |
+
# Ensure coordinates are valid
|
| 135 |
+
x1 = max(0, min(x1, width))
|
| 136 |
+
x2 = max(0, min(x2, width))
|
| 137 |
+
y1 = max(0, min(y1, height))
|
| 138 |
+
y2 = max(0, min(y2, height))
|
| 139 |
+
|
| 140 |
+
# Create overlay
|
| 141 |
+
overlay = img_array.copy()
|
| 142 |
+
|
| 143 |
+
# Draw crop areas in semi-transparent red (areas to be removed)
|
| 144 |
+
alpha = 0.3
|
| 145 |
+
if crop_coords.get('top', 0) > 0 and y1 > 0:
|
| 146 |
+
cv2.rectangle(overlay, (0, 0), (width, y1), (255, 0, 0), -1)
|
| 147 |
+
if crop_coords.get('bottom', 0) > 0 and y2 < height:
|
| 148 |
+
cv2.rectangle(overlay, (0, y2), (width, height), (255, 0, 0), -1)
|
| 149 |
+
if crop_coords.get('left', 0) > 0 and x1 > 0:
|
| 150 |
+
cv2.rectangle(overlay, (0, 0), (x1, height), (255, 0, 0), -1)
|
| 151 |
+
if crop_coords.get('right', 0) > 0 and x2 < width:
|
| 152 |
+
cv2.rectangle(overlay, (x2, 0), (width, height), (255, 0, 0), -1)
|
| 153 |
+
|
| 154 |
+
# Draw content area outline in green
|
| 155 |
+
if x2 > x1 and y2 > y1:
|
| 156 |
+
thickness = max(2, int(self.high_res_scale * 2))
|
| 157 |
+
cv2.rectangle(overlay, (x1, y1), (x2, y2), (0, 255, 0), thickness)
|
| 158 |
+
|
| 159 |
+
# Blend overlay with original
|
| 160 |
+
result = cv2.addWeighted(img_array, 1-alpha, overlay, alpha, 0)
|
| 161 |
+
|
| 162 |
+
# Add informative text with better scaling
|
| 163 |
+
font_scale = max(0.8, self.high_res_scale / 3)
|
| 164 |
+
thickness = max(1, int(self.high_res_scale))
|
| 165 |
+
text_color = (255, 255, 255)
|
| 166 |
+
background_color = (0, 0, 0)
|
| 167 |
+
|
| 168 |
+
# Add text with background for better visibility
|
| 169 |
+
texts = [
|
| 170 |
+
f"Page {page_num + 1}",
|
| 171 |
+
"RED: Remove areas",
|
| 172 |
+
"GREEN: Content area",
|
| 173 |
+
f"Crop: T{crop_coords.get('top', 0):.1f}% B{crop_coords.get('bottom', 0):.1f}% L{crop_coords.get('left', 0):.1f}% R{crop_coords.get('right', 0):.1f}%"
|
| 174 |
+
]
|
| 175 |
+
|
| 176 |
+
y_offset = 30
|
| 177 |
+
for i, text in enumerate(texts):
|
| 178 |
+
y_pos = y_offset + (i * 30)
|
| 179 |
+
# Add background rectangle for text
|
| 180 |
+
(text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)
|
| 181 |
+
cv2.rectangle(result, (10, y_pos - text_height - 5), (text_width + 20, y_pos + 5), background_color, -1)
|
| 182 |
+
cv2.putText(result, text, (15, y_pos), cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, thickness)
|
| 183 |
+
|
| 184 |
+
return result
|
| 185 |
+
|
| 186 |
+
except Exception as e:
|
| 187 |
+
logger.error(f"Error updating crop visualization: {e}")
|
| 188 |
+
return self.page_images[page_num] if page_num in self.page_images else None
|
| 189 |
+
|
| 190 |
+
def set_crop_for_page(self, page_num: int, crop_coords: Dict):
|
| 191 |
+
"""Set crop coordinates for specific page - FIXED"""
|
| 192 |
+
if page_num in self.crop_settings:
|
| 193 |
+
self.crop_settings[page_num].update(crop_coords)
|
| 194 |
+
self.crop_settings[page_num]['custom'] = True
|
| 195 |
+
logger.info(f"Set crop for page {page_num}: {crop_coords}")
|
| 196 |
+
|
| 197 |
+
def set_crop_for_all_pages(self, crop_coords: Dict):
|
| 198 |
+
"""Apply same crop settings to all pages - FIXED"""
|
| 199 |
+
for page_num in self.crop_settings:
|
| 200 |
+
if not self.crop_settings[page_num].get('custom', False):
|
| 201 |
+
self.crop_settings[page_num].update(crop_coords)
|
| 202 |
+
logger.info(f"Applied crop to all non-custom pages: {crop_coords}")
|
| 203 |
+
|
| 204 |
+
def get_crop_settings_for_processing(self) -> Dict:
|
| 205 |
+
"""Get crop settings in format expected by backend - FIXED"""
|
| 206 |
+
return {
|
| 207 |
+
'per_page_crops': self.crop_settings,
|
| 208 |
+
'has_custom_crops': any(page.get('custom', False) for page in self.crop_settings.values()),
|
| 209 |
+
'enhanced_resolution': True,
|
| 210 |
+
'resolution_scale': self.high_res_scale
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
def close(self):
|
| 214 |
+
"""Clean up resources"""
|
| 215 |
+
if self.pdf_doc:
|
| 216 |
+
self.pdf_doc.close()
|
| 217 |
+
self.pdf_doc = None
|
| 218 |
+
self.page_images.clear()
|
| 219 |
+
self.crop_settings.clear()
|
| 220 |
+
|
| 221 |
+
# Global page manager instance
|
| 222 |
+
pdf_manager = PDFPageManager()
|
| 223 |
|
| 224 |
+
def load_pdf_for_preview(pdf_file):
|
| 225 |
+
"""Load PDF and return page thumbnails for selection - FIXED"""
|
| 226 |
if pdf_file is None:
|
| 227 |
+
return None, gr.update(choices=[], value=None), gr.update(visible=False), "No PDF loaded"
|
| 228 |
|
| 229 |
try:
|
| 230 |
+
result = pdf_manager.load_pdf(pdf_file.name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
+
if result['success']:
|
| 233 |
+
# Create page choices for dropdown
|
| 234 |
+
page_choices = [f"Page {i+1}" for i in range(result['page_count'])]
|
| 235 |
+
|
| 236 |
+
# Get first page preview with default crop
|
| 237 |
+
first_page_preview = pdf_manager.update_crop_visualization(0, {
|
| 238 |
+
'top': 0, 'bottom': 0, 'left': 0, 'right': 0
|
| 239 |
+
}) if 0 in pdf_manager.page_images else None
|
| 240 |
+
|
| 241 |
+
status = f"PDF loaded successfully: {result['page_count']} pages"
|
| 242 |
+
|
| 243 |
+
return (first_page_preview,
|
| 244 |
+
gr.update(choices=page_choices, value=page_choices[0] if page_choices else None, visible=True),
|
| 245 |
+
gr.update(visible=True),
|
| 246 |
+
status)
|
| 247 |
+
else:
|
| 248 |
+
return None, gr.update(choices=[], value=None, visible=False), gr.update(visible=False), f"Error: {result['error']}"
|
| 249 |
+
|
| 250 |
+
except Exception as e:
|
| 251 |
+
logger.error(f"Error in load_pdf_for_preview: {e}")
|
| 252 |
+
return None, gr.update(choices=[], value=None, visible=False), gr.update(visible=False), f"Error loading PDF: {str(e)}"
|
| 253 |
+
|
| 254 |
+
def change_preview_page(page_selection, crop_top, crop_bottom, crop_left, crop_right):
|
| 255 |
+
"""Change preview to selected page with current crop settings - FIXED"""
|
| 256 |
+
if not page_selection:
|
| 257 |
+
return None
|
| 258 |
+
|
| 259 |
+
try:
|
| 260 |
+
page_num = int(page_selection.split()[1]) - 1 # Extract page number
|
| 261 |
+
|
| 262 |
+
# Get current crop settings for this page
|
| 263 |
+
crop_coords = {
|
| 264 |
+
'top': crop_top,
|
| 265 |
+
'bottom': crop_bottom,
|
| 266 |
+
'left': crop_left,
|
| 267 |
+
'right': crop_right
|
| 268 |
+
}
|
| 269 |
|
| 270 |
+
# Update visualization
|
| 271 |
+
preview_image = pdf_manager.update_crop_visualization(page_num, crop_coords)
|
| 272 |
+
return preview_image
|
| 273 |
|
|
|
|
| 274 |
except Exception as e:
|
| 275 |
+
logger.error(f"Error changing preview page: {e}")
|
| 276 |
return None
|
| 277 |
|
| 278 |
+
def update_crop_preview_interactive(page_selection, crop_top, crop_bottom, crop_left, crop_right, apply_to_all):
|
| 279 |
+
"""Update crop preview with interactive feedback - FIXED"""
|
| 280 |
+
if not page_selection or not pdf_manager.pdf_doc:
|
|
|
|
| 281 |
return None
|
| 282 |
|
| 283 |
try:
|
| 284 |
+
page_num = int(page_selection.split()[1]) - 1
|
| 285 |
+
|
| 286 |
+
crop_coords = {
|
| 287 |
+
'top': crop_top,
|
| 288 |
+
'bottom': crop_bottom,
|
| 289 |
+
'left': crop_left,
|
| 290 |
+
'right': crop_right
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
# Apply to current page or all pages based on setting
|
| 294 |
+
if apply_to_all:
|
| 295 |
+
pdf_manager.set_crop_for_all_pages(crop_coords)
|
| 296 |
+
else:
|
| 297 |
+
pdf_manager.set_crop_for_page(page_num, crop_coords)
|
| 298 |
|
| 299 |
+
# Return updated preview
|
| 300 |
+
return pdf_manager.update_crop_visualization(page_num, crop_coords)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
except Exception as e:
|
| 303 |
logger.error(f"Error updating crop preview: {e}")
|
| 304 |
return None
|
| 305 |
|
| 306 |
+
def process_pdf_with_html_enhancement(pdf_file, ocr_method, enable_header_footer_removal,
|
| 307 |
+
crop_top, crop_bottom, crop_left, crop_right,
|
| 308 |
+
apply_to_all_pages, current_page_selection,
|
| 309 |
+
progress=gr.Progress()):
|
| 310 |
+
"""Process PDF with HTML enhancement and improved table handling - FIXED"""
|
|
|
|
| 311 |
if pdf_file is None:
|
| 312 |
+
return "No file uploaded.", "", "", "Error: No file selected"
|
| 313 |
|
|
|
|
| 314 |
try:
|
| 315 |
+
progress(0.1, desc="Initializing HTML-enhanced processing...")
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
+
# Prepare enhanced preprocessing options
|
| 318 |
preprocessing_options = {
|
| 319 |
'enable_header_footer_removal': enable_header_footer_removal,
|
| 320 |
+
'enhanced_crop_processing': True,
|
| 321 |
+
'crop_settings': pdf_manager.get_crop_settings_for_processing() if enable_header_footer_removal else None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
}
|
| 323 |
|
| 324 |
+
progress(0.3, desc="Processing with HTML enhancement...")
|
| 325 |
|
| 326 |
+
# Process the PDF with enhanced preprocessing
|
| 327 |
+
result = backend_manager.process_pdf_with_enhanced_resolution(
|
| 328 |
+
pdf_file.name, ocr_method, preprocessing_options
|
| 329 |
+
)
|
| 330 |
|
| 331 |
+
progress(0.9, desc="Finalizing HTML processing...")
|
| 332 |
progress(1.0, desc="Complete!")
|
| 333 |
|
| 334 |
if result['success']:
|
| 335 |
+
metadata_info = format_enhanced_metadata(result['metadata'], result['method_used'])
|
| 336 |
+
status = f"Success: Processed using {result['method_used']} with HTML enhancement"
|
| 337 |
+
|
| 338 |
+
# Return text, HTML, metadata, and status
|
| 339 |
+
return (result['text'],
|
| 340 |
+
result.get('html', ''),
|
| 341 |
+
metadata_info,
|
| 342 |
+
status)
|
| 343 |
else:
|
| 344 |
error_msg = result.get('error', 'Unknown error occurred')
|
| 345 |
+
return f"Error: {error_msg}", "", "", f"Processing failed: {error_msg}"
|
| 346 |
|
| 347 |
except Exception as e:
|
| 348 |
+
logger.error(f"HTML-enhanced processing error: {e}")
|
| 349 |
+
return f"Error: {str(e)}", "", "", f"Unexpected error: {str(e)}"
|
|
|
|
| 350 |
|
| 351 |
+
def format_enhanced_metadata(metadata, method_used):
|
| 352 |
+
"""Enhanced metadata formatting with HTML processing info"""
|
| 353 |
if not metadata:
|
| 354 |
return f"Method used: {method_used}"
|
| 355 |
|
|
|
|
| 358 |
if 'pages' in metadata:
|
| 359 |
info_lines.append(f"Pages processed: {metadata['pages']}")
|
| 360 |
|
| 361 |
+
if metadata.get('enhanced_processing', False):
|
| 362 |
+
info_lines.append("Enhanced processing: Enabled")
|
| 363 |
+
|
| 364 |
+
if metadata.get('html_processing', False):
|
| 365 |
+
info_lines.append("HTML generation: Enabled")
|
| 366 |
|
| 367 |
+
if metadata.get('enhanced_resolution', False) and 'resolution_scale' in metadata:
|
| 368 |
+
info_lines.append(f"Enhanced resolution: {metadata.get('resolution_scale', 'N/A')}x")
|
|
|
|
| 369 |
|
| 370 |
+
if 'custom_crops_applied' in metadata:
|
| 371 |
+
info_lines.append(f"Custom crops per page: {metadata['custom_crops_applied']}")
|
| 372 |
+
|
| 373 |
+
if 'tables' in metadata:
|
| 374 |
+
info_lines.append(f"Tables detected: {metadata['tables']}")
|
| 375 |
|
| 376 |
if 'processing_time_seconds' in metadata:
|
| 377 |
info_lines.append(f"Processing time: {metadata['processing_time_seconds']:.2f} seconds")
|
| 378 |
|
| 379 |
return "\n".join(info_lines)
|
| 380 |
|
| 381 |
+
def prepare_enhanced_downloads(pdf_file, method, enable_header_footer_removal,
|
| 382 |
+
crop_top, crop_bottom, crop_left, crop_right,
|
| 383 |
+
apply_to_all_pages, current_page_selection):
|
| 384 |
+
"""Prepare enhanced downloads with HTML processing"""
|
| 385 |
+
text, html, metadata, status = process_pdf_with_html_enhancement(
|
| 386 |
+
pdf_file, method, enable_header_footer_removal,
|
| 387 |
+
crop_top, crop_bottom, crop_left, crop_right,
|
| 388 |
+
apply_to_all_pages, current_page_selection
|
|
|
|
| 389 |
)
|
| 390 |
|
| 391 |
+
# Prepare downloads if processing was successful
|
| 392 |
+
if text and not text.startswith("Error:") and not text.startswith("No file"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
try:
|
| 394 |
+
# Create enhanced download files
|
| 395 |
+
download_files = backend_manager.create_enhanced_downloads(text, html, metadata)
|
| 396 |
+
|
| 397 |
+
# Prepare gradio updates for download buttons
|
| 398 |
+
updates = [
|
| 399 |
+
text, metadata, status, # Display outputs
|
| 400 |
+
gr.update(visible=True, value=download_files.get('txt')) if 'txt' in download_files else gr.update(visible=False),
|
| 401 |
+
gr.update(visible=True, value=download_files.get('docx')) if 'docx' in download_files else gr.update(visible=False),
|
| 402 |
+
gr.update(visible=True, value=download_files.get('html')) if 'html' in download_files else gr.update(visible=False)
|
| 403 |
+
]
|
| 404 |
+
|
| 405 |
+
return tuple(updates)
|
| 406 |
+
|
| 407 |
+
except Exception as file_error:
|
| 408 |
+
logger.error(f"Enhanced file creation error: {file_error}")
|
| 409 |
+
return (text, metadata, status,
|
| 410 |
+
gr.update(visible=False),
|
| 411 |
+
gr.update(visible=False),
|
| 412 |
+
gr.update(visible=False))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
else:
|
| 414 |
+
return (text, metadata, status,
|
| 415 |
+
gr.update(visible=False),
|
| 416 |
+
gr.update(visible=False),
|
| 417 |
+
gr.update(visible=False))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
|
| 419 |
+
def get_enhanced_method_info(method):
|
| 420 |
+
"""Get information about selected OCR method with HTML processing"""
|
| 421 |
method_descriptions = {
|
| 422 |
+
"auto": "**Auto Selection**: Automatically chooses the best available method with HTML processing and enhanced table handling.",
|
| 423 |
+
"azure": "**Azure Document Intelligence**: Advanced cloud-based OCR with HTML generation, layout preservation, and smart table detection.",
|
| 424 |
+
"tesseract": "**Tesseract OCR**: Open-source OCR with HTML output, enhanced image preprocessing, and resolution scaling.",
|
| 425 |
+
"pymupdf": "**PyMuPDF**: Fast extraction enhanced with HTML processing and improved formatting preservation."
|
| 426 |
}
|
| 427 |
|
| 428 |
return method_descriptions.get(method, "Select a method to see details.")
|
| 429 |
|
| 430 |
+
def check_enhanced_service_status():
|
| 431 |
+
"""Check and display enhanced service status"""
|
|
|
|
| 432 |
available_methods = backend_manager.get_available_methods()
|
| 433 |
|
| 434 |
+
status_lines = ["**Available OCR Methods (Enhanced with HTML Processing):**"]
|
| 435 |
|
| 436 |
if "azure" in available_methods:
|
| 437 |
+
status_lines.append("✓ Azure Document Intelligence - Ready (HTML + Tables)")
|
| 438 |
else:
|
| 439 |
+
status_lines.append("✗ Azure Document Intelligence - Not configured")
|
| 440 |
|
| 441 |
if "tesseract" in available_methods:
|
| 442 |
+
status_lines.append("✓ Tesseract OCR - Ready (HTML Enhanced)")
|
| 443 |
else:
|
| 444 |
+
status_lines.append("✗ Tesseract OCR - Not available")
|
| 445 |
|
| 446 |
if "pymupdf" in available_methods:
|
| 447 |
+
status_lines.append("✓ PyMuPDF - Ready (HTML Enhanced)")
|
| 448 |
else:
|
| 449 |
+
status_lines.append("✗ PyMuPDF - Not available")
|
| 450 |
+
|
| 451 |
+
# Add enhanced features status
|
| 452 |
+
status_lines.append("✓ HTML Processing - Available")
|
| 453 |
+
status_lines.append("✓ Enhanced Table Handling - Available")
|
| 454 |
+
status_lines.append("✓ Smart Text Preservation - Available")
|
| 455 |
+
status_lines.append("✓ Multi-Page Crop Preview - Available")
|
| 456 |
+
status_lines.append("✓ Per-Page Crop Customization - Available")
|
| 457 |
|
|
|
|
| 458 |
if HAS_DOCX_SUPPORT:
|
| 459 |
+
status_lines.append("✓ Enhanced DOCX Export - Available")
|
| 460 |
else:
|
| 461 |
+
status_lines.append("✗ Enhanced DOCX Export - Install python-docx to enable")
|
| 462 |
|
| 463 |
+
status_lines.append("✓ HTML File Export - Available")
|
| 464 |
+
status_lines.append("✓ Enhanced Text Export - Available")
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
+
return "\n".join(status_lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
|
| 468 |
+
def create_enhanced_interface():
|
| 469 |
+
"""Create enhanced Gradio interface with improved layout and HTML processing"""
|
| 470 |
|
| 471 |
with gr.Blocks(
|
| 472 |
+
title="PDF OCR Service - Enhanced with HTML Processing",
|
| 473 |
theme=gr.themes.Soft(),
|
| 474 |
css="""
|
| 475 |
.main-header { text-align: center; margin-bottom: 2rem; }
|
| 476 |
+
.config-panel { border: 2px solid #007bff; padding: 1.5rem; border-radius: 0.8rem; background-color: #f8f9fa; margin-bottom: 1rem; }
|
| 477 |
+
.instructions-panel { border: 2px solid #28a745; padding: 1.5rem; border-radius: 0.8rem; background-color: #f0fff0; margin-bottom: 1rem; }
|
| 478 |
+
.crop-controls { border: 2px solid #ffc107; padding: 1rem; border-radius: 0.5rem; background-color: #fffef7; }
|
| 479 |
+
.page-preview { border: 2px solid #17a2b8; padding: 1rem; border-radius: 0.5rem; background-color: #f0f8ff; }
|
| 480 |
+
.results-panel { border: 2px solid #6f42c1; padding: 1rem; border-radius: 0.5rem; background-color: #f8f5ff; }
|
| 481 |
.status-box { border-left: 4px solid #007bff; padding: 1rem; background-color: #f8f9fa; }
|
|
|
|
| 482 |
"""
|
| 483 |
) as interface:
|
| 484 |
|
| 485 |
gr.HTML("""
|
| 486 |
<div class="main-header">
|
| 487 |
+
<h1>PDF OCR Service - Enhanced with HTML Processing</h1>
|
| 488 |
+
<p>Convert PDF documents to text using enhanced OCR with HTML intermediate processing, smart table handling, and format preservation</p>
|
| 489 |
</div>
|
| 490 |
""")
|
| 491 |
|
| 492 |
+
# Instructions at the top
|
| 493 |
+
with gr.Group(elem_classes=["instructions-panel"]):
|
| 494 |
+
gr.HTML("<h3>Instructions & Features</h3>")
|
| 495 |
+
gr.HTML("""
|
| 496 |
+
<div style="background-color: #e7f3ff; padding: 1rem; border-radius: 0.5rem;">
|
| 497 |
+
<h4>How to Use:</h4>
|
| 498 |
+
<ol>
|
| 499 |
+
<li><strong>Upload PDF:</strong> Select your PDF file in the configuration panel below</li>
|
| 500 |
+
<li><strong>Choose Method:</strong> Select OCR method (Auto recommended for best results)</li>
|
| 501 |
+
<li><strong>Configure Crop (Optional):</strong> Enable header/footer removal and adjust crop settings</li>
|
| 502 |
+
<li><strong>Process:</strong> Click the process button to extract text with HTML enhancement</li>
|
| 503 |
+
<li><strong>Download:</strong> Get results in TXT, DOCX, or HTML format</li>
|
| 504 |
+
</ol>
|
| 505 |
|
| 506 |
+
<h4>Enhanced Features:</h4>
|
| 507 |
+
<ul>
|
| 508 |
+
<li><strong>Smart Table Detection:</strong> 70% overlap threshold prevents text loss</li>
|
| 509 |
+
<li><strong>HTML Processing:</strong> Better structure and formatting preservation</li>
|
| 510 |
+
<li><strong>Multi-format Export:</strong> TXT, DOCX, and HTML downloads</li>
|
| 511 |
+
<li><strong>Advanced Crop Control:</strong> Per-page customization with real-time preview</li>
|
| 512 |
+
<li><strong>Enhanced Resolution:</strong> High-quality processing for better accuracy</li>
|
| 513 |
+
<li><strong>Page Numbers:</strong> Automatic page numbering in extracted content</li>
|
| 514 |
+
<li><strong>Proper Indentation:</strong> Preserved spacing and formatting</li>
|
| 515 |
+
</ul>
|
| 516 |
+
</div>
|
| 517 |
+
""")
|
| 518 |
+
|
| 519 |
+
# Configuration Panel - Top Left
|
| 520 |
+
with gr.Group(elem_classes=["config-panel"]):
|
| 521 |
+
gr.HTML("<h3>Configuration Panel</h3>")
|
| 522 |
+
|
| 523 |
+
with gr.Row():
|
| 524 |
+
with gr.Column(scale=1):
|
| 525 |
+
# File upload
|
| 526 |
+
pdf_input = gr.File(
|
| 527 |
+
label="Upload PDF File",
|
| 528 |
+
file_types=[".pdf"],
|
| 529 |
+
file_count="single"
|
| 530 |
+
)
|
| 531 |
+
|
| 532 |
+
# PDF loading status
|
| 533 |
+
pdf_load_status = gr.Textbox(
|
| 534 |
+
label="PDF Status",
|
| 535 |
+
interactive=False,
|
| 536 |
+
lines=1,
|
| 537 |
+
value="No PDF loaded"
|
| 538 |
+
)
|
| 539 |
+
|
| 540 |
+
with gr.Column(scale=1):
|
| 541 |
+
# OCR method selection
|
| 542 |
+
method_choice = gr.Dropdown(
|
| 543 |
+
choices=["auto", "azure", "tesseract", "pymupdf"],
|
| 544 |
+
value="auto",
|
| 545 |
+
label="OCR Method",
|
| 546 |
+
info="Choose OCR method (all enhanced with HTML processing)"
|
| 547 |
+
)
|
| 548 |
+
|
| 549 |
+
# Method information display
|
| 550 |
+
method_info = gr.Markdown(
|
| 551 |
+
value=get_enhanced_method_info("auto"),
|
| 552 |
+
elem_classes=["method-info"]
|
| 553 |
+
)
|
| 554 |
+
|
| 555 |
+
# Enhanced Header/Footer Removal Section
|
| 556 |
+
with gr.Group(elem_classes=["crop-controls"]):
|
| 557 |
+
gr.HTML("<h4>Header/Footer Removal & Crop Settings</h4>")
|
| 558 |
|
| 559 |
enable_header_footer_removal = gr.Checkbox(
|
| 560 |
+
label="Enable Enhanced Header/Footer Removal",
|
| 561 |
value=False,
|
| 562 |
+
info="Remove headers and footers with high-resolution processing"
|
| 563 |
)
|
| 564 |
|
| 565 |
+
# Multi-page controls
|
| 566 |
with gr.Group(visible=False) as crop_controls:
|
| 567 |
+
gr.HTML("<h5>Multi-Page Crop Control</h5>")
|
| 568 |
|
| 569 |
+
with gr.Row():
|
| 570 |
+
# Page selection
|
| 571 |
+
page_selector = gr.Dropdown(
|
| 572 |
+
label="Select Page for Preview",
|
| 573 |
+
choices=[],
|
| 574 |
+
value=None,
|
| 575 |
+
info="Choose page to preview and customize crop settings",
|
| 576 |
+
visible=False
|
| 577 |
+
)
|
| 578 |
+
|
| 579 |
+
# Apply to all pages toggle
|
| 580 |
+
apply_to_all_pages = gr.Checkbox(
|
| 581 |
+
label="Apply crop settings to all pages",
|
| 582 |
+
value=True,
|
| 583 |
+
info="When enabled, changes apply to all pages"
|
| 584 |
+
)
|
| 585 |
|
| 586 |
+
gr.HTML("<h5>Crop Areas (% of page)</h5>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
|
| 588 |
+
with gr.Row():
|
| 589 |
+
crop_top = gr.Slider(
|
| 590 |
+
minimum=0,
|
| 591 |
+
maximum=40,
|
| 592 |
+
value=8,
|
| 593 |
+
step=0.5,
|
| 594 |
+
label="Top Crop %"
|
| 595 |
+
)
|
| 596 |
+
|
| 597 |
+
crop_bottom = gr.Slider(
|
| 598 |
+
minimum=0,
|
| 599 |
+
maximum=40,
|
| 600 |
+
value=8,
|
| 601 |
+
step=0.5,
|
| 602 |
+
label="Bottom Crop %"
|
| 603 |
+
)
|
| 604 |
|
| 605 |
+
with gr.Row():
|
| 606 |
+
crop_left = gr.Slider(
|
| 607 |
+
minimum=0,
|
| 608 |
+
maximum=30,
|
| 609 |
+
value=3,
|
| 610 |
+
step=0.5,
|
| 611 |
+
label="Left Crop %"
|
| 612 |
+
)
|
| 613 |
+
|
| 614 |
+
crop_right = gr.Slider(
|
| 615 |
+
minimum=0,
|
| 616 |
+
maximum=30,
|
| 617 |
+
value=3,
|
| 618 |
+
step=0.5,
|
| 619 |
+
label="Right Crop %"
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
+
# Quick preset buttons
|
| 623 |
+
with gr.Row():
|
| 624 |
+
preset_light = gr.Button("Light Crop (5%)", size="sm")
|
| 625 |
+
preset_medium = gr.Button("Medium Crop (10%)", size="sm")
|
| 626 |
+
preset_heavy = gr.Button("Heavy Crop (15%)", size="sm")
|
| 627 |
+
preset_reset = gr.Button("Reset", size="sm")
|
| 628 |
+
|
| 629 |
+
# Process button
|
| 630 |
+
process_btn = gr.Button(
|
| 631 |
+
"Process PDF with HTML Enhancement",
|
| 632 |
+
variant="primary",
|
| 633 |
+
size="lg"
|
| 634 |
+
)
|
| 635 |
+
|
| 636 |
+
# Results and Preview Section
|
| 637 |
+
with gr.Row():
|
| 638 |
+
with gr.Column(scale=1):
|
| 639 |
+
# Enhanced crop preview with multi-page support
|
| 640 |
+
with gr.Group(visible=False, elem_classes=["page-preview"]) as preview_group:
|
| 641 |
+
gr.HTML("<h4>Page Preview with Crop Visualization</h4>")
|
| 642 |
+
crop_preview = gr.Image(
|
| 643 |
+
label="High-Resolution Page Preview",
|
| 644 |
+
interactive=False,
|
| 645 |
+
height=500,
|
| 646 |
+
show_label=False
|
| 647 |
)
|
| 648 |
+
|
| 649 |
+
gr.HTML("""
|
| 650 |
+
<p style="font-size: 0.9em; color: #666; text-align: center;">
|
| 651 |
+
<strong>Red areas:</strong> Will be removed | <strong>Green outline:</strong> Content area |
|
| 652 |
+
<strong>Enhanced:</strong> 2x resolution processing
|
| 653 |
+
</p>
|
| 654 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 655 |
|
| 656 |
with gr.Column(scale=2):
|
| 657 |
+
with gr.Group(elem_classes=["results-panel"]):
|
| 658 |
+
gr.HTML("<h3>Results & Downloads</h3>")
|
| 659 |
+
|
| 660 |
+
# Processing status
|
| 661 |
+
processing_status = gr.Textbox(
|
| 662 |
+
label="Processing Status",
|
| 663 |
+
interactive=False,
|
| 664 |
+
lines=1
|
| 665 |
+
)
|
| 666 |
+
|
| 667 |
+
# Extracted text output
|
| 668 |
+
text_output = gr.Textbox(
|
| 669 |
+
label="Extracted Text (Enhanced with Proper Formatting and Page Numbers)",
|
| 670 |
+
placeholder="Processed text with HTML enhancement and preserved formatting will appear here...",
|
| 671 |
+
lines=20,
|
| 672 |
+
max_lines=30,
|
| 673 |
+
interactive=False,
|
| 674 |
+
show_copy_button=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 675 |
)
|
| 676 |
+
|
| 677 |
+
# Metadata information
|
| 678 |
+
metadata_output = gr.Textbox(
|
| 679 |
+
label="Processing Information",
|
| 680 |
+
interactive=False,
|
| 681 |
+
lines=4
|
| 682 |
)
|
| 683 |
+
|
| 684 |
+
# Enhanced download buttons
|
| 685 |
+
with gr.Row():
|
| 686 |
+
download_txt_btn = gr.DownloadButton(
|
| 687 |
+
"Download Enhanced TXT",
|
| 688 |
+
visible=False,
|
| 689 |
+
variant="secondary"
|
| 690 |
+
)
|
| 691 |
+
download_docx_btn = gr.DownloadButton(
|
| 692 |
+
"Download Enhanced DOCX",
|
| 693 |
+
visible=False,
|
| 694 |
+
variant="secondary"
|
| 695 |
+
)
|
| 696 |
+
download_html_btn = gr.DownloadButton(
|
| 697 |
+
"Download HTML File",
|
| 698 |
+
visible=False,
|
| 699 |
+
variant="secondary"
|
| 700 |
+
)
|
| 701 |
+
|
| 702 |
+
# Service Status at the bottom
|
| 703 |
+
with gr.Group(elem_classes=["status-box"]):
|
| 704 |
+
gr.HTML("<h4>Service Status</h4>")
|
| 705 |
+
service_status = gr.Markdown(
|
| 706 |
+
value=check_enhanced_service_status()
|
| 707 |
+
)
|
| 708 |
+
|
| 709 |
+
# Refresh status button
|
| 710 |
+
refresh_btn = gr.Button("Refresh Status", size="sm")
|
| 711 |
|
| 712 |
+
# Event handlers with enhanced functionality
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 713 |
|
| 714 |
+
# PDF upload handler
|
| 715 |
+
pdf_input.change(
|
| 716 |
+
fn=load_pdf_for_preview,
|
| 717 |
+
inputs=[pdf_input],
|
| 718 |
+
outputs=[crop_preview, page_selector, crop_controls, pdf_load_status]
|
| 719 |
+
)
|
| 720 |
|
| 721 |
+
# Method info handler
|
| 722 |
method_choice.change(
|
| 723 |
+
fn=get_enhanced_method_info,
|
| 724 |
inputs=[method_choice],
|
| 725 |
outputs=[method_info]
|
| 726 |
)
|
| 727 |
|
| 728 |
+
# Header/footer removal handler
|
| 729 |
enable_header_footer_removal.change(
|
| 730 |
+
fn=lambda enabled: [
|
| 731 |
gr.update(visible=enabled),
|
| 732 |
+
gr.update(visible=enabled)
|
| 733 |
+
],
|
|
|
|
|
|
|
| 734 |
inputs=[enable_header_footer_removal],
|
| 735 |
+
outputs=[crop_controls, preview_group]
|
| 736 |
)
|
| 737 |
+
|
| 738 |
+
# Page selection handler
|
| 739 |
+
page_selector.change(
|
| 740 |
+
fn=change_preview_page,
|
| 741 |
+
inputs=[page_selector, crop_top, crop_bottom, crop_left, crop_right],
|
| 742 |
+
outputs=[crop_preview]
|
| 743 |
+
)
|
| 744 |
+
|
| 745 |
+
# Crop parameter handlers - update preview in real-time
|
| 746 |
+
for crop_input in [crop_top, crop_bottom, crop_left, crop_right, apply_to_all_pages]:
|
| 747 |
crop_input.change(
|
| 748 |
+
fn=update_crop_preview_interactive,
|
| 749 |
+
inputs=[page_selector, crop_top, crop_bottom, crop_left, crop_right, apply_to_all_pages],
|
| 750 |
outputs=[crop_preview]
|
| 751 |
)
|
| 752 |
|
| 753 |
+
# Preset button handlers
|
| 754 |
+
def apply_preset(top, bottom, left, right):
|
| 755 |
+
return top, bottom, left, right
|
| 756 |
+
|
| 757 |
+
preset_light.click(
|
| 758 |
+
fn=lambda: apply_preset(5, 5, 2, 2),
|
| 759 |
+
outputs=[crop_top, crop_bottom, crop_left, crop_right]
|
| 760 |
+
)
|
| 761 |
+
|
| 762 |
+
preset_medium.click(
|
| 763 |
+
fn=lambda: apply_preset(10, 10, 5, 5),
|
| 764 |
+
outputs=[crop_top, crop_bottom, crop_left, crop_right]
|
| 765 |
+
)
|
| 766 |
+
|
| 767 |
+
preset_heavy.click(
|
| 768 |
+
fn=lambda: apply_preset(15, 15, 8, 8),
|
| 769 |
+
outputs=[crop_top, crop_bottom, crop_left, crop_right]
|
| 770 |
)
|
| 771 |
|
| 772 |
+
preset_reset.click(
|
| 773 |
+
fn=lambda: apply_preset(0, 0, 0, 0),
|
| 774 |
+
outputs=[crop_top, crop_bottom, crop_left, crop_right]
|
| 775 |
+
)
|
| 776 |
+
|
| 777 |
+
# Status refresh handler
|
| 778 |
refresh_btn.click(
|
| 779 |
+
fn=check_enhanced_service_status,
|
| 780 |
outputs=[service_status]
|
| 781 |
)
|
| 782 |
|
| 783 |
+
# Main processing handler with enhanced downloads
|
| 784 |
process_btn.click(
|
| 785 |
+
fn=prepare_enhanced_downloads,
|
| 786 |
+
inputs=[pdf_input, method_choice, enable_header_footer_removal,
|
| 787 |
+
crop_top, crop_bottom, crop_left, crop_right,
|
| 788 |
+
apply_to_all_pages, page_selector],
|
| 789 |
+
outputs=[text_output, metadata_output, processing_status,
|
| 790 |
+
download_txt_btn, download_docx_btn, download_html_btn]
|
| 791 |
)
|
| 792 |
|
| 793 |
return interface
|
| 794 |
|
| 795 |
+
def launch_enhanced_ui():
|
| 796 |
+
"""Launch the enhanced Gradio interface with HTML processing"""
|
| 797 |
+
try:
|
| 798 |
+
interface = create_enhanced_interface()
|
| 799 |
+
interface.launch(
|
| 800 |
+
server_name="0.0.0.0",
|
| 801 |
+
server_port=7860,
|
| 802 |
+
share=False,
|
| 803 |
+
show_error=True
|
| 804 |
+
)
|
| 805 |
+
finally:
|
| 806 |
+
# Clean up resources
|
| 807 |
+
pdf_manager.close()
|
| 808 |
|
| 809 |
if __name__ == "__main__":
|
| 810 |
+
launch_enhanced_ui()
|
backend.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
Backend Management Module -
|
| 3 |
Coordinates between UI and OCR services, handles file management and preprocessing
|
| 4 |
"""
|
| 5 |
import re
|
|
@@ -26,8 +26,385 @@ logging.basicConfig(level=logging.INFO)
|
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
class BackendManager:
|
| 30 |
-
"""
|
| 31 |
|
| 32 |
def __init__(self):
|
| 33 |
self.ocr_service = OCRService()
|
|
@@ -38,12 +415,12 @@ class BackendManager:
|
|
| 38 |
self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service'
|
| 39 |
self.temp_dir.mkdir(exist_ok=True)
|
| 40 |
|
| 41 |
-
logger.info("Enhanced backend manager initialized successfully")
|
| 42 |
|
| 43 |
-
def
|
| 44 |
-
|
| 45 |
"""
|
| 46 |
-
Process PDF
|
| 47 |
|
| 48 |
Args:
|
| 49 |
pdf_path: Path to the PDF file
|
|
@@ -51,7 +428,7 @@ class BackendManager:
|
|
| 51 |
preprocessing_options: Dictionary containing preprocessing settings
|
| 52 |
|
| 53 |
Returns:
|
| 54 |
-
Dict containing processing results
|
| 55 |
"""
|
| 56 |
start_time = datetime.now()
|
| 57 |
|
|
@@ -61,11 +438,12 @@ class BackendManager:
|
|
| 61 |
'success': False,
|
| 62 |
'error': f"File not found: {pdf_path}",
|
| 63 |
'text': '',
|
|
|
|
| 64 |
'method_used': '',
|
| 65 |
'metadata': {}
|
| 66 |
}
|
| 67 |
|
| 68 |
-
# Check file size
|
| 69 |
max_file_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024
|
| 70 |
file_size = os.path.getsize(pdf_path)
|
| 71 |
|
|
@@ -74,14 +452,15 @@ class BackendManager:
|
|
| 74 |
'success': False,
|
| 75 |
'error': f"File too large. Maximum size: {max_file_size // (1024*1024)}MB",
|
| 76 |
'text': '',
|
|
|
|
| 77 |
'method_used': '',
|
| 78 |
'metadata': {}
|
| 79 |
}
|
| 80 |
|
| 81 |
-
# Generate file hash for
|
| 82 |
file_hash = self._calculate_file_hash(pdf_path)
|
| 83 |
|
| 84 |
-
logger.info(f"Processing PDF: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
|
| 85 |
logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
|
| 86 |
|
| 87 |
# Handle preprocessing if enabled
|
|
@@ -89,18 +468,17 @@ class BackendManager:
|
|
| 89 |
preprocessing_applied = False
|
| 90 |
|
| 91 |
if preprocessing_options and preprocessing_options.get('enable_header_footer_removal', False):
|
| 92 |
-
logger.info("Applying
|
| 93 |
try:
|
| 94 |
-
processed_pdf_path = self.
|
| 95 |
preprocessing_applied = True
|
| 96 |
-
logger.info("
|
| 97 |
except Exception as e:
|
| 98 |
logger.error(f"Preprocessing failed: {e}")
|
| 99 |
-
# Continue with original file if preprocessing fails
|
| 100 |
processed_pdf_path = pdf_path
|
| 101 |
|
| 102 |
try:
|
| 103 |
-
# Process
|
| 104 |
result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method)
|
| 105 |
|
| 106 |
# Add processing metadata
|
|
@@ -111,15 +489,12 @@ class BackendManager:
|
|
| 111 |
'file_size_mb': round(file_size / (1024*1024), 2),
|
| 112 |
'processing_time_seconds': round(processing_time, 2),
|
| 113 |
'timestamp': start_time.isoformat(),
|
|
|
|
|
|
|
| 114 |
'header_footer_removed': preprocessing_applied,
|
| 115 |
'preprocessing_options': preprocessing_options if preprocessing_applied else None
|
| 116 |
})
|
| 117 |
|
| 118 |
-
# Post-process for better table handling if needed
|
| 119 |
-
if result['success'] and result['text']:
|
| 120 |
-
result['text'] = self._post_process_extracted_text(result['text'])
|
| 121 |
-
result['metadata']['post_processed'] = True
|
| 122 |
-
|
| 123 |
# Cleanup temporary preprocessed file
|
| 124 |
if preprocessing_applied and processed_pdf_path != pdf_path:
|
| 125 |
try:
|
|
@@ -130,14 +505,17 @@ class BackendManager:
|
|
| 130 |
# Log results
|
| 131 |
if result['success']:
|
| 132 |
text_length = len(result['text'])
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
| 135 |
logger.info(f"Method used: {result['method_used']}")
|
| 136 |
logger.info(f"Text extracted: {text_length} characters")
|
|
|
|
| 137 |
if table_count > 0:
|
| 138 |
logger.info(f"Tables detected: {table_count}")
|
| 139 |
if preprocessing_applied:
|
| 140 |
-
logger.info("
|
| 141 |
|
| 142 |
# Add to processing history
|
| 143 |
self._add_to_history({
|
|
@@ -148,10 +526,12 @@ class BackendManager:
|
|
| 148 |
'text_length': text_length,
|
| 149 |
'table_count': table_count,
|
| 150 |
'processing_time': processing_time,
|
| 151 |
-
'preprocessing_applied': preprocessing_applied
|
|
|
|
|
|
|
| 152 |
})
|
| 153 |
else:
|
| 154 |
-
logger.error(f"
|
| 155 |
|
| 156 |
# Add to processing history
|
| 157 |
self._add_to_history({
|
|
@@ -161,15 +541,16 @@ class BackendManager:
|
|
| 161 |
'success': False,
|
| 162 |
'error': result.get('error', 'Unknown error'),
|
| 163 |
'processing_time': processing_time,
|
| 164 |
-
'preprocessing_applied': preprocessing_applied
|
|
|
|
| 165 |
})
|
| 166 |
|
| 167 |
return result
|
| 168 |
|
| 169 |
except Exception as e:
|
| 170 |
-
logger.error(f"Unexpected error during processing: {e}")
|
| 171 |
|
| 172 |
-
# Cleanup
|
| 173 |
if preprocessing_applied and processed_pdf_path != pdf_path:
|
| 174 |
try:
|
| 175 |
os.unlink(processed_pdf_path)
|
|
@@ -184,57 +565,36 @@ class BackendManager:
|
|
| 184 |
'method_requested': method,
|
| 185 |
'success': False,
|
| 186 |
'error': str(e),
|
| 187 |
-
'processing_time': processing_time
|
|
|
|
| 188 |
})
|
| 189 |
|
| 190 |
return {
|
| 191 |
'success': False,
|
| 192 |
-
'error': f"
|
| 193 |
'text': '',
|
|
|
|
| 194 |
'method_used': '',
|
| 195 |
'metadata': {
|
| 196 |
'file_hash': file_hash,
|
| 197 |
'processing_time_seconds': round(processing_time, 2),
|
| 198 |
-
'timestamp': start_time.isoformat()
|
|
|
|
| 199 |
}
|
| 200 |
}
|
| 201 |
|
| 202 |
-
def
|
| 203 |
-
"""
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
options: Preprocessing options
|
| 209 |
-
|
| 210 |
-
Returns:
|
| 211 |
-
Path to preprocessed PDF file
|
| 212 |
-
"""
|
| 213 |
-
removal_method = options.get('removal_method', 'fixed')
|
| 214 |
|
| 215 |
# Create temporary file for processed PDF
|
| 216 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 217 |
-
temp_pdf_path = self.temp_dir / f"
|
| 218 |
-
|
| 219 |
-
if removal_method == 'fixed':
|
| 220 |
-
return self._apply_fixed_removal(pdf_path, str(temp_pdf_path), options)
|
| 221 |
-
elif removal_method == 'crop':
|
| 222 |
-
return self._apply_crop_removal(pdf_path, str(temp_pdf_path), options)
|
| 223 |
-
else:
|
| 224 |
-
raise ValueError(f"Unknown removal method: {removal_method}")
|
| 225 |
-
|
| 226 |
-
def _apply_crop_removal(self, input_pdf: str, output_pdf: str, options: Dict[str, Any]) -> str:
|
| 227 |
-
"""Apply percentage-based crop removal"""
|
| 228 |
-
crop_settings = options.get('crop_settings', {})
|
| 229 |
-
top_percent = crop_settings.get('top', 0)
|
| 230 |
-
bottom_percent = crop_settings.get('bottom', 0)
|
| 231 |
-
left_percent = crop_settings.get('left', 0)
|
| 232 |
-
right_percent = crop_settings.get('right', 0)
|
| 233 |
|
| 234 |
-
|
| 235 |
-
return input_pdf # No processing needed
|
| 236 |
-
|
| 237 |
-
doc = fitz.open(input_pdf)
|
| 238 |
new_doc = fitz.open()
|
| 239 |
|
| 240 |
try:
|
|
@@ -242,7 +602,17 @@ class BackendManager:
|
|
| 242 |
page = doc.load_page(page_num)
|
| 243 |
page_rect = page.rect
|
| 244 |
|
| 245 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
width = page_rect.width
|
| 247 |
height = page_rect.height
|
| 248 |
|
|
@@ -259,319 +629,109 @@ class BackendManager:
|
|
| 259 |
page_rect.y1 - crop_bottom
|
| 260 |
)
|
| 261 |
|
| 262 |
-
#
|
| 263 |
-
|
|
|
|
|
|
|
| 264 |
|
| 265 |
-
#
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
-
new_doc.save(
|
| 274 |
-
logger.info(f"
|
| 275 |
|
|
|
|
|
|
|
|
|
|
| 276 |
finally:
|
| 277 |
doc.close()
|
| 278 |
new_doc.close()
|
| 279 |
|
| 280 |
-
return
|
| 281 |
|
| 282 |
-
def
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
Args:
|
| 287 |
-
text: Raw extracted text
|
| 288 |
-
|
| 289 |
-
Returns:
|
| 290 |
-
Cleaned and formatted text with zero duplicates
|
| 291 |
-
"""
|
| 292 |
-
if not text or not text.strip():
|
| 293 |
-
return text
|
| 294 |
-
|
| 295 |
-
import re
|
| 296 |
-
|
| 297 |
-
# Step 1: Split by page markers first to handle each page individually
|
| 298 |
-
if '=== PAGE ' in text:
|
| 299 |
-
pages = re.split(r'(=== PAGE \d+ ===)', text)
|
| 300 |
-
processed_pages = []
|
| 301 |
-
|
| 302 |
-
for i, page_part in enumerate(pages):
|
| 303 |
-
if not page_part.strip():
|
| 304 |
-
continue
|
| 305 |
-
|
| 306 |
-
if page_part.startswith('=== PAGE '):
|
| 307 |
-
# This is a page marker, keep it
|
| 308 |
-
processed_pages.append(page_part)
|
| 309 |
-
else:
|
| 310 |
-
# This is page content, process it
|
| 311 |
-
cleaned_content = self._clean_page_content(page_part)
|
| 312 |
-
if cleaned_content.strip():
|
| 313 |
-
processed_pages.append(cleaned_content)
|
| 314 |
-
|
| 315 |
-
return '\n'.join(processed_pages)
|
| 316 |
-
else:
|
| 317 |
-
# Single page content
|
| 318 |
-
return self._clean_page_content(text)
|
| 319 |
-
|
| 320 |
-
def _clean_page_content(self, content: str) -> str:
|
| 321 |
-
"""Clean individual page content removing all duplicates and artifacts"""
|
| 322 |
-
if not content.strip():
|
| 323 |
-
return content
|
| 324 |
-
|
| 325 |
-
import re
|
| 326 |
-
|
| 327 |
-
# Step 1: Identify and preserve table sections
|
| 328 |
-
table_pattern = r'(--- TABLE \d+ ---\n.*?)(?=\n--- TABLE \d+ ---|\n=== PAGE |\Z)'
|
| 329 |
-
table_sections = {}
|
| 330 |
-
table_positions = []
|
| 331 |
-
|
| 332 |
-
for match in re.finditer(table_pattern, content, re.DOTALL):
|
| 333 |
-
start_pos = match.start()
|
| 334 |
-
end_pos = match.end()
|
| 335 |
-
table_content = match.group(1)
|
| 336 |
-
table_sections[start_pos] = table_content
|
| 337 |
-
table_positions.append((start_pos, end_pos))
|
| 338 |
-
|
| 339 |
-
# Step 2: Extract pure text content (excluding table regions)
|
| 340 |
-
text_content = content
|
| 341 |
-
|
| 342 |
-
# Remove table sections from text processing
|
| 343 |
-
for start_pos, end_pos in sorted(table_positions, reverse=True):
|
| 344 |
-
text_content = text_content[:start_pos] + '\n<<<TABLE_PLACEHOLDER>>>\n' + text_content[end_pos:]
|
| 345 |
-
|
| 346 |
-
# Step 3: Clean the text content
|
| 347 |
-
lines = text_content.split('\n')
|
| 348 |
-
cleaned_lines = []
|
| 349 |
-
|
| 350 |
-
for line in lines:
|
| 351 |
-
if line.strip() == '<<<TABLE_PLACEHOLDER>>>':
|
| 352 |
-
cleaned_lines.append(line) # Preserve placeholder
|
| 353 |
-
continue
|
| 354 |
-
|
| 355 |
-
# Remove excessive whitespace but preserve structure
|
| 356 |
-
if line.strip():
|
| 357 |
-
# Clean up multiple spaces but preserve indentation
|
| 358 |
-
leading_spaces = len(line) - len(line.lstrip())
|
| 359 |
-
content_part = re.sub(r'\s+', ' ', line.strip())
|
| 360 |
-
cleaned_line = ' ' * leading_spaces + content_part
|
| 361 |
-
cleaned_lines.append(cleaned_line)
|
| 362 |
-
else:
|
| 363 |
-
cleaned_lines.append('')
|
| 364 |
-
|
| 365 |
-
# Step 4: Remove excessive empty lines
|
| 366 |
-
result_lines = []
|
| 367 |
-
empty_count = 0
|
| 368 |
-
|
| 369 |
-
for line in cleaned_lines:
|
| 370 |
-
if not line.strip() and line != '<<<TABLE_PLACEHOLDER>>>':
|
| 371 |
-
empty_count += 1
|
| 372 |
-
if empty_count <= 1: # Allow max 1 empty line between content
|
| 373 |
-
result_lines.append('')
|
| 374 |
-
else:
|
| 375 |
-
empty_count = 0
|
| 376 |
-
result_lines.append(line)
|
| 377 |
-
|
| 378 |
-
# Step 5: Restore table sections with enhanced cleaning
|
| 379 |
-
processed_text = '\n'.join(result_lines)
|
| 380 |
-
|
| 381 |
-
# Replace placeholders with cleaned table content
|
| 382 |
-
for start_pos in sorted(table_sections.keys()):
|
| 383 |
-
table_content = table_sections[start_pos]
|
| 384 |
-
# ENHANCED: Clean table content to remove separator rows
|
| 385 |
-
cleaned_table_content = self._clean_table_content(table_content)
|
| 386 |
-
processed_text = processed_text.replace('\n<<<TABLE_PLACEHOLDER>>>\n', f'\n{cleaned_table_content}\n', 1)
|
| 387 |
-
|
| 388 |
-
return processed_text
|
| 389 |
-
|
| 390 |
-
def _clean_table_content(self, table_content: str) -> str:
|
| 391 |
-
"""Clean table content removing separator rows and duplicates"""
|
| 392 |
-
lines = table_content.split('\n')
|
| 393 |
-
cleaned_lines = []
|
| 394 |
-
|
| 395 |
-
for line in lines:
|
| 396 |
-
line_stripped = line.strip()
|
| 397 |
-
|
| 398 |
-
# Keep table headers
|
| 399 |
-
if line_stripped.startswith('--- TABLE '):
|
| 400 |
-
cleaned_lines.append(line_stripped)
|
| 401 |
-
continue
|
| 402 |
-
|
| 403 |
-
# CRITICAL: Skip separator rows (lines that are mostly dashes and pipes)
|
| 404 |
-
if line_stripped:
|
| 405 |
-
# Remove pipes and spaces, check if remaining content is just dashes
|
| 406 |
-
content_check = line_stripped.replace('|', '').replace(' ', '')
|
| 407 |
-
if content_check.replace('-', '') == '':
|
| 408 |
-
# This is a separator row, skip it
|
| 409 |
-
continue
|
| 410 |
-
|
| 411 |
-
# Keep actual content rows
|
| 412 |
-
cleaned_lines.append(line_stripped)
|
| 413 |
-
|
| 414 |
-
return '\n'.join(cleaned_lines)
|
| 415 |
-
|
| 416 |
-
def extract_table_data(self, text: str) -> Dict[str, Any]:
|
| 417 |
-
"""
|
| 418 |
-
Extract structured table data from processed text - NO duplicates
|
| 419 |
-
|
| 420 |
-
Args:
|
| 421 |
-
text: Processed text containing table markers
|
| 422 |
-
|
| 423 |
-
Returns:
|
| 424 |
-
Dictionary containing extracted table information
|
| 425 |
-
"""
|
| 426 |
-
import re
|
| 427 |
-
|
| 428 |
-
tables = {}
|
| 429 |
-
|
| 430 |
-
# More precise pattern to avoid overlapping matches
|
| 431 |
-
table_pattern = r'--- TABLE (\d+) ---\n(.*?)(?=\n--- TABLE \d+ ---|$|\n=== PAGE)'
|
| 432 |
-
|
| 433 |
-
matches = re.finditer(table_pattern, text, re.DOTALL)
|
| 434 |
-
|
| 435 |
-
for match in matches:
|
| 436 |
-
table_num = int(match.group(1))
|
| 437 |
-
table_content = match.group(2).strip()
|
| 438 |
-
|
| 439 |
-
# Only process if we haven't seen this table number before
|
| 440 |
-
if table_num not in tables:
|
| 441 |
-
table_data = self._parse_table_content(table_content)
|
| 442 |
-
tables[table_num] = table_data
|
| 443 |
-
|
| 444 |
-
return {
|
| 445 |
-
'table_count': len(tables),
|
| 446 |
-
'tables': tables,
|
| 447 |
-
'has_tables': len(tables) > 0
|
| 448 |
-
}
|
| 449 |
-
|
| 450 |
-
def _parse_table_content(self, content: str) -> Dict[str, Any]:
|
| 451 |
-
"""Parse individual table content into structured data - improved with separator filtering"""
|
| 452 |
-
lines = [line.strip() for line in content.split('\n') if line.strip()]
|
| 453 |
-
|
| 454 |
-
table_data = {
|
| 455 |
-
'rows': [],
|
| 456 |
-
'columns': 0,
|
| 457 |
-
'has_header': False
|
| 458 |
-
}
|
| 459 |
-
|
| 460 |
-
seen_rows = set() # Track seen row content to avoid duplicates
|
| 461 |
-
|
| 462 |
-
for i, line in enumerate(lines):
|
| 463 |
-
# ENHANCED: Skip separator lines more comprehensively
|
| 464 |
-
line_content = line.replace('|', '').replace(' ', '')
|
| 465 |
-
if line_content.replace('-', '') == '':
|
| 466 |
-
continue # Skip separator rows
|
| 467 |
-
|
| 468 |
-
if '|' in line:
|
| 469 |
-
# Split by | and clean up cells
|
| 470 |
-
cells = [cell.strip() for cell in line.split('|')]
|
| 471 |
-
# Remove empty cells at start/end
|
| 472 |
-
while cells and not cells[0]:
|
| 473 |
-
cells.pop(0)
|
| 474 |
-
while cells and not cells[-1]:
|
| 475 |
-
cells.pop()
|
| 476 |
-
|
| 477 |
-
if cells:
|
| 478 |
-
# Create a key for duplicate detection
|
| 479 |
-
row_key = '|'.join(cells).lower().strip()
|
| 480 |
-
|
| 481 |
-
# Only add if we haven't seen this exact row before
|
| 482 |
-
if row_key not in seen_rows:
|
| 483 |
-
table_data['rows'].append(cells)
|
| 484 |
-
table_data['columns'] = max(table_data['columns'], len(cells))
|
| 485 |
-
seen_rows.add(row_key)
|
| 486 |
-
|
| 487 |
-
# Assume first unique row is header
|
| 488 |
-
if len(table_data['rows']) == 1:
|
| 489 |
-
table_data['has_header'] = True
|
| 490 |
-
|
| 491 |
-
return table_data
|
| 492 |
-
|
| 493 |
-
def validate_pdf_file(self, file_path: str) -> Dict[str, Any]:
|
| 494 |
-
"""
|
| 495 |
-
Validate PDF file before processing - enhanced validation
|
| 496 |
-
|
| 497 |
-
Args:
|
| 498 |
-
file_path: Path to the PDF file
|
| 499 |
-
|
| 500 |
-
Returns:
|
| 501 |
-
Dict with validation results
|
| 502 |
-
"""
|
| 503 |
-
validation_result = {
|
| 504 |
-
'valid': False,
|
| 505 |
-
'error': None,
|
| 506 |
-
'warnings': [],
|
| 507 |
-
'file_info': {}
|
| 508 |
-
}
|
| 509 |
|
| 510 |
try:
|
| 511 |
-
#
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
# Check file size
|
| 521 |
-
file_size = os.path.getsize(file_path)
|
| 522 |
-
max_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024
|
| 523 |
-
|
| 524 |
-
if file_size > max_size:
|
| 525 |
-
validation_result['error'] = f"File too large ({file_size/(1024*1024):.1f}MB > {max_size/(1024*1024)}MB)"
|
| 526 |
-
return validation_result
|
| 527 |
-
|
| 528 |
-
if file_size == 0:
|
| 529 |
-
validation_result['error'] = "File is empty"
|
| 530 |
-
return validation_result
|
| 531 |
-
|
| 532 |
-
# Try to open with PyMuPDF to check if it's a valid PDF
|
| 533 |
try:
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
doc.close()
|
| 545 |
-
|
| 546 |
-
if page_count == 0:
|
| 547 |
-
validation_result['warnings'].append("PDF contains no pages")
|
| 548 |
-
|
| 549 |
-
validation_result['file_info'] = {
|
| 550 |
-
'size_mb': round(file_size / (1024*1024), 2),
|
| 551 |
-
'pages': page_count
|
| 552 |
-
}
|
| 553 |
-
|
| 554 |
-
except Exception as pdf_error:
|
| 555 |
-
validation_result['error'] = f"Invalid or corrupted PDF file: {str(pdf_error)}"
|
| 556 |
-
return validation_result
|
| 557 |
|
| 558 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
|
| 560 |
except Exception as e:
|
| 561 |
-
|
|
|
|
| 562 |
|
| 563 |
-
return
|
| 564 |
|
| 565 |
def get_available_methods(self) -> List[str]:
|
| 566 |
"""Get list of available OCR methods"""
|
| 567 |
methods = self.ocr_service.get_available_methods()
|
| 568 |
-
logger.info(f"Available OCR methods: {methods}")
|
| 569 |
return methods
|
| 570 |
|
| 571 |
def get_service_status(self) -> Dict[str, Any]:
|
| 572 |
-
"""Get comprehensive service status"""
|
| 573 |
available_methods = self.get_available_methods()
|
| 574 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
status = {
|
| 576 |
'service_healthy': True,
|
| 577 |
'available_methods': available_methods,
|
|
@@ -582,37 +742,16 @@ class BackendManager:
|
|
| 582 |
'successful_processes': sum(1 for h in self.processing_history if h.get('success', False)),
|
| 583 |
'temp_dir': str(self.temp_dir),
|
| 584 |
'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)),
|
| 585 |
-
'
|
| 586 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
}
|
| 588 |
|
| 589 |
return status
|
| 590 |
|
| 591 |
-
def get_processing_history(self, limit: int = 10) -> List[Dict[str, Any]]:
|
| 592 |
-
"""Get recent processing history"""
|
| 593 |
-
return self.processing_history[-limit:]
|
| 594 |
-
|
| 595 |
-
def cleanup_temp_files(self):
|
| 596 |
-
"""Clean up temporary files"""
|
| 597 |
-
try:
|
| 598 |
-
temp_files = list(self.temp_dir.glob('*'))
|
| 599 |
-
cleaned_count = 0
|
| 600 |
-
|
| 601 |
-
for temp_file in temp_files:
|
| 602 |
-
try:
|
| 603 |
-
# Remove files older than 1 hour
|
| 604 |
-
if temp_file.is_file() and temp_file.stat().st_mtime < (datetime.now().timestamp() - 3600):
|
| 605 |
-
temp_file.unlink()
|
| 606 |
-
cleaned_count += 1
|
| 607 |
-
except Exception as e:
|
| 608 |
-
logger.warning(f"Could not remove temp file {temp_file}: {e}")
|
| 609 |
-
|
| 610 |
-
if cleaned_count > 0:
|
| 611 |
-
logger.info(f"Cleaned up {cleaned_count} temporary files")
|
| 612 |
-
|
| 613 |
-
except Exception as e:
|
| 614 |
-
logger.error(f"Error during cleanup: {e}")
|
| 615 |
-
|
| 616 |
def _calculate_file_hash(self, file_path: str) -> str:
|
| 617 |
"""Calculate SHA-256 hash of file"""
|
| 618 |
sha256_hash = hashlib.sha256()
|
|
@@ -634,32 +773,29 @@ class BackendManager:
|
|
| 634 |
if len(self.processing_history) > self.max_history_size:
|
| 635 |
self.processing_history = self.processing_history[-self.max_history_size:]
|
| 636 |
|
| 637 |
-
def
|
| 638 |
-
"""
|
| 639 |
-
if file_path is None:
|
| 640 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 641 |
-
file_path = self.temp_dir / f"processing_history_{timestamp}.json"
|
| 642 |
-
|
| 643 |
try:
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
'total_entries': len(self.processing_history),
|
| 647 |
-
'service_status': self.get_service_status(),
|
| 648 |
-
'history': self.processing_history
|
| 649 |
-
}
|
| 650 |
-
|
| 651 |
-
with open(file_path, 'w') as f:
|
| 652 |
-
json.dump(history_data, f, indent=2)
|
| 653 |
|
| 654 |
-
|
| 655 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 656 |
|
|
|
|
|
|
|
|
|
|
| 657 |
except Exception as e:
|
| 658 |
-
logger.error(f"Error
|
| 659 |
-
raise
|
| 660 |
|
| 661 |
-
def
|
| 662 |
-
"""Get processing statistics"""
|
| 663 |
if not self.processing_history:
|
| 664 |
return {
|
| 665 |
'total_processed': 0,
|
|
@@ -668,27 +804,31 @@ class BackendManager:
|
|
| 668 |
'most_used_method': 'N/A',
|
| 669 |
'total_text_extracted': 0,
|
| 670 |
'total_tables_processed': 0,
|
| 671 |
-
'preprocessing_usage': 0
|
|
|
|
|
|
|
| 672 |
}
|
| 673 |
|
| 674 |
total_processed = len(self.processing_history)
|
| 675 |
successful = [h for h in self.processing_history if h.get('success', False)]
|
| 676 |
success_rate = (len(successful) / total_processed) * 100 if total_processed > 0 else 0
|
| 677 |
|
| 678 |
-
# Calculate
|
| 679 |
processing_times = [h.get('processing_time', 0) for h in self.processing_history if 'processing_time' in h]
|
| 680 |
avg_processing_time = sum(processing_times) / len(processing_times) if processing_times else 0
|
| 681 |
|
| 682 |
-
# Find most used method
|
| 683 |
methods = [h.get('method_used', 'unknown') for h in successful]
|
| 684 |
most_used_method = max(set(methods), key=methods.count) if methods else 'N/A'
|
| 685 |
|
| 686 |
-
# Calculate total text and tables extracted
|
| 687 |
total_text = sum(h.get('text_length', 0) for h in successful)
|
| 688 |
total_tables = sum(h.get('table_count', 0) for h in successful)
|
| 689 |
|
| 690 |
-
# Calculate preprocessing usage
|
| 691 |
preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 692 |
|
| 693 |
return {
|
| 694 |
'total_processed': total_processed,
|
|
@@ -699,15 +839,18 @@ class BackendManager:
|
|
| 699 |
'total_tables_processed': total_tables,
|
| 700 |
'successful_processes': len(successful),
|
| 701 |
'failed_processes': total_processed - len(successful),
|
| 702 |
-
'preprocessing_usage': preprocessing_usage
|
|
|
|
|
|
|
|
|
|
| 703 |
}
|
| 704 |
|
| 705 |
|
| 706 |
-
#
|
| 707 |
_backend_manager = None
|
| 708 |
|
| 709 |
def get_backend_manager() -> BackendManager:
|
| 710 |
-
"""Get global backend manager instance"""
|
| 711 |
global _backend_manager
|
| 712 |
if _backend_manager is None:
|
| 713 |
_backend_manager = BackendManager()
|
|
@@ -715,11 +858,11 @@ def get_backend_manager() -> BackendManager:
|
|
| 715 |
|
| 716 |
|
| 717 |
if __name__ == "__main__":
|
| 718 |
-
# Test the backend manager
|
| 719 |
manager = BackendManager()
|
| 720 |
|
| 721 |
-
print("Enhanced Backend Manager Test")
|
| 722 |
-
print("=
|
| 723 |
print(f"Available methods: {manager.get_available_methods()}")
|
| 724 |
print(f"Service status: {manager.get_service_status()}")
|
| 725 |
-
print(f"
|
|
|
|
| 1 |
"""
|
| 2 |
+
Backend Management Module - FIXED VERSION with Corrected Crop Processing
|
| 3 |
Coordinates between UI and OCR services, handles file management and preprocessing
|
| 4 |
"""
|
| 5 |
import re
|
|
|
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
|
| 28 |
|
| 29 |
+
class DocumentExporter:
|
| 30 |
+
"""Advanced document export with HTML-based formatting"""
|
| 31 |
+
|
| 32 |
+
@staticmethod
|
| 33 |
+
def create_enhanced_txt_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
|
| 34 |
+
"""Create enhanced TXT file with improved formatting"""
|
| 35 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 36 |
+
temp_file = tempfile.NamedTemporaryFile(
|
| 37 |
+
suffix=f'_extracted_text_{timestamp}.txt',
|
| 38 |
+
delete=False,
|
| 39 |
+
mode='w',
|
| 40 |
+
encoding='utf-8'
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
# Add header
|
| 45 |
+
temp_file.write("PDF OCR Extraction Results - Enhanced with HTML Processing\n")
|
| 46 |
+
temp_file.write("=" * 70 + "\n\n")
|
| 47 |
+
|
| 48 |
+
# Add metadata
|
| 49 |
+
if metadata_info:
|
| 50 |
+
temp_file.write("Processing Information:\n")
|
| 51 |
+
temp_file.write("-" * 25 + "\n")
|
| 52 |
+
temp_file.write(metadata_info + "\n\n")
|
| 53 |
+
|
| 54 |
+
# Add timestamp
|
| 55 |
+
temp_file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 56 |
+
temp_file.write("=" * 70 + "\n\n")
|
| 57 |
+
|
| 58 |
+
# Add main content
|
| 59 |
+
temp_file.write("Extracted Text (Formatted):\n")
|
| 60 |
+
temp_file.write("-" * 30 + "\n\n")
|
| 61 |
+
temp_file.write(text_content)
|
| 62 |
+
|
| 63 |
+
temp_file.close()
|
| 64 |
+
return temp_file.name
|
| 65 |
+
|
| 66 |
+
except Exception as e:
|
| 67 |
+
logger.error(f"Error creating enhanced TXT file: {e}")
|
| 68 |
+
temp_file.close()
|
| 69 |
+
raise
|
| 70 |
+
|
| 71 |
+
@staticmethod
|
| 72 |
+
def create_enhanced_docx_file(text_content: str, html_content: str, metadata_info: str = "") -> str:
|
| 73 |
+
"""Create enhanced DOCX file from HTML content with proper spacing and indentation"""
|
| 74 |
+
try:
|
| 75 |
+
from docx import Document
|
| 76 |
+
from docx.shared import Inches, Pt, RGBColor
|
| 77 |
+
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
| 78 |
+
from docx.enum.table import WD_TABLE_ALIGNMENT
|
| 79 |
+
from docx.oxml.shared import OxmlElement, qn
|
| 80 |
+
from html.parser import HTMLParser
|
| 81 |
+
|
| 82 |
+
# Enhanced HTML to DOCX parser with spacing preservation
|
| 83 |
+
class EnhancedDOCXHTMLParser(HTMLParser):
|
| 84 |
+
def __init__(self, doc):
|
| 85 |
+
super().__init__()
|
| 86 |
+
self.doc = doc
|
| 87 |
+
self.current_paragraph = None
|
| 88 |
+
self.current_run = None
|
| 89 |
+
self.in_table = False
|
| 90 |
+
self.current_table = None
|
| 91 |
+
self.current_row = None
|
| 92 |
+
self.current_cell = None
|
| 93 |
+
self.table_data = []
|
| 94 |
+
self.current_table_row = []
|
| 95 |
+
self.current_indent_em = 0
|
| 96 |
+
self.is_bold = False
|
| 97 |
+
self.is_title = False
|
| 98 |
+
self.is_heading = False
|
| 99 |
+
self.is_bullet_point = False
|
| 100 |
+
|
| 101 |
+
def handle_starttag(self, tag, attrs):
|
| 102 |
+
attr_dict = dict(attrs)
|
| 103 |
+
class_attr = attr_dict.get('class', '')
|
| 104 |
+
style_attr = attr_dict.get('style', '')
|
| 105 |
+
|
| 106 |
+
if tag == 'div' and 'page' in class_attr:
|
| 107 |
+
# Add minimal page separation (just paragraph spacing, no page break)
|
| 108 |
+
if hasattr(self, 'has_content'):
|
| 109 |
+
# Add just 2 line breaks worth of spacing
|
| 110 |
+
self.doc.add_paragraph()
|
| 111 |
+
self.doc.add_paragraph()
|
| 112 |
+
self.has_content = True
|
| 113 |
+
|
| 114 |
+
elif tag == 'div' and 'page-header' in class_attr:
|
| 115 |
+
self.current_paragraph = self.doc.add_heading(level=1)
|
| 116 |
+
self.current_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 117 |
+
|
| 118 |
+
elif tag == 'div' and 'title' in class_attr:
|
| 119 |
+
self.current_paragraph = self.doc.add_heading(level=1)
|
| 120 |
+
self.is_title = True
|
| 121 |
+
self._apply_spacing_from_style(style_attr)
|
| 122 |
+
|
| 123 |
+
elif tag == 'div' and 'section-heading' in class_attr:
|
| 124 |
+
self.current_paragraph = self.doc.add_heading(level=2)
|
| 125 |
+
self.is_heading = True
|
| 126 |
+
self._apply_spacing_from_style(style_attr)
|
| 127 |
+
|
| 128 |
+
elif tag == 'div' and 'paragraph' in class_attr:
|
| 129 |
+
self.current_paragraph = self.doc.add_paragraph()
|
| 130 |
+
self.is_bullet_point = 'bullet-point' in class_attr
|
| 131 |
+
self._apply_spacing_from_style(style_attr)
|
| 132 |
+
|
| 133 |
+
elif tag == 'table':
|
| 134 |
+
self.in_table = True
|
| 135 |
+
self.table_data = []
|
| 136 |
+
|
| 137 |
+
elif tag == 'tr':
|
| 138 |
+
self.current_table_row = []
|
| 139 |
+
|
| 140 |
+
elif tag == 'th' or tag == 'td':
|
| 141 |
+
pass # Will be handled in handle_data
|
| 142 |
+
|
| 143 |
+
elif tag == 'br':
|
| 144 |
+
if self.current_paragraph:
|
| 145 |
+
self.current_paragraph.add_run().add_break()
|
| 146 |
+
|
| 147 |
+
def _apply_spacing_from_style(self, style_attr):
|
| 148 |
+
"""Apply spacing and indentation from HTML style to DOCX paragraph"""
|
| 149 |
+
if not self.current_paragraph:
|
| 150 |
+
return
|
| 151 |
+
|
| 152 |
+
# Extract margin-left for indentation
|
| 153 |
+
import re
|
| 154 |
+
margin_match = re.search(r'margin-left:\s*(\d+(?:\.\d+)?)em', style_attr)
|
| 155 |
+
if margin_match:
|
| 156 |
+
em_value = float(margin_match.group(1))
|
| 157 |
+
# Convert em to inches (1em ≈ 12pt, 72pt = 1 inch)
|
| 158 |
+
indent_inches = (em_value * 12) / 72
|
| 159 |
+
self.current_paragraph.paragraph_format.left_indent = Inches(indent_inches)
|
| 160 |
+
|
| 161 |
+
# For bullet points, add hanging indent
|
| 162 |
+
if self.is_bullet_point:
|
| 163 |
+
self.current_paragraph.paragraph_format.first_line_indent = Inches(-0.25)
|
| 164 |
+
|
| 165 |
+
# Set line spacing for better readability
|
| 166 |
+
from docx.shared import Length
|
| 167 |
+
self.current_paragraph.paragraph_format.line_spacing = 1.15
|
| 168 |
+
|
| 169 |
+
# Add appropriate spacing after paragraphs
|
| 170 |
+
self.current_paragraph.paragraph_format.space_after = Pt(6)
|
| 171 |
+
|
| 172 |
+
def handle_endtag(self, tag):
|
| 173 |
+
if tag == 'div' and (self.is_title or self.is_heading):
|
| 174 |
+
self.is_title = False
|
| 175 |
+
self.is_heading = False
|
| 176 |
+
self.current_paragraph = None
|
| 177 |
+
|
| 178 |
+
elif tag == 'div' and self.current_paragraph and not self.in_table:
|
| 179 |
+
self.is_bullet_point = False
|
| 180 |
+
self.current_paragraph = None
|
| 181 |
+
|
| 182 |
+
elif tag == 'table':
|
| 183 |
+
self.in_table = False
|
| 184 |
+
self._create_enhanced_docx_table()
|
| 185 |
+
|
| 186 |
+
elif tag == 'tr' and self.current_table_row:
|
| 187 |
+
self.table_data.append(self.current_table_row[:])
|
| 188 |
+
self.current_table_row = []
|
| 189 |
+
|
| 190 |
+
def handle_data(self, data):
|
| 191 |
+
if data.strip():
|
| 192 |
+
# Convert back to regular spaces
|
| 193 |
+
data = data.replace(' ', ' ')
|
| 194 |
+
|
| 195 |
+
if self.in_table:
|
| 196 |
+
self.current_table_row.append(data.strip())
|
| 197 |
+
elif self.current_paragraph is not None:
|
| 198 |
+
run = self.current_paragraph.add_run(data)
|
| 199 |
+
if self.is_title:
|
| 200 |
+
run.bold = True
|
| 201 |
+
run.font.size = Pt(16)
|
| 202 |
+
elif self.is_heading:
|
| 203 |
+
run.bold = True
|
| 204 |
+
run.font.size = Pt(14)
|
| 205 |
+
else:
|
| 206 |
+
# Regular text formatting
|
| 207 |
+
run.font.size = Pt(11)
|
| 208 |
+
|
| 209 |
+
def _create_enhanced_docx_table(self):
|
| 210 |
+
if not self.table_data:
|
| 211 |
+
return
|
| 212 |
+
|
| 213 |
+
# Create table with proper formatting
|
| 214 |
+
rows = len(self.table_data)
|
| 215 |
+
cols = max(len(row) for row in self.table_data) if self.table_data else 1
|
| 216 |
+
|
| 217 |
+
table = self.doc.add_table(rows=rows, cols=cols)
|
| 218 |
+
table.style = 'Table Grid'
|
| 219 |
+
table.alignment = WD_TABLE_ALIGNMENT.LEFT
|
| 220 |
+
|
| 221 |
+
# Set table margins
|
| 222 |
+
table.autofit = False
|
| 223 |
+
|
| 224 |
+
# Fill table data with proper formatting
|
| 225 |
+
for row_idx, row_data in enumerate(self.table_data):
|
| 226 |
+
table_row = table.rows[row_idx]
|
| 227 |
+
for col_idx, cell_data in enumerate(row_data):
|
| 228 |
+
if col_idx < len(table_row.cells):
|
| 229 |
+
cell = table_row.cells[col_idx]
|
| 230 |
+
cell.text = str(cell_data)
|
| 231 |
+
|
| 232 |
+
# Style header row
|
| 233 |
+
if row_idx == 0:
|
| 234 |
+
for paragraph in cell.paragraphs:
|
| 235 |
+
for run in paragraph.runs:
|
| 236 |
+
run.bold = True
|
| 237 |
+
run.font.size = Pt(10)
|
| 238 |
+
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 239 |
+
else:
|
| 240 |
+
# Regular data cells
|
| 241 |
+
for paragraph in cell.paragraphs:
|
| 242 |
+
for run in paragraph.runs:
|
| 243 |
+
run.font.size = Pt(10)
|
| 244 |
+
|
| 245 |
+
# Set cell margins for better spacing
|
| 246 |
+
cell.vertical_alignment = WD_ALIGN_PARAGRAPH.LEFT
|
| 247 |
+
|
| 248 |
+
# Add spacing after table
|
| 249 |
+
self.doc.add_paragraph()
|
| 250 |
+
|
| 251 |
+
# Create DOCX document
|
| 252 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 253 |
+
temp_file = tempfile.NamedTemporaryFile(
|
| 254 |
+
suffix=f'_extracted_document_{timestamp}.docx',
|
| 255 |
+
delete=False
|
| 256 |
+
)
|
| 257 |
+
temp_file.close()
|
| 258 |
+
|
| 259 |
+
doc = Document()
|
| 260 |
+
|
| 261 |
+
# Set document margins for better spacing
|
| 262 |
+
sections = doc.sections
|
| 263 |
+
for section in sections:
|
| 264 |
+
section.top_margin = Inches(1)
|
| 265 |
+
section.bottom_margin = Inches(1)
|
| 266 |
+
section.left_margin = Inches(1)
|
| 267 |
+
section.right_margin = Inches(1)
|
| 268 |
+
|
| 269 |
+
# Title with better formatting
|
| 270 |
+
title = doc.add_heading('PDF OCR Extraction Results', 0)
|
| 271 |
+
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 272 |
+
|
| 273 |
+
# Add subtitle with enhanced styling
|
| 274 |
+
subtitle_para = doc.add_paragraph()
|
| 275 |
+
subtitle_run = subtitle_para.add_run('Enhanced with HTML Processing and Preserved Formatting')
|
| 276 |
+
subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 277 |
+
subtitle_run.italic = True
|
| 278 |
+
subtitle_run.font.size = Pt(12)
|
| 279 |
+
subtitle_run.font.color.rgb = RGBColor(102, 102, 102)
|
| 280 |
+
|
| 281 |
+
# Metadata section with better formatting
|
| 282 |
+
if metadata_info:
|
| 283 |
+
doc.add_heading('Processing Information', level=1)
|
| 284 |
+
meta_para = doc.add_paragraph()
|
| 285 |
+
meta_run = meta_para.add_run(metadata_info)
|
| 286 |
+
meta_run.font.size = Pt(10)
|
| 287 |
+
meta_para.style = 'Intense Quote'
|
| 288 |
+
doc.add_paragraph() # Add spacing
|
| 289 |
+
|
| 290 |
+
# Process HTML content with enhanced spacing
|
| 291 |
+
doc.add_heading('Extracted Content', level=1)
|
| 292 |
+
|
| 293 |
+
if html_content and '<table' in html_content:
|
| 294 |
+
# Parse HTML and convert to DOCX with spacing preservation
|
| 295 |
+
parser = EnhancedDOCXHTMLParser(doc)
|
| 296 |
+
parser.feed(html_content)
|
| 297 |
+
else:
|
| 298 |
+
# Fallback to text content with enhanced formatting
|
| 299 |
+
paragraphs = text_content.split('\n\n')
|
| 300 |
+
for para in paragraphs:
|
| 301 |
+
if para.strip():
|
| 302 |
+
if para.strip().startswith('==='):
|
| 303 |
+
# Page headers with minimal separation
|
| 304 |
+
page_header = doc.add_heading(para.strip(), level=1)
|
| 305 |
+
page_header.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 306 |
+
elif para.strip().startswith('#'):
|
| 307 |
+
# Titles
|
| 308 |
+
title_text = para.strip().lstrip('#').strip()
|
| 309 |
+
title_para = doc.add_heading(title_text, level=1)
|
| 310 |
+
elif para.strip().startswith('##'):
|
| 311 |
+
# Section headings
|
| 312 |
+
heading_text = para.strip().lstrip('#').strip()
|
| 313 |
+
heading_para = doc.add_heading(heading_text, level=2)
|
| 314 |
+
else:
|
| 315 |
+
# Regular paragraphs with spacing preservation
|
| 316 |
+
lines = para.split('\n')
|
| 317 |
+
for line in lines:
|
| 318 |
+
if line.strip():
|
| 319 |
+
para_element = doc.add_paragraph()
|
| 320 |
+
|
| 321 |
+
# Calculate indentation from leading spaces
|
| 322 |
+
leading_spaces = len(line) - len(line.lstrip())
|
| 323 |
+
if leading_spaces > 0:
|
| 324 |
+
indent_level = leading_spaces // 2 # 2 spaces = 1 indent level
|
| 325 |
+
para_element.paragraph_format.left_indent = Inches(0.5 * indent_level)
|
| 326 |
+
|
| 327 |
+
# Add the text content
|
| 328 |
+
run = para_element.add_run(line.strip())
|
| 329 |
+
run.font.size = Pt(11)
|
| 330 |
+
|
| 331 |
+
# Set line spacing
|
| 332 |
+
para_element.paragraph_format.line_spacing = 1.15
|
| 333 |
+
para_element.paragraph_format.space_after = Pt(3)
|
| 334 |
+
|
| 335 |
+
# Enhanced footer
|
| 336 |
+
footer_section = doc.sections[0]
|
| 337 |
+
footer = footer_section.footer
|
| 338 |
+
footer_para = footer.paragraphs[0]
|
| 339 |
+
footer_para.text = f"Generated by Enhanced PDF OCR Service on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
| 340 |
+
footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 341 |
+
footer_run = footer_para.runs[0]
|
| 342 |
+
footer_run.font.size = Pt(9)
|
| 343 |
+
footer_run.font.color.rgb = RGBColor(128, 128, 128)
|
| 344 |
+
|
| 345 |
+
doc.save(temp_file.name)
|
| 346 |
+
logger.info(f"Enhanced DOCX file with proper spacing created: {temp_file.name}")
|
| 347 |
+
return temp_file.name
|
| 348 |
+
|
| 349 |
+
except ImportError:
|
| 350 |
+
raise ImportError("python-docx not installed. Cannot create DOCX files.")
|
| 351 |
+
except Exception as e:
|
| 352 |
+
logger.error(f"Error creating enhanced DOCX file: {e}")
|
| 353 |
+
try:
|
| 354 |
+
os.unlink(temp_file.name)
|
| 355 |
+
except:
|
| 356 |
+
pass
|
| 357 |
+
raise
|
| 358 |
+
|
| 359 |
+
@staticmethod
|
| 360 |
+
def create_html_file(html_content: str, metadata_info: str = "") -> str:
|
| 361 |
+
"""Create standalone HTML file"""
|
| 362 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 363 |
+
temp_file = tempfile.NamedTemporaryFile(
|
| 364 |
+
suffix=f'_extracted_document_{timestamp}.html',
|
| 365 |
+
delete=False,
|
| 366 |
+
mode='w',
|
| 367 |
+
encoding='utf-8'
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
try:
|
| 371 |
+
# Enhanced HTML with better styling
|
| 372 |
+
enhanced_html = html_content.replace(
|
| 373 |
+
'<style>',
|
| 374 |
+
'''<style>
|
| 375 |
+
body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; line-height: 1.6; margin: 20px; background-color: #f9f9f9; }
|
| 376 |
+
.container { max-width: 1200px; margin: 0 auto; background-color: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
|
| 377 |
+
.header { text-align: center; margin-bottom: 30px; border-bottom: 3px solid #2c3e50; padding-bottom: 20px; }
|
| 378 |
+
.metadata { background-color: #ecf0f1; padding: 15px; border-radius: 5px; margin-bottom: 25px; border-left: 4px solid #3498db; }
|
| 379 |
+
'''
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
# Wrap content in container
|
| 383 |
+
if '<body>' in enhanced_html:
|
| 384 |
+
enhanced_html = enhanced_html.replace(
|
| 385 |
+
'<body>',
|
| 386 |
+
'''<body>
|
| 387 |
+
<div class="container">
|
| 388 |
+
<div class="header">
|
| 389 |
+
<h1>PDF OCR Extraction Results</h1>
|
| 390 |
+
<p>Enhanced with HTML Processing and Format Preservation</p>
|
| 391 |
+
</div>''' +
|
| 392 |
+
(f'<div class="metadata"><h3>Processing Information</h3><pre>{metadata_info}</pre></div>' if metadata_info else '')
|
| 393 |
+
)
|
| 394 |
+
enhanced_html = enhanced_html.replace('</body>', '</div></body>')
|
| 395 |
+
|
| 396 |
+
temp_file.write(enhanced_html)
|
| 397 |
+
temp_file.close()
|
| 398 |
+
return temp_file.name
|
| 399 |
+
|
| 400 |
+
except Exception as e:
|
| 401 |
+
logger.error(f"Error creating HTML file: {e}")
|
| 402 |
+
temp_file.close()
|
| 403 |
+
raise
|
| 404 |
+
|
| 405 |
+
|
| 406 |
class BackendManager:
|
| 407 |
+
"""Enhanced backend manager with FIXED crop processing and advanced export capabilities"""
|
| 408 |
|
| 409 |
def __init__(self):
|
| 410 |
self.ocr_service = OCRService()
|
|
|
|
| 415 |
self.temp_dir = Path(tempfile.gettempdir()) / 'pdf_ocr_service'
|
| 416 |
self.temp_dir.mkdir(exist_ok=True)
|
| 417 |
|
| 418 |
+
logger.info("Enhanced backend manager with fixed crop processing initialized successfully")
|
| 419 |
|
| 420 |
+
def process_pdf_with_enhanced_resolution(self, pdf_path: str, method: str = "auto",
|
| 421 |
+
preprocessing_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 422 |
"""
|
| 423 |
+
Process PDF with enhanced resolution and HTML generation
|
| 424 |
|
| 425 |
Args:
|
| 426 |
pdf_path: Path to the PDF file
|
|
|
|
| 428 |
preprocessing_options: Dictionary containing preprocessing settings
|
| 429 |
|
| 430 |
Returns:
|
| 431 |
+
Dict containing processing results with HTML content
|
| 432 |
"""
|
| 433 |
start_time = datetime.now()
|
| 434 |
|
|
|
|
| 438 |
'success': False,
|
| 439 |
'error': f"File not found: {pdf_path}",
|
| 440 |
'text': '',
|
| 441 |
+
'html': '',
|
| 442 |
'method_used': '',
|
| 443 |
'metadata': {}
|
| 444 |
}
|
| 445 |
|
| 446 |
+
# Check file size
|
| 447 |
max_file_size = int(os.getenv('MAX_FILE_SIZE_MB', 50)) * 1024 * 1024
|
| 448 |
file_size = os.path.getsize(pdf_path)
|
| 449 |
|
|
|
|
| 452 |
'success': False,
|
| 453 |
'error': f"File too large. Maximum size: {max_file_size // (1024*1024)}MB",
|
| 454 |
'text': '',
|
| 455 |
+
'html': '',
|
| 456 |
'method_used': '',
|
| 457 |
'metadata': {}
|
| 458 |
}
|
| 459 |
|
| 460 |
+
# Generate file hash for tracking
|
| 461 |
file_hash = self._calculate_file_hash(pdf_path)
|
| 462 |
|
| 463 |
+
logger.info(f"Processing PDF with enhanced resolution: {os.path.basename(pdf_path)} (Hash: {file_hash[:8]}...)")
|
| 464 |
logger.info(f"File size: {file_size / (1024*1024):.2f}MB, Method: {method}")
|
| 465 |
|
| 466 |
# Handle preprocessing if enabled
|
|
|
|
| 468 |
preprocessing_applied = False
|
| 469 |
|
| 470 |
if preprocessing_options and preprocessing_options.get('enable_header_footer_removal', False):
|
| 471 |
+
logger.info("Applying enhanced preprocessing...")
|
| 472 |
try:
|
| 473 |
+
processed_pdf_path = self._apply_enhanced_preprocessing(pdf_path, preprocessing_options)
|
| 474 |
preprocessing_applied = True
|
| 475 |
+
logger.info("Enhanced preprocessing completed successfully")
|
| 476 |
except Exception as e:
|
| 477 |
logger.error(f"Preprocessing failed: {e}")
|
|
|
|
| 478 |
processed_pdf_path = pdf_path
|
| 479 |
|
| 480 |
try:
|
| 481 |
+
# Process with enhanced OCR
|
| 482 |
result = self.ocr_service.convert_pdf_to_text(processed_pdf_path, method)
|
| 483 |
|
| 484 |
# Add processing metadata
|
|
|
|
| 489 |
'file_size_mb': round(file_size / (1024*1024), 2),
|
| 490 |
'processing_time_seconds': round(processing_time, 2),
|
| 491 |
'timestamp': start_time.isoformat(),
|
| 492 |
+
'enhanced_processing': True,
|
| 493 |
+
'html_processing': True,
|
| 494 |
'header_footer_removed': preprocessing_applied,
|
| 495 |
'preprocessing_options': preprocessing_options if preprocessing_applied else None
|
| 496 |
})
|
| 497 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
# Cleanup temporary preprocessed file
|
| 499 |
if preprocessing_applied and processed_pdf_path != pdf_path:
|
| 500 |
try:
|
|
|
|
| 505 |
# Log results
|
| 506 |
if result['success']:
|
| 507 |
text_length = len(result['text'])
|
| 508 |
+
has_html = bool(result.get('html'))
|
| 509 |
+
table_count = result['text'].count('Table ') if 'Table ' in result['text'] else 0
|
| 510 |
+
|
| 511 |
+
logger.info(f"Enhanced processing completed successfully in {processing_time:.2f}s")
|
| 512 |
logger.info(f"Method used: {result['method_used']}")
|
| 513 |
logger.info(f"Text extracted: {text_length} characters")
|
| 514 |
+
logger.info(f"HTML generated: {has_html}")
|
| 515 |
if table_count > 0:
|
| 516 |
logger.info(f"Tables detected: {table_count}")
|
| 517 |
if preprocessing_applied:
|
| 518 |
+
logger.info("Enhanced preprocessing applied")
|
| 519 |
|
| 520 |
# Add to processing history
|
| 521 |
self._add_to_history({
|
|
|
|
| 526 |
'text_length': text_length,
|
| 527 |
'table_count': table_count,
|
| 528 |
'processing_time': processing_time,
|
| 529 |
+
'preprocessing_applied': preprocessing_applied,
|
| 530 |
+
'html_generated': has_html,
|
| 531 |
+
'enhanced_processing': True
|
| 532 |
})
|
| 533 |
else:
|
| 534 |
+
logger.error(f"Enhanced processing failed: {result.get('error', 'Unknown error')}")
|
| 535 |
|
| 536 |
# Add to processing history
|
| 537 |
self._add_to_history({
|
|
|
|
| 541 |
'success': False,
|
| 542 |
'error': result.get('error', 'Unknown error'),
|
| 543 |
'processing_time': processing_time,
|
| 544 |
+
'preprocessing_applied': preprocessing_applied,
|
| 545 |
+
'enhanced_processing': True
|
| 546 |
})
|
| 547 |
|
| 548 |
return result
|
| 549 |
|
| 550 |
except Exception as e:
|
| 551 |
+
logger.error(f"Unexpected error during enhanced processing: {e}")
|
| 552 |
|
| 553 |
+
# Cleanup
|
| 554 |
if preprocessing_applied and processed_pdf_path != pdf_path:
|
| 555 |
try:
|
| 556 |
os.unlink(processed_pdf_path)
|
|
|
|
| 565 |
'method_requested': method,
|
| 566 |
'success': False,
|
| 567 |
'error': str(e),
|
| 568 |
+
'processing_time': processing_time,
|
| 569 |
+
'enhanced_processing': True
|
| 570 |
})
|
| 571 |
|
| 572 |
return {
|
| 573 |
'success': False,
|
| 574 |
+
'error': f"Enhanced processing error: {str(e)}",
|
| 575 |
'text': '',
|
| 576 |
+
'html': '',
|
| 577 |
'method_used': '',
|
| 578 |
'metadata': {
|
| 579 |
'file_hash': file_hash,
|
| 580 |
'processing_time_seconds': round(processing_time, 2),
|
| 581 |
+
'timestamp': start_time.isoformat(),
|
| 582 |
+
'enhanced_processing': True
|
| 583 |
}
|
| 584 |
}
|
| 585 |
|
| 586 |
+
def _apply_enhanced_preprocessing(self, pdf_path: str, options: Dict[str, Any]) -> str:
|
| 587 |
+
"""Apply enhanced preprocessing with high-resolution crop handling - FIXED"""
|
| 588 |
+
crop_settings = options.get('crop_settings', {})
|
| 589 |
+
per_page_crops = crop_settings.get('per_page_crops', {})
|
| 590 |
+
enhanced_resolution = crop_settings.get('enhanced_resolution', True)
|
| 591 |
+
resolution_scale = crop_settings.get('resolution_scale', 2.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
|
| 593 |
# Create temporary file for processed PDF
|
| 594 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 595 |
+
temp_pdf_path = self.temp_dir / f"enhanced_preprocessed_{timestamp}.pdf"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 596 |
|
| 597 |
+
doc = fitz.open(pdf_path)
|
|
|
|
|
|
|
|
|
|
| 598 |
new_doc = fitz.open()
|
| 599 |
|
| 600 |
try:
|
|
|
|
| 602 |
page = doc.load_page(page_num)
|
| 603 |
page_rect = page.rect
|
| 604 |
|
| 605 |
+
# Get crop settings for this page - FIXED indexing
|
| 606 |
+
page_crop = per_page_crops.get(page_num, per_page_crops.get(0, {
|
| 607 |
+
'top': 0, 'bottom': 0, 'left': 0, 'right': 0
|
| 608 |
+
}))
|
| 609 |
+
|
| 610 |
+
top_percent = page_crop.get('top', 0)
|
| 611 |
+
bottom_percent = page_crop.get('bottom', 0)
|
| 612 |
+
left_percent = page_crop.get('left', 0)
|
| 613 |
+
right_percent = page_crop.get('right', 0)
|
| 614 |
+
|
| 615 |
+
# Calculate crop amounts
|
| 616 |
width = page_rect.width
|
| 617 |
height = page_rect.height
|
| 618 |
|
|
|
|
| 629 |
page_rect.y1 - crop_bottom
|
| 630 |
)
|
| 631 |
|
| 632 |
+
# Ensure the rectangle is valid
|
| 633 |
+
if new_rect.width <= 0 or new_rect.height <= 0:
|
| 634 |
+
logger.warning(f"Invalid crop rectangle for page {page_num}, using original page")
|
| 635 |
+
new_rect = page_rect
|
| 636 |
|
| 637 |
+
# Create new page with enhanced resolution if enabled
|
| 638 |
+
if enhanced_resolution:
|
| 639 |
+
# Use high resolution for better quality
|
| 640 |
+
new_page = new_doc.new_page(
|
| 641 |
+
width=new_rect.width,
|
| 642 |
+
height=new_rect.height
|
| 643 |
+
)
|
| 644 |
+
|
| 645 |
+
# Copy content with proper transformation
|
| 646 |
+
mat = fitz.Matrix(1, 1).prescale(resolution_scale, resolution_scale)
|
| 647 |
+
new_page.show_pdf_page(
|
| 648 |
+
new_page.rect,
|
| 649 |
+
doc,
|
| 650 |
+
page_num,
|
| 651 |
+
clip=new_rect
|
| 652 |
+
)
|
| 653 |
+
else:
|
| 654 |
+
# Standard resolution
|
| 655 |
+
new_page = new_doc.new_page(width=new_rect.width, height=new_rect.height)
|
| 656 |
+
new_page.show_pdf_page(
|
| 657 |
+
new_page.rect,
|
| 658 |
+
doc,
|
| 659 |
+
page_num,
|
| 660 |
+
clip=new_rect
|
| 661 |
+
)
|
| 662 |
+
|
| 663 |
+
logger.debug(f"Page {page_num}: Applied crop T{top_percent}% B{bottom_percent}% L{left_percent}% R{right_percent}%")
|
| 664 |
|
| 665 |
+
new_doc.save(str(temp_pdf_path))
|
| 666 |
+
logger.info(f"Enhanced preprocessing applied with {resolution_scale}x resolution to {len(doc)} pages")
|
| 667 |
|
| 668 |
+
except Exception as e:
|
| 669 |
+
logger.error(f"Error in enhanced preprocessing: {e}")
|
| 670 |
+
raise
|
| 671 |
finally:
|
| 672 |
doc.close()
|
| 673 |
new_doc.close()
|
| 674 |
|
| 675 |
+
return str(temp_pdf_path)
|
| 676 |
|
| 677 |
+
def create_enhanced_downloads(self, text_content: str, html_content: str,
|
| 678 |
+
metadata_info: str = "") -> Dict[str, str]:
|
| 679 |
+
"""Create enhanced download files with HTML processing"""
|
| 680 |
+
download_files = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 681 |
|
| 682 |
try:
|
| 683 |
+
# Create enhanced TXT file
|
| 684 |
+
txt_path = DocumentExporter.create_enhanced_txt_file(
|
| 685 |
+
text_content, html_content, metadata_info
|
| 686 |
+
)
|
| 687 |
+
download_files['txt'] = txt_path
|
| 688 |
+
logger.info(f"Enhanced TXT file created: {txt_path}")
|
| 689 |
+
|
| 690 |
+
# Create enhanced DOCX file if possible
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 691 |
try:
|
| 692 |
+
docx_path = DocumentExporter.create_enhanced_docx_file(
|
| 693 |
+
text_content, html_content, metadata_info
|
| 694 |
+
)
|
| 695 |
+
download_files['docx'] = docx_path
|
| 696 |
+
logger.info(f"Enhanced DOCX file created: {docx_path}")
|
| 697 |
+
except ImportError:
|
| 698 |
+
logger.warning("python-docx not available. DOCX creation skipped.")
|
| 699 |
+
except Exception as e:
|
| 700 |
+
logger.error(f"DOCX creation failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 701 |
|
| 702 |
+
# Create standalone HTML file
|
| 703 |
+
try:
|
| 704 |
+
html_path = DocumentExporter.create_html_file(
|
| 705 |
+
html_content, metadata_info
|
| 706 |
+
)
|
| 707 |
+
download_files['html'] = html_path
|
| 708 |
+
logger.info(f"HTML file created: {html_path}")
|
| 709 |
+
except Exception as e:
|
| 710 |
+
logger.error(f"HTML file creation failed: {e}")
|
| 711 |
|
| 712 |
except Exception as e:
|
| 713 |
+
logger.error(f"Error creating enhanced downloads: {e}")
|
| 714 |
+
raise
|
| 715 |
|
| 716 |
+
return download_files
|
| 717 |
|
| 718 |
def get_available_methods(self) -> List[str]:
|
| 719 |
"""Get list of available OCR methods"""
|
| 720 |
methods = self.ocr_service.get_available_methods()
|
| 721 |
+
logger.info(f"Available enhanced OCR methods: {methods}")
|
| 722 |
return methods
|
| 723 |
|
| 724 |
def get_service_status(self) -> Dict[str, Any]:
|
| 725 |
+
"""Get comprehensive service status with enhanced features"""
|
| 726 |
available_methods = self.get_available_methods()
|
| 727 |
|
| 728 |
+
# Check DOCX support
|
| 729 |
+
try:
|
| 730 |
+
import docx
|
| 731 |
+
docx_available = True
|
| 732 |
+
except ImportError:
|
| 733 |
+
docx_available = False
|
| 734 |
+
|
| 735 |
status = {
|
| 736 |
'service_healthy': True,
|
| 737 |
'available_methods': available_methods,
|
|
|
|
| 742 |
'successful_processes': sum(1 for h in self.processing_history if h.get('success', False)),
|
| 743 |
'temp_dir': str(self.temp_dir),
|
| 744 |
'max_file_size_mb': int(os.getenv('MAX_FILE_SIZE_MB', 50)),
|
| 745 |
+
'enhanced_processing': True,
|
| 746 |
+
'html_processing': True,
|
| 747 |
+
'docx_export_available': docx_available,
|
| 748 |
+
'enhanced_crop_processing': True,
|
| 749 |
+
'multi_resolution_support': True,
|
| 750 |
+
'crop_processing_fixed': True
|
| 751 |
}
|
| 752 |
|
| 753 |
return status
|
| 754 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 755 |
def _calculate_file_hash(self, file_path: str) -> str:
|
| 756 |
"""Calculate SHA-256 hash of file"""
|
| 757 |
sha256_hash = hashlib.sha256()
|
|
|
|
| 773 |
if len(self.processing_history) > self.max_history_size:
|
| 774 |
self.processing_history = self.processing_history[-self.max_history_size:]
|
| 775 |
|
| 776 |
+
def cleanup_temp_files(self):
|
| 777 |
+
"""Clean up temporary files"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 778 |
try:
|
| 779 |
+
temp_files = list(self.temp_dir.glob('*'))
|
| 780 |
+
cleaned_count = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 781 |
|
| 782 |
+
for temp_file in temp_files:
|
| 783 |
+
try:
|
| 784 |
+
# Remove files older than 1 hour
|
| 785 |
+
if temp_file.is_file() and temp_file.stat().st_mtime < (datetime.now().timestamp() - 3600):
|
| 786 |
+
temp_file.unlink()
|
| 787 |
+
cleaned_count += 1
|
| 788 |
+
except Exception as e:
|
| 789 |
+
logger.warning(f"Could not remove temp file {temp_file}: {e}")
|
| 790 |
|
| 791 |
+
if cleaned_count > 0:
|
| 792 |
+
logger.info(f"Cleaned up {cleaned_count} temporary files")
|
| 793 |
+
|
| 794 |
except Exception as e:
|
| 795 |
+
logger.error(f"Error during cleanup: {e}")
|
|
|
|
| 796 |
|
| 797 |
+
def get_enhanced_statistics(self) -> Dict[str, Any]:
|
| 798 |
+
"""Get enhanced processing statistics"""
|
| 799 |
if not self.processing_history:
|
| 800 |
return {
|
| 801 |
'total_processed': 0,
|
|
|
|
| 804 |
'most_used_method': 'N/A',
|
| 805 |
'total_text_extracted': 0,
|
| 806 |
'total_tables_processed': 0,
|
| 807 |
+
'preprocessing_usage': 0,
|
| 808 |
+
'html_generation_rate': 0,
|
| 809 |
+
'enhanced_processing_usage': 0
|
| 810 |
}
|
| 811 |
|
| 812 |
total_processed = len(self.processing_history)
|
| 813 |
successful = [h for h in self.processing_history if h.get('success', False)]
|
| 814 |
success_rate = (len(successful) / total_processed) * 100 if total_processed > 0 else 0
|
| 815 |
|
| 816 |
+
# Calculate statistics
|
| 817 |
processing_times = [h.get('processing_time', 0) for h in self.processing_history if 'processing_time' in h]
|
| 818 |
avg_processing_time = sum(processing_times) / len(processing_times) if processing_times else 0
|
| 819 |
|
|
|
|
| 820 |
methods = [h.get('method_used', 'unknown') for h in successful]
|
| 821 |
most_used_method = max(set(methods), key=methods.count) if methods else 'N/A'
|
| 822 |
|
|
|
|
| 823 |
total_text = sum(h.get('text_length', 0) for h in successful)
|
| 824 |
total_tables = sum(h.get('table_count', 0) for h in successful)
|
| 825 |
|
|
|
|
| 826 |
preprocessing_usage = sum(1 for h in self.processing_history if h.get('preprocessing_applied', False))
|
| 827 |
+
html_generated = sum(1 for h in self.processing_history if h.get('html_generated', False))
|
| 828 |
+
enhanced_processing = sum(1 for h in self.processing_history if h.get('enhanced_processing', False))
|
| 829 |
+
|
| 830 |
+
html_generation_rate = (html_generated / total_processed) * 100 if total_processed > 0 else 0
|
| 831 |
+
enhanced_processing_rate = (enhanced_processing / total_processed) * 100 if total_processed > 0 else 0
|
| 832 |
|
| 833 |
return {
|
| 834 |
'total_processed': total_processed,
|
|
|
|
| 839 |
'total_tables_processed': total_tables,
|
| 840 |
'successful_processes': len(successful),
|
| 841 |
'failed_processes': total_processed - len(successful),
|
| 842 |
+
'preprocessing_usage': preprocessing_usage,
|
| 843 |
+
'html_generation_rate': round(html_generation_rate, 2),
|
| 844 |
+
'enhanced_processing_usage': enhanced_processing,
|
| 845 |
+
'enhanced_processing_rate': round(enhanced_processing_rate, 2)
|
| 846 |
}
|
| 847 |
|
| 848 |
|
| 849 |
+
# Global backend manager instance
|
| 850 |
_backend_manager = None
|
| 851 |
|
| 852 |
def get_backend_manager() -> BackendManager:
|
| 853 |
+
"""Get global enhanced backend manager instance"""
|
| 854 |
global _backend_manager
|
| 855 |
if _backend_manager is None:
|
| 856 |
_backend_manager = BackendManager()
|
|
|
|
| 858 |
|
| 859 |
|
| 860 |
if __name__ == "__main__":
|
| 861 |
+
# Test the enhanced backend manager
|
| 862 |
manager = BackendManager()
|
| 863 |
|
| 864 |
+
print("Enhanced Backend Manager with Fixed Crop Processing Test")
|
| 865 |
+
print("=" * 60)
|
| 866 |
print(f"Available methods: {manager.get_available_methods()}")
|
| 867 |
print(f"Service status: {manager.get_service_status()}")
|
| 868 |
+
print(f"Enhanced statistics: {manager.get_enhanced_statistics()}")
|
ocr_service.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
"""
|
| 2 |
-
OCR Service Module - FIXED VERSION
|
| 3 |
-
Handles PDF to text conversion
|
| 4 |
"""
|
| 5 |
import re
|
| 6 |
import os
|
| 7 |
import logging
|
| 8 |
-
from typing import Optional, Dict, Any, Tuple
|
| 9 |
import tempfile
|
| 10 |
from pathlib import Path
|
| 11 |
|
|
@@ -35,8 +35,708 @@ logging.basicConfig(level=logging.INFO)
|
|
| 35 |
logger = logging.getLogger(__name__)
|
| 36 |
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
class OCRService:
|
| 39 |
-
"""Main OCR service with
|
| 40 |
|
| 41 |
def __init__(self):
|
| 42 |
self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
|
|
@@ -58,18 +758,19 @@ class OCRService:
|
|
| 58 |
|
| 59 |
def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
|
| 60 |
"""
|
| 61 |
-
Convert PDF to text using specified method
|
| 62 |
|
| 63 |
Args:
|
| 64 |
pdf_path: Path to the PDF file
|
| 65 |
method: OCR method ('azure', 'tesseract', 'pymupdf', 'auto')
|
| 66 |
|
| 67 |
Returns:
|
| 68 |
-
Dict containing text content, metadata, and processing info
|
| 69 |
"""
|
| 70 |
result = {
|
| 71 |
'success': False,
|
| 72 |
'text': '',
|
|
|
|
| 73 |
'method_used': '',
|
| 74 |
'metadata': {},
|
| 75 |
'error': None
|
|
@@ -91,7 +792,7 @@ class OCRService:
|
|
| 91 |
# Try primary method
|
| 92 |
try:
|
| 93 |
if method == "azure" and self.azure_client:
|
| 94 |
-
result = self.
|
| 95 |
elif method == "tesseract":
|
| 96 |
result = self._tesseract_ocr(pdf_path)
|
| 97 |
elif method == "pymupdf":
|
|
@@ -110,11 +811,12 @@ class OCRService:
|
|
| 110 |
|
| 111 |
return result
|
| 112 |
|
| 113 |
-
def
|
| 114 |
-
"""Azure Document Intelligence OCR with
|
| 115 |
result = {
|
| 116 |
'success': False,
|
| 117 |
'text': '',
|
|
|
|
| 118 |
'method_used': 'azure_document_intelligence',
|
| 119 |
'metadata': {},
|
| 120 |
'error': None
|
|
@@ -124,9 +826,8 @@ class OCRService:
|
|
| 124 |
with open(pdf_path, 'rb') as pdf_file:
|
| 125 |
file_content = pdf_file.read()
|
| 126 |
|
| 127 |
-
# Try different API call patterns
|
| 128 |
try:
|
| 129 |
-
# Pattern 1: body + content_type (most common for current SDK)
|
| 130 |
poller = self.azure_client.begin_analyze_document(
|
| 131 |
"prebuilt-layout",
|
| 132 |
body=file_content,
|
|
@@ -134,13 +835,11 @@ class OCRService:
|
|
| 134 |
)
|
| 135 |
except TypeError:
|
| 136 |
try:
|
| 137 |
-
# Pattern 2: model_id + body
|
| 138 |
poller = self.azure_client.begin_analyze_document(
|
| 139 |
model_id="prebuilt-layout",
|
| 140 |
body=file_content
|
| 141 |
)
|
| 142 |
except TypeError:
|
| 143 |
-
# Pattern 3: document parameter (older SDK)
|
| 144 |
pdf_file.seek(0)
|
| 145 |
poller = self.azure_client.begin_analyze_document(
|
| 146 |
"prebuilt-layout",
|
|
@@ -149,22 +848,29 @@ class OCRService:
|
|
| 149 |
|
| 150 |
analysis_result = poller.result()
|
| 151 |
|
| 152 |
-
#
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
result.update({
|
| 156 |
'success': True,
|
| 157 |
'text': formatted_text,
|
|
|
|
| 158 |
'metadata': {
|
| 159 |
'pages': len(analysis_result.pages) if analysis_result.pages else 0,
|
| 160 |
'tables': len(analysis_result.tables) if analysis_result.tables else 0,
|
| 161 |
'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
|
| 162 |
'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
|
| 163 |
-
'
|
|
|
|
|
|
|
|
|
|
| 164 |
}
|
| 165 |
})
|
| 166 |
|
| 167 |
-
logger.info("Azure OCR
|
| 168 |
|
| 169 |
except Exception as e:
|
| 170 |
logger.error(f"Azure OCR error: {e}")
|
|
@@ -172,281 +878,12 @@ class OCRService:
|
|
| 172 |
|
| 173 |
return result
|
| 174 |
|
| 175 |
-
def _format_azure_result_enhanced(self, analysis_result) -> str:
|
| 176 |
-
"""FIXED: Enhanced formatting that eliminates ALL duplication at the source"""
|
| 177 |
-
formatted_parts = []
|
| 178 |
-
|
| 179 |
-
if not analysis_result.pages:
|
| 180 |
-
return ""
|
| 181 |
-
|
| 182 |
-
for page_num, page in enumerate(analysis_result.pages, 1):
|
| 183 |
-
formatted_parts.append(f"\n=== PAGE {page_num} ===\n")
|
| 184 |
-
|
| 185 |
-
# Get all tables for this page first
|
| 186 |
-
page_tables = []
|
| 187 |
-
table_regions = []
|
| 188 |
-
|
| 189 |
-
if analysis_result.tables:
|
| 190 |
-
for table_idx, table in enumerate(analysis_result.tables):
|
| 191 |
-
if any(cell.bounding_regions and
|
| 192 |
-
cell.bounding_regions[0].page_number == page_num
|
| 193 |
-
for cell in table.cells):
|
| 194 |
-
page_tables.append((table_idx, table))
|
| 195 |
-
|
| 196 |
-
# Calculate table bounding region
|
| 197 |
-
if table.bounding_regions:
|
| 198 |
-
table_regions.append(table.bounding_regions[0])
|
| 199 |
-
|
| 200 |
-
# CRITICAL FIX: Use ONLY paragraphs OR lines, never both
|
| 201 |
-
content_items = []
|
| 202 |
-
|
| 203 |
-
# Priority 1: Use paragraphs if available (they contain consolidated content)
|
| 204 |
-
if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
|
| 205 |
-
page_paragraphs = [p for p in analysis_result.paragraphs if
|
| 206 |
-
p.bounding_regions and
|
| 207 |
-
p.bounding_regions[0].page_number == page_num]
|
| 208 |
-
|
| 209 |
-
# Use paragraph content ONLY - don't use lines at all
|
| 210 |
-
for para in page_paragraphs:
|
| 211 |
-
if para.content.strip() and not self._is_content_in_table(para, table_regions):
|
| 212 |
-
y_pos = para.bounding_regions[0].polygon[1] if para.bounding_regions[0].polygon else 0
|
| 213 |
-
content_items.append({
|
| 214 |
-
'type': 'paragraph',
|
| 215 |
-
'content': para.content.strip(),
|
| 216 |
-
'y_pos': y_pos,
|
| 217 |
-
'role': getattr(para, 'role', 'paragraph')
|
| 218 |
-
})
|
| 219 |
-
|
| 220 |
-
# Priority 2: Only if NO paragraphs available, use lines
|
| 221 |
-
elif page.lines:
|
| 222 |
-
# Deduplicate lines first - group by approximate position
|
| 223 |
-
unique_lines = []
|
| 224 |
-
seen_content = set()
|
| 225 |
-
|
| 226 |
-
for line in page.lines:
|
| 227 |
-
line_content = line.content.strip().lower()
|
| 228 |
-
if (line_content and
|
| 229 |
-
line_content not in seen_content and
|
| 230 |
-
not self._is_content_in_table_by_line(line, table_regions)):
|
| 231 |
-
|
| 232 |
-
seen_content.add(line_content)
|
| 233 |
-
y_pos = line.polygon[1] if line.polygon else 0
|
| 234 |
-
unique_lines.append({
|
| 235 |
-
'type': 'line',
|
| 236 |
-
'content': line.content.strip(),
|
| 237 |
-
'y_pos': y_pos,
|
| 238 |
-
'role': 'text'
|
| 239 |
-
})
|
| 240 |
-
|
| 241 |
-
content_items.extend(unique_lines)
|
| 242 |
-
|
| 243 |
-
# Add table positions to content items
|
| 244 |
-
for table_idx, table in page_tables:
|
| 245 |
-
if table.bounding_regions:
|
| 246 |
-
table_y_pos = table.bounding_regions[0].polygon[1] if table.bounding_regions[0].polygon else 9999
|
| 247 |
-
content_items.append({
|
| 248 |
-
'type': 'table',
|
| 249 |
-
'content': table,
|
| 250 |
-
'y_pos': table_y_pos,
|
| 251 |
-
'table_idx': table_idx
|
| 252 |
-
})
|
| 253 |
-
|
| 254 |
-
# Sort all content by vertical position
|
| 255 |
-
content_items.sort(key=lambda x: x['y_pos'])
|
| 256 |
-
|
| 257 |
-
# FINAL DEDUPLICATION: Remove content that appears multiple times
|
| 258 |
-
seen_text_content = set()
|
| 259 |
-
final_content = []
|
| 260 |
-
|
| 261 |
-
for item in content_items:
|
| 262 |
-
if item['type'] == 'table':
|
| 263 |
-
final_content.append(item)
|
| 264 |
-
else:
|
| 265 |
-
# Check for text duplication
|
| 266 |
-
text_key = item['content'].lower().strip()
|
| 267 |
-
if text_key not in seen_text_content:
|
| 268 |
-
seen_text_content.add(text_key)
|
| 269 |
-
final_content.append(item)
|
| 270 |
-
|
| 271 |
-
# Add formatted content
|
| 272 |
-
for item in final_content:
|
| 273 |
-
if item['type'] == 'table':
|
| 274 |
-
formatted_parts.append(f"\n--- TABLE {item['table_idx'] + 1} ---")
|
| 275 |
-
table_text = self._format_table_enhanced(item['content'])
|
| 276 |
-
formatted_parts.append(table_text)
|
| 277 |
-
formatted_parts.append("")
|
| 278 |
-
else:
|
| 279 |
-
# Add text content
|
| 280 |
-
if item['role'] == 'title':
|
| 281 |
-
formatted_parts.append(f"\n# {item['content']}\n")
|
| 282 |
-
elif item['role'] == 'sectionHeading':
|
| 283 |
-
formatted_parts.append(f"\n## {item['content']}\n")
|
| 284 |
-
else:
|
| 285 |
-
formatted_parts.append(item['content'])
|
| 286 |
-
|
| 287 |
-
# Clean up excessive empty lines
|
| 288 |
-
result = '\n'.join(formatted_parts)
|
| 289 |
-
result = re.sub(r'\n{3,}', '\n\n', result) # Max 2 consecutive newlines
|
| 290 |
-
return result
|
| 291 |
-
|
| 292 |
-
def _is_content_in_table(self, content_item, table_regions):
|
| 293 |
-
"""Check if content overlaps with any table region"""
|
| 294 |
-
if not table_regions or not content_item.bounding_regions:
|
| 295 |
-
return False
|
| 296 |
-
|
| 297 |
-
content_region = content_item.bounding_regions[0]
|
| 298 |
-
if not content_region.polygon:
|
| 299 |
-
return False
|
| 300 |
-
|
| 301 |
-
content_y1 = content_region.polygon[1] # Top Y
|
| 302 |
-
content_y2 = content_region.polygon[5] # Bottom Y
|
| 303 |
-
content_x1 = content_region.polygon[0] # Left X
|
| 304 |
-
content_x2 = content_region.polygon[2] # Right X
|
| 305 |
-
|
| 306 |
-
for table_region in table_regions:
|
| 307 |
-
if not table_region.polygon:
|
| 308 |
-
continue
|
| 309 |
-
|
| 310 |
-
table_y1 = table_region.polygon[1] # Top Y
|
| 311 |
-
table_y2 = table_region.polygon[5] # Bottom Y
|
| 312 |
-
table_x1 = table_region.polygon[0] # Left X
|
| 313 |
-
table_x2 = table_region.polygon[2] # Right X
|
| 314 |
-
|
| 315 |
-
# Check for overlap with some tolerance
|
| 316 |
-
y_overlap = not (content_y2 < table_y1 - 10 or content_y1 > table_y2 + 10)
|
| 317 |
-
x_overlap = not (content_x2 < table_x1 - 10 or content_x1 > table_x2 + 10)
|
| 318 |
-
|
| 319 |
-
if y_overlap and x_overlap:
|
| 320 |
-
return True
|
| 321 |
-
|
| 322 |
-
return False
|
| 323 |
-
|
| 324 |
-
def _is_content_in_table_by_line(self, line, table_regions):
|
| 325 |
-
"""Check if line content overlaps with any table region"""
|
| 326 |
-
if not table_regions or not line.polygon:
|
| 327 |
-
return False
|
| 328 |
-
|
| 329 |
-
line_y1 = line.polygon[1] # Top Y
|
| 330 |
-
line_y2 = line.polygon[5] # Bottom Y
|
| 331 |
-
line_x1 = line.polygon[0] # Left X
|
| 332 |
-
line_x2 = line.polygon[2] # Right X
|
| 333 |
-
|
| 334 |
-
for table_region in table_regions:
|
| 335 |
-
if not table_region.polygon:
|
| 336 |
-
continue
|
| 337 |
-
|
| 338 |
-
table_y1 = table_region.polygon[1] # Top Y
|
| 339 |
-
table_y2 = table_region.polygon[5] # Bottom Y
|
| 340 |
-
table_x1 = table_region.polygon[0] # Left X
|
| 341 |
-
table_x2 = table_region.polygon[2] # Right X
|
| 342 |
-
|
| 343 |
-
# Check for overlap with tolerance
|
| 344 |
-
y_overlap = not (line_y2 < table_y1 - 10 or line_y1 > table_y2 + 10)
|
| 345 |
-
x_overlap = not (line_x2 < table_x1 - 10 or line_x1 > table_x2 + 10)
|
| 346 |
-
|
| 347 |
-
if y_overlap and x_overlap:
|
| 348 |
-
return True
|
| 349 |
-
|
| 350 |
-
return False
|
| 351 |
-
|
| 352 |
-
def _format_table_enhanced(self, table) -> str:
|
| 353 |
-
"""Enhanced table formatting with better structure"""
|
| 354 |
-
if not table.cells:
|
| 355 |
-
return ""
|
| 356 |
-
|
| 357 |
-
# Create matrix
|
| 358 |
-
max_row = max(cell.row_index for cell in table.cells) + 1
|
| 359 |
-
max_col = max(cell.column_index for cell in table.cells) + 1
|
| 360 |
-
|
| 361 |
-
table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
|
| 362 |
-
|
| 363 |
-
# Fill matrix with cell content
|
| 364 |
-
for cell in table.cells:
|
| 365 |
-
content = (cell.content or "").strip()
|
| 366 |
-
table_matrix[cell.row_index][cell.column_index] = content
|
| 367 |
-
|
| 368 |
-
# Calculate column widths
|
| 369 |
-
col_widths = [0] * max_col
|
| 370 |
-
for row in table_matrix:
|
| 371 |
-
for col_idx, cell in enumerate(row):
|
| 372 |
-
col_widths[col_idx] = max(col_widths[col_idx], len(cell))
|
| 373 |
-
|
| 374 |
-
# Format as aligned table
|
| 375 |
-
formatted_rows = []
|
| 376 |
-
for row_idx, row in enumerate(table_matrix):
|
| 377 |
-
formatted_cells = []
|
| 378 |
-
for col_idx, cell in enumerate(row):
|
| 379 |
-
width = max(col_widths[col_idx], 3) # Minimum width
|
| 380 |
-
formatted_cells.append(cell.ljust(width))
|
| 381 |
-
|
| 382 |
-
formatted_row = " | ".join(formatted_cells)
|
| 383 |
-
formatted_rows.append(formatted_row)
|
| 384 |
-
|
| 385 |
-
# Add separator after header row
|
| 386 |
-
if row_idx == 0 and max_row > 1:
|
| 387 |
-
separator = " | ".join(["-" * max(col_widths[i], 3) for i in range(max_col)])
|
| 388 |
-
formatted_rows.append(separator)
|
| 389 |
-
|
| 390 |
-
return "\n".join(formatted_rows)
|
| 391 |
-
|
| 392 |
-
def _format_azure_result(self, analysis_result) -> str:
|
| 393 |
-
"""Format Azure Document Intelligence result preserving layout"""
|
| 394 |
-
formatted_text = []
|
| 395 |
-
|
| 396 |
-
if analysis_result.pages:
|
| 397 |
-
for page_num, page in enumerate(analysis_result.pages, 1):
|
| 398 |
-
formatted_text.append(f"\n--- Page {page_num} ---\n")
|
| 399 |
-
|
| 400 |
-
# Sort lines by vertical position for better reading order
|
| 401 |
-
if page.lines:
|
| 402 |
-
sorted_lines = sorted(page.lines, key=lambda line: (
|
| 403 |
-
line.polygon[1] if line.polygon else 0, # Y coordinate
|
| 404 |
-
line.polygon[0] if line.polygon else 0 # X coordinate
|
| 405 |
-
))
|
| 406 |
-
|
| 407 |
-
for line in sorted_lines:
|
| 408 |
-
formatted_text.append(line.content)
|
| 409 |
-
|
| 410 |
-
# Add tables if present
|
| 411 |
-
if analysis_result.tables:
|
| 412 |
-
page_tables = [t for t in analysis_result.tables if any(
|
| 413 |
-
cell.bounding_regions and
|
| 414 |
-
cell.bounding_regions[0].page_number == page_num
|
| 415 |
-
for cell in t.cells
|
| 416 |
-
)]
|
| 417 |
-
|
| 418 |
-
for table_idx, table in enumerate(page_tables):
|
| 419 |
-
formatted_text.append(f"\n--- Table {table_idx + 1} ---")
|
| 420 |
-
formatted_text.append(self._format_table(table))
|
| 421 |
-
|
| 422 |
-
return '\n'.join(formatted_text)
|
| 423 |
-
|
| 424 |
-
def _format_table(self, table) -> str:
|
| 425 |
-
"""Format table from Azure Document Intelligence"""
|
| 426 |
-
if not table.cells:
|
| 427 |
-
return ""
|
| 428 |
-
|
| 429 |
-
# Create matrix
|
| 430 |
-
max_row = max(cell.row_index for cell in table.cells) + 1
|
| 431 |
-
max_col = max(cell.column_index for cell in table.cells) + 1
|
| 432 |
-
|
| 433 |
-
table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
|
| 434 |
-
|
| 435 |
-
for cell in table.cells:
|
| 436 |
-
table_matrix[cell.row_index][cell.column_index] = cell.content or ""
|
| 437 |
-
|
| 438 |
-
# Format as text table
|
| 439 |
-
formatted_rows = []
|
| 440 |
-
for row in table_matrix:
|
| 441 |
-
formatted_rows.append(" | ".join(row))
|
| 442 |
-
|
| 443 |
-
return "\n".join(formatted_rows)
|
| 444 |
-
|
| 445 |
def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
|
| 446 |
-
"""Tesseract OCR with
|
| 447 |
result = {
|
| 448 |
'success': False,
|
| 449 |
'text': '',
|
|
|
|
| 450 |
'method_used': 'tesseract',
|
| 451 |
'metadata': {},
|
| 452 |
'error': None
|
|
@@ -458,57 +895,72 @@ class OCRService:
|
|
| 458 |
|
| 459 |
pdf_document = None
|
| 460 |
try:
|
| 461 |
-
# Convert PDF to images
|
| 462 |
pdf_document = fitz.open(pdf_path)
|
| 463 |
-
page_count = len(pdf_document)
|
| 464 |
all_text = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
for page_num in range(page_count):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
page = pdf_document.load_page(page_num)
|
| 468 |
|
| 469 |
# Render page to image
|
| 470 |
-
mat = fitz.Matrix(2.0, 2.0)
|
| 471 |
pix = page.get_pixmap(matrix=mat)
|
| 472 |
img_data = pix.tobytes("png")
|
| 473 |
|
| 474 |
-
# Convert to PIL Image
|
| 475 |
temp_img_path = None
|
| 476 |
try:
|
| 477 |
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
|
| 478 |
temp_img.write(img_data)
|
| 479 |
temp_img_path = temp_img.name
|
| 480 |
|
| 481 |
-
# Preprocess image for better OCR
|
| 482 |
processed_img = self._preprocess_image(temp_img_path)
|
| 483 |
|
| 484 |
-
# OCR with custom config
|
| 485 |
custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
|
| 486 |
text = pytesseract.image_to_string(processed_img, config=custom_config, lang='eng')
|
| 487 |
|
| 488 |
-
all_text.append(f"\n--- Page {page_num + 1} ---\n")
|
| 489 |
all_text.append(text)
|
| 490 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 491 |
finally:
|
| 492 |
-
# Clean up temp image file
|
| 493 |
if temp_img_path and os.path.exists(temp_img_path):
|
| 494 |
try:
|
| 495 |
os.unlink(temp_img_path)
|
| 496 |
except:
|
| 497 |
pass
|
| 498 |
|
|
|
|
|
|
|
| 499 |
result.update({
|
| 500 |
'success': True,
|
| 501 |
'text': '\n'.join(all_text),
|
| 502 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 503 |
})
|
| 504 |
|
| 505 |
-
logger.info("Tesseract OCR completed successfully")
|
| 506 |
|
| 507 |
except Exception as e:
|
| 508 |
logger.error(f"Tesseract OCR error: {e}")
|
| 509 |
result['error'] = f"Tesseract OCR error: {e}"
|
| 510 |
finally:
|
| 511 |
-
# FIXED: Ensure document is properly closed
|
| 512 |
if pdf_document is not None:
|
| 513 |
try:
|
| 514 |
pdf_document.close()
|
|
@@ -517,27 +969,12 @@ class OCRService:
|
|
| 517 |
|
| 518 |
return result
|
| 519 |
|
| 520 |
-
def _preprocess_image(self, image_path: str) -> np.ndarray:
|
| 521 |
-
"""Preprocess image for better OCR accuracy"""
|
| 522 |
-
# Read image
|
| 523 |
-
img = cv2.imread(image_path)
|
| 524 |
-
|
| 525 |
-
# Convert to grayscale
|
| 526 |
-
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 527 |
-
|
| 528 |
-
# Noise removal
|
| 529 |
-
denoised = cv2.medianBlur(gray, 3)
|
| 530 |
-
|
| 531 |
-
# Threshold to get binary image
|
| 532 |
-
_, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 533 |
-
|
| 534 |
-
return binary
|
| 535 |
-
|
| 536 |
def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
|
| 537 |
-
"""PyMuPDF text extraction
|
| 538 |
result = {
|
| 539 |
'success': False,
|
| 540 |
'text': '',
|
|
|
|
| 541 |
'method_used': 'pymupdf',
|
| 542 |
'metadata': {},
|
| 543 |
'error': None
|
|
@@ -546,29 +983,50 @@ class OCRService:
|
|
| 546 |
pdf_document = None
|
| 547 |
try:
|
| 548 |
pdf_document = fitz.open(pdf_path)
|
| 549 |
-
page_count = len(pdf_document)
|
| 550 |
all_text = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
|
| 552 |
for page_num in range(page_count):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
page = pdf_document.load_page(page_num)
|
| 554 |
text = page.get_text()
|
| 555 |
|
| 556 |
-
all_text.append(f"\n--- Page {page_num + 1} ---\n")
|
| 557 |
all_text.append(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
|
| 559 |
result.update({
|
| 560 |
'success': True,
|
| 561 |
'text': '\n'.join(all_text),
|
| 562 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 563 |
})
|
| 564 |
|
| 565 |
-
logger.info("PyMuPDF extraction completed successfully")
|
| 566 |
|
| 567 |
except Exception as e:
|
| 568 |
logger.error(f"PyMuPDF error: {e}")
|
| 569 |
result['error'] = f"PyMuPDF error: {e}"
|
| 570 |
finally:
|
| 571 |
-
# FIXED: Ensure document is properly closed
|
| 572 |
if pdf_document is not None:
|
| 573 |
try:
|
| 574 |
pdf_document.close()
|
|
@@ -577,11 +1035,18 @@ class OCRService:
|
|
| 577 |
|
| 578 |
return result
|
| 579 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 580 |
def _try_fallback_methods(self, pdf_path: str, exclude_method: str = None) -> Dict[str, Any]:
|
| 581 |
"""Try fallback OCR methods"""
|
| 582 |
fallback_methods = []
|
| 583 |
|
| 584 |
-
# Order of fallback preference
|
| 585 |
if exclude_method != "azure" and self.azure_client:
|
| 586 |
fallback_methods.append("azure")
|
| 587 |
if exclude_method != "tesseract" and self._check_tesseract_available():
|
|
@@ -593,7 +1058,7 @@ class OCRService:
|
|
| 593 |
logger.info(f"Trying fallback method: {method}")
|
| 594 |
try:
|
| 595 |
if method == "azure":
|
| 596 |
-
result = self.
|
| 597 |
elif method == "tesseract":
|
| 598 |
result = self._tesseract_ocr(pdf_path)
|
| 599 |
elif method == "pymupdf":
|
|
@@ -610,6 +1075,7 @@ class OCRService:
|
|
| 610 |
return {
|
| 611 |
'success': False,
|
| 612 |
'text': '',
|
|
|
|
| 613 |
'method_used': 'all_methods_failed',
|
| 614 |
'metadata': {},
|
| 615 |
'error': 'All OCR methods failed'
|
|
@@ -633,6 +1099,6 @@ class OCRService:
|
|
| 633 |
methods.append("azure")
|
| 634 |
if self._check_tesseract_available():
|
| 635 |
methods.append("tesseract")
|
| 636 |
-
methods.append("pymupdf")
|
| 637 |
|
| 638 |
return methods
|
|
|
|
| 1 |
"""
|
| 2 |
+
OCR Service Module - FIXED VERSION with Improved Text Formatting and Page Numbers
|
| 3 |
+
Handles PDF to text conversion with proper indentation, spacing, and page numbering
|
| 4 |
"""
|
| 5 |
import re
|
| 6 |
import os
|
| 7 |
import logging
|
| 8 |
+
from typing import Optional, Dict, Any, Tuple, List
|
| 9 |
import tempfile
|
| 10 |
from pathlib import Path
|
| 11 |
|
|
|
|
| 35 |
logger = logging.getLogger(__name__)
|
| 36 |
|
| 37 |
|
| 38 |
+
class HTMLProcessor:
|
| 39 |
+
"""Process OCR results through HTML for better formatting preservation - FIXED VERSION"""
|
| 40 |
+
|
| 41 |
+
@staticmethod
|
| 42 |
+
def create_html_from_azure_result(analysis_result) -> str:
|
| 43 |
+
"""Create structured HTML from Azure Document Intelligence result with proper spacing and page numbers"""
|
| 44 |
+
html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8">']
|
| 45 |
+
html_parts.append('<style>')
|
| 46 |
+
html_parts.append('''
|
| 47 |
+
body {
|
| 48 |
+
font-family: 'Consolas', 'Courier New', monospace;
|
| 49 |
+
line-height: 1.6;
|
| 50 |
+
margin: 20px;
|
| 51 |
+
white-space: pre-wrap;
|
| 52 |
+
font-size: 11pt;
|
| 53 |
+
background-color: #fafafa;
|
| 54 |
+
}
|
| 55 |
+
.page {
|
| 56 |
+
margin-bottom: 30px;
|
| 57 |
+
border: 1px solid #ddd;
|
| 58 |
+
padding: 20px;
|
| 59 |
+
background-color: white;
|
| 60 |
+
border-radius: 5px;
|
| 61 |
+
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
| 62 |
+
}
|
| 63 |
+
.page-header {
|
| 64 |
+
font-weight: bold;
|
| 65 |
+
color: #2c3e50;
|
| 66 |
+
margin-bottom: 15px;
|
| 67 |
+
text-align: center;
|
| 68 |
+
border-bottom: 2px solid #3498db;
|
| 69 |
+
padding-bottom: 8px;
|
| 70 |
+
font-size: 14pt;
|
| 71 |
+
text-transform: uppercase;
|
| 72 |
+
letter-spacing: 1px;
|
| 73 |
+
}
|
| 74 |
+
.paragraph {
|
| 75 |
+
margin-bottom: 0.8em;
|
| 76 |
+
white-space: pre-wrap;
|
| 77 |
+
font-family: 'Consolas', 'Courier New', monospace;
|
| 78 |
+
line-height: 1.4;
|
| 79 |
+
}
|
| 80 |
+
.title {
|
| 81 |
+
font-size: 1.4em;
|
| 82 |
+
font-weight: bold;
|
| 83 |
+
margin: 15px 0 12px 0;
|
| 84 |
+
color: #2c3e50;
|
| 85 |
+
border-left: 4px solid #3498db;
|
| 86 |
+
padding-left: 10px;
|
| 87 |
+
}
|
| 88 |
+
.section-heading {
|
| 89 |
+
font-size: 1.2em;
|
| 90 |
+
font-weight: bold;
|
| 91 |
+
margin: 12px 0 8px 0;
|
| 92 |
+
color: #34495e;
|
| 93 |
+
border-left: 3px solid #95a5a6;
|
| 94 |
+
padding-left: 8px;
|
| 95 |
+
}
|
| 96 |
+
.table-container {
|
| 97 |
+
margin: 15px 0;
|
| 98 |
+
font-family: 'Consolas', 'Courier New', monospace;
|
| 99 |
+
background-color: #f8f9fa;
|
| 100 |
+
padding: 10px;
|
| 101 |
+
border-radius: 5px;
|
| 102 |
+
border: 1px solid #dee2e6;
|
| 103 |
+
}
|
| 104 |
+
.table {
|
| 105 |
+
border-collapse: collapse;
|
| 106 |
+
width: 100%;
|
| 107 |
+
margin: 8px 0;
|
| 108 |
+
font-family: 'Consolas', 'Courier New', monospace;
|
| 109 |
+
font-size: 10pt;
|
| 110 |
+
background-color: white;
|
| 111 |
+
}
|
| 112 |
+
.table th, .table td {
|
| 113 |
+
border: 1px solid #bdc3c7;
|
| 114 |
+
padding: 6px 10px;
|
| 115 |
+
text-align: left;
|
| 116 |
+
white-space: pre-wrap;
|
| 117 |
+
vertical-align: top;
|
| 118 |
+
}
|
| 119 |
+
.table th {
|
| 120 |
+
background-color: #ecf0f1;
|
| 121 |
+
font-weight: bold;
|
| 122 |
+
color: #2c3e50;
|
| 123 |
+
}
|
| 124 |
+
.table tr:nth-child(even) {
|
| 125 |
+
background-color: #f8f9fa;
|
| 126 |
+
}
|
| 127 |
+
.indented {
|
| 128 |
+
display: inline-block;
|
| 129 |
+
white-space: pre-wrap;
|
| 130 |
+
}
|
| 131 |
+
.bullet-point {
|
| 132 |
+
position: relative;
|
| 133 |
+
padding-left: 1.2em;
|
| 134 |
+
margin-bottom: 0.3em;
|
| 135 |
+
}
|
| 136 |
+
.bullet-point:before {
|
| 137 |
+
content: "•";
|
| 138 |
+
position: absolute;
|
| 139 |
+
left: 0;
|
| 140 |
+
color: #3498db;
|
| 141 |
+
font-weight: bold;
|
| 142 |
+
}
|
| 143 |
+
.spaced {
|
| 144 |
+
margin-top: 10px;
|
| 145 |
+
}
|
| 146 |
+
.page-number {
|
| 147 |
+
position: relative;
|
| 148 |
+
float: right;
|
| 149 |
+
background-color: #3498db;
|
| 150 |
+
color: white;
|
| 151 |
+
padding: 2px 8px;
|
| 152 |
+
border-radius: 3px;
|
| 153 |
+
font-size: 9pt;
|
| 154 |
+
margin-top: -5px;
|
| 155 |
+
}
|
| 156 |
+
''')
|
| 157 |
+
html_parts.append('</style></head><body>')
|
| 158 |
+
|
| 159 |
+
if not analysis_result.pages:
|
| 160 |
+
html_parts.append('<p>No content found</p></body></html>')
|
| 161 |
+
return '\n'.join(html_parts)
|
| 162 |
+
|
| 163 |
+
for page_num, page in enumerate(analysis_result.pages, 1):
|
| 164 |
+
html_parts.append(f'<div class="page">')
|
| 165 |
+
html_parts.append(f'<div class="page-header">Page {page_num} <span class="page-number">{page_num}</span></div>')
|
| 166 |
+
|
| 167 |
+
# Process content with proper ordering and spacing preservation
|
| 168 |
+
content_items = HTMLProcessor._extract_page_content(page, analysis_result, page_num)
|
| 169 |
+
content_items.sort(key=lambda x: (x['y_pos'], x['x_pos']))
|
| 170 |
+
|
| 171 |
+
# Generate HTML for each content item with preserved spacing
|
| 172 |
+
for item in content_items:
|
| 173 |
+
if item['type'] == 'table':
|
| 174 |
+
html_parts.append(HTMLProcessor._table_to_html(item['content'], item['table_idx']))
|
| 175 |
+
else:
|
| 176 |
+
html_parts.append(HTMLProcessor._text_to_html(item))
|
| 177 |
+
|
| 178 |
+
html_parts.append('</div>')
|
| 179 |
+
|
| 180 |
+
html_parts.append('</body></html>')
|
| 181 |
+
return '\n'.join(html_parts)
|
| 182 |
+
|
| 183 |
+
@staticmethod
|
| 184 |
+
def _extract_page_content(page, analysis_result, page_num):
|
| 185 |
+
"""Extract and organize page content without losing text with proper spacing"""
|
| 186 |
+
content_items = []
|
| 187 |
+
|
| 188 |
+
# First, collect all tables for this page
|
| 189 |
+
page_tables = []
|
| 190 |
+
table_regions = []
|
| 191 |
+
|
| 192 |
+
if analysis_result.tables:
|
| 193 |
+
for table_idx, table in enumerate(analysis_result.tables):
|
| 194 |
+
if HTMLProcessor._is_table_on_page(table, page_num):
|
| 195 |
+
page_tables.append((table_idx, table))
|
| 196 |
+
# Store table regions for overlap detection
|
| 197 |
+
if table.bounding_regions:
|
| 198 |
+
table_regions.append({
|
| 199 |
+
'polygon': table.bounding_regions[0].polygon,
|
| 200 |
+
'table_idx': table_idx
|
| 201 |
+
})
|
| 202 |
+
|
| 203 |
+
# Add table items to content
|
| 204 |
+
for table_idx, table in page_tables:
|
| 205 |
+
if table.bounding_regions and table.bounding_regions[0].polygon:
|
| 206 |
+
polygon = table.bounding_regions[0].polygon
|
| 207 |
+
y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) # Top Y
|
| 208 |
+
x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) # Left X
|
| 209 |
+
|
| 210 |
+
content_items.append({
|
| 211 |
+
'type': 'table',
|
| 212 |
+
'content': table,
|
| 213 |
+
'table_idx': table_idx,
|
| 214 |
+
'y_pos': y_pos,
|
| 215 |
+
'x_pos': x_pos
|
| 216 |
+
})
|
| 217 |
+
|
| 218 |
+
# Calculate page margins for proper indentation detection
|
| 219 |
+
page_left_margin = HTMLProcessor._calculate_page_margins(page, analysis_result, page_num)
|
| 220 |
+
|
| 221 |
+
# Process text content - use paragraphs if available, otherwise lines
|
| 222 |
+
if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
|
| 223 |
+
# Use paragraphs (better content grouping)
|
| 224 |
+
page_paragraphs = [p for p in analysis_result.paragraphs if
|
| 225 |
+
p.bounding_regions and
|
| 226 |
+
p.bounding_regions[0].page_number == page_num]
|
| 227 |
+
|
| 228 |
+
for para in page_paragraphs:
|
| 229 |
+
if para.content.strip():
|
| 230 |
+
# Check if this paragraph overlaps significantly with any table
|
| 231 |
+
overlap_ratio = HTMLProcessor._calculate_table_overlap(para, table_regions)
|
| 232 |
+
|
| 233 |
+
# Only exclude if heavily overlapping (>70%) with a table
|
| 234 |
+
if overlap_ratio < 0.7:
|
| 235 |
+
polygon = para.bounding_regions[0].polygon
|
| 236 |
+
y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
|
| 237 |
+
x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
|
| 238 |
+
|
| 239 |
+
# Calculate proper indentation based on page margins
|
| 240 |
+
indent_info = HTMLProcessor._calculate_precise_indentation(x_pos, page_left_margin, para.content)
|
| 241 |
+
|
| 242 |
+
content_items.append({
|
| 243 |
+
'type': 'paragraph',
|
| 244 |
+
'content': para.content.strip(),
|
| 245 |
+
'role': getattr(para, 'role', 'paragraph'),
|
| 246 |
+
'y_pos': y_pos,
|
| 247 |
+
'x_pos': x_pos,
|
| 248 |
+
'indent_level': indent_info['level'],
|
| 249 |
+
'indent_pixels': indent_info['pixels'],
|
| 250 |
+
'is_bullet': indent_info['is_bullet'],
|
| 251 |
+
'preserve_spacing': True
|
| 252 |
+
})
|
| 253 |
+
|
| 254 |
+
elif page.lines:
|
| 255 |
+
# Use lines as fallback with enhanced spacing preservation
|
| 256 |
+
processed_lines = HTMLProcessor._process_lines_content_with_spacing(page.lines, table_regions, page_left_margin)
|
| 257 |
+
content_items.extend(processed_lines)
|
| 258 |
+
|
| 259 |
+
return content_items
|
| 260 |
+
|
| 261 |
+
@staticmethod
|
| 262 |
+
def _is_table_on_page(table, page_num):
|
| 263 |
+
"""Check if table belongs to the specified page"""
|
| 264 |
+
if not table.cells:
|
| 265 |
+
return False
|
| 266 |
+
|
| 267 |
+
for cell in table.cells:
|
| 268 |
+
if (cell.bounding_regions and
|
| 269 |
+
cell.bounding_regions[0].page_number == page_num):
|
| 270 |
+
return True
|
| 271 |
+
return False
|
| 272 |
+
|
| 273 |
+
@staticmethod
|
| 274 |
+
def _calculate_table_overlap(content_item, table_regions):
|
| 275 |
+
"""Calculate overlap ratio between content and tables (FIXED)"""
|
| 276 |
+
if not table_regions or not content_item.bounding_regions:
|
| 277 |
+
return 0.0
|
| 278 |
+
|
| 279 |
+
content_polygon = content_item.bounding_regions[0].polygon
|
| 280 |
+
if not content_polygon or len(content_polygon) < 8:
|
| 281 |
+
return 0.0
|
| 282 |
+
|
| 283 |
+
# Content bounding box
|
| 284 |
+
content_x1 = min(content_polygon[0], content_polygon[2], content_polygon[4], content_polygon[6])
|
| 285 |
+
content_x2 = max(content_polygon[0], content_polygon[2], content_polygon[4], content_polygon[6])
|
| 286 |
+
content_y1 = min(content_polygon[1], content_polygon[3], content_polygon[5], content_polygon[7])
|
| 287 |
+
content_y2 = max(content_polygon[1], content_polygon[3], content_polygon[5], content_polygon[7])
|
| 288 |
+
|
| 289 |
+
content_area = (content_x2 - content_x1) * (content_y2 - content_y1)
|
| 290 |
+
if content_area <= 0:
|
| 291 |
+
return 0.0
|
| 292 |
+
|
| 293 |
+
max_overlap_ratio = 0.0
|
| 294 |
+
|
| 295 |
+
for table_region in table_regions:
|
| 296 |
+
table_polygon = table_region['polygon']
|
| 297 |
+
if not table_polygon or len(table_polygon) < 8:
|
| 298 |
+
continue
|
| 299 |
+
|
| 300 |
+
# Table bounding box
|
| 301 |
+
table_x1 = min(table_polygon[0], table_polygon[2], table_polygon[4], table_polygon[6])
|
| 302 |
+
table_x2 = max(table_polygon[0], table_polygon[2], table_polygon[4], table_polygon[6])
|
| 303 |
+
table_y1 = min(table_polygon[1], table_polygon[3], table_polygon[5], table_polygon[7])
|
| 304 |
+
table_y2 = max(table_polygon[1], table_polygon[3], table_polygon[5], table_polygon[7])
|
| 305 |
+
|
| 306 |
+
# Calculate intersection
|
| 307 |
+
intersect_x1 = max(content_x1, table_x1)
|
| 308 |
+
intersect_x2 = min(content_x2, table_x2)
|
| 309 |
+
intersect_y1 = max(content_y1, table_y1)
|
| 310 |
+
intersect_y2 = min(content_y2, table_y2)
|
| 311 |
+
|
| 312 |
+
if intersect_x2 > intersect_x1 and intersect_y2 > intersect_y1:
|
| 313 |
+
intersect_area = (intersect_x2 - intersect_x1) * (intersect_y2 - intersect_y1)
|
| 314 |
+
overlap_ratio = intersect_area / content_area
|
| 315 |
+
max_overlap_ratio = max(max_overlap_ratio, overlap_ratio)
|
| 316 |
+
|
| 317 |
+
return max_overlap_ratio
|
| 318 |
+
|
| 319 |
+
@staticmethod
|
| 320 |
+
def _calculate_page_margins(page, analysis_result, page_num):
|
| 321 |
+
"""Calculate page margins to determine proper indentation baseline"""
|
| 322 |
+
left_positions = []
|
| 323 |
+
|
| 324 |
+
# Collect x positions from paragraphs if available
|
| 325 |
+
if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs:
|
| 326 |
+
page_paragraphs = [p for p in analysis_result.paragraphs if
|
| 327 |
+
p.bounding_regions and
|
| 328 |
+
p.bounding_regions[0].page_number == page_num]
|
| 329 |
+
|
| 330 |
+
for para in page_paragraphs:
|
| 331 |
+
if para.bounding_regions and para.bounding_regions[0].polygon:
|
| 332 |
+
polygon = para.bounding_regions[0].polygon
|
| 333 |
+
x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6])
|
| 334 |
+
left_positions.append(x_pos)
|
| 335 |
+
|
| 336 |
+
# Fallback to lines if no paragraphs
|
| 337 |
+
elif page.lines:
|
| 338 |
+
for line in page.lines:
|
| 339 |
+
if line.polygon:
|
| 340 |
+
x_pos = min(line.polygon[0], line.polygon[2], line.polygon[4], line.polygon[6])
|
| 341 |
+
left_positions.append(x_pos)
|
| 342 |
+
|
| 343 |
+
# Find the most common left margin (baseline)
|
| 344 |
+
if left_positions:
|
| 345 |
+
left_positions.sort()
|
| 346 |
+
# Take the most frequent left position as the main margin
|
| 347 |
+
from collections import Counter
|
| 348 |
+
position_counts = Counter([round(pos, -1) for pos in left_positions]) # Round to nearest 10
|
| 349 |
+
base_margin = position_counts.most_common(1)[0][0]
|
| 350 |
+
return base_margin
|
| 351 |
+
|
| 352 |
+
return 50 # Default margin if no content found
|
| 353 |
+
|
| 354 |
+
@staticmethod
|
| 355 |
+
def _calculate_precise_indentation(x_pos, base_margin, content):
|
| 356 |
+
"""Calculate precise indentation based on x position and content analysis"""
|
| 357 |
+
# Calculate indent distance from base margin
|
| 358 |
+
indent_distance = max(0, x_pos - base_margin)
|
| 359 |
+
|
| 360 |
+
# Define indentation levels based on distance
|
| 361 |
+
# Each level represents approximately 0.5 inch or 36 points
|
| 362 |
+
level_threshold = 30 # Reduced threshold for better sensitivity
|
| 363 |
+
indent_level = int(indent_distance / level_threshold)
|
| 364 |
+
|
| 365 |
+
# Detect bullet points or numbered lists
|
| 366 |
+
is_bullet = False
|
| 367 |
+
content_stripped = content.strip()
|
| 368 |
+
|
| 369 |
+
# Common bullet point patterns
|
| 370 |
+
bullet_patterns = [
|
| 371 |
+
r'^\s*[•·▪▫◦‣⁃]\s+', # Bullet symbols
|
| 372 |
+
r'^\s*[\-\*\+]\s+', # Dash, asterisk, plus
|
| 373 |
+
r'^\s*\d+[\.\)]\s+', # Numbered lists (1. or 1))
|
| 374 |
+
r'^\s*[a-zA-Z][\.\)]\s+', # Lettered lists (a. or a))
|
| 375 |
+
r'^\s*[ivxlcdm]+[\.\)]\s+', # Roman numerals
|
| 376 |
+
]
|
| 377 |
+
|
| 378 |
+
for pattern in bullet_patterns:
|
| 379 |
+
if re.match(pattern, content_stripped, re.IGNORECASE):
|
| 380 |
+
is_bullet = True
|
| 381 |
+
break
|
| 382 |
+
|
| 383 |
+
return {
|
| 384 |
+
'level': min(indent_level, 6), # Cap at level 6
|
| 385 |
+
'pixels': indent_distance,
|
| 386 |
+
'is_bullet': is_bullet
|
| 387 |
+
}
|
| 388 |
+
|
| 389 |
+
@staticmethod
|
| 390 |
+
def _process_lines_content_with_spacing(lines, table_regions, page_left_margin):
|
| 391 |
+
"""Process lines content with enhanced spacing preservation"""
|
| 392 |
+
content_items = []
|
| 393 |
+
processed_content = set()
|
| 394 |
+
|
| 395 |
+
for line in lines:
|
| 396 |
+
if not line.content.strip():
|
| 397 |
+
continue
|
| 398 |
+
|
| 399 |
+
# Avoid duplicates
|
| 400 |
+
content_key = line.content.strip().lower()
|
| 401 |
+
if content_key in processed_content:
|
| 402 |
+
continue
|
| 403 |
+
processed_content.add(content_key)
|
| 404 |
+
|
| 405 |
+
# Check table overlap
|
| 406 |
+
overlap_ratio = HTMLProcessor._calculate_line_table_overlap(line, table_regions)
|
| 407 |
+
|
| 408 |
+
# Only exclude if heavily overlapping with table
|
| 409 |
+
if overlap_ratio < 0.7:
|
| 410 |
+
polygon = line.polygon
|
| 411 |
+
y_pos = min(polygon[1], polygon[3], polygon[5], polygon[7]) if polygon else 0
|
| 412 |
+
x_pos = min(polygon[0], polygon[2], polygon[4], polygon[6]) if polygon else 0
|
| 413 |
+
|
| 414 |
+
# Calculate precise indentation for lines
|
| 415 |
+
indent_info = HTMLProcessor._calculate_precise_indentation(x_pos, page_left_margin, line.content)
|
| 416 |
+
|
| 417 |
+
content_items.append({
|
| 418 |
+
'type': 'line',
|
| 419 |
+
'content': line.content.strip(),
|
| 420 |
+
'role': 'text',
|
| 421 |
+
'y_pos': y_pos,
|
| 422 |
+
'x_pos': x_pos,
|
| 423 |
+
'indent_level': indent_info['level'],
|
| 424 |
+
'indent_pixels': indent_info['pixels'],
|
| 425 |
+
'is_bullet': indent_info['is_bullet'],
|
| 426 |
+
'preserve_spacing': True
|
| 427 |
+
})
|
| 428 |
+
|
| 429 |
+
return content_items
|
| 430 |
+
|
| 431 |
+
@staticmethod
|
| 432 |
+
def _calculate_line_table_overlap(line, table_regions):
|
| 433 |
+
"""Calculate overlap between line and tables"""
|
| 434 |
+
if not table_regions or not line.polygon:
|
| 435 |
+
return 0.0
|
| 436 |
+
|
| 437 |
+
line_polygon = line.polygon
|
| 438 |
+
if len(line_polygon) < 8:
|
| 439 |
+
return 0.0
|
| 440 |
+
|
| 441 |
+
# Line bounding box
|
| 442 |
+
line_x1 = min(line_polygon[0], line_polygon[2], line_polygon[4], line_polygon[6])
|
| 443 |
+
line_x2 = max(line_polygon[0], line_polygon[2], line_polygon[4], line_polygon[6])
|
| 444 |
+
line_y1 = min(line_polygon[1], line_polygon[3], line_polygon[5], line_polygon[7])
|
| 445 |
+
line_y2 = max(line_polygon[1], line_polygon[3], line_polygon[5], line_polygon[7])
|
| 446 |
+
|
| 447 |
+
line_area = (line_x2 - line_x1) * (line_y2 - line_y1)
|
| 448 |
+
if line_area <= 0:
|
| 449 |
+
return 0.0
|
| 450 |
+
|
| 451 |
+
max_overlap = 0.0
|
| 452 |
+
|
| 453 |
+
for table_region in table_regions:
|
| 454 |
+
table_polygon = table_region['polygon']
|
| 455 |
+
if not table_polygon or len(table_polygon) < 8:
|
| 456 |
+
continue
|
| 457 |
+
|
| 458 |
+
table_x1 = min(table_polygon[0], table_polygon[2], table_polygon[4], table_polygon[6])
|
| 459 |
+
table_x2 = max(table_polygon[0], table_polygon[2], table_polygon[4], table_polygon[6])
|
| 460 |
+
table_y1 = min(table_polygon[1], table_polygon[3], table_polygon[5], table_polygon[7])
|
| 461 |
+
table_y2 = max(table_polygon[1], table_polygon[3], table_polygon[5], table_polygon[7])
|
| 462 |
+
|
| 463 |
+
# Calculate intersection
|
| 464 |
+
intersect_x1 = max(line_x1, table_x1)
|
| 465 |
+
intersect_x2 = min(line_x2, table_x2)
|
| 466 |
+
intersect_y1 = max(line_y1, table_y1)
|
| 467 |
+
intersect_y2 = min(line_y2, table_y2)
|
| 468 |
+
|
| 469 |
+
if intersect_x2 > intersect_x1 and intersect_y2 > intersect_y1:
|
| 470 |
+
intersect_area = (intersect_x2 - intersect_x1) * (intersect_y2 - intersect_y1)
|
| 471 |
+
overlap_ratio = intersect_area / line_area
|
| 472 |
+
max_overlap = max(max_overlap, overlap_ratio)
|
| 473 |
+
|
| 474 |
+
return max_overlap
|
| 475 |
+
|
| 476 |
+
@staticmethod
|
| 477 |
+
def _text_to_html(item):
|
| 478 |
+
"""Convert text item to HTML with proper formatting and preserved spacing"""
|
| 479 |
+
content = item['content']
|
| 480 |
+
role = item.get('role', 'paragraph')
|
| 481 |
+
indent_level = item.get('indent_level', 0)
|
| 482 |
+
indent_pixels = item.get('indent_pixels', 0)
|
| 483 |
+
is_bullet = item.get('is_bullet', False)
|
| 484 |
+
preserve_spacing = item.get('preserve_spacing', False)
|
| 485 |
+
|
| 486 |
+
# Calculate CSS indentation
|
| 487 |
+
css_indent = max(0, indent_level)
|
| 488 |
+
|
| 489 |
+
# Build CSS classes and inline styles
|
| 490 |
+
css_classes = []
|
| 491 |
+
inline_styles = []
|
| 492 |
+
|
| 493 |
+
if css_indent > 0:
|
| 494 |
+
inline_styles.append(f"margin-left: {css_indent * 1.5}em")
|
| 495 |
+
css_classes.append("indented")
|
| 496 |
+
|
| 497 |
+
if is_bullet:
|
| 498 |
+
css_classes.append("bullet-point")
|
| 499 |
+
|
| 500 |
+
# Preserve internal spacing within content
|
| 501 |
+
if preserve_spacing:
|
| 502 |
+
# Replace multiple spaces with to preserve spacing
|
| 503 |
+
content = re.sub(r' +', lambda m: ' ' * len(m.group()), content)
|
| 504 |
+
# Preserve line breaks within content
|
| 505 |
+
content = content.replace('\n', '<br>')
|
| 506 |
+
|
| 507 |
+
# Combine CSS
|
| 508 |
+
class_str = f' class="{" ".join(css_classes)}"' if css_classes else ''
|
| 509 |
+
style_str = f' style="{"; ".join(inline_styles)}"' if inline_styles else ''
|
| 510 |
+
|
| 511 |
+
if role == 'title':
|
| 512 |
+
return f'<div class="title"{class_str}{style_str}>{content}</div>'
|
| 513 |
+
elif role == 'sectionHeading':
|
| 514 |
+
return f'<div class="section-heading"{class_str}{style_str}>{content}</div>'
|
| 515 |
+
else:
|
| 516 |
+
# Regular paragraphs with preserved formatting
|
| 517 |
+
return f'<div class="paragraph"{class_str}{style_str}>{content}</div>'
|
| 518 |
+
|
| 519 |
+
@staticmethod
|
| 520 |
+
def _table_to_html(table, table_idx):
|
| 521 |
+
"""Convert table to HTML with proper structure"""
|
| 522 |
+
if not table.cells:
|
| 523 |
+
return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
|
| 524 |
+
|
| 525 |
+
# Create table matrix
|
| 526 |
+
max_row = max(cell.row_index for cell in table.cells) + 1
|
| 527 |
+
max_col = max(cell.column_index for cell in table.cells) + 1
|
| 528 |
+
|
| 529 |
+
table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
|
| 530 |
+
|
| 531 |
+
# Fill matrix
|
| 532 |
+
for cell in table.cells:
|
| 533 |
+
content = (cell.content or "").strip()
|
| 534 |
+
table_matrix[cell.row_index][cell.column_index] = content
|
| 535 |
+
|
| 536 |
+
# Generate HTML
|
| 537 |
+
html_parts = [f'<div class="table-container">']
|
| 538 |
+
html_parts.append(f'<h4>Table {table_idx + 1}</h4>')
|
| 539 |
+
html_parts.append('<table class="table">')
|
| 540 |
+
|
| 541 |
+
for row_idx, row in enumerate(table_matrix):
|
| 542 |
+
if row_idx == 0 and any(cell.strip() for cell in row):
|
| 543 |
+
# Header row
|
| 544 |
+
html_parts.append('<tr>')
|
| 545 |
+
for cell in row:
|
| 546 |
+
html_parts.append(f'<th>{cell}</th>')
|
| 547 |
+
html_parts.append('</tr>')
|
| 548 |
+
else:
|
| 549 |
+
# Data row
|
| 550 |
+
if any(cell.strip() for cell in row): # Skip empty rows
|
| 551 |
+
html_parts.append('<tr>')
|
| 552 |
+
for cell in row:
|
| 553 |
+
html_parts.append(f'<td>{cell}</td>')
|
| 554 |
+
html_parts.append('</tr>')
|
| 555 |
+
|
| 556 |
+
html_parts.append('</table></div>')
|
| 557 |
+
return '\n'.join(html_parts)
|
| 558 |
+
|
| 559 |
+
@staticmethod
|
| 560 |
+
def html_to_formatted_text(html_content):
|
| 561 |
+
"""Convert HTML back to formatted text preserving structure, spacing, and adding page numbers"""
|
| 562 |
+
from html.parser import HTMLParser
|
| 563 |
+
|
| 564 |
+
class FixedSpacingTextExtractor(HTMLParser):
|
| 565 |
+
def __init__(self):
|
| 566 |
+
super().__init__()
|
| 567 |
+
self.text_parts = []
|
| 568 |
+
self.in_title = False
|
| 569 |
+
self.in_section_heading = False
|
| 570 |
+
self.in_table = False
|
| 571 |
+
self.in_table_header = False
|
| 572 |
+
self.current_table_row = []
|
| 573 |
+
self.table_data = []
|
| 574 |
+
self.current_indent = 0
|
| 575 |
+
self.preserve_spacing = False
|
| 576 |
+
self.in_page_header = False
|
| 577 |
+
self.current_page_num = 0
|
| 578 |
+
|
| 579 |
+
def handle_starttag(self, tag, attrs):
|
| 580 |
+
attr_dict = dict(attrs)
|
| 581 |
+
class_attr = attr_dict.get('class', '')
|
| 582 |
+
style_attr = attr_dict.get('style', '')
|
| 583 |
+
|
| 584 |
+
if 'page-header' in class_attr:
|
| 585 |
+
self.in_page_header = True
|
| 586 |
+
# Add proper page separation with page number
|
| 587 |
+
if len(self.text_parts) > 0:
|
| 588 |
+
self.text_parts.append('\n\n' + '=' * 80 + '\n')
|
| 589 |
+
|
| 590 |
+
elif 'title' in class_attr:
|
| 591 |
+
self.in_title = True
|
| 592 |
+
elif 'section-heading' in class_attr:
|
| 593 |
+
self.in_section_heading = True
|
| 594 |
+
elif tag == 'table':
|
| 595 |
+
self.in_table = True
|
| 596 |
+
self.table_data = []
|
| 597 |
+
elif tag == 'th':
|
| 598 |
+
self.in_table_header = True
|
| 599 |
+
elif tag == 'tr':
|
| 600 |
+
self.current_table_row = []
|
| 601 |
+
elif tag == 'br':
|
| 602 |
+
self.text_parts.append('\n')
|
| 603 |
+
|
| 604 |
+
# Extract indentation from style
|
| 605 |
+
if 'margin-left' in style_attr:
|
| 606 |
+
import re
|
| 607 |
+
margin_match = re.search(r'margin-left:\s*(\d+(?:\.\d+)?)em', style_attr)
|
| 608 |
+
if margin_match:
|
| 609 |
+
self.current_indent = int(float(margin_match.group(1)))
|
| 610 |
+
else:
|
| 611 |
+
self.current_indent = 0
|
| 612 |
+
else:
|
| 613 |
+
# Count indented classes as fallback
|
| 614 |
+
self.current_indent = class_attr.count('indented')
|
| 615 |
+
|
| 616 |
+
# Check if we should preserve spacing
|
| 617 |
+
self.preserve_spacing = 'paragraph' in class_attr or 'bullet-point' in class_attr
|
| 618 |
+
|
| 619 |
+
def handle_endtag(self, tag):
|
| 620 |
+
if tag == 'div' and self.in_page_header:
|
| 621 |
+
self.text_parts.append('\n' + '=' * 80 + '\n\n')
|
| 622 |
+
self.in_page_header = False
|
| 623 |
+
elif tag == 'div' and self.in_title:
|
| 624 |
+
self.text_parts.append('\n\n')
|
| 625 |
+
self.in_title = False
|
| 626 |
+
elif tag == 'div' and self.in_section_heading:
|
| 627 |
+
self.text_parts.append('\n\n')
|
| 628 |
+
self.in_section_heading = False
|
| 629 |
+
elif tag == 'table':
|
| 630 |
+
self.in_table = False
|
| 631 |
+
self._format_table()
|
| 632 |
+
elif tag == 'th':
|
| 633 |
+
self.in_table_header = False
|
| 634 |
+
elif tag == 'tr' and self.current_table_row:
|
| 635 |
+
self.table_data.append(self.current_table_row[:])
|
| 636 |
+
elif tag == 'div' and not self.in_table and not self.in_title and not self.in_section_heading and not self.in_page_header:
|
| 637 |
+
if not self.preserve_spacing:
|
| 638 |
+
self.text_parts.append('\n')
|
| 639 |
+
|
| 640 |
+
# Reset indentation when closing div
|
| 641 |
+
if tag == 'div':
|
| 642 |
+
self.current_indent = 0
|
| 643 |
+
self.preserve_spacing = False
|
| 644 |
+
|
| 645 |
+
def handle_data(self, data):
|
| 646 |
+
if data.strip():
|
| 647 |
+
# Convert back to spaces for proper spacing
|
| 648 |
+
data = data.replace(' ', ' ')
|
| 649 |
+
|
| 650 |
+
if self.in_page_header:
|
| 651 |
+
# Extract page number and format properly
|
| 652 |
+
page_match = re.search(r'Page (\d+)', data)
|
| 653 |
+
if page_match:
|
| 654 |
+
self.current_page_num = int(page_match.group(1))
|
| 655 |
+
page_header = f"PAGE {self.current_page_num}"
|
| 656 |
+
self.text_parts.append(page_header.center(80))
|
| 657 |
+
elif self.in_title:
|
| 658 |
+
indent_str = " " * self.current_indent
|
| 659 |
+
self.text_parts.append(f'\n{indent_str}## {data.strip()}')
|
| 660 |
+
elif self.in_section_heading:
|
| 661 |
+
indent_str = " " * self.current_indent
|
| 662 |
+
self.text_parts.append(f'\n{indent_str}### {data.strip()}')
|
| 663 |
+
elif self.in_table:
|
| 664 |
+
if self.in_table_header or self.current_table_row is not None:
|
| 665 |
+
self.current_table_row.append(data.strip())
|
| 666 |
+
else:
|
| 667 |
+
# Apply indentation and preserve internal spacing
|
| 668 |
+
indent_str = " " * self.current_indent
|
| 669 |
+
|
| 670 |
+
if self.preserve_spacing:
|
| 671 |
+
# Keep the exact spacing from the data
|
| 672 |
+
formatted_data = data
|
| 673 |
+
else:
|
| 674 |
+
# Clean up spacing for non-preserved content
|
| 675 |
+
formatted_data = re.sub(r'\s+', ' ', data).strip()
|
| 676 |
+
|
| 677 |
+
# Handle bullet points specially
|
| 678 |
+
if 'bullet-point' in getattr(self, '_last_class', ''):
|
| 679 |
+
# Remove the bullet symbol that CSS adds and format properly
|
| 680 |
+
self.text_parts.append(f'{indent_str}• {formatted_data}')
|
| 681 |
+
else:
|
| 682 |
+
self.text_parts.append(f'{indent_str}{formatted_data}')
|
| 683 |
+
|
| 684 |
+
def _format_table(self):
|
| 685 |
+
if not self.table_data:
|
| 686 |
+
return
|
| 687 |
+
|
| 688 |
+
self.text_parts.append('\n\n')
|
| 689 |
+
|
| 690 |
+
# Calculate column widths for better formatting
|
| 691 |
+
if self.table_data:
|
| 692 |
+
max_cols = max(len(row) for row in self.table_data)
|
| 693 |
+
col_widths = [0] * max_cols
|
| 694 |
+
|
| 695 |
+
for row in self.table_data:
|
| 696 |
+
for i, cell in enumerate(row):
|
| 697 |
+
if i < max_cols:
|
| 698 |
+
col_widths[i] = max(col_widths[i], len(str(cell)))
|
| 699 |
+
|
| 700 |
+
# Ensure minimum column width
|
| 701 |
+
col_widths = [max(width, 8) for width in col_widths]
|
| 702 |
+
|
| 703 |
+
# Format rows with proper alignment
|
| 704 |
+
for row_idx, row in enumerate(self.table_data):
|
| 705 |
+
formatted_cells = []
|
| 706 |
+
for i, cell in enumerate(row):
|
| 707 |
+
if i < max_cols:
|
| 708 |
+
width = col_widths[i]
|
| 709 |
+
formatted_cells.append(str(cell).ljust(width))
|
| 710 |
+
|
| 711 |
+
row_text = ' | '.join(formatted_cells)
|
| 712 |
+
self.text_parts.append(row_text)
|
| 713 |
+
|
| 714 |
+
# Add separator after header
|
| 715 |
+
if row_idx == 0 and len(self.table_data) > 1:
|
| 716 |
+
separator_cells = ['-' * col_widths[i] for i in range(max_cols)]
|
| 717 |
+
separator_text = ' | '.join(separator_cells)
|
| 718 |
+
self.text_parts.append(separator_text)
|
| 719 |
+
|
| 720 |
+
self.text_parts.append('\n')
|
| 721 |
+
|
| 722 |
+
self.text_parts.append('\n')
|
| 723 |
+
|
| 724 |
+
extractor = FixedSpacingTextExtractor()
|
| 725 |
+
extractor.feed(html_content)
|
| 726 |
+
|
| 727 |
+
result = ''.join(extractor.text_parts)
|
| 728 |
+
|
| 729 |
+
# Clean up excessive newlines while preserving intentional spacing
|
| 730 |
+
result = re.sub(r'\n{4,}', '\n\n\n', result) # Max 3 consecutive newlines
|
| 731 |
+
|
| 732 |
+
# Ensure proper spacing around page headers
|
| 733 |
+
result = re.sub(r'(={80})\n*([A-Z ]+)\n*(={80})', r'\1\n\2\n\3', result)
|
| 734 |
+
|
| 735 |
+
return result.strip()
|
| 736 |
+
|
| 737 |
+
|
| 738 |
class OCRService:
|
| 739 |
+
"""Main OCR service with HTML processing and improved table handling"""
|
| 740 |
|
| 741 |
def __init__(self):
|
| 742 |
self.azure_endpoint = os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
|
|
|
|
| 758 |
|
| 759 |
def convert_pdf_to_text(self, pdf_path: str, method: str = "auto") -> Dict[str, Any]:
|
| 760 |
"""
|
| 761 |
+
Convert PDF to text using specified method with HTML processing
|
| 762 |
|
| 763 |
Args:
|
| 764 |
pdf_path: Path to the PDF file
|
| 765 |
method: OCR method ('azure', 'tesseract', 'pymupdf', 'auto')
|
| 766 |
|
| 767 |
Returns:
|
| 768 |
+
Dict containing text content, HTML, metadata, and processing info
|
| 769 |
"""
|
| 770 |
result = {
|
| 771 |
'success': False,
|
| 772 |
'text': '',
|
| 773 |
+
'html': '',
|
| 774 |
'method_used': '',
|
| 775 |
'metadata': {},
|
| 776 |
'error': None
|
|
|
|
| 792 |
# Try primary method
|
| 793 |
try:
|
| 794 |
if method == "azure" and self.azure_client:
|
| 795 |
+
result = self._azure_ocr_with_html(pdf_path)
|
| 796 |
elif method == "tesseract":
|
| 797 |
result = self._tesseract_ocr(pdf_path)
|
| 798 |
elif method == "pymupdf":
|
|
|
|
| 811 |
|
| 812 |
return result
|
| 813 |
|
| 814 |
+
def _azure_ocr_with_html(self, pdf_path: str) -> Dict[str, Any]:
|
| 815 |
+
"""Azure Document Intelligence OCR with HTML processing"""
|
| 816 |
result = {
|
| 817 |
'success': False,
|
| 818 |
'text': '',
|
| 819 |
+
'html': '',
|
| 820 |
'method_used': 'azure_document_intelligence',
|
| 821 |
'metadata': {},
|
| 822 |
'error': None
|
|
|
|
| 826 |
with open(pdf_path, 'rb') as pdf_file:
|
| 827 |
file_content = pdf_file.read()
|
| 828 |
|
| 829 |
+
# Try different API call patterns
|
| 830 |
try:
|
|
|
|
| 831 |
poller = self.azure_client.begin_analyze_document(
|
| 832 |
"prebuilt-layout",
|
| 833 |
body=file_content,
|
|
|
|
| 835 |
)
|
| 836 |
except TypeError:
|
| 837 |
try:
|
|
|
|
| 838 |
poller = self.azure_client.begin_analyze_document(
|
| 839 |
model_id="prebuilt-layout",
|
| 840 |
body=file_content
|
| 841 |
)
|
| 842 |
except TypeError:
|
|
|
|
| 843 |
pdf_file.seek(0)
|
| 844 |
poller = self.azure_client.begin_analyze_document(
|
| 845 |
"prebuilt-layout",
|
|
|
|
| 848 |
|
| 849 |
analysis_result = poller.result()
|
| 850 |
|
| 851 |
+
# Generate HTML first
|
| 852 |
+
html_content = HTMLProcessor.create_html_from_azure_result(analysis_result)
|
| 853 |
+
|
| 854 |
+
# Convert HTML to formatted text with proper page numbers and spacing
|
| 855 |
+
formatted_text = HTMLProcessor.html_to_formatted_text(html_content)
|
| 856 |
|
| 857 |
result.update({
|
| 858 |
'success': True,
|
| 859 |
'text': formatted_text,
|
| 860 |
+
'html': html_content,
|
| 861 |
'metadata': {
|
| 862 |
'pages': len(analysis_result.pages) if analysis_result.pages else 0,
|
| 863 |
'tables': len(analysis_result.tables) if analysis_result.tables else 0,
|
| 864 |
'paragraphs': len(analysis_result.paragraphs) if hasattr(analysis_result, 'paragraphs') and analysis_result.paragraphs else 0,
|
| 865 |
'has_handwritten': any(style.is_handwritten for style in analysis_result.styles) if analysis_result.styles else False,
|
| 866 |
+
'html_generated': True,
|
| 867 |
+
'improved_formatting': True,
|
| 868 |
+
'page_numbers_added': True,
|
| 869 |
+
'azure_analysis': analysis_result
|
| 870 |
}
|
| 871 |
})
|
| 872 |
|
| 873 |
+
logger.info("Azure OCR with improved HTML processing completed successfully")
|
| 874 |
|
| 875 |
except Exception as e:
|
| 876 |
logger.error(f"Azure OCR error: {e}")
|
|
|
|
| 878 |
|
| 879 |
return result
|
| 880 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 881 |
def _tesseract_ocr(self, pdf_path: str) -> Dict[str, Any]:
|
| 882 |
+
"""Tesseract OCR with basic HTML generation and page numbers"""
|
| 883 |
result = {
|
| 884 |
'success': False,
|
| 885 |
'text': '',
|
| 886 |
+
'html': '',
|
| 887 |
'method_used': 'tesseract',
|
| 888 |
'metadata': {},
|
| 889 |
'error': None
|
|
|
|
| 895 |
|
| 896 |
pdf_document = None
|
| 897 |
try:
|
|
|
|
| 898 |
pdf_document = fitz.open(pdf_path)
|
| 899 |
+
page_count = len(pdf_document)
|
| 900 |
all_text = []
|
| 901 |
+
html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
|
| 902 |
+
html_parts.append('body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }')
|
| 903 |
+
html_parts.append('.page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }')
|
| 904 |
+
html_parts.append('.page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }')
|
| 905 |
+
html_parts.append('</style></head><body>')
|
| 906 |
|
| 907 |
for page_num in range(page_count):
|
| 908 |
+
# Add page header to text
|
| 909 |
+
page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
|
| 910 |
+
all_text.append(page_header)
|
| 911 |
+
|
| 912 |
page = pdf_document.load_page(page_num)
|
| 913 |
|
| 914 |
# Render page to image
|
| 915 |
+
mat = fitz.Matrix(2.0, 2.0)
|
| 916 |
pix = page.get_pixmap(matrix=mat)
|
| 917 |
img_data = pix.tobytes("png")
|
| 918 |
|
|
|
|
| 919 |
temp_img_path = None
|
| 920 |
try:
|
| 921 |
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
|
| 922 |
temp_img.write(img_data)
|
| 923 |
temp_img_path = temp_img.name
|
| 924 |
|
|
|
|
| 925 |
processed_img = self._preprocess_image(temp_img_path)
|
| 926 |
|
|
|
|
| 927 |
custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
|
| 928 |
text = pytesseract.image_to_string(processed_img, config=custom_config, lang='eng')
|
| 929 |
|
|
|
|
| 930 |
all_text.append(text)
|
| 931 |
|
| 932 |
+
# Add to HTML with page number
|
| 933 |
+
html_parts.append(f'<div class="page">')
|
| 934 |
+
html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
|
| 935 |
+
html_parts.append(f'<pre>{text}</pre></div>')
|
| 936 |
+
|
| 937 |
finally:
|
|
|
|
| 938 |
if temp_img_path and os.path.exists(temp_img_path):
|
| 939 |
try:
|
| 940 |
os.unlink(temp_img_path)
|
| 941 |
except:
|
| 942 |
pass
|
| 943 |
|
| 944 |
+
html_parts.append('</body></html>')
|
| 945 |
+
|
| 946 |
result.update({
|
| 947 |
'success': True,
|
| 948 |
'text': '\n'.join(all_text),
|
| 949 |
+
'html': '\n'.join(html_parts),
|
| 950 |
+
'metadata': {
|
| 951 |
+
'pages': page_count,
|
| 952 |
+
'html_generated': True,
|
| 953 |
+
'page_numbers_added': True,
|
| 954 |
+
'improved_formatting': True
|
| 955 |
+
}
|
| 956 |
})
|
| 957 |
|
| 958 |
+
logger.info("Tesseract OCR with improved formatting completed successfully")
|
| 959 |
|
| 960 |
except Exception as e:
|
| 961 |
logger.error(f"Tesseract OCR error: {e}")
|
| 962 |
result['error'] = f"Tesseract OCR error: {e}"
|
| 963 |
finally:
|
|
|
|
| 964 |
if pdf_document is not None:
|
| 965 |
try:
|
| 966 |
pdf_document.close()
|
|
|
|
| 969 |
|
| 970 |
return result
|
| 971 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 972 |
def _pymupdf_extract(self, pdf_path: str) -> Dict[str, Any]:
|
| 973 |
+
"""PyMuPDF text extraction with HTML generation and page numbers"""
|
| 974 |
result = {
|
| 975 |
'success': False,
|
| 976 |
'text': '',
|
| 977 |
+
'html': '',
|
| 978 |
'method_used': 'pymupdf',
|
| 979 |
'metadata': {},
|
| 980 |
'error': None
|
|
|
|
| 983 |
pdf_document = None
|
| 984 |
try:
|
| 985 |
pdf_document = fitz.open(pdf_path)
|
| 986 |
+
page_count = len(pdf_document)
|
| 987 |
all_text = []
|
| 988 |
+
html_parts = ['<!DOCTYPE html><html><head><meta charset="UTF-8"><style>']
|
| 989 |
+
html_parts.append('body { font-family: "Consolas", monospace; line-height: 1.6; margin: 20px; }')
|
| 990 |
+
html_parts.append('.page { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; }')
|
| 991 |
+
html_parts.append('.page-header { font-weight: bold; text-align: center; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 15px; }')
|
| 992 |
+
html_parts.append('</style></head><body>')
|
| 993 |
|
| 994 |
for page_num in range(page_count):
|
| 995 |
+
# Add page header to text
|
| 996 |
+
page_header = f"\n{'=' * 80}\n{'PAGE ' + str(page_num + 1).center(74)}\n{'=' * 80}\n\n"
|
| 997 |
+
all_text.append(page_header)
|
| 998 |
+
|
| 999 |
page = pdf_document.load_page(page_num)
|
| 1000 |
text = page.get_text()
|
| 1001 |
|
|
|
|
| 1002 |
all_text.append(text)
|
| 1003 |
+
|
| 1004 |
+
# Add to HTML with better formatting and page numbers
|
| 1005 |
+
html_parts.append(f'<div class="page">')
|
| 1006 |
+
html_parts.append(f'<div class="page-header">Page {page_num + 1}</div>')
|
| 1007 |
+
formatted_text = text.replace('\n', '<br>')
|
| 1008 |
+
html_parts.append(f'<div>{formatted_text}</div></div>')
|
| 1009 |
+
|
| 1010 |
+
html_parts.append('</body></html>')
|
| 1011 |
|
| 1012 |
result.update({
|
| 1013 |
'success': True,
|
| 1014 |
'text': '\n'.join(all_text),
|
| 1015 |
+
'html': '\n'.join(html_parts),
|
| 1016 |
+
'metadata': {
|
| 1017 |
+
'pages': page_count,
|
| 1018 |
+
'html_generated': True,
|
| 1019 |
+
'page_numbers_added': True,
|
| 1020 |
+
'improved_formatting': True
|
| 1021 |
+
}
|
| 1022 |
})
|
| 1023 |
|
| 1024 |
+
logger.info("PyMuPDF extraction with improved formatting completed successfully")
|
| 1025 |
|
| 1026 |
except Exception as e:
|
| 1027 |
logger.error(f"PyMuPDF error: {e}")
|
| 1028 |
result['error'] = f"PyMuPDF error: {e}"
|
| 1029 |
finally:
|
|
|
|
| 1030 |
if pdf_document is not None:
|
| 1031 |
try:
|
| 1032 |
pdf_document.close()
|
|
|
|
| 1035 |
|
| 1036 |
return result
|
| 1037 |
|
| 1038 |
+
def _preprocess_image(self, image_path: str) -> np.ndarray:
|
| 1039 |
+
"""Preprocess image for better OCR accuracy"""
|
| 1040 |
+
img = cv2.imread(image_path)
|
| 1041 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 1042 |
+
denoised = cv2.medianBlur(gray, 3)
|
| 1043 |
+
_, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 1044 |
+
return binary
|
| 1045 |
+
|
| 1046 |
def _try_fallback_methods(self, pdf_path: str, exclude_method: str = None) -> Dict[str, Any]:
|
| 1047 |
"""Try fallback OCR methods"""
|
| 1048 |
fallback_methods = []
|
| 1049 |
|
|
|
|
| 1050 |
if exclude_method != "azure" and self.azure_client:
|
| 1051 |
fallback_methods.append("azure")
|
| 1052 |
if exclude_method != "tesseract" and self._check_tesseract_available():
|
|
|
|
| 1058 |
logger.info(f"Trying fallback method: {method}")
|
| 1059 |
try:
|
| 1060 |
if method == "azure":
|
| 1061 |
+
result = self._azure_ocr_with_html(pdf_path)
|
| 1062 |
elif method == "tesseract":
|
| 1063 |
result = self._tesseract_ocr(pdf_path)
|
| 1064 |
elif method == "pymupdf":
|
|
|
|
| 1075 |
return {
|
| 1076 |
'success': False,
|
| 1077 |
'text': '',
|
| 1078 |
+
'html': '',
|
| 1079 |
'method_used': 'all_methods_failed',
|
| 1080 |
'metadata': {},
|
| 1081 |
'error': 'All OCR methods failed'
|
|
|
|
| 1099 |
methods.append("azure")
|
| 1100 |
if self._check_tesseract_available():
|
| 1101 |
methods.append("tesseract")
|
| 1102 |
+
methods.append("pymupdf")
|
| 1103 |
|
| 1104 |
return methods
|
readme.md
CHANGED
|
@@ -1,231 +1,270 @@
|
|
| 1 |
# PDF OCR Service
|
| 2 |
|
| 3 |
-
A comprehensive PDF to text
|
| 4 |
|
| 5 |
## Features
|
| 6 |
|
| 7 |
-
-
|
| 8 |
-
-
|
| 9 |
-
-
|
| 10 |
-
-
|
| 11 |
-
-
|
| 12 |
-
-
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
##
|
| 15 |
|
| 16 |
-
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
3. **`ui.py`** - Gradio web interface for user interaction
|
| 21 |
-
|
| 22 |
-
## Quick Start
|
| 23 |
-
|
| 24 |
-
### 1. Install Dependencies
|
| 25 |
|
|
|
|
| 26 |
```bash
|
| 27 |
-
# Install Python dependencies
|
| 28 |
-
pip install -r requirements.txt
|
| 29 |
-
|
| 30 |
-
# Install system dependencies (Ubuntu/Debian)
|
| 31 |
sudo apt-get update
|
| 32 |
sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
|
| 33 |
sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
#
|
|
|
|
| 36 |
brew install tesseract
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
# Download Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
|
| 40 |
-
# Add to PATH environment variable
|
| 41 |
```
|
| 42 |
|
| 43 |
-
###
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
```bash
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
```
|
| 52 |
|
| 53 |
-
|
| 54 |
-
- Set Azure Document Intelligence endpoint and key (for best quality)
|
| 55 |
-
- Adjust file size limits and server settings as needed
|
| 56 |
|
| 57 |
-
###
|
| 58 |
|
| 59 |
```bash
|
| 60 |
-
# Start the web interface
|
| 61 |
python app.py
|
| 62 |
-
|
| 63 |
-
# Or run individual components
|
| 64 |
-
python backend.py # Test backend functionality
|
| 65 |
-
python ocr_service.py # Test OCR service
|
| 66 |
```
|
| 67 |
|
| 68 |
-
The service will
|
| 69 |
|
| 70 |
-
##
|
| 71 |
|
| 72 |
-
1. **
|
| 73 |
-
|
| 74 |
-
-
|
| 75 |
-
-
|
| 76 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
|
| 79 |
-
- Navigate to "Keys and Endpoint" section
|
| 80 |
-
- Copy the endpoint URL and API key
|
| 81 |
-
- Add to your `.env` file:
|
| 82 |
-
```bash
|
| 83 |
-
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
|
| 84 |
-
AZURE_DOCUMENT_INTELLIGENCE_KEY=your-api-key-here
|
| 85 |
-
```
|
| 86 |
|
| 87 |
-
|
| 88 |
|
| 89 |
-
|
| 90 |
-
- **
|
| 91 |
-
- **
|
| 92 |
-
- **
|
| 93 |
-
- **Requirements**: Azure subscription and API key
|
| 94 |
|
| 95 |
-
###
|
| 96 |
-
- **
|
| 97 |
-
- **
|
| 98 |
-
- **
|
| 99 |
-
- **
|
| 100 |
|
| 101 |
-
###
|
| 102 |
-
- **Fast Processing**: Direct text extraction from digital PDFs
|
| 103 |
-
- **Features**: Fastest processing, embedded text extraction
|
| 104 |
-
- **Use Case**: Digital PDFs with embedded text
|
| 105 |
-
- **Requirements**: No additional setup needed
|
| 106 |
|
| 107 |
-
|
| 108 |
|
| 109 |
-
### Web Interface
|
| 110 |
-
1. Open `http://localhost:7860` in your browser
|
| 111 |
-
2. Upload a PDF file
|
| 112 |
-
3. Select OCR method (or use "auto")
|
| 113 |
-
4. Click "Process PDF"
|
| 114 |
-
5. Download extracted text
|
| 115 |
-
|
| 116 |
-
### Python API
|
| 117 |
```python
|
| 118 |
from backend import BackendManager
|
| 119 |
|
| 120 |
# Initialize backend
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
# Process PDF
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
if result['success']:
|
| 127 |
-
print("Extracted
|
| 128 |
-
print(result['
|
| 129 |
-
print(f"Method used: {result['method_used']}")
|
| 130 |
-
print(f"Pages: {result['metadata']['pages']}")
|
| 131 |
else:
|
| 132 |
-
print(
|
| 133 |
```
|
| 134 |
|
| 135 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
-
##
|
| 138 |
-
- `MAX_FILE_SIZE_MB`: Maximum file size (default: 50MB)
|
| 139 |
-
- `PROCESSING_TIMEOUT`: Processing timeout in seconds
|
| 140 |
-
- `MAX_CONCURRENT_TASKS`: Concurrent processing limit
|
| 141 |
|
| 142 |
-
###
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
|
|
|
| 146 |
|
| 147 |
-
###
|
| 148 |
-
-
|
| 149 |
-
-
|
| 150 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
## Troubleshooting
|
| 153 |
|
| 154 |
### Common Issues
|
| 155 |
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
|
| 176 |
### Performance Optimization
|
| 177 |
|
| 178 |
-
-
|
| 179 |
-
-
|
| 180 |
-
-
|
| 181 |
-
-
|
| 182 |
|
| 183 |
-
##
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
├── .env # Environment configuration
|
| 192 |
-
├── README.md # This file
|
| 193 |
-
├── logs/ # Log files (created automatically)
|
| 194 |
-
├── temp/ # Temporary files (created automatically)
|
| 195 |
-
└── cache/ # Cache directory (optional)
|
| 196 |
-
```
|
| 197 |
|
| 198 |
-
##
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
-
|
| 201 |
-
-
|
| 202 |
-
-
|
| 203 |
-
-
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
## Contributing
|
| 207 |
|
| 208 |
1. Fork the repository
|
| 209 |
2. Create a feature branch
|
| 210 |
3. Make your changes
|
| 211 |
-
4. Add tests
|
| 212 |
5. Submit a pull request
|
| 213 |
|
| 214 |
-
## License
|
| 215 |
-
|
| 216 |
-
This project is licensed under the MIT License. See LICENSE file for details.
|
| 217 |
-
|
| 218 |
## Support
|
| 219 |
|
| 220 |
-
|
| 221 |
-
-
|
| 222 |
-
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
### Version 1.0.0
|
| 227 |
-
- Initial release
|
| 228 |
-
- Azure Document Intelligence integration
|
| 229 |
-
- Multiple OCR fallback methods
|
| 230 |
-
- Gradio web interface
|
| 231 |
-
- Processing history and analytics
|
|
|
|
| 1 |
# PDF OCR Service
|
| 2 |
|
| 3 |
+
A comprehensive PDF OCR service with HTML processing, smart table detection, and multiple export formats. Convert PDF documents to text with preserved formatting, enhanced table handling, and advanced preprocessing options.
|
| 4 |
|
| 5 |
## Features
|
| 6 |
|
| 7 |
+
- **Multiple OCR Engines**: Azure Document Intelligence, Tesseract OCR, and PyMuPDF
|
| 8 |
+
- **Smart Table Detection**: Preserves text while accurately detecting and formatting tables
|
| 9 |
+
- **HTML Processing**: Intermediate HTML format for better structure preservation
|
| 10 |
+
- **Advanced Crop Control**: Remove headers/footers with per-page customization
|
| 11 |
+
- **Multiple Export Formats**: TXT, DOCX, and HTML downloads
|
| 12 |
+
- **Real-time Preview**: Visual crop preview with live updates
|
| 13 |
+
- **Enhanced Resolution**: High-quality processing for better accuracy
|
| 14 |
+
- **Automatic Page Numbering**: Clear page separation in extracted content
|
| 15 |
|
| 16 |
+
## Installation
|
| 17 |
|
| 18 |
+
### Prerequisites
|
| 19 |
|
| 20 |
+
- Python 3.8 or higher
|
| 21 |
+
- System dependencies for OCR engines
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
#### Ubuntu/Debian
|
| 24 |
```bash
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
sudo apt-get update
|
| 26 |
sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
|
| 27 |
sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
|
| 28 |
+
sudo apt-get install -y libxml2-dev libxslt1-dev
|
| 29 |
+
```
|
| 30 |
|
| 31 |
+
#### macOS
|
| 32 |
+
```bash
|
| 33 |
brew install tesseract
|
| 34 |
+
brew install opencv
|
| 35 |
+
brew install libxml2
|
|
|
|
|
|
|
| 36 |
```
|
| 37 |
|
| 38 |
+
#### Windows
|
| 39 |
+
- Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
|
| 40 |
+
- Add Tesseract to PATH environment variable
|
| 41 |
+
|
| 42 |
+
### Python Dependencies
|
| 43 |
|
| 44 |
```bash
|
| 45 |
+
pip install -r requirements.txt
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
### Environment Configuration
|
| 49 |
+
|
| 50 |
+
Create a `.env` file in the project root:
|
| 51 |
|
| 52 |
+
```env
|
| 53 |
+
# Azure Document Intelligence (Optional)
|
| 54 |
+
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=your_azure_endpoint
|
| 55 |
+
AZURE_DOCUMENT_INTELLIGENCE_KEY=your_azure_key
|
| 56 |
+
|
| 57 |
+
# File Processing Limits
|
| 58 |
+
MAX_FILE_SIZE_MB=50
|
| 59 |
+
MAX_HISTORY_SIZE=100
|
| 60 |
```
|
| 61 |
|
| 62 |
+
## Usage
|
|
|
|
|
|
|
| 63 |
|
| 64 |
+
### Starting the Service
|
| 65 |
|
| 66 |
```bash
|
|
|
|
| 67 |
python app.py
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
```
|
| 69 |
|
| 70 |
+
The service will start on `http://localhost:7860`
|
| 71 |
|
| 72 |
+
### Web Interface
|
| 73 |
|
| 74 |
+
1. **Upload PDF**: Select your PDF file using the file upload button
|
| 75 |
+
2. **Choose OCR Method**:
|
| 76 |
+
- `auto`: Automatically selects the best available method
|
| 77 |
+
- `azure`: Azure Document Intelligence (requires API key)
|
| 78 |
+
- `tesseract`: Open-source Tesseract OCR
|
| 79 |
+
- `pymupdf`: Fast PyMuPDF text extraction
|
| 80 |
+
3. **Configure Preprocessing** (Optional):
|
| 81 |
+
- Enable header/footer removal
|
| 82 |
+
- Adjust crop percentages for each edge
|
| 83 |
+
- Use real-time preview to see crop effects
|
| 84 |
+
- Apply settings to all pages or customize per page
|
| 85 |
+
4. **Process**: Click "Process PDF with HTML Enhancement"
|
| 86 |
+
5. **Download**: Choose from TXT, DOCX, or HTML formats
|
| 87 |
|
| 88 |
+
### Crop Control
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
+
The crop feature allows you to remove headers, footers, and margins:
|
| 91 |
|
| 92 |
+
- **Top/Bottom Crop**: Remove headers and footers (0-40% of page height)
|
| 93 |
+
- **Left/Right Crop**: Remove side margins (0-30% of page width)
|
| 94 |
+
- **Per-page Settings**: Customize crop for individual pages
|
| 95 |
+
- **Real-time Preview**: See crop effects with red (removed) and green (content) areas
|
|
|
|
| 96 |
|
| 97 |
+
#### Preset Options
|
| 98 |
+
- **Light Crop (5%)**: Minimal header/footer removal
|
| 99 |
+
- **Medium Crop (10%)**: Standard header/footer removal
|
| 100 |
+
- **Heavy Crop (15%)**: Aggressive header/footer removal
|
| 101 |
+
- **Reset**: Remove all cropping
|
| 102 |
|
| 103 |
+
### API Usage
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
+
The service can be integrated programmatically:
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
```python
|
| 108 |
from backend import BackendManager
|
| 109 |
|
| 110 |
# Initialize backend
|
| 111 |
+
backend = BackendManager()
|
| 112 |
+
|
| 113 |
+
# Process PDF with options
|
| 114 |
+
preprocessing_options = {
|
| 115 |
+
'enable_header_footer_removal': True,
|
| 116 |
+
'crop_settings': {
|
| 117 |
+
'per_page_crops': {
|
| 118 |
+
0: {'top': 10, 'bottom': 10, 'left': 5, 'right': 5}
|
| 119 |
+
},
|
| 120 |
+
'enhanced_resolution': True
|
| 121 |
+
}
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
result = backend.process_pdf_with_enhanced_resolution(
|
| 125 |
+
pdf_path='document.pdf',
|
| 126 |
+
method='auto',
|
| 127 |
+
preprocessing_options=preprocessing_options
|
| 128 |
+
)
|
| 129 |
|
| 130 |
if result['success']:
|
| 131 |
+
print("Extracted text:", result['text'])
|
| 132 |
+
print("HTML content:", result['html'])
|
|
|
|
|
|
|
| 133 |
else:
|
| 134 |
+
print("Error:", result['error'])
|
| 135 |
```
|
| 136 |
|
| 137 |
+
## Output Formats
|
| 138 |
+
|
| 139 |
+
### Text (TXT)
|
| 140 |
+
- Plain text with preserved formatting
|
| 141 |
+
- Page numbers and separators
|
| 142 |
+
- Table formatting with borders
|
| 143 |
+
- Proper indentation and spacing
|
| 144 |
+
|
| 145 |
+
### Microsoft Word (DOCX)
|
| 146 |
+
- Structured document with headings
|
| 147 |
+
- Tables converted to Word tables
|
| 148 |
+
- Preserved formatting and layout
|
| 149 |
+
- Metadata and processing information
|
| 150 |
+
|
| 151 |
+
### HTML
|
| 152 |
+
- Web-viewable format
|
| 153 |
+
- CSS styling for better readability
|
| 154 |
+
- Interactive tables
|
| 155 |
+
- Responsive design
|
| 156 |
|
| 157 |
+
## OCR Method Selection
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
+
### Auto (Recommended)
|
| 160 |
+
Automatically chooses the best available method based on:
|
| 161 |
+
1. Azure Document Intelligence (if configured)
|
| 162 |
+
2. Tesseract OCR (if available)
|
| 163 |
+
3. PyMuPDF (fallback)
|
| 164 |
|
| 165 |
+
### Azure Document Intelligence
|
| 166 |
+
- **Best for**: Complex documents with tables and forms
|
| 167 |
+
- **Requires**: Azure API credentials
|
| 168 |
+
- **Features**: Advanced layout detection, handwriting recognition
|
| 169 |
+
- **Speed**: Medium (cloud processing)
|
| 170 |
+
|
| 171 |
+
### Tesseract OCR
|
| 172 |
+
- **Best for**: Scanned documents and images
|
| 173 |
+
- **Requires**: Local Tesseract installation
|
| 174 |
+
- **Features**: Open-source, multilingual support
|
| 175 |
+
- **Speed**: Slow (local processing)
|
| 176 |
+
|
| 177 |
+
### PyMuPDF
|
| 178 |
+
- **Best for**: Text-based PDFs
|
| 179 |
+
- **Requires**: No additional setup
|
| 180 |
+
- **Features**: Fast extraction, basic formatting
|
| 181 |
+
- **Speed**: Fast (direct text extraction)
|
| 182 |
+
|
| 183 |
+
## Configuration
|
| 184 |
+
|
| 185 |
+
### Environment Variables
|
| 186 |
+
|
| 187 |
+
| Variable | Description | Default |
|
| 188 |
+
|----------|-------------|---------|
|
| 189 |
+
| `AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT` | Azure service endpoint | None |
|
| 190 |
+
| `AZURE_DOCUMENT_INTELLIGENCE_KEY` | Azure API key | None |
|
| 191 |
+
| `MAX_FILE_SIZE_MB` | Maximum file size limit | 50 |
|
| 192 |
+
| `MAX_HISTORY_SIZE` | Processing history limit | 100 |
|
| 193 |
+
|
| 194 |
+
### Service Status
|
| 195 |
+
|
| 196 |
+
Check available OCR methods and service health at the bottom of the web interface. The status panel shows:
|
| 197 |
+
- Available OCR methods
|
| 198 |
+
- Feature availability
|
| 199 |
+
- Configuration status
|
| 200 |
+
- Export format support
|
| 201 |
|
| 202 |
## Troubleshooting
|
| 203 |
|
| 204 |
### Common Issues
|
| 205 |
|
| 206 |
+
**PDF Upload Fails**
|
| 207 |
+
- Check file size (default limit: 50MB)
|
| 208 |
+
- Ensure PDF is not password protected
|
| 209 |
+
- Verify PDF is not corrupted
|
| 210 |
|
| 211 |
+
**OCR Processing Errors**
|
| 212 |
+
- Check Azure credentials if using Azure method
|
| 213 |
+
- Verify Tesseract installation for Tesseract method
|
| 214 |
+
- Try different OCR method using auto-selection
|
| 215 |
|
| 216 |
+
**Crop Preview Not Showing**
|
| 217 |
+
- Ensure PDF is loaded successfully
|
| 218 |
+
- Enable header/footer removal option
|
| 219 |
+
- Check browser console for JavaScript errors
|
| 220 |
|
| 221 |
+
**Export Downloads Not Available**
|
| 222 |
+
- Verify processing completed successfully
|
| 223 |
+
- Check python-docx installation for DOCX export
|
| 224 |
+
- Ensure sufficient disk space for temporary files
|
| 225 |
|
| 226 |
### Performance Optimization
|
| 227 |
|
| 228 |
+
- Use PyMuPDF for simple text-based PDFs
|
| 229 |
+
- Enable crop processing only when needed
|
| 230 |
+
- Reduce crop resolution scale for better performance
|
| 231 |
+
- Regular cleanup of temporary files
|
| 232 |
|
| 233 |
+
## Dependencies
|
| 234 |
|
| 235 |
+
### Core Dependencies
|
| 236 |
+
- `gradio>=4.0.0` - Web interface
|
| 237 |
+
- `python-dotenv>=1.0.0` - Environment configuration
|
| 238 |
+
- `PyMuPDF>=1.23.0` - PDF processing
|
| 239 |
+
- `opencv-python>=4.8.0` - Image processing
|
| 240 |
+
- `numpy>=1.24.0` - Numerical operations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
+
### OCR Dependencies
|
| 243 |
+
- `azure-ai-documentintelligence>=1.0.0b1` - Azure OCR
|
| 244 |
+
- `pytesseract>=0.3.10` - Tesseract integration
|
| 245 |
+
- `Pillow>=10.0.0` - Image processing
|
| 246 |
|
| 247 |
+
### Export Dependencies
|
| 248 |
+
- `python-docx>=0.8.11` - DOCX generation
|
| 249 |
+
- `beautifulsoup4>=4.12.0` - HTML processing
|
| 250 |
+
- `lxml>=4.9.0` - XML processing
|
| 251 |
+
|
| 252 |
+
## License
|
| 253 |
+
|
| 254 |
+
This project is licensed under the MIT License. See LICENSE file for details.
|
| 255 |
|
| 256 |
## Contributing
|
| 257 |
|
| 258 |
1. Fork the repository
|
| 259 |
2. Create a feature branch
|
| 260 |
3. Make your changes
|
| 261 |
+
4. Add tests for new functionality
|
| 262 |
5. Submit a pull request
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
## Support
|
| 265 |
|
| 266 |
+
For issues and questions:
|
| 267 |
+
- Check the troubleshooting section
|
| 268 |
+
- Review the service status panel
|
| 269 |
+
- Check system dependencies
|
| 270 |
+
- Verify environment configuration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# PDF OCR Service Requirements - Enhanced Version
|
| 2 |
|
| 3 |
# Core web framework and UI
|
| 4 |
gradio>=4.0.0
|
|
@@ -19,13 +19,21 @@ numpy>=1.24.0
|
|
| 19 |
# PDF processing and manipulation
|
| 20 |
PyMuPDF>=1.23.0
|
| 21 |
|
| 22 |
-
# Document export formats
|
| 23 |
python-docx>=0.8.11
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
# Additional dependencies for enhanced preprocessing
|
| 26 |
matplotlib>=3.7.0 # For image visualization in development
|
| 27 |
scikit-image>=0.21.0 # Advanced image processing (optional)
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
# System dependencies information (install separately):
|
| 30 |
#
|
| 31 |
# For Ubuntu/Debian:
|
|
@@ -33,30 +41,38 @@ scikit-image>=0.21.0 # Advanced image processing (optional)
|
|
| 33 |
# sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
|
| 34 |
# sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
|
| 35 |
# sudo apt-get install -y python3-opencv # Alternative OpenCV installation
|
|
|
|
| 36 |
#
|
| 37 |
# For CentOS/RHEL:
|
| 38 |
# sudo yum install -y tesseract tesseract-langpack-eng
|
| 39 |
# sudo yum install -y opencv-python
|
|
|
|
| 40 |
#
|
| 41 |
# For macOS:
|
| 42 |
# brew install tesseract
|
| 43 |
# brew install opencv
|
|
|
|
| 44 |
#
|
| 45 |
# For Windows:
|
| 46 |
# Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
|
| 47 |
# Add Tesseract to PATH environment variable
|
| 48 |
-
# OpenCV should install automatically with pip
|
| 49 |
|
| 50 |
# Development and testing (optional)
|
| 51 |
pytest>=7.0.0
|
| 52 |
pytest-cov>=4.0.0
|
|
|
|
|
|
|
| 53 |
|
| 54 |
# Performance monitoring (optional)
|
| 55 |
memory-profiler>=0.60.0
|
|
|
|
| 56 |
|
| 57 |
# Note: The enhanced version includes:
|
| 58 |
-
# -
|
| 59 |
-
# -
|
| 60 |
-
# - Enhanced
|
| 61 |
-
# -
|
| 62 |
-
# -
|
|
|
|
|
|
|
|
|
| 1 |
+
# PDF OCR Service Requirements - Enhanced Version with HTML Processing
|
| 2 |
|
| 3 |
# Core web framework and UI
|
| 4 |
gradio>=4.0.0
|
|
|
|
| 19 |
# PDF processing and manipulation
|
| 20 |
PyMuPDF>=1.23.0
|
| 21 |
|
| 22 |
+
# Document export formats (ENHANCED)
|
| 23 |
python-docx>=0.8.11
|
| 24 |
|
| 25 |
+
# HTML processing and parsing (NEW)
|
| 26 |
+
beautifulsoup4>=4.12.0
|
| 27 |
+
lxml>=4.9.0
|
| 28 |
+
|
| 29 |
# Additional dependencies for enhanced preprocessing
|
| 30 |
matplotlib>=3.7.0 # For image visualization in development
|
| 31 |
scikit-image>=0.21.0 # Advanced image processing (optional)
|
| 32 |
|
| 33 |
+
# Performance and utility libraries
|
| 34 |
+
tqdm>=4.65.0 # Progress bars for long operations
|
| 35 |
+
requests>=2.31.0 # HTTP requests for external services
|
| 36 |
+
|
| 37 |
# System dependencies information (install separately):
|
| 38 |
#
|
| 39 |
# For Ubuntu/Debian:
|
|
|
|
| 41 |
# sudo apt-get install -y tesseract-ocr tesseract-ocr-eng
|
| 42 |
# sudo apt-get install -y libgl1-mesa-glx libglib2.0-0
|
| 43 |
# sudo apt-get install -y python3-opencv # Alternative OpenCV installation
|
| 44 |
+
# sudo apt-get install -y libxml2-dev libxslt1-dev # For lxml
|
| 45 |
#
|
| 46 |
# For CentOS/RHEL:
|
| 47 |
# sudo yum install -y tesseract tesseract-langpack-eng
|
| 48 |
# sudo yum install -y opencv-python
|
| 49 |
+
# sudo yum install -y libxml2-devel libxslt-devel
|
| 50 |
#
|
| 51 |
# For macOS:
|
| 52 |
# brew install tesseract
|
| 53 |
# brew install opencv
|
| 54 |
+
# brew install libxml2
|
| 55 |
#
|
| 56 |
# For Windows:
|
| 57 |
# Install Tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
|
| 58 |
# Add Tesseract to PATH environment variable
|
| 59 |
+
# OpenCV and other packages should install automatically with pip
|
| 60 |
|
| 61 |
# Development and testing (optional)
|
| 62 |
pytest>=7.0.0
|
| 63 |
pytest-cov>=4.0.0
|
| 64 |
+
black>=23.0.0 # Code formatting
|
| 65 |
+
flake8>=6.0.0 # Code linting
|
| 66 |
|
| 67 |
# Performance monitoring (optional)
|
| 68 |
memory-profiler>=0.60.0
|
| 69 |
+
psutil>=5.9.0 # System monitoring
|
| 70 |
|
| 71 |
# Note: The enhanced version includes:
|
| 72 |
+
# - Fixed table processing that prevents text loss
|
| 73 |
+
# - HTML intermediate processing for better formatting
|
| 74 |
+
# - Enhanced export capabilities (TXT, DOCX, HTML)
|
| 75 |
+
# - Smart overlap detection with 70% threshold
|
| 76 |
+
# - Improved coordinate calculations for table boundaries
|
| 77 |
+
# - Better document structure preservation
|
| 78 |
+
# - Multi-format download options
|