Spaces:
Running
Running
Upload 5 files
Browse files- app.py +3 -0
- backend.py +17 -3
- ocr_service.py +118 -30
app.py
CHANGED
|
@@ -440,6 +440,9 @@ def process_pdf_with_opencv_enhancement(pdf_file, ocr_method, enable_header_foot
|
|
| 440 |
progress(1.0, desc="Complete!")
|
| 441 |
|
| 442 |
if result['success']:
|
|
|
|
|
|
|
|
|
|
| 443 |
metadata_info = format_opencv_enhanced_metadata(result['metadata'], result['method_used'])
|
| 444 |
status_parts = [f"Success: Processed using {result['method_used']}"]
|
| 445 |
status_parts.append("OpenCV text block analysis: Enabled")
|
|
|
|
| 440 |
progress(1.0, desc="Complete!")
|
| 441 |
|
| 442 |
if result['success']:
|
| 443 |
+
# Clean any remaining artifacts from text and HTML
|
| 444 |
+
result['text'] = result['text'].replace(':unselected:', '').replace(':selected:', '')
|
| 445 |
+
result['html'] = result['html'].replace(':unselected:', '').replace(':selected:', '')
|
| 446 |
metadata_info = format_opencv_enhanced_metadata(result['metadata'], result['method_used'])
|
| 447 |
status_parts = [f"Success: Processed using {result['method_used']}"]
|
| 448 |
status_parts.append("OpenCV text block analysis: Enabled")
|
backend.py
CHANGED
|
@@ -259,7 +259,16 @@ class EnhancedDocumentExporter:
|
|
| 259 |
|
| 260 |
def handle_data(self, data):
|
| 261 |
if data.strip():
|
|
|
|
|
|
|
|
|
|
| 262 |
data = data.replace(' ', ' ')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
|
| 264 |
if self.in_table:
|
| 265 |
self.current_table_row.append(data.strip())
|
|
@@ -290,9 +299,14 @@ class EnhancedDocumentExporter:
|
|
| 290 |
run.font.size = Pt(14)
|
| 291 |
run.font.color.rgb = RGBColor(52, 73, 94) # Darker blue
|
| 292 |
elif self.in_page_header:
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
else:
|
| 297 |
# Apply pattern-specific formatting with OpenCV enhancement
|
| 298 |
self._apply_opencv_pattern_formatting(run, indent_info, text_classification)
|
|
|
|
| 259 |
|
| 260 |
def handle_data(self, data):
|
| 261 |
if data.strip():
|
| 262 |
+
# Clean OCR artifacts
|
| 263 |
+
data = data.replace(':unselected:', '')
|
| 264 |
+
data = data.replace(':selected:', '')
|
| 265 |
data = data.replace(' ', ' ')
|
| 266 |
+
if self.in_page_header:
|
| 267 |
+
page_match = re.search(r'Page (\d+)', data)
|
| 268 |
+
if page_match:
|
| 269 |
+
page_num = int(page_match.group(1))
|
| 270 |
+
page_header = f"PAGE {page_num}"
|
| 271 |
+
self.text_parts.append(page_header.center(80))
|
| 272 |
|
| 273 |
if self.in_table:
|
| 274 |
self.current_table_row.append(data.strip())
|
|
|
|
| 299 |
run.font.size = Pt(14)
|
| 300 |
run.font.color.rgb = RGBColor(52, 73, 94) # Darker blue
|
| 301 |
elif self.in_page_header:
|
| 302 |
+
page_match = re.search(r'Page (\d+)', data)
|
| 303 |
+
if page_match:
|
| 304 |
+
page_num = int(page_match.group(1))
|
| 305 |
+
page_header = f"PAGE {page_num}"
|
| 306 |
+
run.bold = True
|
| 307 |
+
run.font.size = Pt(14)
|
| 308 |
+
run.font.color.rgb = RGBColor(44, 62, 80)
|
| 309 |
+
self.text_parts.append(page_header.center(80))
|
| 310 |
else:
|
| 311 |
# Apply pattern-specific formatting with OpenCV enhancement
|
| 312 |
self._apply_opencv_pattern_formatting(run, indent_info, text_classification)
|
ocr_service.py
CHANGED
|
@@ -645,20 +645,46 @@ class EnhancedHTMLProcessor:
|
|
| 645 |
return f'<div{class_str}>{content}</div>'
|
| 646 |
|
| 647 |
def _table_to_html(self, table, table_idx):
|
| 648 |
-
"""Convert table to HTML with
|
| 649 |
if not table.cells:
|
| 650 |
return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
|
| 651 |
|
| 652 |
-
#
|
| 653 |
max_row = max(cell.row_index for cell in table.cells) + 1
|
| 654 |
max_col = max(cell.column_index for cell in table.cells) + 1
|
| 655 |
|
| 656 |
-
|
|
|
|
|
|
|
| 657 |
|
| 658 |
-
# Fill matrix
|
| 659 |
for cell in table.cells:
|
| 660 |
-
|
| 661 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
|
| 663 |
# Generate HTML
|
| 664 |
html_parts = [f'<div class="table-container">']
|
|
@@ -666,19 +692,18 @@ class EnhancedHTMLProcessor:
|
|
| 666 |
html_parts.append('<table class="table">')
|
| 667 |
|
| 668 |
for row_idx, row in enumerate(table_matrix):
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
html_parts.append('</tr>')
|
| 682 |
|
| 683 |
html_parts.append('</table></div>')
|
| 684 |
return '\n'.join(html_parts)
|
|
@@ -783,6 +808,21 @@ class EnhancedHTMLProcessor:
|
|
| 783 |
|
| 784 |
return max_overlap
|
| 785 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 786 |
@staticmethod
|
| 787 |
def html_to_formatted_text_enhanced(html_content):
|
| 788 |
"""Convert HTML back to formatted text with OpenCV-enhanced preservation"""
|
|
@@ -896,6 +936,9 @@ class EnhancedHTMLProcessor:
|
|
| 896 |
|
| 897 |
def handle_data(self, data):
|
| 898 |
if data.strip():
|
|
|
|
|
|
|
|
|
|
| 899 |
data = data.replace(' ', ' ')
|
| 900 |
|
| 901 |
if self.in_page_header:
|
|
@@ -1004,6 +1047,38 @@ class EnhancedHTMLProcessor:
|
|
| 1004 |
|
| 1005 |
return result.strip()
|
| 1006 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1007 |
|
| 1008 |
class OCRService:
|
| 1009 |
"""Main OCR service with OpenCV-enhanced text analysis, spacing detection, and bold text recognition"""
|
|
@@ -1135,25 +1210,38 @@ class OCRService:
|
|
| 1135 |
with open(pdf_path, 'rb') as pdf_file:
|
| 1136 |
file_content = pdf_file.read()
|
| 1137 |
|
| 1138 |
-
#
|
|
|
|
|
|
|
| 1139 |
try:
|
|
|
|
| 1140 |
poller = self.azure_client.begin_analyze_document(
|
| 1141 |
"prebuilt-layout",
|
| 1142 |
body=file_content,
|
| 1143 |
-
content_type="application/pdf"
|
|
|
|
|
|
|
| 1144 |
)
|
| 1145 |
-
except TypeError:
|
|
|
|
| 1146 |
try:
|
| 1147 |
-
poller = self.azure_client.begin_analyze_document(
|
| 1148 |
-
model_id="prebuilt-layout",
|
| 1149 |
-
body=file_content
|
| 1150 |
-
)
|
| 1151 |
-
except TypeError:
|
| 1152 |
-
pdf_file.seek(0)
|
| 1153 |
poller = self.azure_client.begin_analyze_document(
|
| 1154 |
"prebuilt-layout",
|
| 1155 |
-
|
|
|
|
| 1156 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1157 |
|
| 1158 |
analysis_result = poller.result()
|
| 1159 |
|
|
@@ -1622,4 +1710,4 @@ class OCRService:
|
|
| 1622 |
methods.append("tesseract")
|
| 1623 |
methods.append("pymupdf")
|
| 1624 |
|
| 1625 |
-
return methods
|
|
|
|
| 645 |
return f'<div{class_str}>{content}</div>'
|
| 646 |
|
| 647 |
def _table_to_html(self, table, table_idx):
|
| 648 |
+
"""Convert table to HTML with improved cell alignment and artifact removal"""
|
| 649 |
if not table.cells:
|
| 650 |
return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
|
| 651 |
|
| 652 |
+
# Get table dimensions
|
| 653 |
max_row = max(cell.row_index for cell in table.cells) + 1
|
| 654 |
max_col = max(cell.column_index for cell in table.cells) + 1
|
| 655 |
|
| 656 |
+
# Create table matrix with cell span information
|
| 657 |
+
table_matrix = [[{"content": "", "rowspan": 1, "colspan": 1, "occupied": False}
|
| 658 |
+
for _ in range(max_col)] for _ in range(max_row)]
|
| 659 |
|
| 660 |
+
# Fill matrix with proper handling of spans
|
| 661 |
for cell in table.cells:
|
| 662 |
+
row_idx = cell.row_index
|
| 663 |
+
col_idx = cell.column_index
|
| 664 |
+
|
| 665 |
+
# Clean the content
|
| 666 |
+
content = self.clean_ocr_artifacts(cell.content or "").strip()
|
| 667 |
+
|
| 668 |
+
# Get span information
|
| 669 |
+
rowspan = getattr(cell, 'row_span', 1) or 1
|
| 670 |
+
colspan = getattr(cell, 'column_span', 1) or 1
|
| 671 |
+
|
| 672 |
+
# Mark this cell and any cells it spans over
|
| 673 |
+
if row_idx < max_row and col_idx < max_col:
|
| 674 |
+
# Find the first non-occupied cell in this position
|
| 675 |
+
while col_idx < max_col and table_matrix[row_idx][col_idx]["occupied"]:
|
| 676 |
+
col_idx += 1
|
| 677 |
+
|
| 678 |
+
if col_idx < max_col:
|
| 679 |
+
table_matrix[row_idx][col_idx]["content"] = content
|
| 680 |
+
table_matrix[row_idx][col_idx]["rowspan"] = rowspan
|
| 681 |
+
table_matrix[row_idx][col_idx]["colspan"] = colspan
|
| 682 |
+
|
| 683 |
+
# Mark spanned cells as occupied
|
| 684 |
+
for r in range(row_idx, min(row_idx + rowspan, max_row)):
|
| 685 |
+
for c in range(col_idx, min(col_idx + colspan, max_col)):
|
| 686 |
+
if r != row_idx or c != col_idx:
|
| 687 |
+
table_matrix[r][c]["occupied"] = True
|
| 688 |
|
| 689 |
# Generate HTML
|
| 690 |
html_parts = [f'<div class="table-container">']
|
|
|
|
| 692 |
html_parts.append('<table class="table">')
|
| 693 |
|
| 694 |
for row_idx, row in enumerate(table_matrix):
|
| 695 |
+
html_parts.append('<tr>')
|
| 696 |
+
for col_idx, cell in enumerate(row):
|
| 697 |
+
if not cell["occupied"]:
|
| 698 |
+
content = cell["content"]
|
| 699 |
+
rowspan_attr = f' rowspan="{cell["rowspan"]}"' if cell["rowspan"] > 1 else ''
|
| 700 |
+
colspan_attr = f' colspan="{cell["colspan"]}"' if cell["colspan"] > 1 else ''
|
| 701 |
+
|
| 702 |
+
if row_idx == 0 and content.strip(): # Header row
|
| 703 |
+
html_parts.append(f'<th{rowspan_attr}{colspan_attr}>{content}</th>')
|
| 704 |
+
else:
|
| 705 |
+
html_parts.append(f'<td{rowspan_attr}{colspan_attr}>{content}</td>')
|
| 706 |
+
html_parts.append('</tr>')
|
|
|
|
| 707 |
|
| 708 |
html_parts.append('</table></div>')
|
| 709 |
return '\n'.join(html_parts)
|
|
|
|
| 808 |
|
| 809 |
return max_overlap
|
| 810 |
|
| 811 |
+
@staticmethod
|
| 812 |
+
def clean_ocr_artifacts(text: str) -> str:
|
| 813 |
+
"""Remove OCR artifacts like checkbox markers and clean up text"""
|
| 814 |
+
if not text:
|
| 815 |
+
return text
|
| 816 |
+
|
| 817 |
+
# Remove checkbox markers
|
| 818 |
+
text = re.sub(r':unselected:', '', text)
|
| 819 |
+
text = re.sub(r':selected:', '', text) # Replace with checkmark
|
| 820 |
+
|
| 821 |
+
# Clean up multiple spaces
|
| 822 |
+
text = re.sub(r'\s+', ' ', text)
|
| 823 |
+
|
| 824 |
+
return text.strip()
|
| 825 |
+
|
| 826 |
@staticmethod
|
| 827 |
def html_to_formatted_text_enhanced(html_content):
|
| 828 |
"""Convert HTML back to formatted text with OpenCV-enhanced preservation"""
|
|
|
|
| 936 |
|
| 937 |
def handle_data(self, data):
|
| 938 |
if data.strip():
|
| 939 |
+
# Clean OCR artifacts first
|
| 940 |
+
data = data.replace(':unselected:', '')
|
| 941 |
+
data = data.replace(':selected:', '')
|
| 942 |
data = data.replace(' ', ' ')
|
| 943 |
|
| 944 |
if self.in_page_header:
|
|
|
|
| 1047 |
|
| 1048 |
return result.strip()
|
| 1049 |
|
| 1050 |
+
def _validate_and_fix_table_structure(self, table_matrix):
|
| 1051 |
+
"""Validate and fix common table structure issues"""
|
| 1052 |
+
if not table_matrix:
|
| 1053 |
+
return table_matrix
|
| 1054 |
+
|
| 1055 |
+
max_row = len(table_matrix)
|
| 1056 |
+
max_col = len(table_matrix[0]) if table_matrix else 0
|
| 1057 |
+
|
| 1058 |
+
# Ensure all rows have same number of columns
|
| 1059 |
+
for row in table_matrix:
|
| 1060 |
+
while len(row) < max_col:
|
| 1061 |
+
row.append({"content": "", "rowspan": 1, "colspan": 1, "occupied": False})
|
| 1062 |
+
|
| 1063 |
+
# Remove completely empty rows
|
| 1064 |
+
table_matrix = [row for row in table_matrix if any(cell["content"].strip() for cell in row)]
|
| 1065 |
+
|
| 1066 |
+
# Merge cells with identical content in adjacent columns (likely split cells)
|
| 1067 |
+
for row_idx, row in enumerate(table_matrix):
|
| 1068 |
+
col_idx = 0
|
| 1069 |
+
while col_idx < len(row) - 1:
|
| 1070 |
+
current = row[col_idx]
|
| 1071 |
+
next_cell = row[col_idx + 1]
|
| 1072 |
+
|
| 1073 |
+
if (current["content"] == next_cell["content"] and
|
| 1074 |
+
current["content"].strip() and
|
| 1075 |
+
not current["occupied"] and not next_cell["occupied"]):
|
| 1076 |
+
# Merge cells
|
| 1077 |
+
current["colspan"] += next_cell["colspan"]
|
| 1078 |
+
next_cell["occupied"] = True
|
| 1079 |
+
col_idx += 1
|
| 1080 |
+
|
| 1081 |
+
return table_matrix
|
| 1082 |
|
| 1083 |
class OCRService:
|
| 1084 |
"""Main OCR service with OpenCV-enhanced text analysis, spacing detection, and bold text recognition"""
|
|
|
|
| 1210 |
with open(pdf_path, 'rb') as pdf_file:
|
| 1211 |
file_content = pdf_file.read()
|
| 1212 |
|
| 1213 |
+
# Use enhanced analysis features
|
| 1214 |
+
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
| 1215 |
+
|
| 1216 |
try:
|
| 1217 |
+
# Try with features parameter for better table extraction
|
| 1218 |
poller = self.azure_client.begin_analyze_document(
|
| 1219 |
"prebuilt-layout",
|
| 1220 |
body=file_content,
|
| 1221 |
+
content_type="application/pdf",
|
| 1222 |
+
features=["keyValuePairs"], # Enable key-value pair detection
|
| 1223 |
+
output_content_format="markdown" # Better structure preservation
|
| 1224 |
)
|
| 1225 |
+
except (TypeError, AttributeError):
|
| 1226 |
+
# Fallback to basic call
|
| 1227 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1228 |
poller = self.azure_client.begin_analyze_document(
|
| 1229 |
"prebuilt-layout",
|
| 1230 |
+
body=file_content,
|
| 1231 |
+
content_type="application/pdf"
|
| 1232 |
)
|
| 1233 |
+
except TypeError:
|
| 1234 |
+
try:
|
| 1235 |
+
poller = self.azure_client.begin_analyze_document(
|
| 1236 |
+
model_id="prebuilt-layout",
|
| 1237 |
+
body=file_content
|
| 1238 |
+
)
|
| 1239 |
+
except TypeError:
|
| 1240 |
+
pdf_file.seek(0)
|
| 1241 |
+
poller = self.azure_client.begin_analyze_document(
|
| 1242 |
+
"prebuilt-layout",
|
| 1243 |
+
document=pdf_file
|
| 1244 |
+
)
|
| 1245 |
|
| 1246 |
analysis_result = poller.result()
|
| 1247 |
|
|
|
|
| 1710 |
methods.append("tesseract")
|
| 1711 |
methods.append("pymupdf")
|
| 1712 |
|
| 1713 |
+
return methods
|