Chirapath commited on
Commit
48a5ad7
·
verified ·
1 Parent(s): 4b6cb81

Upload 5 files

Browse files
Files changed (3) hide show
  1. app.py +3 -0
  2. backend.py +17 -3
  3. ocr_service.py +118 -30
app.py CHANGED
@@ -440,6 +440,9 @@ def process_pdf_with_opencv_enhancement(pdf_file, ocr_method, enable_header_foot
440
  progress(1.0, desc="Complete!")
441
 
442
  if result['success']:
 
 
 
443
  metadata_info = format_opencv_enhanced_metadata(result['metadata'], result['method_used'])
444
  status_parts = [f"Success: Processed using {result['method_used']}"]
445
  status_parts.append("OpenCV text block analysis: Enabled")
 
440
  progress(1.0, desc="Complete!")
441
 
442
  if result['success']:
443
+ # Clean any remaining artifacts from text and HTML
444
+ result['text'] = result['text'].replace(':unselected:', '').replace(':selected:', '')
445
+ result['html'] = result['html'].replace(':unselected:', '').replace(':selected:', '')
446
  metadata_info = format_opencv_enhanced_metadata(result['metadata'], result['method_used'])
447
  status_parts = [f"Success: Processed using {result['method_used']}"]
448
  status_parts.append("OpenCV text block analysis: Enabled")
backend.py CHANGED
@@ -259,7 +259,16 @@ class EnhancedDocumentExporter:
259
 
260
  def handle_data(self, data):
261
  if data.strip():
 
 
 
262
  data = data.replace(' ', ' ')
 
 
 
 
 
 
263
 
264
  if self.in_table:
265
  self.current_table_row.append(data.strip())
@@ -290,9 +299,14 @@ class EnhancedDocumentExporter:
290
  run.font.size = Pt(14)
291
  run.font.color.rgb = RGBColor(52, 73, 94) # Darker blue
292
  elif self.in_page_header:
293
- run.bold = True
294
- run.font.size = Pt(14)
295
- run.font.color.rgb = RGBColor(44, 62, 80)
 
 
 
 
 
296
  else:
297
  # Apply pattern-specific formatting with OpenCV enhancement
298
  self._apply_opencv_pattern_formatting(run, indent_info, text_classification)
 
259
 
260
  def handle_data(self, data):
261
  if data.strip():
262
+ # Clean OCR artifacts
263
+ data = data.replace(':unselected:', '')
264
+ data = data.replace(':selected:', '')
265
  data = data.replace(' ', ' ')
266
+ if self.in_page_header:
267
+ page_match = re.search(r'Page (\d+)', data)
268
+ if page_match:
269
+ page_num = int(page_match.group(1))
270
+ page_header = f"PAGE {page_num}"
271
+ self.text_parts.append(page_header.center(80))
272
 
273
  if self.in_table:
274
  self.current_table_row.append(data.strip())
 
299
  run.font.size = Pt(14)
300
  run.font.color.rgb = RGBColor(52, 73, 94) # Darker blue
301
  elif self.in_page_header:
302
+ page_match = re.search(r'Page (\d+)', data)
303
+ if page_match:
304
+ page_num = int(page_match.group(1))
305
+ page_header = f"PAGE {page_num}"
306
+ run.bold = True
307
+ run.font.size = Pt(14)
308
+ run.font.color.rgb = RGBColor(44, 62, 80)
309
+ self.text_parts.append(page_header.center(80))
310
  else:
311
  # Apply pattern-specific formatting with OpenCV enhancement
312
  self._apply_opencv_pattern_formatting(run, indent_info, text_classification)
ocr_service.py CHANGED
@@ -645,20 +645,46 @@ class EnhancedHTMLProcessor:
645
  return f'<div{class_str}>{content}</div>'
646
 
647
  def _table_to_html(self, table, table_idx):
648
- """Convert table to HTML with proper structure"""
649
  if not table.cells:
650
  return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
651
 
652
- # Create table matrix
653
  max_row = max(cell.row_index for cell in table.cells) + 1
654
  max_col = max(cell.column_index for cell in table.cells) + 1
655
 
656
- table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
 
 
657
 
658
- # Fill matrix
659
  for cell in table.cells:
660
- content = (cell.content or "").strip()
661
- table_matrix[cell.row_index][cell.column_index] = content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662
 
663
  # Generate HTML
664
  html_parts = [f'<div class="table-container">']
@@ -666,19 +692,18 @@ class EnhancedHTMLProcessor:
666
  html_parts.append('<table class="table">')
667
 
668
  for row_idx, row in enumerate(table_matrix):
669
- if row_idx == 0 and any(cell.strip() for cell in row):
670
- # Header row
671
- html_parts.append('<tr>')
672
- for cell in row:
673
- html_parts.append(f'<th>{cell}</th>')
674
- html_parts.append('</tr>')
675
- else:
676
- # Data row
677
- if any(cell.strip() for cell in row): # Skip empty rows
678
- html_parts.append('<tr>')
679
- for cell in row:
680
- html_parts.append(f'<td>{cell}</td>')
681
- html_parts.append('</tr>')
682
 
683
  html_parts.append('</table></div>')
684
  return '\n'.join(html_parts)
@@ -783,6 +808,21 @@ class EnhancedHTMLProcessor:
783
 
784
  return max_overlap
785
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
  @staticmethod
787
  def html_to_formatted_text_enhanced(html_content):
788
  """Convert HTML back to formatted text with OpenCV-enhanced preservation"""
@@ -896,6 +936,9 @@ class EnhancedHTMLProcessor:
896
 
897
  def handle_data(self, data):
898
  if data.strip():
 
 
 
899
  data = data.replace('&nbsp;', ' ')
900
 
901
  if self.in_page_header:
@@ -1004,6 +1047,38 @@ class EnhancedHTMLProcessor:
1004
 
1005
  return result.strip()
1006
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1007
 
1008
  class OCRService:
1009
  """Main OCR service with OpenCV-enhanced text analysis, spacing detection, and bold text recognition"""
@@ -1135,25 +1210,38 @@ class OCRService:
1135
  with open(pdf_path, 'rb') as pdf_file:
1136
  file_content = pdf_file.read()
1137
 
1138
- # Try different API call patterns
 
 
1139
  try:
 
1140
  poller = self.azure_client.begin_analyze_document(
1141
  "prebuilt-layout",
1142
  body=file_content,
1143
- content_type="application/pdf"
 
 
1144
  )
1145
- except TypeError:
 
1146
  try:
1147
- poller = self.azure_client.begin_analyze_document(
1148
- model_id="prebuilt-layout",
1149
- body=file_content
1150
- )
1151
- except TypeError:
1152
- pdf_file.seek(0)
1153
  poller = self.azure_client.begin_analyze_document(
1154
  "prebuilt-layout",
1155
- document=pdf_file
 
1156
  )
 
 
 
 
 
 
 
 
 
 
 
 
1157
 
1158
  analysis_result = poller.result()
1159
 
@@ -1622,4 +1710,4 @@ class OCRService:
1622
  methods.append("tesseract")
1623
  methods.append("pymupdf")
1624
 
1625
- return methods
 
645
  return f'<div{class_str}>{content}</div>'
646
 
647
  def _table_to_html(self, table, table_idx):
648
+ """Convert table to HTML with improved cell alignment and artifact removal"""
649
  if not table.cells:
650
  return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
651
 
652
+ # Get table dimensions
653
  max_row = max(cell.row_index for cell in table.cells) + 1
654
  max_col = max(cell.column_index for cell in table.cells) + 1
655
 
656
+ # Create table matrix with cell span information
657
+ table_matrix = [[{"content": "", "rowspan": 1, "colspan": 1, "occupied": False}
658
+ for _ in range(max_col)] for _ in range(max_row)]
659
 
660
+ # Fill matrix with proper handling of spans
661
  for cell in table.cells:
662
+ row_idx = cell.row_index
663
+ col_idx = cell.column_index
664
+
665
+ # Clean the content
666
+ content = self.clean_ocr_artifacts(cell.content or "").strip()
667
+
668
+ # Get span information
669
+ rowspan = getattr(cell, 'row_span', 1) or 1
670
+ colspan = getattr(cell, 'column_span', 1) or 1
671
+
672
+ # Mark this cell and any cells it spans over
673
+ if row_idx < max_row and col_idx < max_col:
674
+ # Find the first non-occupied cell in this position
675
+ while col_idx < max_col and table_matrix[row_idx][col_idx]["occupied"]:
676
+ col_idx += 1
677
+
678
+ if col_idx < max_col:
679
+ table_matrix[row_idx][col_idx]["content"] = content
680
+ table_matrix[row_idx][col_idx]["rowspan"] = rowspan
681
+ table_matrix[row_idx][col_idx]["colspan"] = colspan
682
+
683
+ # Mark spanned cells as occupied
684
+ for r in range(row_idx, min(row_idx + rowspan, max_row)):
685
+ for c in range(col_idx, min(col_idx + colspan, max_col)):
686
+ if r != row_idx or c != col_idx:
687
+ table_matrix[r][c]["occupied"] = True
688
 
689
  # Generate HTML
690
  html_parts = [f'<div class="table-container">']
 
692
  html_parts.append('<table class="table">')
693
 
694
  for row_idx, row in enumerate(table_matrix):
695
+ html_parts.append('<tr>')
696
+ for col_idx, cell in enumerate(row):
697
+ if not cell["occupied"]:
698
+ content = cell["content"]
699
+ rowspan_attr = f' rowspan="{cell["rowspan"]}"' if cell["rowspan"] > 1 else ''
700
+ colspan_attr = f' colspan="{cell["colspan"]}"' if cell["colspan"] > 1 else ''
701
+
702
+ if row_idx == 0 and content.strip(): # Header row
703
+ html_parts.append(f'<th{rowspan_attr}{colspan_attr}>{content}</th>')
704
+ else:
705
+ html_parts.append(f'<td{rowspan_attr}{colspan_attr}>{content}</td>')
706
+ html_parts.append('</tr>')
 
707
 
708
  html_parts.append('</table></div>')
709
  return '\n'.join(html_parts)
 
808
 
809
  return max_overlap
810
 
811
+ @staticmethod
812
+ def clean_ocr_artifacts(text: str) -> str:
813
+ """Remove OCR artifacts like checkbox markers and clean up text"""
814
+ if not text:
815
+ return text
816
+
817
+ # Remove checkbox markers
818
+ text = re.sub(r':unselected:', '', text)
819
+ text = re.sub(r':selected:', '', text) # Replace with checkmark
820
+
821
+ # Clean up multiple spaces
822
+ text = re.sub(r'\s+', ' ', text)
823
+
824
+ return text.strip()
825
+
826
  @staticmethod
827
  def html_to_formatted_text_enhanced(html_content):
828
  """Convert HTML back to formatted text with OpenCV-enhanced preservation"""
 
936
 
937
  def handle_data(self, data):
938
  if data.strip():
939
+ # Clean OCR artifacts first
940
+ data = data.replace(':unselected:', '')
941
+ data = data.replace(':selected:', '')
942
  data = data.replace('&nbsp;', ' ')
943
 
944
  if self.in_page_header:
 
1047
 
1048
  return result.strip()
1049
 
1050
+ def _validate_and_fix_table_structure(self, table_matrix):
1051
+ """Validate and fix common table structure issues"""
1052
+ if not table_matrix:
1053
+ return table_matrix
1054
+
1055
+ max_row = len(table_matrix)
1056
+ max_col = len(table_matrix[0]) if table_matrix else 0
1057
+
1058
+ # Ensure all rows have same number of columns
1059
+ for row in table_matrix:
1060
+ while len(row) < max_col:
1061
+ row.append({"content": "", "rowspan": 1, "colspan": 1, "occupied": False})
1062
+
1063
+ # Remove completely empty rows
1064
+ table_matrix = [row for row in table_matrix if any(cell["content"].strip() for cell in row)]
1065
+
1066
+ # Merge cells with identical content in adjacent columns (likely split cells)
1067
+ for row_idx, row in enumerate(table_matrix):
1068
+ col_idx = 0
1069
+ while col_idx < len(row) - 1:
1070
+ current = row[col_idx]
1071
+ next_cell = row[col_idx + 1]
1072
+
1073
+ if (current["content"] == next_cell["content"] and
1074
+ current["content"].strip() and
1075
+ not current["occupied"] and not next_cell["occupied"]):
1076
+ # Merge cells
1077
+ current["colspan"] += next_cell["colspan"]
1078
+ next_cell["occupied"] = True
1079
+ col_idx += 1
1080
+
1081
+ return table_matrix
1082
 
1083
  class OCRService:
1084
  """Main OCR service with OpenCV-enhanced text analysis, spacing detection, and bold text recognition"""
 
1210
  with open(pdf_path, 'rb') as pdf_file:
1211
  file_content = pdf_file.read()
1212
 
1213
+ # Use enhanced analysis features
1214
+ from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
1215
+
1216
  try:
1217
+ # Try with features parameter for better table extraction
1218
  poller = self.azure_client.begin_analyze_document(
1219
  "prebuilt-layout",
1220
  body=file_content,
1221
+ content_type="application/pdf",
1222
+ features=["keyValuePairs"], # Enable key-value pair detection
1223
+ output_content_format="markdown" # Better structure preservation
1224
  )
1225
+ except (TypeError, AttributeError):
1226
+ # Fallback to basic call
1227
  try:
 
 
 
 
 
 
1228
  poller = self.azure_client.begin_analyze_document(
1229
  "prebuilt-layout",
1230
+ body=file_content,
1231
+ content_type="application/pdf"
1232
  )
1233
+ except TypeError:
1234
+ try:
1235
+ poller = self.azure_client.begin_analyze_document(
1236
+ model_id="prebuilt-layout",
1237
+ body=file_content
1238
+ )
1239
+ except TypeError:
1240
+ pdf_file.seek(0)
1241
+ poller = self.azure_client.begin_analyze_document(
1242
+ "prebuilt-layout",
1243
+ document=pdf_file
1244
+ )
1245
 
1246
  analysis_result = poller.result()
1247
 
 
1710
  methods.append("tesseract")
1711
  methods.append("pymupdf")
1712
 
1713
+ return methods