Spaces:

Betimes-Solution
/

PDFtoDocx-OCR

Running

App Files Files Community

Chirapath commited on Sep 29, 2025

Commit

48a5ad7

verified ·

1 Parent(s): 4b6cb81

Upload 5 files

Browse files

Files changed (3) hide show

app.py +3 -0
backend.py +17 -3
ocr_service.py +118 -30

app.py CHANGED Viewed

@@ -440,6 +440,9 @@ def process_pdf_with_opencv_enhancement(pdf_file, ocr_method, enable_header_foot
         progress(1.0, desc="Complete!")
         if result['success']:
             metadata_info = format_opencv_enhanced_metadata(result['metadata'], result['method_used'])
             status_parts = [f"Success: Processed using {result['method_used']}"]
             status_parts.append("OpenCV text block analysis: Enabled")

         progress(1.0, desc="Complete!")
         if result['success']:
+            # Clean any remaining artifacts from text and HTML
+            result['text'] = result['text'].replace(':unselected:', '').replace(':selected:', '')
+            result['html'] = result['html'].replace(':unselected:', '').replace(':selected:', '')
             metadata_info = format_opencv_enhanced_metadata(result['metadata'], result['method_used'])
             status_parts = [f"Success: Processed using {result['method_used']}"]
             status_parts.append("OpenCV text block analysis: Enabled")

backend.py CHANGED Viewed

@@ -259,7 +259,16 @@ class EnhancedDocumentExporter:
                 def handle_data(self, data):
                     if data.strip():
                         data = data.replace('&nbsp;', ' ')
                         if self.in_table:
                             self.current_table_row.append(data.strip())
@@ -290,9 +299,14 @@ class EnhancedDocumentExporter:
                                 run.font.size = Pt(14)
                                 run.font.color.rgb = RGBColor(52, 73, 94)  # Darker blue
                             elif self.in_page_header:
-                                run.bold = True
-                                run.font.size = Pt(14)
-                                run.font.color.rgb = RGBColor(44, 62, 80)
                             else:
                                 # Apply pattern-specific formatting with OpenCV enhancement
                                 self._apply_opencv_pattern_formatting(run, indent_info, text_classification)

                 def handle_data(self, data):
                     if data.strip():
+                        # Clean OCR artifacts
+                        data = data.replace(':unselected:', '')
+                        data = data.replace(':selected:', '')
                         data = data.replace('&nbsp;', ' ')
+                        if self.in_page_header:
+                            page_match = re.search(r'Page (\d+)', data)
+                            if page_match:
+                                page_num = int(page_match.group(1))
+                                page_header = f"PAGE {page_num}"
+                                self.text_parts.append(page_header.center(80))
                         if self.in_table:
                             self.current_table_row.append(data.strip())
                                 run.font.size = Pt(14)
                                 run.font.color.rgb = RGBColor(52, 73, 94)  # Darker blue
                             elif self.in_page_header:
+                                page_match = re.search(r'Page (\d+)', data)
+                                if page_match:
+                                    page_num = int(page_match.group(1))
+                                    page_header = f"PAGE {page_num}"
+                                    run.bold = True
+                                    run.font.size = Pt(14)
+                                    run.font.color.rgb = RGBColor(44, 62, 80)
+                                    self.text_parts.append(page_header.center(80))
                             else:
                                 # Apply pattern-specific formatting with OpenCV enhancement
                                 self._apply_opencv_pattern_formatting(run, indent_info, text_classification)

ocr_service.py CHANGED Viewed

@@ -645,20 +645,46 @@ class EnhancedHTMLProcessor:
             return f'<div{class_str}>{content}</div>'
     def _table_to_html(self, table, table_idx):
-        """Convert table to HTML with proper structure"""
         if not table.cells:
             return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
-        # Create table matrix
         max_row = max(cell.row_index for cell in table.cells) + 1
         max_col = max(cell.column_index for cell in table.cells) + 1
-        table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
-        # Fill matrix
         for cell in table.cells:
-            content = (cell.content or "").strip()
-            table_matrix[cell.row_index][cell.column_index] = content
         # Generate HTML
         html_parts = [f'<div class="table-container">']
@@ -666,19 +692,18 @@ class EnhancedHTMLProcessor:
         html_parts.append('<table class="table">')
         for row_idx, row in enumerate(table_matrix):
-            if row_idx == 0 and any(cell.strip() for cell in row):
-                # Header row
-                html_parts.append('<tr>')
-                for cell in row:
-                    html_parts.append(f'<th>{cell}</th>')
-                html_parts.append('</tr>')
-            else:
-                # Data row
-                if any(cell.strip() for cell in row):  # Skip empty rows
-                    html_parts.append('<tr>')
-                    for cell in row:
-                        html_parts.append(f'<td>{cell}</td>')
-                    html_parts.append('</tr>')
         html_parts.append('</table></div>')
         return '\n'.join(html_parts)
@@ -783,6 +808,21 @@ class EnhancedHTMLProcessor:
         return max_overlap
     @staticmethod
     def html_to_formatted_text_enhanced(html_content):
         """Convert HTML back to formatted text with OpenCV-enhanced preservation"""
@@ -896,6 +936,9 @@ class EnhancedHTMLProcessor:
             def handle_data(self, data):
                 if data.strip():
                     data = data.replace('&nbsp;', ' ')
                     if self.in_page_header:
@@ -1004,6 +1047,38 @@ class EnhancedHTMLProcessor:
         return result.strip()
 class OCRService:
     """Main OCR service with OpenCV-enhanced text analysis, spacing detection, and bold text recognition"""
@@ -1135,25 +1210,38 @@ class OCRService:
             with open(pdf_path, 'rb') as pdf_file:
                 file_content = pdf_file.read()
-                # Try different API call patterns
                 try:
                     poller = self.azure_client.begin_analyze_document(
                         "prebuilt-layout",
                         body=file_content,
-                        content_type="application/pdf"
                     )
-                except TypeError:
                     try:
-                        poller = self.azure_client.begin_analyze_document(
-                            model_id="prebuilt-layout",
-                            body=file_content
-                        )
-                    except TypeError:
-                        pdf_file.seek(0)
                         poller = self.azure_client.begin_analyze_document(
                             "prebuilt-layout",
-                            document=pdf_file
                         )
             analysis_result = poller.result()
@@ -1622,4 +1710,4 @@ class OCRService:
             methods.append("tesseract")
         methods.append("pymupdf")
-        return methods

             return f'<div{class_str}>{content}</div>'
     def _table_to_html(self, table, table_idx):
+        """Convert table to HTML with improved cell alignment and artifact removal"""
         if not table.cells:
             return f'<div class="table-container"><h4>Table {table_idx + 1} (Empty)</h4></div>'
+        # Get table dimensions
         max_row = max(cell.row_index for cell in table.cells) + 1
         max_col = max(cell.column_index for cell in table.cells) + 1
+        # Create table matrix with cell span information
+        table_matrix = [[{"content": "", "rowspan": 1, "colspan": 1, "occupied": False}
+                        for _ in range(max_col)] for _ in range(max_row)]
+        # Fill matrix with proper handling of spans
         for cell in table.cells:
+            row_idx = cell.row_index
+            col_idx = cell.column_index
+            # Clean the content
+            content = self.clean_ocr_artifacts(cell.content or "").strip()
+            # Get span information
+            rowspan = getattr(cell, 'row_span', 1) or 1
+            colspan = getattr(cell, 'column_span', 1) or 1
+            # Mark this cell and any cells it spans over
+            if row_idx < max_row and col_idx < max_col:
+                # Find the first non-occupied cell in this position
+                while col_idx < max_col and table_matrix[row_idx][col_idx]["occupied"]:
+                    col_idx += 1
+                if col_idx < max_col:
+                    table_matrix[row_idx][col_idx]["content"] = content
+                    table_matrix[row_idx][col_idx]["rowspan"] = rowspan
+                    table_matrix[row_idx][col_idx]["colspan"] = colspan
+                    # Mark spanned cells as occupied
+                    for r in range(row_idx, min(row_idx + rowspan, max_row)):
+                        for c in range(col_idx, min(col_idx + colspan, max_col)):
+                            if r != row_idx or c != col_idx:
+                                table_matrix[r][c]["occupied"] = True
         # Generate HTML
         html_parts = [f'<div class="table-container">']
         html_parts.append('<table class="table">')
         for row_idx, row in enumerate(table_matrix):
+            html_parts.append('<tr>')
+            for col_idx, cell in enumerate(row):
+                if not cell["occupied"]:
+                    content = cell["content"]
+                    rowspan_attr = f' rowspan="{cell["rowspan"]}"' if cell["rowspan"] > 1 else ''
+                    colspan_attr = f' colspan="{cell["colspan"]}"' if cell["colspan"] > 1 else ''
+                    if row_idx == 0 and content.strip():  # Header row
+                        html_parts.append(f'<th{rowspan_attr}{colspan_attr}>{content}</th>')
+                    else:
+                        html_parts.append(f'<td{rowspan_attr}{colspan_attr}>{content}</td>')
+            html_parts.append('</tr>')
         html_parts.append('</table></div>')
         return '\n'.join(html_parts)
         return max_overlap
+    @staticmethod
+    def clean_ocr_artifacts(text: str) -> str:
+        """Remove OCR artifacts like checkbox markers and clean up text"""
+        if not text:
+            return text
+        # Remove checkbox markers
+        text = re.sub(r':unselected:', '', text)
+        text = re.sub(r':selected:', '', text)  # Replace with checkmark
+        # Clean up multiple spaces
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
     @staticmethod
     def html_to_formatted_text_enhanced(html_content):
         """Convert HTML back to formatted text with OpenCV-enhanced preservation"""
             def handle_data(self, data):
                 if data.strip():
+                    # Clean OCR artifacts first
+                    data = data.replace(':unselected:', '')
+                    data = data.replace(':selected:', '')
                     data = data.replace('&nbsp;', ' ')
                     if self.in_page_header:
         return result.strip()
+    def _validate_and_fix_table_structure(self, table_matrix):
+        """Validate and fix common table structure issues"""
+        if not table_matrix:
+            return table_matrix
+        max_row = len(table_matrix)
+        max_col = len(table_matrix[0]) if table_matrix else 0
+        # Ensure all rows have same number of columns
+        for row in table_matrix:
+            while len(row) < max_col:
+                row.append({"content": "", "rowspan": 1, "colspan": 1, "occupied": False})
+        # Remove completely empty rows
+        table_matrix = [row for row in table_matrix if any(cell["content"].strip() for cell in row)]
+        # Merge cells with identical content in adjacent columns (likely split cells)
+        for row_idx, row in enumerate(table_matrix):
+            col_idx = 0
+            while col_idx < len(row) - 1:
+                current = row[col_idx]
+                next_cell = row[col_idx + 1]
+                if (current["content"] == next_cell["content"] and
+                    current["content"].strip() and
+                    not current["occupied"] and not next_cell["occupied"]):
+                    # Merge cells
+                    current["colspan"] += next_cell["colspan"]
+                    next_cell["occupied"] = True
+                col_idx += 1
+        return table_matrix
 class OCRService:
     """Main OCR service with OpenCV-enhanced text analysis, spacing detection, and bold text recognition"""
             with open(pdf_path, 'rb') as pdf_file:
                 file_content = pdf_file.read()
+                # Use enhanced analysis features
+                from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
                 try:
+                    # Try with features parameter for better table extraction
                     poller = self.azure_client.begin_analyze_document(
                         "prebuilt-layout",
                         body=file_content,
+                        content_type="application/pdf",
+                        features=["keyValuePairs"],  # Enable key-value pair detection
+                        output_content_format="markdown"  # Better structure preservation
                     )
+                except (TypeError, AttributeError):
+                    # Fallback to basic call
                     try:
                         poller = self.azure_client.begin_analyze_document(
                             "prebuilt-layout",
+                            body=file_content,
+                            content_type="application/pdf"
                         )
+                    except TypeError:
+                        try:
+                            poller = self.azure_client.begin_analyze_document(
+                                model_id="prebuilt-layout",
+                                body=file_content
+                            )
+                        except TypeError:
+                            pdf_file.seek(0)
+                            poller = self.azure_client.begin_analyze_document(
+                                "prebuilt-layout",
+                                document=pdf_file
+                            )
             analysis_result = poller.result()
             methods.append("tesseract")
         methods.append("pymupdf")
+        return methods