Spaces:

Ayaan-Sharif
/

ocr-layout-detection-poc

Running

App Files Files Community

Ayaan Sharif commited on Oct 15

Commit

1d76058

1 Parent(s): 933ba3b

Add picture classification with higher accuracy (images_scale=3.0) and improved bbox matching

Browse files

Files changed (2) hide show

app.py +98 -24
sample/Screenshot 2025-10-15 191615.png +3 -0

app.py CHANGED Viewed

@@ -21,6 +21,20 @@ COLORS = {
     "page_header": "#4D96FF",
     "page_footer": "#9D84B7",
     "picture": "#FF8C42",
 }
 def draw_layout_boxes(image_path, layout_data, scale_x=1.0, scale_y=1.0):
@@ -45,6 +59,7 @@ def draw_layout_boxes(image_path, layout_data, scale_x=1.0, scale_y=1.0):
     for cluster in layout_data:
         label = cluster.get("label", "unknown")
         bbox = cluster.get("bbox")
         if bbox:
             # bbox format: [x0, y0, x1, y1] from PDF coordinates
@@ -61,8 +76,13 @@ def draw_layout_boxes(image_path, layout_data, scale_x=1.0, scale_y=1.0):
             # Draw rectangle
             draw.rectangle([x0, y0, x1, y1], outline=color, width=3)
-            # Draw label background
-            label_text = label.replace("_", " ").title()
             bbox_text = draw.textbbox((x0, y0 - 25), label_text, font=small_font)
             draw.rectangle([bbox_text[0] - 2, bbox_text[1] - 2, bbox_text[2] + 2, bbox_text[3] + 2],
                          fill=color)
@@ -88,6 +108,8 @@ def process_document(file_path, mode, enable_ocr, enable_tables):
         pipeline_options.do_ocr = enable_ocr
         pipeline_options.generate_page_images = True
         pipeline_options.generate_picture_images = True
         # Create converter
         converter = DocumentConverter(
@@ -105,17 +127,66 @@ def process_document(file_path, mode, enable_ocr, enable_tables):
         total_clusters = 0
         table_count = 0
         for page_no, page in enumerate(result.pages, 1):
             if page.predictions.layout:
                 clusters = page.predictions.layout.clusters
                 total_clusters += len(clusters)
                 for cluster in clusters:
                     layout_info.append({
                         "page": page_no,
-                        "label": cluster.label,
                         "bbox": [cluster.bbox.l, cluster.bbox.t, cluster.bbox.r, cluster.bbox.b],
-                        "confidence": getattr(cluster, "confidence", None)
                     })
             # Count tables
@@ -268,11 +339,13 @@ with gr.Blocks(title="Document Layout Detection", theme=gr.themes.Soft()) as dem
     gr.Markdown("""
     ### Legend
     Different colors represent different document elements:
-    - 🔴 Title
-    - 🔵 Text
-    - 🟢 Section Header
-    - 🟠 Table
-    - 🟣 List/Figure/Formula
     ### How to Use
     1. Upload your document (PDF or image of ID card, invoice, report, etc.)
@@ -281,23 +354,24 @@ with gr.Blocks(title="Document Layout Detection", theme=gr.themes.Soft()) as dem
     4. View the visualization with bounding boxes and explore the outputs
     ### 💡 Try Examples Below!
-    Click on any example to see instant results on different document types.
     """)
-    # Add examples
-    gr.Examples(
-        examples=[
-            ["sample/Screenshot 2025-10-13 114010.png", "Fast", True, True],
-            ["sample/Screenshot 2025-10-13 114606.png", "Fast", True, True],
-            ["sample/Screenshot 2025-10-15 111602.png", "Fast", True, True],
-            ["sample/Screenshot 2025-10-15 175735.png", "Fast", True, True],
-        ],
-        inputs=[file_input, mode_dropdown, ocr_checkbox, tables_checkbox],
-        outputs=[visualization_output, summary_output, markdown_output, json_output],
-        fn=gradio_interface,
-        cache_examples=False,
-        label="📚 Example Documents"
-    )
     # Connect the button
     process_btn.click(

     "page_header": "#4D96FF",
     "page_footer": "#9D84B7",
     "picture": "#FF8C42",
+    # Picture classifications
+    "signature": "#9D4EDD",
+    "qr_code": "#06FFA5",
+    "bar_code": "#06FFA5",
+    "logo": "#FFB627",
+    "stamp": "#E63946",
+    "icon": "#F4A261",
+    "bar_chart": "#2A9D8F",
+    "pie_chart": "#E76F51",
+    "line_chart": "#264653",
+    "flow_chart": "#8338EC",
+    "map": "#3A86FF",
+    "screenshot": "#FB5607",
+    "other": "#CCCCCC",
 }
 def draw_layout_boxes(image_path, layout_data, scale_x=1.0, scale_y=1.0):
     for cluster in layout_data:
         label = cluster.get("label", "unknown")
         bbox = cluster.get("bbox")
+        classification = cluster.get("classification")
         if bbox:
             # bbox format: [x0, y0, x1, y1] from PDF coordinates
             # Draw rectangle
             draw.rectangle([x0, y0, x1, y1], outline=color, width=3)
+            # Draw label with classification confidence if available
+            if classification:
+                confidence_pct = classification['confidence'] * 100
+                label_text = f"{label.replace('_', ' ').title()} ({confidence_pct:.0f}%)"
+            else:
+                label_text = label.replace("_", " ").title()
             bbox_text = draw.textbbox((x0, y0 - 25), label_text, font=small_font)
             draw.rectangle([bbox_text[0] - 2, bbox_text[1] - 2, bbox_text[2] + 2, bbox_text[3] + 2],
                          fill=color)
         pipeline_options.do_ocr = enable_ocr
         pipeline_options.generate_page_images = True
         pipeline_options.generate_picture_images = True
+        pipeline_options.do_picture_classification = True  # Enable classification
+        pipeline_options.images_scale = 3.0  # Higher resolution for better accuracy
         # Create converter
         converter = DocumentConverter(
         total_clusters = 0
         table_count = 0
+        # Get picture classifications for enrichment
+        # We need to store by page number and use a more flexible matching
+        picture_classifications_by_page = {}
+        print(f"DEBUG: Total pictures found: {len(result.document.pictures)}")
+        for picture in result.document.pictures:
+            page_num = picture.prov[0].page_no
+            bbox = picture.prov[0].bbox
+            if page_num not in picture_classifications_by_page:
+                picture_classifications_by_page[page_num] = []
+            # Get classification if available
+            for annotation in picture.annotations:
+                if hasattr(annotation, 'predicted_classes') and annotation.predicted_classes:
+                    top_pred = annotation.predicted_classes[0]
+                    picture_classifications_by_page[page_num].append({
+                        'bbox': bbox,
+                        'class': top_pred.class_name,
+                        'confidence': top_pred.confidence
+                    })
+                    print(f"DEBUG: Found classification - page: {page_num}, bbox: ({bbox.l:.2f}, {bbox.t:.2f}, {bbox.r:.2f}, {bbox.b:.2f}), class: {top_pred.class_name}")
+                    break
         for page_no, page in enumerate(result.pages, 1):
             if page.predictions.layout:
                 clusters = page.predictions.layout.clusters
                 total_clusters += len(clusters)
                 for cluster in clusters:
+                    # Check if this is a picture with classification
+                    label = cluster.label
+                    classification = None
+                    if cluster.label == "picture" and page_no in picture_classifications_by_page:
+                        print(f"DEBUG: Picture cluster at page {page_no}: ({cluster.bbox.l:.2f}, {cluster.bbox.t:.2f}, {cluster.bbox.r:.2f}, {cluster.bbox.b:.2f})")
+                        # Find matching classification by comparing bounding boxes with tolerance
+                        for pic_class in picture_classifications_by_page[page_no]:
+                            pic_bbox = pic_class['bbox']
+                            # Check if bboxes match with small tolerance (allowing for floating point differences)
+                            # Compare left and right which should match exactly
+                            if (abs(cluster.bbox.l - pic_bbox.l) < 1.0 and
+                                abs(cluster.bbox.r - pic_bbox.r) < 1.0):
+                                # X coordinates match, this is likely the same picture
+                                classification = {
+                                    'class': pic_class['class'],
+                                    'confidence': pic_class['confidence']
+                                }
+                                label = f"{classification['class']}"
+                                print(f"DEBUG: Matched classification: {label} (conf: {classification['confidence']:.2%})")
+                                break
+                        if not classification:
+                            print(f"DEBUG: No classification match found")
                     layout_info.append({
                         "page": page_no,
+                        "label": label,
                         "bbox": [cluster.bbox.l, cluster.bbox.t, cluster.bbox.r, cluster.bbox.b],
+                        "confidence": getattr(cluster, "confidence", None),
+                        "classification": classification
                     })
             # Count tables
     gr.Markdown("""
     ### Legend
     Different colors represent different document elements:
+    **Layout Elements:**
+    - 🔴 Title • 🔵 Text • 🟢 Section Header • 🟠 Table • 🟣 List/Figure/Formula
+    **Picture Classifications (AI-detected):**
+    - 🟣 Signature • 🟢 QR Code • 🟢 Barcode • 🟡 Logo • 🔴 Stamp
+    - 🟦 Charts (Bar/Pie/Line) • 🟣 Flow Chart • 🟠 Screenshot • ⚪ Other
     ### How to Use
     1. Upload your document (PDF or image of ID card, invoice, report, etc.)
     4. View the visualization with bounding boxes and explore the outputs
     ### 💡 Try Examples Below!
+    Click on any example document to see instant results on different document types.
     """)
+    # Add examples with image previews
+    with gr.Row():
+        gr.Examples(
+            examples=[
+                ["sample/Screenshot 2025-10-13 114010.png", "Fast", True, True],
+                ["sample/Screenshot 2025-10-13 114606.png", "Fast", True, True],
+                ["sample/Screenshot 2025-10-15 191615.png", "Fast", True, True],
+            ],
+            inputs=[file_input, mode_dropdown, ocr_checkbox, tables_checkbox],
+            outputs=[visualization_output, summary_output, markdown_output, json_output],
+            fn=gradio_interface,
+            cache_examples=False,
+            label="📚 Example Documents",
+            examples_per_page=3
+        )
     # Connect the button
     process_btn.click(

sample/Screenshot 2025-10-15 191615.png ADDED Viewed

Git LFS Details

SHA256: de6a3bbda5454200ef4e53fca4be935ff5dd0b71604b788f5ffba3dc590fdf02
Pointer size: 132 Bytes
Size of remote file: 1.04 MB