Ayaan Sharif commited on
Commit
1d76058
Β·
1 Parent(s): 933ba3b

Add picture classification with higher accuracy (images_scale=3.0) and improved bbox matching

Browse files
Files changed (2) hide show
  1. app.py +98 -24
  2. sample/Screenshot 2025-10-15 191615.png +3 -0
app.py CHANGED
@@ -21,6 +21,20 @@ COLORS = {
21
  "page_header": "#4D96FF",
22
  "page_footer": "#9D84B7",
23
  "picture": "#FF8C42",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
25
 
26
  def draw_layout_boxes(image_path, layout_data, scale_x=1.0, scale_y=1.0):
@@ -45,6 +59,7 @@ def draw_layout_boxes(image_path, layout_data, scale_x=1.0, scale_y=1.0):
45
  for cluster in layout_data:
46
  label = cluster.get("label", "unknown")
47
  bbox = cluster.get("bbox")
 
48
 
49
  if bbox:
50
  # bbox format: [x0, y0, x1, y1] from PDF coordinates
@@ -61,8 +76,13 @@ def draw_layout_boxes(image_path, layout_data, scale_x=1.0, scale_y=1.0):
61
  # Draw rectangle
62
  draw.rectangle([x0, y0, x1, y1], outline=color, width=3)
63
 
64
- # Draw label background
65
- label_text = label.replace("_", " ").title()
 
 
 
 
 
66
  bbox_text = draw.textbbox((x0, y0 - 25), label_text, font=small_font)
67
  draw.rectangle([bbox_text[0] - 2, bbox_text[1] - 2, bbox_text[2] + 2, bbox_text[3] + 2],
68
  fill=color)
@@ -88,6 +108,8 @@ def process_document(file_path, mode, enable_ocr, enable_tables):
88
  pipeline_options.do_ocr = enable_ocr
89
  pipeline_options.generate_page_images = True
90
  pipeline_options.generate_picture_images = True
 
 
91
 
92
  # Create converter
93
  converter = DocumentConverter(
@@ -105,17 +127,66 @@ def process_document(file_path, mode, enable_ocr, enable_tables):
105
  total_clusters = 0
106
  table_count = 0
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  for page_no, page in enumerate(result.pages, 1):
109
  if page.predictions.layout:
110
  clusters = page.predictions.layout.clusters
111
  total_clusters += len(clusters)
112
 
113
  for cluster in clusters:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  layout_info.append({
115
  "page": page_no,
116
- "label": cluster.label,
117
  "bbox": [cluster.bbox.l, cluster.bbox.t, cluster.bbox.r, cluster.bbox.b],
118
- "confidence": getattr(cluster, "confidence", None)
 
119
  })
120
 
121
  # Count tables
@@ -268,11 +339,13 @@ with gr.Blocks(title="Document Layout Detection", theme=gr.themes.Soft()) as dem
268
  gr.Markdown("""
269
  ### Legend
270
  Different colors represent different document elements:
271
- - πŸ”΄ Title
272
- - πŸ”΅ Text
273
- - 🟒 Section Header
274
- - 🟠 Table
275
- - 🟣 List/Figure/Formula
 
 
276
 
277
  ### How to Use
278
  1. Upload your document (PDF or image of ID card, invoice, report, etc.)
@@ -281,23 +354,24 @@ with gr.Blocks(title="Document Layout Detection", theme=gr.themes.Soft()) as dem
281
  4. View the visualization with bounding boxes and explore the outputs
282
 
283
  ### πŸ’‘ Try Examples Below!
284
- Click on any example to see instant results on different document types.
285
  """)
286
 
287
- # Add examples
288
- gr.Examples(
289
- examples=[
290
- ["sample/Screenshot 2025-10-13 114010.png", "Fast", True, True],
291
- ["sample/Screenshot 2025-10-13 114606.png", "Fast", True, True],
292
- ["sample/Screenshot 2025-10-15 111602.png", "Fast", True, True],
293
- ["sample/Screenshot 2025-10-15 175735.png", "Fast", True, True],
294
- ],
295
- inputs=[file_input, mode_dropdown, ocr_checkbox, tables_checkbox],
296
- outputs=[visualization_output, summary_output, markdown_output, json_output],
297
- fn=gradio_interface,
298
- cache_examples=False,
299
- label="πŸ“š Example Documents"
300
- )
 
301
 
302
  # Connect the button
303
  process_btn.click(
 
21
  "page_header": "#4D96FF",
22
  "page_footer": "#9D84B7",
23
  "picture": "#FF8C42",
24
+ # Picture classifications
25
+ "signature": "#9D4EDD",
26
+ "qr_code": "#06FFA5",
27
+ "bar_code": "#06FFA5",
28
+ "logo": "#FFB627",
29
+ "stamp": "#E63946",
30
+ "icon": "#F4A261",
31
+ "bar_chart": "#2A9D8F",
32
+ "pie_chart": "#E76F51",
33
+ "line_chart": "#264653",
34
+ "flow_chart": "#8338EC",
35
+ "map": "#3A86FF",
36
+ "screenshot": "#FB5607",
37
+ "other": "#CCCCCC",
38
  }
39
 
40
  def draw_layout_boxes(image_path, layout_data, scale_x=1.0, scale_y=1.0):
 
59
  for cluster in layout_data:
60
  label = cluster.get("label", "unknown")
61
  bbox = cluster.get("bbox")
62
+ classification = cluster.get("classification")
63
 
64
  if bbox:
65
  # bbox format: [x0, y0, x1, y1] from PDF coordinates
 
76
  # Draw rectangle
77
  draw.rectangle([x0, y0, x1, y1], outline=color, width=3)
78
 
79
+ # Draw label with classification confidence if available
80
+ if classification:
81
+ confidence_pct = classification['confidence'] * 100
82
+ label_text = f"{label.replace('_', ' ').title()} ({confidence_pct:.0f}%)"
83
+ else:
84
+ label_text = label.replace("_", " ").title()
85
+
86
  bbox_text = draw.textbbox((x0, y0 - 25), label_text, font=small_font)
87
  draw.rectangle([bbox_text[0] - 2, bbox_text[1] - 2, bbox_text[2] + 2, bbox_text[3] + 2],
88
  fill=color)
 
108
  pipeline_options.do_ocr = enable_ocr
109
  pipeline_options.generate_page_images = True
110
  pipeline_options.generate_picture_images = True
111
+ pipeline_options.do_picture_classification = True # Enable classification
112
+ pipeline_options.images_scale = 3.0 # Higher resolution for better accuracy
113
 
114
  # Create converter
115
  converter = DocumentConverter(
 
127
  total_clusters = 0
128
  table_count = 0
129
 
130
+ # Get picture classifications for enrichment
131
+ # We need to store by page number and use a more flexible matching
132
+ picture_classifications_by_page = {}
133
+ print(f"DEBUG: Total pictures found: {len(result.document.pictures)}")
134
+ for picture in result.document.pictures:
135
+ page_num = picture.prov[0].page_no
136
+ bbox = picture.prov[0].bbox
137
+
138
+ if page_num not in picture_classifications_by_page:
139
+ picture_classifications_by_page[page_num] = []
140
+
141
+ # Get classification if available
142
+ for annotation in picture.annotations:
143
+ if hasattr(annotation, 'predicted_classes') and annotation.predicted_classes:
144
+ top_pred = annotation.predicted_classes[0]
145
+ picture_classifications_by_page[page_num].append({
146
+ 'bbox': bbox,
147
+ 'class': top_pred.class_name,
148
+ 'confidence': top_pred.confidence
149
+ })
150
+ print(f"DEBUG: Found classification - page: {page_num}, bbox: ({bbox.l:.2f}, {bbox.t:.2f}, {bbox.r:.2f}, {bbox.b:.2f}), class: {top_pred.class_name}")
151
+ break
152
+
153
  for page_no, page in enumerate(result.pages, 1):
154
  if page.predictions.layout:
155
  clusters = page.predictions.layout.clusters
156
  total_clusters += len(clusters)
157
 
158
  for cluster in clusters:
159
+ # Check if this is a picture with classification
160
+ label = cluster.label
161
+ classification = None
162
+ if cluster.label == "picture" and page_no in picture_classifications_by_page:
163
+ print(f"DEBUG: Picture cluster at page {page_no}: ({cluster.bbox.l:.2f}, {cluster.bbox.t:.2f}, {cluster.bbox.r:.2f}, {cluster.bbox.b:.2f})")
164
+
165
+ # Find matching classification by comparing bounding boxes with tolerance
166
+ for pic_class in picture_classifications_by_page[page_no]:
167
+ pic_bbox = pic_class['bbox']
168
+ # Check if bboxes match with small tolerance (allowing for floating point differences)
169
+ # Compare left and right which should match exactly
170
+ if (abs(cluster.bbox.l - pic_bbox.l) < 1.0 and
171
+ abs(cluster.bbox.r - pic_bbox.r) < 1.0):
172
+ # X coordinates match, this is likely the same picture
173
+ classification = {
174
+ 'class': pic_class['class'],
175
+ 'confidence': pic_class['confidence']
176
+ }
177
+ label = f"{classification['class']}"
178
+ print(f"DEBUG: Matched classification: {label} (conf: {classification['confidence']:.2%})")
179
+ break
180
+
181
+ if not classification:
182
+ print(f"DEBUG: No classification match found")
183
+
184
  layout_info.append({
185
  "page": page_no,
186
+ "label": label,
187
  "bbox": [cluster.bbox.l, cluster.bbox.t, cluster.bbox.r, cluster.bbox.b],
188
+ "confidence": getattr(cluster, "confidence", None),
189
+ "classification": classification
190
  })
191
 
192
  # Count tables
 
339
  gr.Markdown("""
340
  ### Legend
341
  Different colors represent different document elements:
342
+
343
+ **Layout Elements:**
344
+ - πŸ”΄ Title β€’ πŸ”΅ Text β€’ 🟒 Section Header β€’ 🟠 Table β€’ 🟣 List/Figure/Formula
345
+
346
+ **Picture Classifications (AI-detected):**
347
+ - 🟣 Signature β€’ 🟒 QR Code β€’ 🟒 Barcode β€’ 🟑 Logo β€’ πŸ”΄ Stamp
348
+ - 🟦 Charts (Bar/Pie/Line) β€’ 🟣 Flow Chart β€’ 🟠 Screenshot β€’ βšͺ Other
349
 
350
  ### How to Use
351
  1. Upload your document (PDF or image of ID card, invoice, report, etc.)
 
354
  4. View the visualization with bounding boxes and explore the outputs
355
 
356
  ### πŸ’‘ Try Examples Below!
357
+ Click on any example document to see instant results on different document types.
358
  """)
359
 
360
+ # Add examples with image previews
361
+ with gr.Row():
362
+ gr.Examples(
363
+ examples=[
364
+ ["sample/Screenshot 2025-10-13 114010.png", "Fast", True, True],
365
+ ["sample/Screenshot 2025-10-13 114606.png", "Fast", True, True],
366
+ ["sample/Screenshot 2025-10-15 191615.png", "Fast", True, True],
367
+ ],
368
+ inputs=[file_input, mode_dropdown, ocr_checkbox, tables_checkbox],
369
+ outputs=[visualization_output, summary_output, markdown_output, json_output],
370
+ fn=gradio_interface,
371
+ cache_examples=False,
372
+ label="πŸ“š Example Documents",
373
+ examples_per_page=3
374
+ )
375
 
376
  # Connect the button
377
  process_btn.click(
sample/Screenshot 2025-10-15 191615.png ADDED

Git LFS Details

  • SHA256: de6a3bbda5454200ef4e53fca4be935ff5dd0b71604b788f5ffba3dc590fdf02
  • Pointer size: 132 Bytes
  • Size of remote file: 1.04 MB