ricklon commited on
Commit
3edf7ce
·
1 Parent(s): 7e8815a

Populate full-page cropped gallery from detected refs

Browse files
Files changed (1) hide show
  1. app.py +33 -3
app.py CHANGED
@@ -242,6 +242,32 @@ def draw_bounding_boxes(image, refs, extract_images=False):
242
  img_draw.paste(overlay, (0, 0), overlay)
243
  return img_draw, crops
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  def clean_output(text, include_images=False):
246
  if not text:
247
  return ""
@@ -994,6 +1020,7 @@ def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_c
994
 
995
  img_out = None
996
  crops = []
 
997
  result_for_layout = result
998
 
999
  if has_grounding and '<|ref|>' in result:
@@ -1001,12 +1028,15 @@ def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_c
1001
  if task == "📋 Markdown" and enable_equation_zoom:
1002
  refs.extend(_refine_equation_refs(image, result))
1003
  if refs:
1004
- img_out, crops = draw_bounding_boxes(image, refs, True)
 
1005
  synthetic = [r[0] for r in refs if r[1] == "equation_detail"]
1006
  if synthetic:
1007
  result_for_layout = result + "\n" + "\n".join(synthetic)
1008
-
1009
- markdown = embed_images(markdown, crops)
 
 
1010
 
1011
  return cleaned, markdown, result_for_layout, img_out, crops
1012
 
 
242
  img_draw.paste(overlay, (0, 0), overlay)
243
  return img_draw, crops
244
 
245
+ def _extract_labeled_crops_from_refs(image, refs, max_items=24):
246
+ img_w, img_h = image.size
247
+ items = []
248
+ seen = set()
249
+
250
+ for ref in refs:
251
+ label = str(ref[1])
252
+ coords = _parse_coord_payload(ref[2])
253
+ for box in coords:
254
+ x1 = int(box[0] / 999.0 * img_w)
255
+ y1 = int(box[1] / 999.0 * img_h)
256
+ x2 = int(box[2] / 999.0 * img_w)
257
+ y2 = int(box[3] / 999.0 * img_h)
258
+ if x2 - x1 < 8 or y2 - y1 < 8:
259
+ continue
260
+ key = (label.lower(), x1, y1, x2, y2)
261
+ if key in seen:
262
+ continue
263
+ seen.add(key)
264
+ crop = image.crop((x1, y1, x2, y2))
265
+ caption = f"{label} ({crop.width}x{crop.height})"
266
+ items.append((crop, caption))
267
+ if len(items) >= max_items:
268
+ return items
269
+ return items
270
+
271
  def clean_output(text, include_images=False):
272
  if not text:
273
  return ""
 
1020
 
1021
  img_out = None
1022
  crops = []
1023
+ figure_crops = []
1024
  result_for_layout = result
1025
 
1026
  if has_grounding and '<|ref|>' in result:
 
1028
  if task == "📋 Markdown" and enable_equation_zoom:
1029
  refs.extend(_refine_equation_refs(image, result))
1030
  if refs:
1031
+ img_out, figure_crops = draw_bounding_boxes(image, refs, True)
1032
+ crops = _extract_labeled_crops_from_refs(image, refs)
1033
  synthetic = [r[0] for r in refs if r[1] == "equation_detail"]
1034
  if synthetic:
1035
  result_for_layout = result + "\n" + "\n".join(synthetic)
1036
+
1037
+ markdown = embed_images(markdown, figure_crops)
1038
+ if not crops and figure_crops:
1039
+ crops = _label_gallery_items(figure_crops, prefix="Figure")
1040
 
1041
  return cleaned, markdown, result_for_layout, img_out, crops
1042