tiiuae
/

Falcon-OCR

@@ -777,6 +777,12 @@ class FalconOCRForCausalLM(PreTrainedModel):
         crop_origins: list[tuple[int, int]] = []  # (image_idx, det_idx)
         for img_idx, (pil_img, dets) in enumerate(zip(pil_images, all_layout_dets)):
             img_w, img_h = pil_img.size
             for det_idx, det in enumerate(dets):
                 cat_key = det["category"].strip().lower()
@@ -819,12 +825,21 @@ class FalconOCRForCausalLM(PreTrainedModel):
         # --- Reassemble per-image results ---
         results: list[list[dict]] = [[] for _ in range(len(pil_images))]
         for (img_idx, det_idx), text in zip(crop_origins, flat_texts):
-            det = all_layout_dets[img_idx][det_idx]
-            results[img_idx].append({
-                "category": det["category"],
-                "bbox": det["bbox"],
-                "score": det["score"],
-                "text": text,
-            })
         return results

         crop_origins: list[tuple[int, int]] = []  # (image_idx, det_idx)
         for img_idx, (pil_img, dets) in enumerate(zip(pil_images, all_layout_dets)):
+            if not dets or (len(dets) == 1 and dets[0]["category"].strip().lower() == "image"):
+                prompt = f"<|image|>{CATEGORY_PROMPTS['plain']}\n<|OCR_PLAIN|>"
+                flat_crops.append((pil_img, prompt))
+                crop_origins.append((img_idx, -1))
+                continue
             img_w, img_h = pil_img.size
             for det_idx, det in enumerate(dets):
                 cat_key = det["category"].strip().lower()
         # --- Reassemble per-image results ---
         results: list[list[dict]] = [[] for _ in range(len(pil_images))]
         for (img_idx, det_idx), text in zip(crop_origins, flat_texts):
+            if det_idx == -1:
+                img_w, img_h = pil_images[img_idx].size
+                results[img_idx].append({
+                    "category": "plain",
+                    "bbox": [0, 0, img_w, img_h],
+                    "score": 1.0,
+                    "text": text,
+                })
+            else:
+                det = all_layout_dets[img_idx][det_idx]
+                results[img_idx].append({
+                    "category": det["category"],
+                    "bbox": det["bbox"],
+                    "score": det["score"],
+                    "text": text,
+                })
         return results