Spaces:

deepkansara-123
/

extrect_mcqs_from_pdf

Running

deepkansara-123 commited on Oct 29, 2025

Commit

e8a4941

verified ·

1 Parent(s): 5f58f84

Update charcnn_bylstm.py

Files changed (1) hide show

charcnn_bylstm.py CHANGED Viewed

@@ -111,14 +111,17 @@ def extract_pdf_pages(path: str):
         images = []
         for img_info in page.get_images(full=True):
             xref = img_info[0]
-            try:
-                base_image = doc.extract_image(xref)
-                img_bytes = base_image["image"]
-                img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
-                img_rect = page.get_image_bbox(img_info)
                 images.append({"bbox": (img_rect.x0, img_rect.y0, img_rect.x1, img_rect.y1), "image": img})
-            except Exception as e:
-                print(f"Warning: Could not extract image {xref} on page {pno+1}. Error: {e}")
         # OCR fallback if text is too little
         total_chars = sum(len(b["text"]) for b in text_blocks)

         images = []
         for img_info in page.get_images(full=True):
             xref = img_info[0]
+            base_image = doc.extract_image(xref)
+            img_bytes = base_image["image"]
+            img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+            rects = page.get_image_rects(img_info)
+            if rects:
+                img_rect = rects[0]
                 images.append({"bbox": (img_rect.x0, img_rect.y0, img_rect.x1, img_rect.y1), "image": img})
+            else:
+                # no bounding box available; skip image or assign a dummy one
+                print(f"⚠️ Warning: No image bbox found for xref {xref} on page {pno+1}")
         # OCR fallback if text is too little
         total_chars = sum(len(b["text"]) for b in text_blocks)