deepkansara-123 commited on
Commit
e8a4941
·
verified ·
1 Parent(s): 5f58f84

Update charcnn_bylstm.py

Browse files
Files changed (1) hide show
  1. charcnn_bylstm.py +10 -7
charcnn_bylstm.py CHANGED
@@ -111,14 +111,17 @@ def extract_pdf_pages(path: str):
111
  images = []
112
  for img_info in page.get_images(full=True):
113
  xref = img_info[0]
114
- try:
115
- base_image = doc.extract_image(xref)
116
- img_bytes = base_image["image"]
117
- img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
118
- img_rect = page.get_image_bbox(img_info)
 
 
119
  images.append({"bbox": (img_rect.x0, img_rect.y0, img_rect.x1, img_rect.y1), "image": img})
120
- except Exception as e:
121
- print(f"Warning: Could not extract image {xref} on page {pno+1}. Error: {e}")
 
122
 
123
  # OCR fallback if text is too little
124
  total_chars = sum(len(b["text"]) for b in text_blocks)
 
111
  images = []
112
  for img_info in page.get_images(full=True):
113
  xref = img_info[0]
114
+
115
+ base_image = doc.extract_image(xref)
116
+ img_bytes = base_image["image"]
117
+ img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
118
+ rects = page.get_image_rects(img_info)
119
+ if rects:
120
+ img_rect = rects[0]
121
  images.append({"bbox": (img_rect.x0, img_rect.y0, img_rect.x1, img_rect.y1), "image": img})
122
+ else:
123
+ # no bounding box available; skip image or assign a dummy one
124
+ print(f"⚠️ Warning: No image bbox found for xref {xref} on page {pno+1}")
125
 
126
  # OCR fallback if text is too little
127
  total_chars = sum(len(b["text"]) for b in text_blocks)