Update charcnn_bylstm.py
Browse files- charcnn_bylstm.py +10 -7
charcnn_bylstm.py
CHANGED
|
@@ -111,14 +111,17 @@ def extract_pdf_pages(path: str):
|
|
| 111 |
images = []
|
| 112 |
for img_info in page.get_images(full=True):
|
| 113 |
xref = img_info[0]
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
| 119 |
images.append({"bbox": (img_rect.x0, img_rect.y0, img_rect.x1, img_rect.y1), "image": img})
|
| 120 |
-
|
| 121 |
-
|
|
|
|
| 122 |
|
| 123 |
# OCR fallback if text is too little
|
| 124 |
total_chars = sum(len(b["text"]) for b in text_blocks)
|
|
|
|
| 111 |
images = []
|
| 112 |
for img_info in page.get_images(full=True):
|
| 113 |
xref = img_info[0]
|
| 114 |
+
|
| 115 |
+
base_image = doc.extract_image(xref)
|
| 116 |
+
img_bytes = base_image["image"]
|
| 117 |
+
img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
| 118 |
+
rects = page.get_image_rects(img_info)
|
| 119 |
+
if rects:
|
| 120 |
+
img_rect = rects[0]
|
| 121 |
images.append({"bbox": (img_rect.x0, img_rect.y0, img_rect.x1, img_rect.y1), "image": img})
|
| 122 |
+
else:
|
| 123 |
+
# no bounding box available; skip image or assign a dummy one
|
| 124 |
+
print(f"⚠️ Warning: No image bbox found for xref {xref} on page {pno+1}")
|
| 125 |
|
| 126 |
# OCR fallback if text is too little
|
| 127 |
total_chars = sum(len(b["text"]) for b in text_blocks)
|