File size: 2,244 Bytes
d81169f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | import fitz
import base64
import io
from PIL import Image
from typing import List, Dict
MIN_WIDTH = 50
MIN_HEIGHT = 50
def extract_images_from_pdf(pdf_bytes: bytes) -> List[Dict]:
if not pdf_bytes:
return []
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
images = []
seen = set()
for page_num in range(len(doc)):
page = doc[page_num]
image_list = page.get_images(full=True)
for img in image_list:
xref = img[0]
if xref in seen:
continue
seen.add(xref)
try:
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"].lower()
pil_image = Image.open(io.BytesIO(image_bytes))
if pil_image.width < MIN_WIDTH or pil_image.height < MIN_HEIGHT:
continue
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
images.append({
"page": page_num + 1,
"format": image_ext,
"width": pil_image.width,
"height": pil_image.height,
"data": f"data:image/{image_ext};base64,{image_b64}",
})
except Exception as e:
print(f"Failed to extract image xref={xref} page={page_num + 1}: {e}")
continue
doc.close()
return images
except Exception as e:
print(f"Image extraction error: {e}")
return []
def extract_images_from_image_file(file_bytes: bytes, ext: str) -> List[Dict]:
try:
pil_image = Image.open(io.BytesIO(file_bytes))
image_b64 = base64.b64encode(file_bytes).decode("utf-8")
return [{
"page": 1,
"format": ext,
"width": pil_image.width,
"height": pil_image.height,
"data": f"data:image/{ext};base64,{image_b64}",
}]
except Exception as e:
print(f"Image file read error: {e}")
return [] |