qp-parser / app /image_extractor.py
MakPr016
QP Parser
d81169f
import fitz
import base64
import io
from PIL import Image
from typing import List, Dict
MIN_WIDTH = 50
MIN_HEIGHT = 50
def extract_images_from_pdf(pdf_bytes: bytes) -> List[Dict]:
if not pdf_bytes:
return []
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
images = []
seen = set()
for page_num in range(len(doc)):
page = doc[page_num]
image_list = page.get_images(full=True)
for img in image_list:
xref = img[0]
if xref in seen:
continue
seen.add(xref)
try:
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"].lower()
pil_image = Image.open(io.BytesIO(image_bytes))
if pil_image.width < MIN_WIDTH or pil_image.height < MIN_HEIGHT:
continue
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
images.append({
"page": page_num + 1,
"format": image_ext,
"width": pil_image.width,
"height": pil_image.height,
"data": f"data:image/{image_ext};base64,{image_b64}",
})
except Exception as e:
print(f"Failed to extract image xref={xref} page={page_num + 1}: {e}")
continue
doc.close()
return images
except Exception as e:
print(f"Image extraction error: {e}")
return []
def extract_images_from_image_file(file_bytes: bytes, ext: str) -> List[Dict]:
try:
pil_image = Image.open(io.BytesIO(file_bytes))
image_b64 = base64.b64encode(file_bytes).decode("utf-8")
return [{
"page": 1,
"format": ext,
"width": pil_image.width,
"height": pil_image.height,
"data": f"data:image/{ext};base64,{image_b64}",
}]
except Exception as e:
print(f"Image file read error: {e}")
return []