| import fitz |
| import base64 |
| import io |
| from PIL import Image |
| from typing import List, Dict |
|
|
|
|
| MIN_WIDTH = 50 |
| MIN_HEIGHT = 50 |
|
|
|
|
| def extract_images_from_pdf(pdf_bytes: bytes) -> List[Dict]: |
| if not pdf_bytes: |
| return [] |
|
|
| try: |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
| images = [] |
| seen = set() |
|
|
| for page_num in range(len(doc)): |
| page = doc[page_num] |
| image_list = page.get_images(full=True) |
|
|
| for img in image_list: |
| xref = img[0] |
| if xref in seen: |
| continue |
| seen.add(xref) |
|
|
| try: |
| base_image = doc.extract_image(xref) |
| image_bytes = base_image["image"] |
| image_ext = base_image["ext"].lower() |
|
|
| pil_image = Image.open(io.BytesIO(image_bytes)) |
|
|
| if pil_image.width < MIN_WIDTH or pil_image.height < MIN_HEIGHT: |
| continue |
|
|
| image_b64 = base64.b64encode(image_bytes).decode("utf-8") |
|
|
| images.append({ |
| "page": page_num + 1, |
| "format": image_ext, |
| "width": pil_image.width, |
| "height": pil_image.height, |
| "data": f"data:image/{image_ext};base64,{image_b64}", |
| }) |
|
|
| except Exception as e: |
| print(f"Failed to extract image xref={xref} page={page_num + 1}: {e}") |
| continue |
|
|
| doc.close() |
| return images |
|
|
| except Exception as e: |
| print(f"Image extraction error: {e}") |
| return [] |
|
|
|
|
| def extract_images_from_image_file(file_bytes: bytes, ext: str) -> List[Dict]: |
| try: |
| pil_image = Image.open(io.BytesIO(file_bytes)) |
| image_b64 = base64.b64encode(file_bytes).decode("utf-8") |
| return [{ |
| "page": 1, |
| "format": ext, |
| "width": pil_image.width, |
| "height": pil_image.height, |
| "data": f"data:image/{ext};base64,{image_b64}", |
| }] |
| except Exception as e: |
| print(f"Image file read error: {e}") |
| return [] |