""" Extract embedded images from PDF files """ import fitz # PyMuPDF import base64 from PIL import Image import io from typing import List, Dict def extract_images_from_pdf(pdf_bytes: bytes) -> List[Dict]: """ Extract all embedded images from PDF Returns list of image dictionaries with base64 data """ if not pdf_bytes: return [] try: doc = fitz.open(stream=pdf_bytes, filetype="pdf") images = [] for page_num in range(len(doc)): page = doc[page_num] image_list = page.get_images(full=True) for img_index, img in enumerate(image_list): try: xref = img[0] base_image = doc.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] # Get dimensions pil_image = Image.open(io.BytesIO(image_bytes)) # Convert to base64 image_b64 = base64.b64encode(image_bytes).decode('utf-8') images.append({ "page": page_num + 1, "format": image_ext, "width": pil_image.width, "height": pil_image.height, "data": f"data:image/{image_ext};base64,{image_b64}" }) except Exception as e: print(f"⚠ Failed to extract image {img_index} from page {page_num + 1}: {e}") continue doc.close() print(f"✓ Extracted {len(images)} images from PDF") return images except Exception as e: print(f"✗ Image extraction error: {e}") return [] def create_thumbnail(image_bytes: bytes, size: tuple = (200, 200)) -> str: """ Create thumbnail version of image (base64) """ try: image = Image.open(io.BytesIO(image_bytes)) image.thumbnail(size, Image.Resampling.LANCZOS) buffered = io.BytesIO() image.save(buffered, format="JPEG", quality=85) img_str = base64.b64encode(buffered.getvalue()).decode('utf-8') return f"data:image/jpeg;base64,{img_str}" except Exception as e: print(f"✗ Thumbnail creation failed: {e}") return ""