Spaces:
Sleeping
Sleeping
| """ | |
| Extract embedded images from PDF files | |
| """ | |
| import fitz # PyMuPDF | |
| import base64 | |
| from PIL import Image | |
| import io | |
| from typing import List, Dict | |
| def extract_images_from_pdf(pdf_bytes: bytes) -> List[Dict]: | |
| """ | |
| Extract all embedded images from PDF | |
| Returns list of image dictionaries with base64 data | |
| """ | |
| if not pdf_bytes: | |
| return [] | |
| try: | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| images = [] | |
| for page_num in range(len(doc)): | |
| page = doc[page_num] | |
| image_list = page.get_images(full=True) | |
| for img_index, img in enumerate(image_list): | |
| try: | |
| xref = img[0] | |
| base_image = doc.extract_image(xref) | |
| image_bytes = base_image["image"] | |
| image_ext = base_image["ext"] | |
| # Get dimensions | |
| pil_image = Image.open(io.BytesIO(image_bytes)) | |
| # Convert to base64 | |
| image_b64 = base64.b64encode(image_bytes).decode('utf-8') | |
| images.append({ | |
| "page": page_num + 1, | |
| "format": image_ext, | |
| "width": pil_image.width, | |
| "height": pil_image.height, | |
| "data": f"data:image/{image_ext};base64,{image_b64}" | |
| }) | |
| except Exception as e: | |
| print(f"⚠ Failed to extract image {img_index} from page {page_num + 1}: {e}") | |
| continue | |
| doc.close() | |
| print(f"✓ Extracted {len(images)} images from PDF") | |
| return images | |
| except Exception as e: | |
| print(f"✗ Image extraction error: {e}") | |
| return [] | |
| def create_thumbnail(image_bytes: bytes, size: tuple = (200, 200)) -> str: | |
| """ | |
| Create thumbnail version of image (base64) | |
| """ | |
| try: | |
| image = Image.open(io.BytesIO(image_bytes)) | |
| image.thumbnail(size, Image.Resampling.LANCZOS) | |
| buffered = io.BytesIO() | |
| image.save(buffered, format="JPEG", quality=85) | |
| img_str = base64.b64encode(buffered.getvalue()).decode('utf-8') | |
| return f"data:image/jpeg;base64,{img_str}" | |
| except Exception as e: | |
| print(f"✗ Thumbnail creation failed: {e}") | |
| return "" | |