Spaces:
Sleeping
Sleeping
File size: 2,509 Bytes
e158d2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
"""
Extract embedded images from PDF files
"""
import fitz # PyMuPDF
import base64
from PIL import Image
import io
from typing import List, Dict
def extract_images_from_pdf(pdf_bytes: bytes) -> List[Dict]:
"""
Extract all embedded images from PDF
Returns list of image dictionaries with base64 data
"""
if not pdf_bytes:
return []
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
images = []
for page_num in range(len(doc)):
page = doc[page_num]
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list):
try:
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# Get dimensions
pil_image = Image.open(io.BytesIO(image_bytes))
# Convert to base64
image_b64 = base64.b64encode(image_bytes).decode('utf-8')
images.append({
"page": page_num + 1,
"format": image_ext,
"width": pil_image.width,
"height": pil_image.height,
"data": f"data:image/{image_ext};base64,{image_b64}"
})
except Exception as e:
print(f"⚠ Failed to extract image {img_index} from page {page_num + 1}: {e}")
continue
doc.close()
print(f"✓ Extracted {len(images)} images from PDF")
return images
except Exception as e:
print(f"✗ Image extraction error: {e}")
return []
def create_thumbnail(image_bytes: bytes, size: tuple = (200, 200)) -> str:
"""
Create thumbnail version of image (base64)
"""
try:
image = Image.open(io.BytesIO(image_bytes))
image.thumbnail(size, Image.Resampling.LANCZOS)
buffered = io.BytesIO()
image.save(buffered, format="JPEG", quality=85)
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
return f"data:image/jpeg;base64,{img_str}"
except Exception as e:
print(f"✗ Thumbnail creation failed: {e}")
return ""
|