import fitz # PyMuPDF import os def extract_images_from_pdf(input_pdf: str, output_dir: str = "extracted_images"): """ Extract all images from a PDF and save them as individual image files. Args: input_pdf (str): Path to the PDF file. output_dir (str): Directory to save extracted images. Default is 'extracted_images'. Returns: List of saved image file paths. """ # Ensure output directory exists os.makedirs(output_dir, exist_ok=True) # Open the PDF pdf = fitz.open(input_pdf) saved_images = [] print(f"Extracting images from: {input_pdf}") for page_num in range(len(pdf)): page = pdf[page_num] images = page.get_images(full=True) for img_index, img in enumerate(images): xref = img[0] base_image = pdf.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] image_filename = f"page{page_num+1}_img{img_index+1}.{image_ext}" output_path = os.path.join(output_dir, image_filename) with open(output_path, "wb") as img_file: img_file.write(image_bytes) saved_images.append(output_path) print(f"Saved: {output_path}") pdf.close() if saved_images: print(f"✅ Extracted {len(saved_images)} images to: {output_dir}") else: print("⚠️ No images found in the PDF.") return saved_images # Example usage extract_images_from_pdf("../CaptionCreator/media/Jebin passport.pdf")