import fitz # PyMuPDF import os import json import pytesseract from PIL import Image import io def extract_text_and_images(pdf_path, save_dir="poster/pdf_result"): os.makedirs(save_dir, exist_ok=True) doc = fitz.open(pdf_path) result = [] for page_index in range(len(doc)): page = doc[page_index] text = page.get_text() image_list = page.get_images(full=True) page_data = {"page": page_index + 1, "text": text, "images": []} for img_index, img in enumerate(image_list): xref = img[0] base_image = doc.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] image = Image.open(io.BytesIO(image_bytes)) img_path = os.path.join(save_dir, f"page{page_index+1}_img{img_index+1}.{image_ext}") image.save(img_path) page_data["images"].append(img_path) result.append(page_data) # Save structured output json_path = os.path.join(save_dir, "layout.json") with open(json_path, "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=2) return json_path