P2P / pdf_extract.py
SFEREWQW's picture
Create pdf_extract.py
f62c57e verified
import fitz # PyMuPDF
import os
import json
import pytesseract
from PIL import Image
import io
def extract_text_and_images(pdf_path, save_dir="poster/pdf_result"):
os.makedirs(save_dir, exist_ok=True)
doc = fitz.open(pdf_path)
result = []
for page_index in range(len(doc)):
page = doc[page_index]
text = page.get_text()
image_list = page.get_images(full=True)
page_data = {"page": page_index + 1, "text": text, "images": []}
for img_index, img in enumerate(image_list):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image = Image.open(io.BytesIO(image_bytes))
img_path = os.path.join(save_dir, f"page{page_index+1}_img{img_index+1}.{image_ext}")
image.save(img_path)
page_data["images"].append(img_path)
result.append(page_data)
# Save structured output
json_path = os.path.join(save_dir, "layout.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
return json_path