|
|
import fitz |
|
|
import os |
|
|
import json |
|
|
import pytesseract |
|
|
from PIL import Image |
|
|
import io |
|
|
|
|
|
def extract_text_and_images(pdf_path, save_dir="poster/pdf_result"): |
|
|
os.makedirs(save_dir, exist_ok=True) |
|
|
doc = fitz.open(pdf_path) |
|
|
result = [] |
|
|
|
|
|
for page_index in range(len(doc)): |
|
|
page = doc[page_index] |
|
|
text = page.get_text() |
|
|
image_list = page.get_images(full=True) |
|
|
|
|
|
page_data = {"page": page_index + 1, "text": text, "images": []} |
|
|
|
|
|
for img_index, img in enumerate(image_list): |
|
|
xref = img[0] |
|
|
base_image = doc.extract_image(xref) |
|
|
image_bytes = base_image["image"] |
|
|
image_ext = base_image["ext"] |
|
|
image = Image.open(io.BytesIO(image_bytes)) |
|
|
|
|
|
img_path = os.path.join(save_dir, f"page{page_index+1}_img{img_index+1}.{image_ext}") |
|
|
image.save(img_path) |
|
|
page_data["images"].append(img_path) |
|
|
|
|
|
result.append(page_data) |
|
|
|
|
|
|
|
|
json_path = os.path.join(save_dir, "layout.json") |
|
|
with open(json_path, "w", encoding="utf-8") as f: |
|
|
json.dump(result, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
return json_path |
|
|
|