File size: 1,184 Bytes
f62c57e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import fitz  # PyMuPDF
import os
import json
import pytesseract
from PIL import Image
import io

def extract_text_and_images(pdf_path, save_dir="poster/pdf_result"):
    os.makedirs(save_dir, exist_ok=True)
    doc = fitz.open(pdf_path)
    result = []

    for page_index in range(len(doc)):
        page = doc[page_index]
        text = page.get_text()
        image_list = page.get_images(full=True)

        page_data = {"page": page_index + 1, "text": text, "images": []}

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image = Image.open(io.BytesIO(image_bytes))

            img_path = os.path.join(save_dir, f"page{page_index+1}_img{img_index+1}.{image_ext}")
            image.save(img_path)
            page_data["images"].append(img_path)

        result.append(page_data)

    # Save structured output
    json_path = os.path.join(save_dir, "layout.json")
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    return json_path