File size: 1,102 Bytes
b752d16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import fitz  # PyMuPDF
import os
from PIL import Image

def convert_pdf_to_images(pdf_path, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    doc = fitz.open(pdf_path)
    base_name = os.path.basename(pdf_path).split('.')[0]
    image_paths = []
    
    # Just take the first page for testing to save time/memory
    for i in range(min(1, len(doc))):
        page = doc.load_page(i)
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Zoom for better OCR
        output_file = os.path.join(output_dir, f"{base_name}_page_{i+1}.png")
        # Convert to PIL Image
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        img.save(output_file)
        image_paths.append(output_file)
        print(f"Converted {pdf_path} page {i+1} to {output_file}")
    
    return image_paths

if __name__ == "__main__":
    pdf_dir = "doc_for_testing"
    output_dir = "doc_images"
    for filename in os.listdir(pdf_dir):
        if filename.endswith(".pdf"):
            convert_pdf_to_images(os.path.join(pdf_dir, filename), output_dir)