Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,102 Bytes
b752d16 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
import fitz # PyMuPDF
import os
from PIL import Image
def convert_pdf_to_images(pdf_path, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
doc = fitz.open(pdf_path)
base_name = os.path.basename(pdf_path).split('.')[0]
image_paths = []
# Just take the first page for testing to save time/memory
for i in range(min(1, len(doc))):
page = doc.load_page(i)
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Zoom for better OCR
output_file = os.path.join(output_dir, f"{base_name}_page_{i+1}.png")
# Convert to PIL Image
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
img.save(output_file)
image_paths.append(output_file)
print(f"Converted {pdf_path} page {i+1} to {output_file}")
return image_paths
if __name__ == "__main__":
pdf_dir = "doc_for_testing"
output_dir = "doc_images"
for filename in os.listdir(pdf_dir):
if filename.endswith(".pdf"):
convert_pdf_to_images(os.path.join(pdf_dir, filename), output_dir)
|