Spaces:
Running
on
Zero
Running
on
Zero
| import fitz # PyMuPDF | |
| import os | |
| from PIL import Image | |
| def convert_full_pdf_to_images(pdf_path, output_dir): | |
| if not os.path.exists(output_dir): | |
| os.makedirs(output_dir) | |
| doc = fitz.open(pdf_path) | |
| base_name = os.path.basename(pdf_path).split('.')[0] | |
| image_paths = [] | |
| print(f"Converting all {len(doc)} pages of {pdf_path}...") | |
| for i in range(len(doc)): | |
| page = doc.load_page(i) | |
| pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Zoom for better OCR | |
| output_file = os.path.join(output_dir, f"{base_name}_page_{i+1}.png") | |
| # Convert to PIL Image | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| img.save(output_file) | |
| image_paths.append(output_file) | |
| print(f"Converted page {i+1}/{len(doc)}") | |
| return image_paths | |
| if __name__ == "__main__": | |
| pdf_file = "doc_for_testing/pdf12_un.pdf" | |
| output_dir = "doc_images_full" | |
| if os.path.exists(pdf_file): | |
| convert_full_pdf_to_images(pdf_file, output_dir) | |
| else: | |
| print(f"File {pdf_file} not found.") | |