Upload 10 files
Browse files- .gitignore +31 -0
- LICENSE +21 -0
- README.md +89 -0
- STACKS.md +57 -0
- all_page.py +70 -0
- dump/README.md +62 -0
- dump/pdf_to_image.py +38 -0
- dump/requirements.txt +1 -0
- requirements.txt +5 -0
- single_page.py +75 -0
.gitignore
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environments
|
| 2 |
+
.env
|
| 3 |
+
.venv
|
| 4 |
+
env/
|
| 5 |
+
venv/
|
| 6 |
+
ENV/
|
| 7 |
+
env.bak/
|
| 8 |
+
venv.bak/
|
| 9 |
+
|
| 10 |
+
# Environment Variables
|
| 11 |
+
.env
|
| 12 |
+
.env.local
|
| 13 |
+
|
| 14 |
+
# Python
|
| 15 |
+
__pycache__/
|
| 16 |
+
*.py[cod]
|
| 17 |
+
*$py.class
|
| 18 |
+
*.so
|
| 19 |
+
|
| 20 |
+
# Output directories
|
| 21 |
+
output_images/
|
| 22 |
+
*.png
|
| 23 |
+
*.jpg
|
| 24 |
+
*.jpeg
|
| 25 |
+
|
| 26 |
+
# OS generated files
|
| 27 |
+
.DS_Store
|
| 28 |
+
Thumbs.db
|
| 29 |
+
|
| 30 |
+
PDF/*
|
| 31 |
+
output/*
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2026 Rembrant Oyangoren Albeos
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PDF to Image (Python)
|
| 2 |
+
|
| 3 |
+
[](https://github.com/not-algorembrant/pdf-to-image-python)
|
| 4 |
+
[](https://github.com/not-algorembrant/pdf-to-image-python)
|
| 5 |
+
[](https://github.com/not-algorembrant/pdf-to-image-python)
|
| 6 |
+
[](https://www.python.org/)
|
| 7 |
+
[](https://en.wikipedia.org/wiki/Markdown)
|
| 8 |
+
|
| 9 |
+
Simply convert PDF files into rendered image pages at high resolution.
|
| 10 |
+
|
| 11 |
+
This project was inspired by and serves as a Python alternative to the PHP package [spatie/pdf-to-image](https://github.com/spatie/pdf-to-image).
|
| 12 |
+
|
| 13 |
+
## System Overview
|
| 14 |
+
|
| 15 |
+
```mermaid
|
| 16 |
+
graph TD
|
| 17 |
+
A[PDF Input] --> B{Process Type}
|
| 18 |
+
B -->|Single| C[single_page.py]
|
| 19 |
+
B -->|Batch| D[all_page.py]
|
| 20 |
+
C --> E[First Page Export]
|
| 21 |
+
D --> F[Full Document Export]
|
| 22 |
+
E --> G[Output Folder]
|
| 23 |
+
F --> G
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
## Project Structure
|
| 27 |
+
|
| 28 |
+
```text
|
| 29 |
+
pdf-to-image-python/
|
| 30 |
+
├── .gitignore # Git ignore rules
|
| 31 |
+
├── PDF/ # Input PDF directory
|
| 32 |
+
├── output/ # Generated images directory (auto-created)
|
| 33 |
+
├── LICENSE # Project license
|
| 34 |
+
├── README.md # Main documentation
|
| 35 |
+
├── STACKS.md # Technical stack audit
|
| 36 |
+
├── all_page.py # Full PDF conversion script
|
| 37 |
+
├── requirements.txt # Dependency list
|
| 38 |
+
└── single_page.py # First page conversion script
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
## Requirements
|
| 42 |
+
|
| 43 |
+
- Python 3.14+
|
| 44 |
+
- PyMuPDF library
|
| 45 |
+
|
| 46 |
+
## Setup Instructions
|
| 47 |
+
|
| 48 |
+
Make sure your environment is ready before running the tool:
|
| 49 |
+
|
| 50 |
+
1. Create a virtual environment:
|
| 51 |
+
```bash
|
| 52 |
+
python -m venv venv
|
| 53 |
+
```
|
| 54 |
+
2. Activate the virtual environment:
|
| 55 |
+
```bash
|
| 56 |
+
# On Windows
|
| 57 |
+
venv\Scripts\activate
|
| 58 |
+
```
|
| 59 |
+
3. Install the dependencies:
|
| 60 |
+
```bash
|
| 61 |
+
pip install -r requirements.txt
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
## Usage
|
| 65 |
+
|
| 66 |
+
Basic usage to convert a PDF in the most efficient way:
|
| 67 |
+
|
| 68 |
+
```bash
|
| 69 |
+
# Convert all pages of a PDF (provide folder or file)
|
| 70 |
+
python all_page.py "PDF_folder" --dpi 300 --format png
|
| 71 |
+
|
| 72 |
+
# Convert only the first page (Cover) for quick previews
|
| 73 |
+
python single_page.py "PDF_folder" --dpi 300 --format png
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
## Citation
|
| 77 |
+
|
| 78 |
+
If you use this tool in your research or project, please cite it as follows:
|
| 79 |
+
|
| 80 |
+
```bibtex
|
| 81 |
+
@misc{pdf_to_image_2026,
|
| 82 |
+
author = {Rembrant Oyangoren Albeos},
|
| 83 |
+
title = {PDF to Image Python Utility},
|
| 84 |
+
year = {2026},
|
| 85 |
+
publisher = {GitHub},
|
| 86 |
+
journal = {GitHub repository},
|
| 87 |
+
howpublished = {\url{https://github.com/not-algorembrant/*}},
|
| 88 |
+
}
|
| 89 |
+
```
|
STACKS.md
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Description
|
| 2 |
+
This project, `pdf-to-image-python`, is a high-performance utility designed to convert PDF documents into high-quality images while perfectly preserving native page proportions. It leverages the PyMuPDF (fitz) engine to handle complex PDF layouts and provides both batch processing for entire documents and single-page extraction (specifically for cover pages). The system dynamically calculates resolution scaling based on target DPI, ensuring crisp output for any source page size.
|
| 3 |
+
|
| 4 |
+
## System Overview
|
| 5 |
+
|
| 6 |
+
```mermaid
|
| 7 |
+
graph TD
|
| 8 |
+
A[PDF Input] --> B{Process Type}
|
| 9 |
+
B -->|Single| C[single_page.py]
|
| 10 |
+
B -->|Batch| D[all_page.py]
|
| 11 |
+
C --> E[First Page Export]
|
| 12 |
+
D --> F[Full Document Export]
|
| 13 |
+
E --> G[Output Folder]
|
| 14 |
+
F --> G
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
## Project Structure
|
| 18 |
+
|
| 19 |
+
```text
|
| 20 |
+
pdf-to-image-python/
|
| 21 |
+
├── .gitignore # Git ignore rules
|
| 22 |
+
├── PDF/ # Input PDF directory
|
| 23 |
+
├── output/ # Generated images directory (auto-created)
|
| 24 |
+
├── LICENSE # Project license
|
| 25 |
+
├── README.md # Main documentation
|
| 26 |
+
├── STACKS.md # Technical stack audit
|
| 27 |
+
├── all_page.py # Full PDF conversion script
|
| 28 |
+
├── requirements.txt # Dependency list
|
| 29 |
+
└── single_page.py # First page conversion script
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
## Techstack
|
| 33 |
+
Audit of project files (excluding environment and cache):
|
| 34 |
+
|
| 35 |
+
| File Type | Count | Size (KB) |
|
| 36 |
+
| :--- | :--- | :--- |
|
| 37 |
+
| PDF (.pdf) | 16 | 15152 |
|
| 38 |
+
| PNG (.png) | 17 | 1952 |
|
| 39 |
+
| Python (.py) | 3 | 9.8 |
|
| 40 |
+
| Markdown (.md) | 3 | 4.3 |
|
| 41 |
+
| Text (.txt) | 2 | 0.1 |
|
| 42 |
+
| License | 1 | 1.1 |
|
| 43 |
+
|
| 44 |
+
**Total Files**: 42
|
| 45 |
+
|
| 46 |
+
## Dependencies
|
| 47 |
+
- **Python**:
|
| 48 |
+
- `PyMuPDF` (fitz): Core PDF rendering and processing.
|
| 49 |
+
- `argparse`: Command-line argument parsing.
|
| 50 |
+
- `os`: File system operations.
|
| 51 |
+
- `glob`: Filename pattern matching.
|
| 52 |
+
|
| 53 |
+
## Applications
|
| 54 |
+
- Google Antigravity
|
| 55 |
+
- Google Gemini Pro
|
| 56 |
+
- Visual Studio Code
|
| 57 |
+
- Windows PowerShell
|
all_page.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
import argparse
|
| 3 |
+
import os
|
| 4 |
+
import glob
|
| 5 |
+
|
| 6 |
+
def convert_pdf_to_images(pdf_path, output_dir, dpi=300, image_format="png"):
|
| 7 |
+
"""
|
| 8 |
+
Convert a PDF to a series of images, perfectly maintaining native page proportions.
|
| 9 |
+
"""
|
| 10 |
+
# Create a dedicated subfolder for each PDF's images to keep things organized
|
| 11 |
+
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
| 12 |
+
pdf_output_dir = os.path.join(output_dir, pdf_name)
|
| 13 |
+
|
| 14 |
+
if not os.path.exists(pdf_output_dir):
|
| 15 |
+
os.makedirs(pdf_output_dir)
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
doc = fitz.open(pdf_path)
|
| 19 |
+
print(f"Processing '{pdf_name}' | Total pages: {len(doc)}")
|
| 20 |
+
|
| 21 |
+
# PDFs have a standard base resolution of 72 DPI.
|
| 22 |
+
# We calculate a zoom factor to scale the native page size up to your desired DPI.
|
| 23 |
+
zoom = dpi / 72.0
|
| 24 |
+
mat = fitz.Matrix(zoom, zoom) # This matrix dynamically adapts to ANY page size
|
| 25 |
+
|
| 26 |
+
for page_num in range(len(doc)):
|
| 27 |
+
page = doc.load_page(page_num)
|
| 28 |
+
|
| 29 |
+
# Apply the matrix to render the high-quality, perfectly proportioned image
|
| 30 |
+
pix = page.get_pixmap(matrix=mat, alpha=False)
|
| 31 |
+
|
| 32 |
+
output_file = os.path.join(pdf_output_dir, f"page_{page_num + 1:02d}.{image_format}")
|
| 33 |
+
pix.save(output_file)
|
| 34 |
+
print(f" -> Saved page {page_num + 1} (Dynamic Resolution: {pix.width}x{pix.height})")
|
| 35 |
+
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f"Error processing {pdf_path}: {e}")
|
| 38 |
+
|
| 39 |
+
def process_all_pdfs(input_path, output_dir, dpi=300, image_format="png"):
|
| 40 |
+
"""Determines if the input is a single file or a directory of files."""
|
| 41 |
+
if os.path.isfile(input_path):
|
| 42 |
+
convert_pdf_to_images(input_path, output_dir, dpi, image_format)
|
| 43 |
+
elif os.path.isdir(input_path):
|
| 44 |
+
pdf_files = glob.glob(os.path.join(input_path, "*.pdf"))
|
| 45 |
+
if not pdf_files:
|
| 46 |
+
print(f"No PDF files found in directory: {input_path}")
|
| 47 |
+
return
|
| 48 |
+
|
| 49 |
+
print(f"Found {len(pdf_files)} PDF(s). Starting batch conversion...\n")
|
| 50 |
+
for pdf in pdf_files:
|
| 51 |
+
convert_pdf_to_images(pdf, output_dir, dpi, image_format)
|
| 52 |
+
print("-" * 40)
|
| 53 |
+
print("All batch conversions complete!")
|
| 54 |
+
else:
|
| 55 |
+
print("Invalid input path. Please provide a valid PDF file or folder.")
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
parser = argparse.ArgumentParser(description="Convert PDFs to auto-adjusting high-quality images.")
|
| 59 |
+
parser.add_argument("input_path", help="Path to a single PDF file OR a folder containing PDFs")
|
| 60 |
+
parser.add_argument("--output", "-o", default="output", help="Output directory (default: 'output')")
|
| 61 |
+
parser.add_argument("--dpi", type=int, default=300, help="Output image DPI (default: 300)")
|
| 62 |
+
parser.add_argument("--format", "-f", default="png", help="Output image format (default: png)")
|
| 63 |
+
|
| 64 |
+
args = parser.parse_args()
|
| 65 |
+
|
| 66 |
+
# Ensure base output directory exists
|
| 67 |
+
if not os.path.exists(args.output):
|
| 68 |
+
os.makedirs(args.output)
|
| 69 |
+
|
| 70 |
+
process_all_pdfs(args.input_path, args.output, args.dpi, args.format)
|
dump/README.md
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PDF to Image (Python)
|
| 2 |
+
|
| 3 |
+

|
| 4 |
+

|
| 5 |
+

|
| 6 |
+

|
| 7 |
+
|
| 8 |
+
This is a Python-based utility to convert PDF files into rendered images at high resolution.
|
| 9 |
+
It converts PDFs to standard US Letter dimensions (8.5 x 11 inches) using 300 DPI by default.
|
| 10 |
+
|
| 11 |
+
This project was inspired by and serves as a Python alternative to the PHP package [spatie/pdf-to-image](https://github.com/spatie/pdf-to-image).
|
| 12 |
+
|
| 13 |
+
## Requirements
|
| 14 |
+
|
| 15 |
+
- Python 3.14+
|
| 16 |
+
- PyMuPDF library
|
| 17 |
+
|
| 18 |
+
## Setup Instructions
|
| 19 |
+
|
| 20 |
+
Make sure your environment is ready before running the tool:
|
| 21 |
+
|
| 22 |
+
1. Create a virtual environment:
|
| 23 |
+
```bash
|
| 24 |
+
python -m venv venv
|
| 25 |
+
```
|
| 26 |
+
2. Activate the virtual environment:
|
| 27 |
+
```bash
|
| 28 |
+
# On Windows
|
| 29 |
+
venv\Scripts\activate
|
| 30 |
+
```
|
| 31 |
+
3. Install the dependencies:
|
| 32 |
+
```bash
|
| 33 |
+
pip install -r requirements.txt
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
## Usage
|
| 37 |
+
|
| 38 |
+
Basic usage to convert a PDF:
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
python pdf_to_image.py "path/to/your/document.pdf" --output "output_folder"
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
## Citation
|
| 45 |
+
|
| 46 |
+
If you use this tool in your research or project, please cite it as follows:
|
| 47 |
+
|
| 48 |
+
```bibtex
|
| 49 |
+
@misc{pdf_to_image_2026,
|
| 50 |
+
author = {Rembrant Oyangoren Albeos},
|
| 51 |
+
title = {PDF to Image Python Utility},
|
| 52 |
+
year = {2026},
|
| 53 |
+
publisher = {GitHub},
|
| 54 |
+
journal = {GitHub repository},
|
| 55 |
+
howpublished = {\url{https://github.com/unban-algorembrant/pdf-to-image-python}},
|
| 56 |
+
note = {cite: {https://github.com/unban-algorembrant/*}}
|
| 57 |
+
}
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
## License
|
| 61 |
+
|
| 62 |
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
dump/pdf_to_image.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
import argparse
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
def convert_pdf_to_images(pdf_path, output_dir, dpi=300, image_format="png"):
|
| 6 |
+
"""
|
| 7 |
+
Convert a PDF to a series of images.
|
| 8 |
+
"""
|
| 9 |
+
if not os.path.exists(output_dir):
|
| 10 |
+
os.makedirs(output_dir)
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
doc = fitz.open(pdf_path)
|
| 14 |
+
print(f"Opened {pdf_path}. Total pages: {len(doc)}")
|
| 15 |
+
|
| 16 |
+
for page_num in range(len(doc)):
|
| 17 |
+
page = doc.load_page(page_num)
|
| 18 |
+
# alpha=False ensures a white background instead of transparent
|
| 19 |
+
pix = page.get_pixmap(dpi=dpi, alpha=False)
|
| 20 |
+
|
| 21 |
+
output_file = os.path.join(output_dir, f"page_{page_num + 1:02d}.{image_format}")
|
| 22 |
+
pix.save(output_file)
|
| 23 |
+
print(f"Saved {output_file}")
|
| 24 |
+
|
| 25 |
+
print("Conversion complete.")
|
| 26 |
+
except Exception as e:
|
| 27 |
+
print(f"Error processing PDF: {e}")
|
| 28 |
+
|
| 29 |
+
if __name__ == "__main__":
|
| 30 |
+
parser = argparse.ArgumentParser(description="Convert a PDF to images.")
|
| 31 |
+
parser.add_argument("pdf_path", help="Path to the input PDF file")
|
| 32 |
+
parser.add_argument("--output", "-o", default="output", help="Output directory")
|
| 33 |
+
parser.add_argument("--dpi", type=int, default=300, help="Output image DPI (default: 300)")
|
| 34 |
+
parser.add_argument("--format", "-f", default="png", help="Output image format (default: png)")
|
| 35 |
+
|
| 36 |
+
args = parser.parse_args()
|
| 37 |
+
|
| 38 |
+
convert_pdf_to_images(args.pdf_path, args.output, args.dpi, args.format)
|
dump/requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
PyMuPDF
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PyMuPDF
|
| 2 |
+
fitz
|
| 3 |
+
argparse
|
| 4 |
+
os
|
| 5 |
+
glob
|
single_page.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
import argparse
|
| 3 |
+
import os
|
| 4 |
+
import glob
|
| 5 |
+
|
| 6 |
+
def convert_pdf_to_images(pdf_path, output_dir, dpi=300, image_format="png"):
|
| 7 |
+
"""
|
| 8 |
+
Convert the FIRST PAGE of a PDF to an image, perfectly maintaining native page proportions.
|
| 9 |
+
"""
|
| 10 |
+
# Create a dedicated subfolder for each PDF's images to keep things organized
|
| 11 |
+
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
| 12 |
+
pdf_output_dir = os.path.join(output_dir, pdf_name)
|
| 13 |
+
|
| 14 |
+
if not os.path.exists(pdf_output_dir):
|
| 15 |
+
os.makedirs(pdf_output_dir)
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
doc = fitz.open(pdf_path)
|
| 19 |
+
print(f"Processing '{pdf_name}' | Total pages: {len(doc)} (Extracting page 1 only)")
|
| 20 |
+
|
| 21 |
+
# Prevent errors if an empty PDF is somehow loaded
|
| 22 |
+
if len(doc) == 0:
|
| 23 |
+
print(f" -> Skipping '{pdf_name}': PDF has no pages.")
|
| 24 |
+
return
|
| 25 |
+
|
| 26 |
+
# PDFs have a standard base resolution of 72 DPI.
|
| 27 |
+
# We calculate a zoom factor to scale the native page size up to your desired DPI.
|
| 28 |
+
zoom = dpi / 72.0
|
| 29 |
+
mat = fitz.Matrix(zoom, zoom) # This matrix dynamically adapts to ANY page size
|
| 30 |
+
|
| 31 |
+
# Load ONLY the first page (Index 0)
|
| 32 |
+
page = doc.load_page(0)
|
| 33 |
+
|
| 34 |
+
# Apply the matrix to render the high-quality, perfectly proportioned image
|
| 35 |
+
pix = page.get_pixmap(matrix=mat, alpha=False)
|
| 36 |
+
|
| 37 |
+
output_file = os.path.join(pdf_output_dir, f"cover_page.{image_format}")
|
| 38 |
+
pix.save(output_file)
|
| 39 |
+
print(f" -> Saved first page (Dynamic Resolution: {pix.width}x{pix.height})")
|
| 40 |
+
|
| 41 |
+
except Exception as e:
|
| 42 |
+
print(f"Error processing {pdf_path}: {e}")
|
| 43 |
+
|
| 44 |
+
def process_all_pdfs(input_path, output_dir, dpi=300, image_format="png"):
|
| 45 |
+
"""Determines if the input is a single file or a directory of files."""
|
| 46 |
+
if os.path.isfile(input_path):
|
| 47 |
+
convert_pdf_to_images(input_path, output_dir, dpi, image_format)
|
| 48 |
+
elif os.path.isdir(input_path):
|
| 49 |
+
pdf_files = glob.glob(os.path.join(input_path, "*.pdf"))
|
| 50 |
+
if not pdf_files:
|
| 51 |
+
print(f"No PDF files found in directory: {input_path}")
|
| 52 |
+
return
|
| 53 |
+
|
| 54 |
+
print(f"Found {len(pdf_files)} PDF(s). Starting batch conversion of first pages...\n")
|
| 55 |
+
for pdf in pdf_files:
|
| 56 |
+
convert_pdf_to_images(pdf, output_dir, dpi, image_format)
|
| 57 |
+
print("-" * 40)
|
| 58 |
+
print("All batch conversions complete!")
|
| 59 |
+
else:
|
| 60 |
+
print("Invalid input path. Please provide a valid PDF file or folder.")
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
parser = argparse.ArgumentParser(description="Convert the first page of PDFs to auto-adjusting high-quality images.")
|
| 64 |
+
parser.add_argument("input_path", help="Path to a single PDF file OR a folder containing PDFs")
|
| 65 |
+
parser.add_argument("--output", "-o", default="output", help="Output directory (default: 'output')")
|
| 66 |
+
parser.add_argument("--dpi", type=int, default=300, help="Output image DPI (default: 300)")
|
| 67 |
+
parser.add_argument("--format", "-f", default="png", help="Output image format (default: png)")
|
| 68 |
+
|
| 69 |
+
args = parser.parse_args()
|
| 70 |
+
|
| 71 |
+
# Ensure base output directory exists
|
| 72 |
+
if not os.path.exists(args.output):
|
| 73 |
+
os.makedirs(args.output)
|
| 74 |
+
|
| 75 |
+
process_all_pdfs(args.input_path, args.output, args.dpi, args.format)
|