mangatranslator / src /pdf_handler.py
bartwisch
Initial release v1.0.0 – Manga Translator with MIT License
376598e
import fitz # PyMuPDF
from PIL import Image
import io
import os
from typing import List, Union
class PDFHandler:
def __init__(self):
pass
def extract_images_from_pdf(self, pdf_path: str, zoom: int = 2, pages: List[int] = None) -> List[Image.Image]:
"""
Converts each page of the PDF into a PIL Image.
Args:
pdf_path: Path to the source PDF file.
zoom: Zoom factor for higher resolution (default 2 for better OCR).
pages: Optional list of 0-indexed page numbers to extract. If None, extracts all.
Returns:
List of PIL Image objects.
"""
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
doc = fitz.open(pdf_path)
images = []
# Matrix for zooming (higher resolution for better OCR)
mat = fitz.Matrix(zoom, zoom)
# Determine which pages to process
if pages is None:
page_indices = range(len(doc))
else:
# Filter out invalid page numbers
page_indices = [p for p in pages if 0 <= p < len(doc)]
for page_num in page_indices:
page = doc.load_page(page_num)
pix = page.get_pixmap(matrix=mat)
# Convert to PIL Image
img_data = pix.tobytes("png")
img = Image.open(io.BytesIO(img_data))
images.append(img)
doc.close()
return images
def save_images_as_pdf(self, images: List[Image.Image], output_path: str):
"""
Saves a list of PIL Images as a single PDF file.
Args:
images: List of PIL Image objects.
output_path: Path where the new PDF should be saved.
"""
if not images:
print("No images to save.")
return
# Convert PIL images to RGB if necessary and save
pdf_images = []
for img in images:
if img.mode == 'RGBA':
img = img.convert('RGB')
pdf_images.append(img)
if pdf_images:
pdf_images[0].save(
output_path,
save_all=True,
append_images=pdf_images[1:],
resolution=100.0,
quality=95,
optimize=True
)
print(f"PDF saved successfully at {output_path}")