Spaces:

Hugobartwisch
/

mangatranslator

Sleeping

mangatranslator / src /pdf_handler.py

bartwisch

Initial release v1.0.0 – Manga Translator with MIT License

376598e 2 months ago

2.47 kB

	import fitz # PyMuPDF
	from PIL import Image
	import io
	import os
	from typing import List, Union

	class PDFHandler:
	def __init__(self):
	pass

	def extract_images_from_pdf(self, pdf_path: str, zoom: int = 2, pages: List[int] = None) -> List[Image.Image]:
	"""
	Converts each page of the PDF into a PIL Image.

	Args:
	pdf_path: Path to the source PDF file.
	zoom: Zoom factor for higher resolution (default 2 for better OCR).
	pages: Optional list of 0-indexed page numbers to extract. If None, extracts all.

	Returns:
	List of PIL Image objects.
	"""
	if not os.path.exists(pdf_path):
	raise FileNotFoundError(f"PDF file not found: {pdf_path}")

	doc = fitz.open(pdf_path)
	images = []

	# Matrix for zooming (higher resolution for better OCR)
	mat = fitz.Matrix(zoom, zoom)

	# Determine which pages to process
	if pages is None:
	page_indices = range(len(doc))
	else:
	# Filter out invalid page numbers
	page_indices = [p for p in pages if 0 <= p < len(doc)]

	for page_num in page_indices:
	page = doc.load_page(page_num)

	pix = page.get_pixmap(matrix=mat)

	# Convert to PIL Image
	img_data = pix.tobytes("png")
	img = Image.open(io.BytesIO(img_data))
	images.append(img)

	doc.close()
	return images

	def save_images_as_pdf(self, images: List[Image.Image], output_path: str):
	"""
	Saves a list of PIL Images as a single PDF file.

	Args:
	images: List of PIL Image objects.
	output_path: Path where the new PDF should be saved.
	"""
	if not images:
	print("No images to save.")
	return

	# Convert PIL images to RGB if necessary and save
	pdf_images = []
	for img in images:
	if img.mode == 'RGBA':
	img = img.convert('RGB')
	pdf_images.append(img)

	if pdf_images:
	pdf_images[0].save(
	output_path,
	save_all=True,
	append_images=pdf_images[1:],
	resolution=100.0,
	quality=95,
	optimize=True
	)
	print(f"PDF saved successfully at {output_path}")