67rp / scraper /code /_3picture.py
Jovanseetk
Prepare Hugging Face Spaces deploy
554d9f2
from __future__ import annotations
import os
from concurrent.futures import ThreadPoolExecutor
import pillow_heif
from pdf2image import convert_from_path
INPUT_DIR = "downloads"
OUTPUT_DIR = "images"
PDF_DPI = 200
HEIF_QUALITY = 90
MAX_WORKERS = 10
pillow_heif.register_heif_opener()
def pdf_to_heif(pdf_path: str, output_folder: str) -> None:
os.makedirs(output_folder, exist_ok=True)
print(f"Converting PDF: {pdf_path}")
pages = convert_from_path(pdf_path, dpi=PDF_DPI)
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
for page_index, page in enumerate(pages, start=1):
filename = f"{pdf_name}_page_{page_index}.heif"
filepath = os.path.join(output_folder, filename)
page.save(filepath, "HEIF", quality=HEIF_QUALITY)
print(f"Saved: {filename}")
print(f"Converted {len(pages)} pages for {pdf_name}.")
def convert_single_pdf(filename: str) -> int:
returni = 0
# prevents pdf from converting
if not filename.lower().endswith(".pdf"):
return returni
pdf_stem = os.path.splitext(filename)[0]
output_path = os.path.join(OUTPUT_DIR, pdf_stem)
pdf_path = os.path.join(INPUT_DIR, filename)
if os.path.isdir(output_path) and os.listdir(output_path):
print(f"Skipping {filename}: output already exists.")
return returni
try:
pdf_to_heif(pdf_path, output_path)
returni = 1
except Exception as exc:
print(f"Failed conversion for {filename}: {exc}")
return returni
def run() -> None:
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(INPUT_DIR, exist_ok=True)
# Don't exclude already converted?
files = sorted(os.listdir(INPUT_DIR))
print(f"Found {len(files)} files in {INPUT_DIR}.")
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
executor.map(convert_single_pdf, files)
print("Conversion process completed.")
if __name__ == "__main__":
run()