| | import os |
| | import tempfile |
| | import logging |
| | import subprocess |
| | from typing import List |
| |
|
| | import gradio as gr |
| | import requests |
| | from PIL import Image |
| | from pdf2image import convert_from_path, convert_from_bytes |
| | from pdf2image.exceptions import PDFInfoNotInstalledError, PDFPageCountError |
| |
|
| | |
| | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") |
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | def check_poppler(): |
| | """ |
| | Checks if the Poppler PDF rendering utility is installed and accessible. |
| | """ |
| | try: |
| | |
| | result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False) |
| | if result.returncode == 0 or "pdftoppm version" in result.stderr: |
| | logger.info("Poppler check successful.") |
| | return True |
| | else: |
| | logger.error(f"Poppler check failed. stderr: {result.stderr.strip()}") |
| | return False |
| | except FileNotFoundError: |
| | logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in the system's PATH.") |
| | return False |
| | except Exception as e: |
| | logger.error(f"An unexpected error occurred during Poppler check: {e}") |
| | return False |
| |
|
| |
|
| | def stitch_images_vertically(images: List[Image.Image]) -> Image.Image: |
| | """ |
| | Stitches a list of PIL Images together vertically. |
| | |
| | Args: |
| | images: A list of PIL Image objects. |
| | |
| | Returns: |
| | A single PIL Image object containing all input images stitched together. |
| | """ |
| | if not images: |
| | return None |
| |
|
| | |
| | max_width = max(img.width for img in images) |
| | |
| | |
| | total_height = sum(img.height for img in images) |
| |
|
| | |
| | stitched_image = Image.new('RGB', (max_width, total_height), (255, 255, 255)) |
| |
|
| | |
| | current_y = 0 |
| | for img in images: |
| | stitched_image.paste(img, (0, current_y)) |
| | current_y += img.height |
| | |
| | return stitched_image |
| |
|
| |
|
| | def process_pdf(pdf_file, pdf_url, progress=gr.Progress()): |
| | """ |
| | The main processing function for the Gradio interface. |
| | It takes a PDF (either as an uploaded file or a URL), converts all its |
| | pages to images, and stitches them into a single tall image. |
| | """ |
| | pdf_input_source = None |
| | is_bytes = False |
| | source_name = "document" |
| |
|
| | |
| | progress(0, desc="Validating input...") |
| | if pdf_file is not None: |
| | logger.info(f"Processing uploaded file: {pdf_file.name}") |
| | pdf_input_source = pdf_file.name |
| | source_name = os.path.splitext(os.path.basename(pdf_file.name))[0] |
| | is_bytes = False |
| | elif pdf_url and pdf_url.strip(): |
| | url = pdf_url.strip() |
| | logger.info(f"Processing file from URL: {url}") |
| | progress(0.1, desc=f"Downloading PDF from URL...") |
| | try: |
| | response = requests.get(url, timeout=45) |
| | response.raise_for_status() |
| | pdf_input_source = response.content |
| | source_name = os.path.splitext(os.path.basename(url.split('?')[0]))[0] |
| | is_bytes = True |
| | except requests.RequestException as e: |
| | raise gr.Error(f"Failed to download PDF from URL. Error: {e}") |
| | else: |
| | raise gr.Error("Please upload a PDF file or provide a valid URL.") |
| |
|
| | |
| | progress(0.3, desc="Converting PDF pages to images...") |
| | try: |
| | if is_bytes: |
| | images = convert_from_bytes(pdf_input_source, dpi=200) |
| | else: |
| | images = convert_from_path(pdf_input_source, dpi=200) |
| | except (PDFInfoNotInstalledError, FileNotFoundError): |
| | raise gr.Error("Poppler not found. Please ensure poppler-utils is installed and in your system's PATH.") |
| | except (PDFPageCountError, Exception) as e: |
| | raise gr.Error(f"Failed to process the PDF. It might be corrupted or password-protected. Error: {e}") |
| |
|
| | if not images: |
| | raise gr.Error("Could not extract any pages from the PDF. The file might be empty or invalid.") |
| | |
| | logger.info(f"Successfully converted {len(images)} pages to images.") |
| |
|
| | |
| | progress(0.7, desc=f"Stitching {len(images)} images together...") |
| | |
| | stitched_image = stitch_images_vertically(images) |
| | if stitched_image is None: |
| | raise gr.Error("Image stitching failed.") |
| | |
| | logger.info("Image stitching complete.") |
| |
|
| | |
| | progress(0.9, desc="Saving final image...") |
| | |
| | |
| | with tempfile.NamedTemporaryFile(delete=False, suffix=".png", prefix=f"{source_name}_stitched_") as tmp_file: |
| | stitched_image.save(tmp_file.name, "PNG") |
| | output_path = tmp_file.name |
| | |
| | logger.info(f"Final image saved to temporary path: {output_path}") |
| | progress(1, desc="Done!") |
| | |
| | |
| | |
| | return output_path, output_path |
| |
|
| |
|
| | |
| | with gr.Blocks(theme=gr.themes.Soft()) as demo: |
| | gr.Markdown( |
| | """ |
| | # PDF Page Stitcher 📄 ➡️ 🖼️ |
| | Upload a PDF file or provide a URL. This tool will convert every page of the PDF into an image |
| | and then append them beneath each other to create a single, tall image that you can download. |
| | """ |
| | ) |
| |
|
| | with gr.Row(): |
| | with gr.Column(scale=1): |
| | with gr.Tabs(): |
| | with gr.TabItem("Upload PDF"): |
| | pdf_file_input = gr.File(label="Upload PDF File", file_types=[".pdf"]) |
| | with gr.TabItem("From URL"): |
| | pdf_url_input = gr.Textbox( |
| | label="PDF URL", |
| | placeholder="e.g., https://arxiv.org/pdf/1706.03762.pdf" |
| | ) |
| | |
| | submit_btn = gr.Button("Stitch PDF Pages", variant="primary") |
| |
|
| | with gr.Column(scale=2): |
| | gr.Markdown("## Output") |
| | output_image_preview = gr.Image( |
| | label="Stitched Image Preview", |
| | type="filepath", |
| | interactive=False, |
| | height=600, |
| | ) |
| | output_image_download = gr.File( |
| | label="Download Stitched Image", |
| | interactive=False |
| | ) |
| |
|
| | |
| | submit_btn.click( |
| | fn=process_pdf, |
| | inputs=[pdf_file_input, pdf_url_input], |
| | outputs=[output_image_preview, output_image_download] |
| | ) |
| |
|
| | gr.Examples( |
| | examples=[ |
| | [None, "https://arxiv.org/pdf/1706.03762.pdf"], |
| | [None, "https://bitcoin.org/bitcoin.pdf"], |
| | ], |
| | inputs=[pdf_file_input, pdf_url_input], |
| | outputs=[output_image_preview, output_image_download], |
| | fn=process_pdf, |
| | cache_examples=True |
| | ) |
| |
|
| |
|
| | |
| | if __name__ == '__main__': |
| | |
| | if not check_poppler(): |
| | logger.warning( |
| | "Poppler utilities could not be verified. The application may fail to process PDFs." |
| | ) |
| | |
| | |
| | demo.launch() |