File size: 7,840 Bytes
45cdd58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00dc935
45cdd58
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import os
import tempfile
import logging
import subprocess
from typing import List

import gradio as gr
import requests
from PIL import Image
from pdf2image import convert_from_path, convert_from_bytes
from pdf2image.exceptions import PDFInfoNotInstalledError, PDFPageCountError

# --- Logging Configuration ---
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


def check_poppler():
    """
    Checks if the Poppler PDF rendering utility is installed and accessible.
    """
    try:
        # Run a simple poppler command to check for its existence and version
        result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False)
        if result.returncode == 0 or "pdftoppm version" in result.stderr:
            logger.info("Poppler check successful.")
            return True
        else:
            logger.error(f"Poppler check failed. stderr: {result.stderr.strip()}")
            return False
    except FileNotFoundError:
        logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in the system's PATH.")
        return False
    except Exception as e:
        logger.error(f"An unexpected error occurred during Poppler check: {e}")
        return False


def stitch_images_vertically(images: List[Image.Image]) -> Image.Image:
    """
    Stitches a list of PIL Images together vertically.

    Args:
        images: A list of PIL Image objects.

    Returns:
        A single PIL Image object containing all input images stitched together.
    """
    if not images:
        return None

    # Find the maximum width among all images to use as the canvas width
    max_width = max(img.width for img in images)
    
    # Calculate the total height by summing the height of all images
    total_height = sum(img.height for img in images)

    # Create a new blank image (canvas) with a white background
    stitched_image = Image.new('RGB', (max_width, total_height), (255, 255, 255))

    # Paste each image onto the canvas, one below the other
    current_y = 0
    for img in images:
        stitched_image.paste(img, (0, current_y))
        current_y += img.height
        
    return stitched_image


def process_pdf(pdf_file, pdf_url, progress=gr.Progress()):
    """
    The main processing function for the Gradio interface.
    It takes a PDF (either as an uploaded file or a URL), converts all its
    pages to images, and stitches them into a single tall image.
    """
    pdf_input_source = None
    is_bytes = False
    source_name = "document" # Default name for output file

    # --- 1. Determine Input Source ---
    progress(0, desc="Validating input...")
    if pdf_file is not None:
        logger.info(f"Processing uploaded file: {pdf_file.name}")
        pdf_input_source = pdf_file.name  # .name provides the temp path in Gradio
        source_name = os.path.splitext(os.path.basename(pdf_file.name))[0]
        is_bytes = False
    elif pdf_url and pdf_url.strip():
        url = pdf_url.strip()
        logger.info(f"Processing file from URL: {url}")
        progress(0.1, desc=f"Downloading PDF from URL...")
        try:
            response = requests.get(url, timeout=45)
            response.raise_for_status()
            pdf_input_source = response.content
            source_name = os.path.splitext(os.path.basename(url.split('?')[0]))[0]
            is_bytes = True
        except requests.RequestException as e:
            raise gr.Error(f"Failed to download PDF from URL. Error: {e}")
    else:
        raise gr.Error("Please upload a PDF file or provide a valid URL.")

    # --- 2. Convert PDF to a List of Images ---
    progress(0.3, desc="Converting PDF pages to images...")
    try:
        if is_bytes:
            images = convert_from_bytes(pdf_input_source, dpi=200)
        else:
            images = convert_from_path(pdf_input_source, dpi=200)
    except (PDFInfoNotInstalledError, FileNotFoundError):
        raise gr.Error("Poppler not found. Please ensure poppler-utils is installed and in your system's PATH.")
    except (PDFPageCountError, Exception) as e:
        raise gr.Error(f"Failed to process the PDF. It might be corrupted or password-protected. Error: {e}")

    if not images:
        raise gr.Error("Could not extract any pages from the PDF. The file might be empty or invalid.")
    
    logger.info(f"Successfully converted {len(images)} pages to images.")

    # --- 3. Stitch the Images Together ---
    progress(0.7, desc=f"Stitching {len(images)} images together...")
    
    stitched_image = stitch_images_vertically(images)
    if stitched_image is None:
        raise gr.Error("Image stitching failed.")
        
    logger.info("Image stitching complete.")

    # --- 4. Save the Final Image to a Temporary File ---
    progress(0.9, desc="Saving final image...")
    
    # Use a named temporary file that Gradio can serve
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png", prefix=f"{source_name}_stitched_") as tmp_file:
        stitched_image.save(tmp_file.name, "PNG")
        output_path = tmp_file.name
        
    logger.info(f"Final image saved to temporary path: {output_path}")
    progress(1, desc="Done!")
    
    # --- 5. Return the path for the Gradio output components ---
    # The first path is for the gr.Image preview, the second for the gr.File download.
    return output_path, output_path


# --- Gradio Interface Definition ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # PDF Page Stitcher 📄 ➡️ 🖼️
        Upload a PDF file or provide a URL. This tool will convert every page of the PDF into an image 
        and then append them beneath each other to create a single, tall image that you can download.
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            with gr.Tabs():
                with gr.TabItem("Upload PDF"):
                    pdf_file_input = gr.File(label="Upload PDF File", file_types=[".pdf"])
                with gr.TabItem("From URL"):
                    pdf_url_input = gr.Textbox(
                        label="PDF URL",
                        placeholder="e.g., https://arxiv.org/pdf/1706.03762.pdf"
                    )
            
            submit_btn = gr.Button("Stitch PDF Pages", variant="primary")

        with gr.Column(scale=2):
            gr.Markdown("## Output")
            output_image_preview = gr.Image(
                label="Stitched Image Preview",
                type="filepath",
                interactive=False,
                height=600, # Set a fixed height for the preview area
            )
            output_image_download = gr.File(
                label="Download Stitched Image",
                interactive=False
            )

    # Connect the button click event to the processing function
    submit_btn.click(
        fn=process_pdf,
        inputs=[pdf_file_input, pdf_url_input],
        outputs=[output_image_preview, output_image_download]
    )

    gr.Examples(
        examples=[
            [None, "https://arxiv.org/pdf/1706.03762.pdf"], # "Attention is All You Need" paper
            [None, "https://bitcoin.org/bitcoin.pdf"],     # Bitcoin whitepaper
        ],
        inputs=[pdf_file_input, pdf_url_input],
        outputs=[output_image_preview, output_image_download],
        fn=process_pdf,
        cache_examples=False # Cache results for faster demo
    )


# --- Main Execution ---
if __name__ == '__main__':
    # Perform a check for Poppler when the script starts
    if not check_poppler():
        logger.warning(
            "Poppler utilities could not be verified. The application may fail to process PDFs."
        )
    
    # Launch the Gradio application
    demo.launch()