File size: 6,372 Bytes
45cdd58
 
 
 
1cc3f72
45cdd58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cc3f72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
787a9bf
45cdd58
 
40f4278
45cdd58
 
 
 
40f4278
45cdd58
 
 
 
40f4278
45cdd58
 
 
 
 
 
 
 
 
 
 
 
3451163
45cdd58
 
3451163
45cdd58
3451163
45cdd58
40f4278
45cdd58
 
 
 
 
 
 
 
 
 
1cc3f72
 
 
 
 
45cdd58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cc3f72
45cdd58
 
 
 
 
 
 
 
 
 
 
 
 
1cc3f72
3451163
 
 
dd75ef8
3451163
 
 
 
1cc3f72
 
 
 
 
 
 
 
 
45cdd58
 
 
 
 
 
 
 
 
40f4278
45cdd58
 
 
 
 
 
 
 
1cc3f72
45cdd58
 
 
67e4414
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import os
import tempfile
import logging
from typing import List
import math

import gradio as gr
import requests
from PIL import Image
from pdf2image import convert_from_path, convert_from_bytes
from pdf2image.exceptions import PDFInfoNotInstalledError, PDFPageCountError

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

def stitch_images_vertically(images: List[Image.Image]) -> Image.Image:
    if not images:
        return None

    max_width = max(img.width for img in images)
    total_height = sum(img.height for img in images)

    stitched_image = Image.new('RGB', (max_width, total_height), (255, 255, 255))

    current_y = 0
    for img in images:
        stitched_image.paste(img, (0, current_y))
        current_y += img.height
        
    return stitched_image

def stitch_images_in_grid(images: List[Image.Image], num_columns: int) -> Image.Image:
    if not images:
        return None

    num_images = len(images)
    columns = [images[i::num_columns] for i in range(num_columns)]
    
    stitched_columns = [stitch_images_vertically(col) for col in columns]
    
    max_height = max(col.height for col in stitched_columns if col)
    total_width = sum(col.width for col in stitched_columns if col)
    
    grid_image = Image.new('RGB', (total_width, max_height), (255, 255, 255))
    
    current_x = 0
    for col_img in stitched_columns:
        if col_img:
            grid_image.paste(col_img, (current_x, 0))
            current_x += col_img.width
            
    return grid_image

def process_pdf(pdf_file, pdf_url, dpi, num_columns, progress=gr.Progress()):
    pdf_input_source = None
    is_bytes = False
    source_name = "document"

    progress(0, desc="Validating input...")
    if pdf_file is not None:
        logger.info(f"Processing uploaded file: {pdf_file.name}")
        pdf_input_source = pdf_file.name
        source_name = os.path.splitext(os.path.basename(pdf_file.name))[0]
    elif pdf_url and pdf_url.strip():
        url = pdf_url.strip()
        logger.info(f"Processing file from URL: {url}")
        progress(0.1, desc="Downloading PDF from URL...")
        try:
            response = requests.get(url, timeout=45)
            response.raise_for_status()
            pdf_input_source = response.content
            source_name = os.path.splitext(os.path.basename(url.split('?')[0]))[0]
            is_bytes = True
        except requests.RequestException as e:
            raise gr.Error(f"Failed to download PDF from URL. Error: {e}")
    else:
        raise gr.Error("Please upload a PDF file or provide a valid URL.")

    progress(0.3, desc="Converting PDF pages to images...")
    logger.info(f"Using DPI: {dpi}")
    try:
        if is_bytes:
            images = convert_from_bytes(pdf_input_source, dpi=dpi)
        else:
            images = convert_from_path(pdf_input_source, dpi=dpi)
    except (PDFInfoNotInstalledError, FileNotFoundError):
        raise gr.Error("Server configuration error: Poppler dependency is missing.")
    except (PDFPageCountError, Exception) as e:
        raise gr.Error(f"Failed to process the PDF. It might be corrupted or password-protected. Error: {e}")

    if not images:
        raise gr.Error("Could not extract any pages from the PDF. The file might be empty or invalid.")
    
    logger.info(f"Successfully converted {len(images)} pages to images.")

    progress(0.7, desc=f"Stitching {len(images)} images together...")
    
    if num_columns > 1:
        stitched_image = stitch_images_in_grid(images, num_columns)
    else:
        stitched_image = stitch_images_vertically(images)
        
    if stitched_image is None:
        raise gr.Error("Image stitching failed.")
        
    logger.info("Image stitching complete.")

    progress(0.9, desc="Saving final image...")
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png", prefix=f"{source_name}_stitched_") as tmp_file:
        stitched_image.save(tmp_file.name, "PNG")
        output_path = tmp_file.name
        
    logger.info(f"Final image saved to temporary path: {output_path}")
    progress(1, desc="Done!")
    
    return output_path, output_path

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # PDF Page Stitcher 📄 ➡️ 🖼️
        Upload a PDF file or provide a URL. This tool will convert every page of the PDF into an image 
        and then append them to create a single image that you can download.
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            with gr.Tabs():
                with gr.TabItem("Upload PDF"):
                    pdf_file_input = gr.File(label="Upload PDF File", file_types=[".pdf"])
                with gr.TabItem("From URL"):
                    pdf_url_input = gr.Textbox(
                        label="PDF URL",
                        placeholder="e.g., https://arxiv.org/pdf/1706.03762.pdf"
                    )
            
            dpi_slider = gr.Slider(
                minimum=100,
                maximum=600,
                step=10,
                value=200,
                label="Image Resolution (DPI)",
                info="Higher DPI results in a clearer image but increases processing time and file size."
            )

            columns_slider = gr.Slider(
                minimum=1,
                maximum=10,
                step=1,
                value=1,
                label="Number of Columns",
                info="Increase to make the final image wider and less tall."
            )
            
            submit_btn = gr.Button("Stitch PDF Pages", variant="primary")

        with gr.Column(scale=2):
            gr.Markdown("## Output")
            output_image_preview = gr.Image(
                label="Stitched Image Preview",
                type="filepath",
                interactive=False,
                height=600,
            )
            output_image_download = gr.File(
                label="Download Stitched Image",
                interactive=False
            )

    submit_btn.click(
        fn=process_pdf,
        inputs=[pdf_file_input, pdf_url_input, dpi_slider, columns_slider],
        outputs=[output_image_preview, output_image_download]
    )

demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)