|
|
import os |
|
|
import tempfile |
|
|
import logging |
|
|
import subprocess |
|
|
from typing import List |
|
|
|
|
|
import gradio as gr |
|
|
import requests |
|
|
from PIL import Image |
|
|
from pdf2image import convert_from_path, convert_from_bytes |
|
|
from pdf2image.exceptions import PDFInfoNotInstalledError, PDFPageCountError |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
def check_poppler(): |
|
|
""" |
|
|
Checks if the Poppler PDF rendering utility is installed and accessible. |
|
|
""" |
|
|
try: |
|
|
|
|
|
result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True, check=False) |
|
|
if result.returncode == 0 or "pdftoppm version" in result.stderr: |
|
|
logger.info("Poppler check successful.") |
|
|
return True |
|
|
else: |
|
|
logger.error(f"Poppler check failed. stderr: {result.stderr.strip()}") |
|
|
return False |
|
|
except FileNotFoundError: |
|
|
logger.error("Poppler (pdftoppm command) not found. Ensure poppler-utils is installed and in the system's PATH.") |
|
|
return False |
|
|
except Exception as e: |
|
|
logger.error(f"An unexpected error occurred during Poppler check: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
def stitch_images_vertically(images: List[Image.Image]) -> Image.Image: |
|
|
""" |
|
|
Stitches a list of PIL Images together vertically. |
|
|
|
|
|
Args: |
|
|
images: A list of PIL Image objects. |
|
|
|
|
|
Returns: |
|
|
A single PIL Image object containing all input images stitched together. |
|
|
""" |
|
|
if not images: |
|
|
return None |
|
|
|
|
|
|
|
|
max_width = max(img.width for img in images) |
|
|
|
|
|
|
|
|
total_height = sum(img.height for img in images) |
|
|
|
|
|
|
|
|
stitched_image = Image.new('RGB', (max_width, total_height), (255, 255, 255)) |
|
|
|
|
|
|
|
|
current_y = 0 |
|
|
for img in images: |
|
|
stitched_image.paste(img, (0, current_y)) |
|
|
current_y += img.height |
|
|
|
|
|
return stitched_image |
|
|
|
|
|
|
|
|
def process_pdf(pdf_file, pdf_url, progress=gr.Progress()): |
|
|
""" |
|
|
The main processing function for the Gradio interface. |
|
|
It takes a PDF (either as an uploaded file or a URL), converts all its |
|
|
pages to images, and stitches them into a single tall image. |
|
|
""" |
|
|
pdf_input_source = None |
|
|
is_bytes = False |
|
|
source_name = "document" |
|
|
|
|
|
|
|
|
progress(0, desc="Validating input...") |
|
|
if pdf_file is not None: |
|
|
logger.info(f"Processing uploaded file: {pdf_file.name}") |
|
|
pdf_input_source = pdf_file.name |
|
|
source_name = os.path.splitext(os.path.basename(pdf_file.name))[0] |
|
|
is_bytes = False |
|
|
elif pdf_url and pdf_url.strip(): |
|
|
url = pdf_url.strip() |
|
|
logger.info(f"Processing file from URL: {url}") |
|
|
progress(0.1, desc=f"Downloading PDF from URL...") |
|
|
try: |
|
|
response = requests.get(url, timeout=45) |
|
|
response.raise_for_status() |
|
|
pdf_input_source = response.content |
|
|
source_name = os.path.splitext(os.path.basename(url.split('?')[0]))[0] |
|
|
is_bytes = True |
|
|
except requests.RequestException as e: |
|
|
raise gr.Error(f"Failed to download PDF from URL. Error: {e}") |
|
|
else: |
|
|
raise gr.Error("Please upload a PDF file or provide a valid URL.") |
|
|
|
|
|
|
|
|
progress(0.3, desc="Converting PDF pages to images...") |
|
|
try: |
|
|
if is_bytes: |
|
|
images = convert_from_bytes(pdf_input_source, dpi=200) |
|
|
else: |
|
|
images = convert_from_path(pdf_input_source, dpi=200) |
|
|
except (PDFInfoNotInstalledError, FileNotFoundError): |
|
|
raise gr.Error("Poppler not found. Please ensure poppler-utils is installed and in your system's PATH.") |
|
|
except (PDFPageCountError, Exception) as e: |
|
|
raise gr.Error(f"Failed to process the PDF. It might be corrupted or password-protected. Error: {e}") |
|
|
|
|
|
if not images: |
|
|
raise gr.Error("Could not extract any pages from the PDF. The file might be empty or invalid.") |
|
|
|
|
|
logger.info(f"Successfully converted {len(images)} pages to images.") |
|
|
|
|
|
|
|
|
progress(0.7, desc=f"Stitching {len(images)} images together...") |
|
|
|
|
|
stitched_image = stitch_images_vertically(images) |
|
|
if stitched_image is None: |
|
|
raise gr.Error("Image stitching failed.") |
|
|
|
|
|
logger.info("Image stitching complete.") |
|
|
|
|
|
|
|
|
progress(0.9, desc="Saving final image...") |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".png", prefix=f"{source_name}_stitched_") as tmp_file: |
|
|
stitched_image.save(tmp_file.name, "PNG") |
|
|
output_path = tmp_file.name |
|
|
|
|
|
logger.info(f"Final image saved to temporary path: {output_path}") |
|
|
progress(1, desc="Done!") |
|
|
|
|
|
|
|
|
|
|
|
return output_path, output_path |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# PDF Page Stitcher 📄 ➡️ 🖼️ |
|
|
Upload a PDF file or provide a URL. This tool will convert every page of the PDF into an image |
|
|
and then append them beneath each other to create a single, tall image that you can download. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("Upload PDF"): |
|
|
pdf_file_input = gr.File(label="Upload PDF File", file_types=[".pdf"]) |
|
|
with gr.TabItem("From URL"): |
|
|
pdf_url_input = gr.Textbox( |
|
|
label="PDF URL", |
|
|
placeholder="e.g., https://arxiv.org/pdf/1706.03762.pdf" |
|
|
) |
|
|
|
|
|
submit_btn = gr.Button("Stitch PDF Pages", variant="primary") |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
gr.Markdown("## Output") |
|
|
output_image_preview = gr.Image( |
|
|
label="Stitched Image Preview", |
|
|
type="filepath", |
|
|
interactive=False, |
|
|
height=600, |
|
|
) |
|
|
output_image_download = gr.File( |
|
|
label="Download Stitched Image", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
|
|
|
submit_btn.click( |
|
|
fn=process_pdf, |
|
|
inputs=[pdf_file_input, pdf_url_input], |
|
|
outputs=[output_image_preview, output_image_download] |
|
|
) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
[None, "https://arxiv.org/pdf/1706.03762.pdf"], |
|
|
[None, "https://bitcoin.org/bitcoin.pdf"], |
|
|
], |
|
|
inputs=[pdf_file_input, pdf_url_input], |
|
|
outputs=[output_image_preview, output_image_download], |
|
|
fn=process_pdf, |
|
|
cache_examples=False |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
if not check_poppler(): |
|
|
logger.warning( |
|
|
"Poppler utilities could not be verified. The application may fail to process PDFs." |
|
|
) |
|
|
|
|
|
|
|
|
demo.launch() |