pdf-incorporate / app.py
Alessio Vertemati
Update validation and wording
f531a74
import gradio as gr
import os
import json
from typing import List, Tuple, Optional, Dict
from dataclasses import dataclass, asdict
from dotenv import load_dotenv
from pathlib import Path
from parxy_cli.services.pdf_service import PdfService
# Load environment variables
load_dotenv()
# Maximum file size for attachments (10 MB in bytes)
MAX_ATTACHMENT_SIZE = 10 * 1024 * 1024
# Maximum file size for PDF uploads (25 MB in bytes)
MAX_PDF_SIZE = 25 * 1024 * 1024
@dataclass
class Attachment:
"""Represents a file attachment with its metadata."""
filename: str
filepath: str
description: str
size: int
def get_attachments_from_state(attachments_state: List[Dict]) -> List[Attachment]:
"""
Retrieve the current list of attachments for the session.
Args:
attachments_state: The session state containing attachment dictionaries
Returns:
List of Attachment objects
Note:
Session isolation is handled by Gradio's State management.
Each user session has its own independent state.
"""
if not attachments_state:
return []
attachments = []
for att_dict in attachments_state:
if att_dict.get('filepath') and os.path.exists(att_dict.get('filepath')):
attachments.append(Attachment(
filename=att_dict['filename'],
filepath=att_dict['filepath'],
description=att_dict['description'],
size=att_dict['size']
))
return attachments
def add_attachment_to_pdf(pdf_path: str, attachment: Attachment) -> bool:
"""
Add a single attachment to the PDF file.
Args:
pdf_path: Path to the PDF file
attachment: The Attachment object to add
Returns:
True if successful, False otherwise
"""
try:
with PdfService(Path(pdf_path)) as pdf:
pdf.add_attachment(
Path(attachment.filepath),
name=attachment.filename,
desc=attachment.description
)
pdf.save(Path(pdf_path))
return True
except Exception as e:
print(f"Error adding attachment to PDF: {e}")
return False
def remove_attachment_from_pdf(pdf_path: str, attachment: Attachment) -> bool:
"""
Remove an attachment from the PDF file.
Args:
pdf_path: Path to the PDF file
attachment: The Attachment object to remove
Returns:
True if successful, False otherwise
"""
try:
with PdfService(Path(pdf_path)) as pdf:
pdf.remove_attachment(attachment.filename)
pdf.save(Path(pdf_path))
return True
except KeyError:
print(f"Attachment '{attachment.filename}' not found in PDF")
return False
except Exception as e:
print(f"Error removing attachment from PDF: {e}")
return False
def list_pdf_attachments(pdf_path: str) -> List[Attachment]:
"""
List all attachments currently embedded in the PDF file.
Args:
pdf_path: Path to the PDF file
Returns:
List of Attachment objects found in the PDF
"""
try:
with PdfService(Path(pdf_path)) as pdf:
attachment_names = pdf.list_attachments()
attachments = []
for name in attachment_names:
# Get detailed info for each attachment
info = pdf.get_attachment_info(name)
attachments.append(Attachment(
filename=name,
filepath='', # PDF attachments don't have a source filepath
description=info.get('desc', ''),
size=info.get('size', 0)
))
return attachments
except Exception as e:
print(f"Error listing PDF attachments: {e}")
return []
def attach_files_to_pdf(pdf_path: str, attachments: List[Attachment]) -> str:
"""
Create a new PDF with all attachments embedded.
Args:
pdf_path: Path to the input PDF file
attachments: List of Attachment objects to add to the PDF
Returns:
Path to the output PDF file with attachments
"""
output_path = pdf_path.replace(".pdf", "_with_attachments.pdf")
try:
with PdfService(Path(pdf_path)) as pdf:
# Add each attachment to the PDF
for att in attachments:
pdf.add_attachment(
Path(att.filepath),
name=att.filename,
desc=att.description
)
# Save to output path
pdf.save(Path(output_path))
print(f"Successfully attached {len(attachments)} files to PDF")
for att in attachments:
print(f" - {att.filename} ({att.size} bytes): {att.description}")
return output_path
except Exception as e:
print(f"Error attaching files to PDF: {e}")
# Fallback: copy original file if attachment fails
import shutil
shutil.copy2(pdf_path, output_path)
raise
def validate_file_size(file_path: str) -> Tuple[bool, str]:
"""Validate that a file is within the size limit."""
if not file_path or not os.path.exists(file_path):
return False, "File does not exist"
file_size = os.path.getsize(file_path)
if file_size > MAX_ATTACHMENT_SIZE:
size_mb = file_size / (1024 * 1024)
return False, f"File size ({size_mb:.2f} MB) exceeds 10 MB limit"
return True, ""
def validate_pdf_size(file_path: str) -> Tuple[bool, str]:
"""Validate that a PDF file is within the size limit."""
if not file_path or not os.path.exists(file_path):
return False, "PDF file does not exist"
file_size = os.path.getsize(file_path)
if file_size > MAX_PDF_SIZE:
size_mb = file_size / (1024 * 1024)
return False, f"PDF file size ({size_mb:.2f} MB) exceeds 25 MB limit"
return True, ""
def process_pdf_with_attachments(
pdf_file,
attachments_state: List[Dict]
) -> Tuple[Optional[str], str]:
"""
Process the PDF file and add attachments to it.
Args:
pdf_file: The uploaded PDF file
attachments_state: List of attachment dictionaries (session-specific)
Returns:
Tuple of (output_file_path, status_message)
Note:
Session isolation is handled by Gradio's State management.
Each user session has its own independent attachments_state.
"""
if not pdf_file:
return None, "❌ Please upload a PDF file"
# Validate PDF file size
is_valid, error_msg = validate_pdf_size(pdf_file.name)
if not is_valid:
return None, f"❌ {error_msg}"
if not attachments_state or len(attachments_state) == 0:
return None, "❌ Please add at least one attachment"
# Get attachments for this session
attachments = get_attachments_from_state(attachments_state)
if not attachments:
return None, "❌ No valid attachments found"
try:
output_path = attach_files_to_pdf(pdf_file.name, attachments)
return output_path, f"✓ Successfully processed PDF with {len(attachments)} attachment(s)"
except Exception as e:
return None, f"❌ Error processing PDF: {str(e)}"
def add_attachment(
attachment_file,
description: str,
current_attachments: List[Dict]
) -> Tuple[List[Dict], str, str]:
"""
Add a new attachment to the session's list.
Args:
attachment_file: The file to attach
description: Description of the attachment
current_attachments: Current session-specific list of attachments
Returns:
Tuple of (updated_attachments_list, attachment_list_html, status_message)
Note:
Session isolation is handled by Gradio's State management.
Each user session has its own independent current_attachments list.
This function only updates the in-memory state for this session.
"""
if not attachment_file:
return current_attachments, render_attachments_list(current_attachments), "❌ Please select a file to attach"
# Validate file size
is_valid, error_msg = validate_file_size(attachment_file.name)
if not is_valid:
return current_attachments, render_attachments_list(current_attachments), f"❌ Error {error_msg}"
if not description or description.strip() == "":
return current_attachments, render_attachments_list(current_attachments), "❌ Please provide a description for the attachment"
# Create new attachment
filename = os.path.basename(attachment_file.name)
file_size = os.path.getsize(attachment_file.name)
new_attachment = {
'filename': filename,
'filepath': attachment_file.name,
'description': description.strip(),
'size': file_size
}
# Add to session-specific list
if current_attachments is None:
current_attachments = []
current_attachments.append(new_attachment)
# ============================================================================
# PLACEHOLDER: If you want to immediately attach to PDF on add, insert here
# ============================================================================
# Example: Uncomment and modify if you want real-time PDF updates
# if pdf_file_path: # You'd need to pass the PDF path as a parameter
# attachment_obj = Attachment(
# filename=filename,
# filepath=attachment_file.name,
# description=description.strip(),
# size=file_size
# )
# add_attachment_to_pdf(pdf_file_path, attachment_obj)
return (
current_attachments,
render_attachments_list(current_attachments),
f"✓ Added attachment: {filename}"
)
def remove_attachment(
index: int,
current_attachments: List[Dict]
) -> Tuple[List[Dict], str]:
"""
Remove an attachment from the session's list by index.
Args:
index: Index of the attachment to remove
current_attachments: Current session-specific list of attachments
Returns:
Tuple of (updated_attachments_list, attachment_list_html)
Note:
Session isolation is handled by Gradio's State management.
Each user session has its own independent current_attachments list.
This function only updates the in-memory state for this session.
"""
if current_attachments and 0 <= index < len(current_attachments):
removed = current_attachments.pop(index)
# ============================================================================
# PLACEHOLDER: If you want to immediately remove from PDF, insert here
# ============================================================================
# Example: Uncomment and modify if you want real-time PDF updates
# if pdf_file_path: # You'd need to pass the PDF path as a parameter
# attachment_obj = Attachment(
# filename=removed['filename'],
# filepath=removed['filepath'],
# description=removed['description'],
# size=removed['size']
# )
# remove_attachment_from_pdf(pdf_file_path, attachment_obj)
return current_attachments, render_attachments_list(current_attachments)
return current_attachments, render_attachments_list(current_attachments)
def load_pdf_attachments(pdf_file, previous_output_file) -> Tuple[List[Dict], str, None, str]:
"""
Load attachments from an uploaded PDF file and reset download state.
Args:
pdf_file: The uploaded PDF file
previous_output_file: The previous output file path to clean up
Returns:
Tuple of (attachments_list, attachment_list_html, None for output_file, empty status)
"""
# Clean up previous output file if it exists
if previous_output_file and os.path.exists(previous_output_file):
try:
os.remove(previous_output_file)
print(f"Cleaned up temporary file: {previous_output_file}")
except Exception as e:
print(f"Error removing temporary file: {e}")
if not pdf_file:
return [], render_attachments_list([]), None, ""
# Validate PDF file size
is_valid, error_msg = validate_pdf_size(pdf_file.name)
if not is_valid:
return [], render_attachments_list([]), None, f"❌ {error_msg}"
try:
# Get existing attachments from the PDF
existing_attachments = list_pdf_attachments(pdf_file.name)
# Convert to dictionary format for state
attachments_list = []
for att in existing_attachments:
attachments_list.append({
'filename': att.filename,
'filepath': att.filepath,
'description': att.description,
'size': att.size
})
status_msg = f"✓ Loaded PDF with {len(attachments_list)} existing attachment(s)" if attachments_list else "✓ PDF loaded successfully"
return attachments_list, render_attachments_list(attachments_list), None, status_msg
except Exception as e:
print(f"Error loading PDF attachments: {e}")
return [], render_attachments_list([]), None, f"❌ Error loading PDF: {str(e)}"
def extract_and_download_attachment(pdf_file, attachment_name: str, current_attachments: List[Dict]) -> Tuple[Optional[str], str]:
"""
Extract an attachment from the PDF and save it for download.
Args:
pdf_file: The PDF file containing the attachment
attachment_name: Name of the attachment to extract
current_attachments: Current list of attachments
Returns:
Tuple of (file_path, status_message)
"""
if not pdf_file:
return None, "❌ Please upload a PDF file first"
if not attachment_name:
return None, "❌ Please select an attachment to download"
try:
# Extract the attachment content
with PdfService(Path(pdf_file.name)) as pdf:
content = pdf.extract_attachment(attachment_name)
# Save to temporary file
output_dir = Path(pdf_file.name).parent
output_path = output_dir / attachment_name
with open(output_path, 'wb') as f:
f.write(content)
return str(output_path), f"✓ Successfully extracted: {attachment_name}"
except KeyError:
return None, f"❌ Attachment '{attachment_name}' not found in PDF"
except Exception as e:
return None, f"❌ Error extracting attachment: {str(e)}"
def get_attachment_choices(attachments: List[Dict]):
"""
Get list of attachment names for dropdown.
Args:
attachments: List of attachment dictionaries
Returns:
Gradio Dropdown update with new choices
"""
if not attachments:
return gr.Dropdown(choices=[], value=None)
choices = [att['filename'] for att in attachments]
return gr.Dropdown(choices=choices, value=None)
def render_attachments_list(attachments: Optional[List[Dict]]) -> str:
"""Render the list of attachments as HTML."""
if not attachments or len(attachments) == 0:
return "<p style='color: #666; font-style: italic;'>No attachments added yet</p>"
html = "<div style='font-family: sans-serif;'>"
for idx, att in enumerate(attachments):
size_mb = att['size'] / (1024 * 1024)
html += f"""
<div style='border: 1px solid #ddd; padding: 12px; margin: 8px 0; border-radius: 6px; background: #f9f9f9;'>
<div style='display: flex; justify-content: space-between; align-items: start;'>
<div style='flex: 1;'>
<strong style='color: #333;'>📎 {att['filename']}</strong>
<span style='color: #666; font-size: 0.9em;'> ({size_mb:.2f} MB)</span>
<p style='margin: 8px 0 0 0; color: #555;'>{att['description']}</p>
</div>
</div>
</div>
"""
html += "</div>"
return html
def main():
"""Create and launch the Gradio interface."""
with gr.Blocks(title="Incorporate data in PDF") as demo:
gr.Markdown("""
# 📦 Incorporate Data in PDF
**Transform your PDF into a data container** by incorporate files directly within it. PDFs aren't just static documents—they can carry datasets, supplementary files, and supporting materials alongside your content.
**Why Embed Files in PDFs?**
- **📊 Research & Reports**: Attach raw datasets, analysis scripts, or supplementary tables to academic papers and technical reports
- **📈 Business Documents**: Include spreadsheets, financial data, or supporting evidence within proposals and presentations
- **📝 Documentation**: Bundle configuration files, code samples, or reference materials with technical documentation
- **🔗 Data Provenance**: Keep source data and processed documents together for complete traceability
- **✉️ Simplified Sharing**: Send one file instead of managing multiple attachments—everything travels together
Upload your PDF, add files with descriptions, and create a self-contained document package.
Brought you by [OneOffTech](https://oneofftech.xyz). Created using [Parxy](https://github.com/OneOffTech/parxy)
""")
# State to store attachments
attachments_state = gr.State([])
# 1. File Upload Section (full width at top)
gr.Markdown("### 1. Upload PDF File")
gr.Markdown("*Maximum file size: 25 MB*")
pdf_input = gr.File(
label="Select PDF File",
file_types=[".pdf"],
type="filepath"
)
# 2. Middle section with two columns
with gr.Row():
# Left column: List of current attachments
with gr.Column(scale=1):
gr.Markdown("### 2. Current Attachments")
attachments_display = gr.HTML(
value="<p style='color: #666; font-style: italic;'>No attachments added yet</p>"
)
# Download existing attachment section
gr.Markdown("#### Download Attachment")
attachment_selector = gr.Dropdown(
label="Select attachment to download",
choices=[],
interactive=True
)
download_attachment_btn = gr.Button("Download Selected Attachment", variant="secondary", size="sm")
download_status = gr.Markdown("")
downloaded_file = gr.File(label="Downloaded Attachment", visible=False)
# Right column: Form to add new attachments
with gr.Column(scale=1):
gr.Markdown("### 3. Add New Attachment")
gr.Markdown("*Maximum file size: 10 MB per attachment*")
attachment_file = gr.File(
label="Select File to Attach",
file_types=None, # Allow any file type
type="filepath"
)
attachment_description = gr.Textbox(
label="Attachment Description",
placeholder="Enter a description for this attachment...",
lines=2
)
add_btn = gr.Button("Add Attachment", variant="primary")
add_status = gr.Markdown("")
# 3. Download Section (full width at bottom)
gr.Markdown("### 4. Process and Download")
process_btn = gr.Button("Incorporate Attachments", variant="primary", size="lg")
process_status = gr.Markdown("")
output_file = gr.File(
label="Download PDF with Attachments",
visible=False
)
# Event handlers
# Load existing attachments when PDF is uploaded and reset download state
pdf_input.change(
fn=load_pdf_attachments,
inputs=[pdf_input, output_file],
outputs=[attachments_state, attachments_display, output_file, process_status]
).then(
lambda: gr.File(visible=False),
outputs=[output_file]
).then(
fn=get_attachment_choices,
inputs=[attachments_state],
outputs=[attachment_selector]
)
add_btn.click(
fn=add_attachment,
inputs=[attachment_file, attachment_description, attachments_state],
outputs=[attachments_state, attachments_display, add_status]
).then(
lambda: (None, ""), # Clear the file input and description
outputs=[attachment_file, attachment_description]
).then(
fn=get_attachment_choices,
inputs=[attachments_state],
outputs=[attachment_selector]
)
# Download attachment handler
download_attachment_btn.click(
fn=extract_and_download_attachment,
inputs=[pdf_input, attachment_selector, attachments_state],
outputs=[downloaded_file, download_status]
).then(
lambda x: gr.File(visible=x is not None),
inputs=[downloaded_file],
outputs=[downloaded_file]
)
process_btn.click(
fn=process_pdf_with_attachments,
inputs=[pdf_input, attachments_state],
outputs=[output_file, process_status]
).then(
lambda x: gr.File(visible=x is not None),
inputs=[output_file],
outputs=[output_file]
)
demo.launch(theme=gr.themes.Soft())
if __name__ == "__main__":
main()