Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import json | |
| from typing import List, Tuple, Optional, Dict | |
| from dataclasses import dataclass, asdict | |
| from dotenv import load_dotenv | |
| from pathlib import Path | |
| from parxy_cli.services.pdf_service import PdfService | |
| # Load environment variables | |
| load_dotenv() | |
| # Maximum file size for attachments (10 MB in bytes) | |
| MAX_ATTACHMENT_SIZE = 10 * 1024 * 1024 | |
| # Maximum file size for PDF uploads (25 MB in bytes) | |
| MAX_PDF_SIZE = 25 * 1024 * 1024 | |
| class Attachment: | |
| """Represents a file attachment with its metadata.""" | |
| filename: str | |
| filepath: str | |
| description: str | |
| size: int | |
| def get_attachments_from_state(attachments_state: List[Dict]) -> List[Attachment]: | |
| """ | |
| Retrieve the current list of attachments for the session. | |
| Args: | |
| attachments_state: The session state containing attachment dictionaries | |
| Returns: | |
| List of Attachment objects | |
| Note: | |
| Session isolation is handled by Gradio's State management. | |
| Each user session has its own independent state. | |
| """ | |
| if not attachments_state: | |
| return [] | |
| attachments = [] | |
| for att_dict in attachments_state: | |
| if att_dict.get('filepath') and os.path.exists(att_dict.get('filepath')): | |
| attachments.append(Attachment( | |
| filename=att_dict['filename'], | |
| filepath=att_dict['filepath'], | |
| description=att_dict['description'], | |
| size=att_dict['size'] | |
| )) | |
| return attachments | |
| def add_attachment_to_pdf(pdf_path: str, attachment: Attachment) -> bool: | |
| """ | |
| Add a single attachment to the PDF file. | |
| Args: | |
| pdf_path: Path to the PDF file | |
| attachment: The Attachment object to add | |
| Returns: | |
| True if successful, False otherwise | |
| """ | |
| try: | |
| with PdfService(Path(pdf_path)) as pdf: | |
| pdf.add_attachment( | |
| Path(attachment.filepath), | |
| name=attachment.filename, | |
| desc=attachment.description | |
| ) | |
| pdf.save(Path(pdf_path)) | |
| return True | |
| except Exception as e: | |
| print(f"Error adding attachment to PDF: {e}") | |
| return False | |
| def remove_attachment_from_pdf(pdf_path: str, attachment: Attachment) -> bool: | |
| """ | |
| Remove an attachment from the PDF file. | |
| Args: | |
| pdf_path: Path to the PDF file | |
| attachment: The Attachment object to remove | |
| Returns: | |
| True if successful, False otherwise | |
| """ | |
| try: | |
| with PdfService(Path(pdf_path)) as pdf: | |
| pdf.remove_attachment(attachment.filename) | |
| pdf.save(Path(pdf_path)) | |
| return True | |
| except KeyError: | |
| print(f"Attachment '{attachment.filename}' not found in PDF") | |
| return False | |
| except Exception as e: | |
| print(f"Error removing attachment from PDF: {e}") | |
| return False | |
| def list_pdf_attachments(pdf_path: str) -> List[Attachment]: | |
| """ | |
| List all attachments currently embedded in the PDF file. | |
| Args: | |
| pdf_path: Path to the PDF file | |
| Returns: | |
| List of Attachment objects found in the PDF | |
| """ | |
| try: | |
| with PdfService(Path(pdf_path)) as pdf: | |
| attachment_names = pdf.list_attachments() | |
| attachments = [] | |
| for name in attachment_names: | |
| # Get detailed info for each attachment | |
| info = pdf.get_attachment_info(name) | |
| attachments.append(Attachment( | |
| filename=name, | |
| filepath='', # PDF attachments don't have a source filepath | |
| description=info.get('desc', ''), | |
| size=info.get('size', 0) | |
| )) | |
| return attachments | |
| except Exception as e: | |
| print(f"Error listing PDF attachments: {e}") | |
| return [] | |
| def attach_files_to_pdf(pdf_path: str, attachments: List[Attachment]) -> str: | |
| """ | |
| Create a new PDF with all attachments embedded. | |
| Args: | |
| pdf_path: Path to the input PDF file | |
| attachments: List of Attachment objects to add to the PDF | |
| Returns: | |
| Path to the output PDF file with attachments | |
| """ | |
| output_path = pdf_path.replace(".pdf", "_with_attachments.pdf") | |
| try: | |
| with PdfService(Path(pdf_path)) as pdf: | |
| # Add each attachment to the PDF | |
| for att in attachments: | |
| pdf.add_attachment( | |
| Path(att.filepath), | |
| name=att.filename, | |
| desc=att.description | |
| ) | |
| # Save to output path | |
| pdf.save(Path(output_path)) | |
| print(f"Successfully attached {len(attachments)} files to PDF") | |
| for att in attachments: | |
| print(f" - {att.filename} ({att.size} bytes): {att.description}") | |
| return output_path | |
| except Exception as e: | |
| print(f"Error attaching files to PDF: {e}") | |
| # Fallback: copy original file if attachment fails | |
| import shutil | |
| shutil.copy2(pdf_path, output_path) | |
| raise | |
| def validate_file_size(file_path: str) -> Tuple[bool, str]: | |
| """Validate that a file is within the size limit.""" | |
| if not file_path or not os.path.exists(file_path): | |
| return False, "File does not exist" | |
| file_size = os.path.getsize(file_path) | |
| if file_size > MAX_ATTACHMENT_SIZE: | |
| size_mb = file_size / (1024 * 1024) | |
| return False, f"File size ({size_mb:.2f} MB) exceeds 10 MB limit" | |
| return True, "" | |
| def validate_pdf_size(file_path: str) -> Tuple[bool, str]: | |
| """Validate that a PDF file is within the size limit.""" | |
| if not file_path or not os.path.exists(file_path): | |
| return False, "PDF file does not exist" | |
| file_size = os.path.getsize(file_path) | |
| if file_size > MAX_PDF_SIZE: | |
| size_mb = file_size / (1024 * 1024) | |
| return False, f"PDF file size ({size_mb:.2f} MB) exceeds 25 MB limit" | |
| return True, "" | |
| def process_pdf_with_attachments( | |
| pdf_file, | |
| attachments_state: List[Dict] | |
| ) -> Tuple[Optional[str], str]: | |
| """ | |
| Process the PDF file and add attachments to it. | |
| Args: | |
| pdf_file: The uploaded PDF file | |
| attachments_state: List of attachment dictionaries (session-specific) | |
| Returns: | |
| Tuple of (output_file_path, status_message) | |
| Note: | |
| Session isolation is handled by Gradio's State management. | |
| Each user session has its own independent attachments_state. | |
| """ | |
| if not pdf_file: | |
| return None, "❌ Please upload a PDF file" | |
| # Validate PDF file size | |
| is_valid, error_msg = validate_pdf_size(pdf_file.name) | |
| if not is_valid: | |
| return None, f"❌ {error_msg}" | |
| if not attachments_state or len(attachments_state) == 0: | |
| return None, "❌ Please add at least one attachment" | |
| # Get attachments for this session | |
| attachments = get_attachments_from_state(attachments_state) | |
| if not attachments: | |
| return None, "❌ No valid attachments found" | |
| try: | |
| output_path = attach_files_to_pdf(pdf_file.name, attachments) | |
| return output_path, f"✓ Successfully processed PDF with {len(attachments)} attachment(s)" | |
| except Exception as e: | |
| return None, f"❌ Error processing PDF: {str(e)}" | |
| def add_attachment( | |
| attachment_file, | |
| description: str, | |
| current_attachments: List[Dict] | |
| ) -> Tuple[List[Dict], str, str]: | |
| """ | |
| Add a new attachment to the session's list. | |
| Args: | |
| attachment_file: The file to attach | |
| description: Description of the attachment | |
| current_attachments: Current session-specific list of attachments | |
| Returns: | |
| Tuple of (updated_attachments_list, attachment_list_html, status_message) | |
| Note: | |
| Session isolation is handled by Gradio's State management. | |
| Each user session has its own independent current_attachments list. | |
| This function only updates the in-memory state for this session. | |
| """ | |
| if not attachment_file: | |
| return current_attachments, render_attachments_list(current_attachments), "❌ Please select a file to attach" | |
| # Validate file size | |
| is_valid, error_msg = validate_file_size(attachment_file.name) | |
| if not is_valid: | |
| return current_attachments, render_attachments_list(current_attachments), f"❌ Error {error_msg}" | |
| if not description or description.strip() == "": | |
| return current_attachments, render_attachments_list(current_attachments), "❌ Please provide a description for the attachment" | |
| # Create new attachment | |
| filename = os.path.basename(attachment_file.name) | |
| file_size = os.path.getsize(attachment_file.name) | |
| new_attachment = { | |
| 'filename': filename, | |
| 'filepath': attachment_file.name, | |
| 'description': description.strip(), | |
| 'size': file_size | |
| } | |
| # Add to session-specific list | |
| if current_attachments is None: | |
| current_attachments = [] | |
| current_attachments.append(new_attachment) | |
| # ============================================================================ | |
| # PLACEHOLDER: If you want to immediately attach to PDF on add, insert here | |
| # ============================================================================ | |
| # Example: Uncomment and modify if you want real-time PDF updates | |
| # if pdf_file_path: # You'd need to pass the PDF path as a parameter | |
| # attachment_obj = Attachment( | |
| # filename=filename, | |
| # filepath=attachment_file.name, | |
| # description=description.strip(), | |
| # size=file_size | |
| # ) | |
| # add_attachment_to_pdf(pdf_file_path, attachment_obj) | |
| return ( | |
| current_attachments, | |
| render_attachments_list(current_attachments), | |
| f"✓ Added attachment: {filename}" | |
| ) | |
| def remove_attachment( | |
| index: int, | |
| current_attachments: List[Dict] | |
| ) -> Tuple[List[Dict], str]: | |
| """ | |
| Remove an attachment from the session's list by index. | |
| Args: | |
| index: Index of the attachment to remove | |
| current_attachments: Current session-specific list of attachments | |
| Returns: | |
| Tuple of (updated_attachments_list, attachment_list_html) | |
| Note: | |
| Session isolation is handled by Gradio's State management. | |
| Each user session has its own independent current_attachments list. | |
| This function only updates the in-memory state for this session. | |
| """ | |
| if current_attachments and 0 <= index < len(current_attachments): | |
| removed = current_attachments.pop(index) | |
| # ============================================================================ | |
| # PLACEHOLDER: If you want to immediately remove from PDF, insert here | |
| # ============================================================================ | |
| # Example: Uncomment and modify if you want real-time PDF updates | |
| # if pdf_file_path: # You'd need to pass the PDF path as a parameter | |
| # attachment_obj = Attachment( | |
| # filename=removed['filename'], | |
| # filepath=removed['filepath'], | |
| # description=removed['description'], | |
| # size=removed['size'] | |
| # ) | |
| # remove_attachment_from_pdf(pdf_file_path, attachment_obj) | |
| return current_attachments, render_attachments_list(current_attachments) | |
| return current_attachments, render_attachments_list(current_attachments) | |
| def load_pdf_attachments(pdf_file, previous_output_file) -> Tuple[List[Dict], str, None, str]: | |
| """ | |
| Load attachments from an uploaded PDF file and reset download state. | |
| Args: | |
| pdf_file: The uploaded PDF file | |
| previous_output_file: The previous output file path to clean up | |
| Returns: | |
| Tuple of (attachments_list, attachment_list_html, None for output_file, empty status) | |
| """ | |
| # Clean up previous output file if it exists | |
| if previous_output_file and os.path.exists(previous_output_file): | |
| try: | |
| os.remove(previous_output_file) | |
| print(f"Cleaned up temporary file: {previous_output_file}") | |
| except Exception as e: | |
| print(f"Error removing temporary file: {e}") | |
| if not pdf_file: | |
| return [], render_attachments_list([]), None, "" | |
| # Validate PDF file size | |
| is_valid, error_msg = validate_pdf_size(pdf_file.name) | |
| if not is_valid: | |
| return [], render_attachments_list([]), None, f"❌ {error_msg}" | |
| try: | |
| # Get existing attachments from the PDF | |
| existing_attachments = list_pdf_attachments(pdf_file.name) | |
| # Convert to dictionary format for state | |
| attachments_list = [] | |
| for att in existing_attachments: | |
| attachments_list.append({ | |
| 'filename': att.filename, | |
| 'filepath': att.filepath, | |
| 'description': att.description, | |
| 'size': att.size | |
| }) | |
| status_msg = f"✓ Loaded PDF with {len(attachments_list)} existing attachment(s)" if attachments_list else "✓ PDF loaded successfully" | |
| return attachments_list, render_attachments_list(attachments_list), None, status_msg | |
| except Exception as e: | |
| print(f"Error loading PDF attachments: {e}") | |
| return [], render_attachments_list([]), None, f"❌ Error loading PDF: {str(e)}" | |
| def extract_and_download_attachment(pdf_file, attachment_name: str, current_attachments: List[Dict]) -> Tuple[Optional[str], str]: | |
| """ | |
| Extract an attachment from the PDF and save it for download. | |
| Args: | |
| pdf_file: The PDF file containing the attachment | |
| attachment_name: Name of the attachment to extract | |
| current_attachments: Current list of attachments | |
| Returns: | |
| Tuple of (file_path, status_message) | |
| """ | |
| if not pdf_file: | |
| return None, "❌ Please upload a PDF file first" | |
| if not attachment_name: | |
| return None, "❌ Please select an attachment to download" | |
| try: | |
| # Extract the attachment content | |
| with PdfService(Path(pdf_file.name)) as pdf: | |
| content = pdf.extract_attachment(attachment_name) | |
| # Save to temporary file | |
| output_dir = Path(pdf_file.name).parent | |
| output_path = output_dir / attachment_name | |
| with open(output_path, 'wb') as f: | |
| f.write(content) | |
| return str(output_path), f"✓ Successfully extracted: {attachment_name}" | |
| except KeyError: | |
| return None, f"❌ Attachment '{attachment_name}' not found in PDF" | |
| except Exception as e: | |
| return None, f"❌ Error extracting attachment: {str(e)}" | |
| def get_attachment_choices(attachments: List[Dict]): | |
| """ | |
| Get list of attachment names for dropdown. | |
| Args: | |
| attachments: List of attachment dictionaries | |
| Returns: | |
| Gradio Dropdown update with new choices | |
| """ | |
| if not attachments: | |
| return gr.Dropdown(choices=[], value=None) | |
| choices = [att['filename'] for att in attachments] | |
| return gr.Dropdown(choices=choices, value=None) | |
| def render_attachments_list(attachments: Optional[List[Dict]]) -> str: | |
| """Render the list of attachments as HTML.""" | |
| if not attachments or len(attachments) == 0: | |
| return "<p style='color: #666; font-style: italic;'>No attachments added yet</p>" | |
| html = "<div style='font-family: sans-serif;'>" | |
| for idx, att in enumerate(attachments): | |
| size_mb = att['size'] / (1024 * 1024) | |
| html += f""" | |
| <div style='border: 1px solid #ddd; padding: 12px; margin: 8px 0; border-radius: 6px; background: #f9f9f9;'> | |
| <div style='display: flex; justify-content: space-between; align-items: start;'> | |
| <div style='flex: 1;'> | |
| <strong style='color: #333;'>📎 {att['filename']}</strong> | |
| <span style='color: #666; font-size: 0.9em;'> ({size_mb:.2f} MB)</span> | |
| <p style='margin: 8px 0 0 0; color: #555;'>{att['description']}</p> | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| html += "</div>" | |
| return html | |
| def main(): | |
| """Create and launch the Gradio interface.""" | |
| with gr.Blocks(title="Incorporate data in PDF") as demo: | |
| gr.Markdown(""" | |
| # 📦 Incorporate Data in PDF | |
| **Transform your PDF into a data container** by incorporate files directly within it. PDFs aren't just static documents—they can carry datasets, supplementary files, and supporting materials alongside your content. | |
| **Why Embed Files in PDFs?** | |
| - **📊 Research & Reports**: Attach raw datasets, analysis scripts, or supplementary tables to academic papers and technical reports | |
| - **📈 Business Documents**: Include spreadsheets, financial data, or supporting evidence within proposals and presentations | |
| - **📝 Documentation**: Bundle configuration files, code samples, or reference materials with technical documentation | |
| - **🔗 Data Provenance**: Keep source data and processed documents together for complete traceability | |
| - **✉️ Simplified Sharing**: Send one file instead of managing multiple attachments—everything travels together | |
| Upload your PDF, add files with descriptions, and create a self-contained document package. | |
| Brought you by [OneOffTech](https://oneofftech.xyz). Created using [Parxy](https://github.com/OneOffTech/parxy) | |
| """) | |
| # State to store attachments | |
| attachments_state = gr.State([]) | |
| # 1. File Upload Section (full width at top) | |
| gr.Markdown("### 1. Upload PDF File") | |
| gr.Markdown("*Maximum file size: 25 MB*") | |
| pdf_input = gr.File( | |
| label="Select PDF File", | |
| file_types=[".pdf"], | |
| type="filepath" | |
| ) | |
| # 2. Middle section with two columns | |
| with gr.Row(): | |
| # Left column: List of current attachments | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 2. Current Attachments") | |
| attachments_display = gr.HTML( | |
| value="<p style='color: #666; font-style: italic;'>No attachments added yet</p>" | |
| ) | |
| # Download existing attachment section | |
| gr.Markdown("#### Download Attachment") | |
| attachment_selector = gr.Dropdown( | |
| label="Select attachment to download", | |
| choices=[], | |
| interactive=True | |
| ) | |
| download_attachment_btn = gr.Button("Download Selected Attachment", variant="secondary", size="sm") | |
| download_status = gr.Markdown("") | |
| downloaded_file = gr.File(label="Downloaded Attachment", visible=False) | |
| # Right column: Form to add new attachments | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 3. Add New Attachment") | |
| gr.Markdown("*Maximum file size: 10 MB per attachment*") | |
| attachment_file = gr.File( | |
| label="Select File to Attach", | |
| file_types=None, # Allow any file type | |
| type="filepath" | |
| ) | |
| attachment_description = gr.Textbox( | |
| label="Attachment Description", | |
| placeholder="Enter a description for this attachment...", | |
| lines=2 | |
| ) | |
| add_btn = gr.Button("Add Attachment", variant="primary") | |
| add_status = gr.Markdown("") | |
| # 3. Download Section (full width at bottom) | |
| gr.Markdown("### 4. Process and Download") | |
| process_btn = gr.Button("Incorporate Attachments", variant="primary", size="lg") | |
| process_status = gr.Markdown("") | |
| output_file = gr.File( | |
| label="Download PDF with Attachments", | |
| visible=False | |
| ) | |
| # Event handlers | |
| # Load existing attachments when PDF is uploaded and reset download state | |
| pdf_input.change( | |
| fn=load_pdf_attachments, | |
| inputs=[pdf_input, output_file], | |
| outputs=[attachments_state, attachments_display, output_file, process_status] | |
| ).then( | |
| lambda: gr.File(visible=False), | |
| outputs=[output_file] | |
| ).then( | |
| fn=get_attachment_choices, | |
| inputs=[attachments_state], | |
| outputs=[attachment_selector] | |
| ) | |
| add_btn.click( | |
| fn=add_attachment, | |
| inputs=[attachment_file, attachment_description, attachments_state], | |
| outputs=[attachments_state, attachments_display, add_status] | |
| ).then( | |
| lambda: (None, ""), # Clear the file input and description | |
| outputs=[attachment_file, attachment_description] | |
| ).then( | |
| fn=get_attachment_choices, | |
| inputs=[attachments_state], | |
| outputs=[attachment_selector] | |
| ) | |
| # Download attachment handler | |
| download_attachment_btn.click( | |
| fn=extract_and_download_attachment, | |
| inputs=[pdf_input, attachment_selector, attachments_state], | |
| outputs=[downloaded_file, download_status] | |
| ).then( | |
| lambda x: gr.File(visible=x is not None), | |
| inputs=[downloaded_file], | |
| outputs=[downloaded_file] | |
| ) | |
| process_btn.click( | |
| fn=process_pdf_with_attachments, | |
| inputs=[pdf_input, attachments_state], | |
| outputs=[output_file, process_status] | |
| ).then( | |
| lambda x: gr.File(visible=x is not None), | |
| inputs=[output_file], | |
| outputs=[output_file] | |
| ) | |
| demo.launch(theme=gr.themes.Soft()) | |
| if __name__ == "__main__": | |
| main() | |