import gradio as gr import os import json from typing import List, Tuple, Optional, Dict from dataclasses import dataclass, asdict from dotenv import load_dotenv from pathlib import Path from parxy_cli.services.pdf_service import PdfService # Load environment variables load_dotenv() # Maximum file size for attachments (10 MB in bytes) MAX_ATTACHMENT_SIZE = 10 * 1024 * 1024 # Maximum file size for PDF uploads (25 MB in bytes) MAX_PDF_SIZE = 25 * 1024 * 1024 @dataclass class Attachment: """Represents a file attachment with its metadata.""" filename: str filepath: str description: str size: int def get_attachments_from_state(attachments_state: List[Dict]) -> List[Attachment]: """ Retrieve the current list of attachments for the session. Args: attachments_state: The session state containing attachment dictionaries Returns: List of Attachment objects Note: Session isolation is handled by Gradio's State management. Each user session has its own independent state. """ if not attachments_state: return [] attachments = [] for att_dict in attachments_state: if att_dict.get('filepath') and os.path.exists(att_dict.get('filepath')): attachments.append(Attachment( filename=att_dict['filename'], filepath=att_dict['filepath'], description=att_dict['description'], size=att_dict['size'] )) return attachments def add_attachment_to_pdf(pdf_path: str, attachment: Attachment) -> bool: """ Add a single attachment to the PDF file. Args: pdf_path: Path to the PDF file attachment: The Attachment object to add Returns: True if successful, False otherwise """ try: with PdfService(Path(pdf_path)) as pdf: pdf.add_attachment( Path(attachment.filepath), name=attachment.filename, desc=attachment.description ) pdf.save(Path(pdf_path)) return True except Exception as e: print(f"Error adding attachment to PDF: {e}") return False def remove_attachment_from_pdf(pdf_path: str, attachment: Attachment) -> bool: """ Remove an attachment from the PDF file. Args: pdf_path: Path to the PDF file attachment: The Attachment object to remove Returns: True if successful, False otherwise """ try: with PdfService(Path(pdf_path)) as pdf: pdf.remove_attachment(attachment.filename) pdf.save(Path(pdf_path)) return True except KeyError: print(f"Attachment '{attachment.filename}' not found in PDF") return False except Exception as e: print(f"Error removing attachment from PDF: {e}") return False def list_pdf_attachments(pdf_path: str) -> List[Attachment]: """ List all attachments currently embedded in the PDF file. Args: pdf_path: Path to the PDF file Returns: List of Attachment objects found in the PDF """ try: with PdfService(Path(pdf_path)) as pdf: attachment_names = pdf.list_attachments() attachments = [] for name in attachment_names: # Get detailed info for each attachment info = pdf.get_attachment_info(name) attachments.append(Attachment( filename=name, filepath='', # PDF attachments don't have a source filepath description=info.get('desc', ''), size=info.get('size', 0) )) return attachments except Exception as e: print(f"Error listing PDF attachments: {e}") return [] def attach_files_to_pdf(pdf_path: str, attachments: List[Attachment]) -> str: """ Create a new PDF with all attachments embedded. Args: pdf_path: Path to the input PDF file attachments: List of Attachment objects to add to the PDF Returns: Path to the output PDF file with attachments """ output_path = pdf_path.replace(".pdf", "_with_attachments.pdf") try: with PdfService(Path(pdf_path)) as pdf: # Add each attachment to the PDF for att in attachments: pdf.add_attachment( Path(att.filepath), name=att.filename, desc=att.description ) # Save to output path pdf.save(Path(output_path)) print(f"Successfully attached {len(attachments)} files to PDF") for att in attachments: print(f" - {att.filename} ({att.size} bytes): {att.description}") return output_path except Exception as e: print(f"Error attaching files to PDF: {e}") # Fallback: copy original file if attachment fails import shutil shutil.copy2(pdf_path, output_path) raise def validate_file_size(file_path: str) -> Tuple[bool, str]: """Validate that a file is within the size limit.""" if not file_path or not os.path.exists(file_path): return False, "File does not exist" file_size = os.path.getsize(file_path) if file_size > MAX_ATTACHMENT_SIZE: size_mb = file_size / (1024 * 1024) return False, f"File size ({size_mb:.2f} MB) exceeds 10 MB limit" return True, "" def validate_pdf_size(file_path: str) -> Tuple[bool, str]: """Validate that a PDF file is within the size limit.""" if not file_path or not os.path.exists(file_path): return False, "PDF file does not exist" file_size = os.path.getsize(file_path) if file_size > MAX_PDF_SIZE: size_mb = file_size / (1024 * 1024) return False, f"PDF file size ({size_mb:.2f} MB) exceeds 25 MB limit" return True, "" def process_pdf_with_attachments( pdf_file, attachments_state: List[Dict] ) -> Tuple[Optional[str], str]: """ Process the PDF file and add attachments to it. Args: pdf_file: The uploaded PDF file attachments_state: List of attachment dictionaries (session-specific) Returns: Tuple of (output_file_path, status_message) Note: Session isolation is handled by Gradio's State management. Each user session has its own independent attachments_state. """ if not pdf_file: return None, "❌ Please upload a PDF file" # Validate PDF file size is_valid, error_msg = validate_pdf_size(pdf_file.name) if not is_valid: return None, f"❌ {error_msg}" if not attachments_state or len(attachments_state) == 0: return None, "❌ Please add at least one attachment" # Get attachments for this session attachments = get_attachments_from_state(attachments_state) if not attachments: return None, "❌ No valid attachments found" try: output_path = attach_files_to_pdf(pdf_file.name, attachments) return output_path, f"✓ Successfully processed PDF with {len(attachments)} attachment(s)" except Exception as e: return None, f"❌ Error processing PDF: {str(e)}" def add_attachment( attachment_file, description: str, current_attachments: List[Dict] ) -> Tuple[List[Dict], str, str]: """ Add a new attachment to the session's list. Args: attachment_file: The file to attach description: Description of the attachment current_attachments: Current session-specific list of attachments Returns: Tuple of (updated_attachments_list, attachment_list_html, status_message) Note: Session isolation is handled by Gradio's State management. Each user session has its own independent current_attachments list. This function only updates the in-memory state for this session. """ if not attachment_file: return current_attachments, render_attachments_list(current_attachments), "❌ Please select a file to attach" # Validate file size is_valid, error_msg = validate_file_size(attachment_file.name) if not is_valid: return current_attachments, render_attachments_list(current_attachments), f"❌ Error {error_msg}" if not description or description.strip() == "": return current_attachments, render_attachments_list(current_attachments), "❌ Please provide a description for the attachment" # Create new attachment filename = os.path.basename(attachment_file.name) file_size = os.path.getsize(attachment_file.name) new_attachment = { 'filename': filename, 'filepath': attachment_file.name, 'description': description.strip(), 'size': file_size } # Add to session-specific list if current_attachments is None: current_attachments = [] current_attachments.append(new_attachment) # ============================================================================ # PLACEHOLDER: If you want to immediately attach to PDF on add, insert here # ============================================================================ # Example: Uncomment and modify if you want real-time PDF updates # if pdf_file_path: # You'd need to pass the PDF path as a parameter # attachment_obj = Attachment( # filename=filename, # filepath=attachment_file.name, # description=description.strip(), # size=file_size # ) # add_attachment_to_pdf(pdf_file_path, attachment_obj) return ( current_attachments, render_attachments_list(current_attachments), f"✓ Added attachment: {filename}" ) def remove_attachment( index: int, current_attachments: List[Dict] ) -> Tuple[List[Dict], str]: """ Remove an attachment from the session's list by index. Args: index: Index of the attachment to remove current_attachments: Current session-specific list of attachments Returns: Tuple of (updated_attachments_list, attachment_list_html) Note: Session isolation is handled by Gradio's State management. Each user session has its own independent current_attachments list. This function only updates the in-memory state for this session. """ if current_attachments and 0 <= index < len(current_attachments): removed = current_attachments.pop(index) # ============================================================================ # PLACEHOLDER: If you want to immediately remove from PDF, insert here # ============================================================================ # Example: Uncomment and modify if you want real-time PDF updates # if pdf_file_path: # You'd need to pass the PDF path as a parameter # attachment_obj = Attachment( # filename=removed['filename'], # filepath=removed['filepath'], # description=removed['description'], # size=removed['size'] # ) # remove_attachment_from_pdf(pdf_file_path, attachment_obj) return current_attachments, render_attachments_list(current_attachments) return current_attachments, render_attachments_list(current_attachments) def load_pdf_attachments(pdf_file, previous_output_file) -> Tuple[List[Dict], str, None, str]: """ Load attachments from an uploaded PDF file and reset download state. Args: pdf_file: The uploaded PDF file previous_output_file: The previous output file path to clean up Returns: Tuple of (attachments_list, attachment_list_html, None for output_file, empty status) """ # Clean up previous output file if it exists if previous_output_file and os.path.exists(previous_output_file): try: os.remove(previous_output_file) print(f"Cleaned up temporary file: {previous_output_file}") except Exception as e: print(f"Error removing temporary file: {e}") if not pdf_file: return [], render_attachments_list([]), None, "" # Validate PDF file size is_valid, error_msg = validate_pdf_size(pdf_file.name) if not is_valid: return [], render_attachments_list([]), None, f"❌ {error_msg}" try: # Get existing attachments from the PDF existing_attachments = list_pdf_attachments(pdf_file.name) # Convert to dictionary format for state attachments_list = [] for att in existing_attachments: attachments_list.append({ 'filename': att.filename, 'filepath': att.filepath, 'description': att.description, 'size': att.size }) status_msg = f"✓ Loaded PDF with {len(attachments_list)} existing attachment(s)" if attachments_list else "✓ PDF loaded successfully" return attachments_list, render_attachments_list(attachments_list), None, status_msg except Exception as e: print(f"Error loading PDF attachments: {e}") return [], render_attachments_list([]), None, f"❌ Error loading PDF: {str(e)}" def extract_and_download_attachment(pdf_file, attachment_name: str, current_attachments: List[Dict]) -> Tuple[Optional[str], str]: """ Extract an attachment from the PDF and save it for download. Args: pdf_file: The PDF file containing the attachment attachment_name: Name of the attachment to extract current_attachments: Current list of attachments Returns: Tuple of (file_path, status_message) """ if not pdf_file: return None, "❌ Please upload a PDF file first" if not attachment_name: return None, "❌ Please select an attachment to download" try: # Extract the attachment content with PdfService(Path(pdf_file.name)) as pdf: content = pdf.extract_attachment(attachment_name) # Save to temporary file output_dir = Path(pdf_file.name).parent output_path = output_dir / attachment_name with open(output_path, 'wb') as f: f.write(content) return str(output_path), f"✓ Successfully extracted: {attachment_name}" except KeyError: return None, f"❌ Attachment '{attachment_name}' not found in PDF" except Exception as e: return None, f"❌ Error extracting attachment: {str(e)}" def get_attachment_choices(attachments: List[Dict]): """ Get list of attachment names for dropdown. Args: attachments: List of attachment dictionaries Returns: Gradio Dropdown update with new choices """ if not attachments: return gr.Dropdown(choices=[], value=None) choices = [att['filename'] for att in attachments] return gr.Dropdown(choices=choices, value=None) def render_attachments_list(attachments: Optional[List[Dict]]) -> str: """Render the list of attachments as HTML.""" if not attachments or len(attachments) == 0: return "
No attachments added yet
" html = "{att['description']}
No attachments added yet
" ) # Download existing attachment section gr.Markdown("#### Download Attachment") attachment_selector = gr.Dropdown( label="Select attachment to download", choices=[], interactive=True ) download_attachment_btn = gr.Button("Download Selected Attachment", variant="secondary", size="sm") download_status = gr.Markdown("") downloaded_file = gr.File(label="Downloaded Attachment", visible=False) # Right column: Form to add new attachments with gr.Column(scale=1): gr.Markdown("### 3. Add New Attachment") gr.Markdown("*Maximum file size: 10 MB per attachment*") attachment_file = gr.File( label="Select File to Attach", file_types=None, # Allow any file type type="filepath" ) attachment_description = gr.Textbox( label="Attachment Description", placeholder="Enter a description for this attachment...", lines=2 ) add_btn = gr.Button("Add Attachment", variant="primary") add_status = gr.Markdown("") # 3. Download Section (full width at bottom) gr.Markdown("### 4. Process and Download") process_btn = gr.Button("Incorporate Attachments", variant="primary", size="lg") process_status = gr.Markdown("") output_file = gr.File( label="Download PDF with Attachments", visible=False ) # Event handlers # Load existing attachments when PDF is uploaded and reset download state pdf_input.change( fn=load_pdf_attachments, inputs=[pdf_input, output_file], outputs=[attachments_state, attachments_display, output_file, process_status] ).then( lambda: gr.File(visible=False), outputs=[output_file] ).then( fn=get_attachment_choices, inputs=[attachments_state], outputs=[attachment_selector] ) add_btn.click( fn=add_attachment, inputs=[attachment_file, attachment_description, attachments_state], outputs=[attachments_state, attachments_display, add_status] ).then( lambda: (None, ""), # Clear the file input and description outputs=[attachment_file, attachment_description] ).then( fn=get_attachment_choices, inputs=[attachments_state], outputs=[attachment_selector] ) # Download attachment handler download_attachment_btn.click( fn=extract_and_download_attachment, inputs=[pdf_input, attachment_selector, attachments_state], outputs=[downloaded_file, download_status] ).then( lambda x: gr.File(visible=x is not None), inputs=[downloaded_file], outputs=[downloaded_file] ) process_btn.click( fn=process_pdf_with_attachments, inputs=[pdf_input, attachments_state], outputs=[output_file, process_status] ).then( lambda x: gr.File(visible=x is not None), inputs=[output_file], outputs=[output_file] ) demo.launch(theme=gr.themes.Soft()) if __name__ == "__main__": main()