Spaces:

MicroHealth
/

pdf-split

Build error

App Files Files Community

bluenevus commited on Sep 18, 2025

Commit

3809b5f

verified ·

1 Parent(s): b2d0f10

Update app.py

Browse files

Files changed (1) hide show

app.py +174 -92

app.py CHANGED Viewed

@@ -2,50 +2,77 @@ import gradio as gr
 import pikepdf
 import os
 import zipfile
-import tempfile
 import shutil
 from pathlib import Path
 import uuid
 from datetime import datetime, timedelta
 import threading
 import time
-from typing import Tuple, Optional
-import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Configuration
-MAX_FILE_SIZE_MB = 5
-CHUNK_SIZE_MB = 4.5
-MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
-CHUNK_SIZE_BYTES = int(CHUNK_SIZE_MB * 1024 * 1024)
 TEMP_DIR = Path("temp_files")
 CLEANUP_AFTER_MINUTES = 10
 # Create temp directory
 TEMP_DIR.mkdir(exist_ok=True)
-# Store user sessions
 user_sessions = {}
 class PDFProcessor:
     """Handle PDF splitting with qpdf/pikepdf"""
     @staticmethod
-    def split_pdf(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[list, dict]:
         """
-        Split PDF into chunks using pikepdf (qpdf wrapper)
-        Returns: (list of output files, statistics dict)
         """
-        output_files = []
         stats = {
             "total_pages": 0,
             "segments_created": 0,
             "segments_discarded": 0,
             "original_size_mb": 0,
-            "total_output_size_mb": 0
         }
         try:
@@ -57,25 +84,31 @@ class PDFProcessor:
                 total_pages = len(pdf.pages)
                 stats["total_pages"] = total_pages
-                # Calculate pages per segment
-                file_size = input_path.stat().st_size
-                avg_page_size = file_size / total_pages if total_pages > 0 else file_size
-                pages_per_segment = max(1, int(CHUNK_SIZE_BYTES / avg_page_size))
                 segment_num = 0
                 page_start = 0
                 while page_start < total_pages:
                     page_end = min(page_start + pages_per_segment, total_pages)
-                    segment_num += 1
                     # Update progress
                     if progress_callback:
                         progress = (page_start / total_pages)
-                        progress_callback(progress, f"Processing segment {segment_num}...")
-                    # Create segment filename
-                    segment_filename = f"segment_{segment_num:04d}_pages_{page_start+1}-{page_end}.pdf"
                     segment_path = output_dir / segment_filename
                     # Create new PDF with selected pages
@@ -83,35 +116,75 @@ class PDFProcessor:
                     for page_num in range(page_start, page_end):
                         segment_pdf.pages.append(pdf.pages[page_num])
-                    # Save with compression
                     segment_pdf.save(
                         segment_path,
                         compress_streams=True,
                         object_stream_mode=pikepdf.ObjectStreamMode.generate,
-                        linearize=True
                     )
                     # Check segment size
                     segment_size = segment_path.stat().st_size
-                    if segment_size <= MAX_FILE_SIZE_BYTES:
-                        output_files.append(segment_path)
                         stats["segments_created"] += 1
-                        stats["total_output_size_mb"] += segment_size / 1024 / 1024
-                        logger.info(f"Created segment {segment_num}: {segment_size / 1024 / 1024:.2f} MB")
                     else:
-                        # If single page is too large, still keep it but mark as oversized
                         if page_end - page_start == 1:
-                            output_files.append(segment_path)
                             stats["segments_discarded"] += 1
-                            logger.warning(f"Segment {segment_num} exceeds size limit but kept (single page)")
                         else:
-                            # Try with fewer pages
-                            segment_path.unlink()
-                            pages_per_segment = max(1, pages_per_segment // 2)
-                            continue
-                    page_start = page_end
                 if progress_callback:
                     progress_callback(1.0, "Splitting complete!")
@@ -120,7 +193,7 @@ class PDFProcessor:
             logger.error(f"Error splitting PDF: {str(e)}")
             raise
-        return output_files, stats
 class SessionManager:
     """Manage user sessions and cleanup"""
@@ -170,12 +243,11 @@ cleanup_thread.start()
 def process_pdf(file_obj, progress=gr.Progress()) -> Tuple[Optional[str], str, str]:
     """
     Main processing function for Gradio interface
-    Returns: (zip_file_path, statistics_html, status_message)
     """
     if file_obj is None:
         return None, "", "⚠️ Please upload a PDF file"
-    session_id = str(uuid.uuid4())
     session_dir = SessionManager.create_session(session_id)
     try:
@@ -202,63 +274,68 @@ def process_pdf(file_obj, progress=gr.Progress()) -> Tuple[Optional[str], str, s
         output_dir = session_dir / "output"
         output_dir.mkdir(exist_ok=True)
-        # Split PDF with progress updates
-        progress(0.3, "Splitting PDF into segments...")
         def update_progress(value, message):
-            # Scale progress from 0.3 to 0.8 for splitting phase
             scaled_progress = 0.3 + (value * 0.5)
             progress(scaled_progress, message)
-        output_files, stats = PDFProcessor.split_pdf(
             input_path,
             output_dir,
             progress_callback=update_progress
         )
         if not output_files:
-            return None, "", "❌ No valid segments created"
         # Create ZIP file
         progress(0.9, "Creating ZIP archive...")
-        zip_path = session_dir / f"pdf_segments_{session_id[:8]}.zip"
         with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
             for file_path in output_files:
                 zipf.write(file_path, file_path.name)
-        # Generate statistics HTML with fixed styling
         stats_html = f"""
-        <div style="padding: 20px; background: #e8f4f8; border-radius: 10px; margin: 10px 0; border: 1px solid #0369a1;">
-            <h3 style="color: #0369a1; margin-top: 0;">📊 Processing Results</h3>
-            <table style="width: 100%; border-collapse: collapse;">
-                <tr style="border-bottom: 1px solid #94a3b8;">
-                    <td style="padding: 8px; font-weight: bold; color: #1e293b;">📄 Total Pages:</td>
-                    <td style="padding: 8px; text-align: right; color: #334155;">{stats['total_pages']}</td>
                 </tr>
-                <tr style="border-bottom: 1px solid #94a3b8;">
-                    <td style="padding: 8px; font-weight: bold; color: #1e293b;">✅ Segments Created:</td>
-                    <td style="padding: 8px; text-align: right; color: #334155;">{stats['segments_created']}</td>
                 </tr>
-                <tr style="border-bottom: 1px solid #94a3b8;">
-                    <td style="padding: 8px; font-weight: bold; color: #1e293b;">📦 Original Size:</td>
-                    <td style="padding: 8px; text-align: right; color: #334155;">{stats['original_size_mb']:.2f} MB</td>
                 </tr>
-                <tr style="border-bottom: 1px solid #94a3b8;">
-                    <td style="padding: 8px; font-weight: bold; color: #1e293b;">📁 Total Output Size:</td>
-                    <td style="padding: 8px; text-align: right; color: #334155;">{stats['total_output_size_mb']:.2f} MB</td>
                 </tr>
-                <tr>
-                    <td style="padding: 8px; font-weight: bold; color: #1e293b;">💾 Compression Ratio:</td>
-                    <td style="padding: 8px; text-align: right; color: #334155;">
-                        {((1 - stats['total_output_size_mb'] / stats['original_size_mb']) * 100) if stats['original_size_mb'] > 0 else 0:.1f}%
-                    </td>
                 </tr>
             </table>
             <p style="margin-top: 15px; color: #059669; font-weight: bold;">
                 ✨ Your file has been split successfully!
             </p>
-            <p style="margin-top: 10px; color: #64748b; font-size: 0.9em;">
                 ⏱️ Files will be automatically deleted after {CLEANUP_AFTER_MINUTES} minutes
             </p>
         </div>
@@ -280,8 +357,11 @@ def process_pdf(file_obj, progress=gr.Progress()) -> Tuple[Optional[str], str, s
             pass
         return None, "", f"❌ Error: {str(e)}"
-# Custom CSS to fix text visibility and button styling
-custom_css = """
     .gradio-container {
         max-width: 800px;
         margin: auto;
@@ -319,27 +399,21 @@ custom_css = """
         color: #1f2937 !important;
         font-weight: 500;
     }
-"""
-# Create Gradio interface with fixed theme
-with gr.Blocks(
-    title="PDF Splitter - Fast & Simple",
-    theme=gr.themes.Base(),  # Using Base theme for better control
-    css=custom_css
 ) as app:
     gr.Markdown("""
     # 📄 PDF Splitter Tool
-    **Split large PDFs into smaller segments quickly and efficiently!**
-    This tool uses advanced compression to split your PDF into segments of approximately **4.5 MB** each.
-    Files are processed using qpdf for optimal performance without decompressing the PDF.
     ### How to use:
     1. Upload your PDF file
     2. Click "Split PDF"
-    3. Download the ZIP file containing all segments
     *Note: Files are automatically deleted after 10 minutes for your privacy.*
     """)
@@ -354,7 +428,7 @@ with gr.Blocks(
             )
             split_btn = gr.Button(
-                "🚀 Split PDF",
                 variant="primary",
                 size="lg",
                 elem_classes="split-button"
@@ -368,7 +442,7 @@ with gr.Blocks(
     with gr.Row():
         download_file = gr.File(
-            label="📦 Download ZIP",
             visible=True,
             elem_classes="download-section",
             interactive=False  # Make it non-interactive until file is ready
@@ -384,16 +458,24 @@ with gr.Blocks(
     # Add features with proper styling
     gr.Markdown("""
     ---
-    ### 💡 Features:
-    - ✅ Handles compressed PDFs efficiently using qpdf
-    - ✅ Automatic file cleanup for privacy
-    - ✅ Progress tracking during processing
-    - ✅ Creates ZIP archive for easy download
-    - ✅ Optimized for Hugging Face Spaces
-    ### 🔒 Privacy:
-    All uploaded files are automatically deleted after processing and download.
-    No files are stored permanently on the server.
     """, elem_classes="features-section")
 # Launch the app

 import pikepdf
 import os
 import zipfile
 import shutil
 from pathlib import Path
 import uuid
 from datetime import datetime, timedelta
+import logging
 import threading
 import time
+from typing import Tuple, List, Optional
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Configuration - FIXED VALUES
+TARGET_SEGMENT_SIZE_MB = 4.5  # Target size for each segment
+MAX_ALLOWED_SIZE_MB = 5.0     # Maximum allowed size - discard if larger
+TARGET_SEGMENT_SIZE_BYTES = int(TARGET_SEGMENT_SIZE_MB * 1024 * 1024)  # 4.5MB in bytes
+MAX_ALLOWED_SIZE_BYTES = int(MAX_ALLOWED_SIZE_MB * 1024 * 1024)       # 5MB in bytes
 TEMP_DIR = Path("temp_files")
 CLEANUP_AFTER_MINUTES = 10
 # Create temp directory
 TEMP_DIR.mkdir(exist_ok=True)
+# Store user sessions for cleanup
 user_sessions = {}
 class PDFProcessor:
     """Handle PDF splitting with qpdf/pikepdf"""
     @staticmethod
+    def estimate_pages_for_size(pdf, total_pages: int, target_size_bytes: int) -> int:
+        """
+        Estimate how many pages fit in the target size
+        """
+        # Get approximate file size
+        temp_file = Path("temp_estimate.pdf")
+        try:
+            # Save the entire PDF temporarily to get its size
+            pdf.save(temp_file)
+            total_size = temp_file.stat().st_size
+            temp_file.unlink()
+            # Calculate average page size
+            avg_page_size = total_size / total_pages if total_pages > 0 else total_size
+            # Estimate pages that fit in target size (with 10% safety margin)
+            estimated_pages = int((target_size_bytes * 0.9) / avg_page_size)
+            return max(1, estimated_pages)  # At least 1 page
+        except Exception as e:
+            logger.error(f"Error estimating page size: {e}")
+            return max(1, int(total_pages / 10))  # Fallback to 10% of pages
+    @staticmethod
+    def split_pdf_by_size(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[List[Path], dict]:
         """
+        Split PDF into segments of approximately 4.5MB, discarding any over 5MB
         """
+        kept_files = []
+        discarded_count = 0
         stats = {
             "total_pages": 0,
             "segments_created": 0,
             "segments_discarded": 0,
             "original_size_mb": 0,
+            "total_output_size_mb": 0,
+            "largest_segment_mb": 0,
+            "smallest_segment_mb": float('inf')
         }
         try:
                 total_pages = len(pdf.pages)
                 stats["total_pages"] = total_pages
+                if total_pages == 0:
+                    return kept_files, stats
+                # Initial estimate of pages per segment
+                pages_per_segment = PDFProcessor.estimate_pages_for_size(
+                    pdf, total_pages, TARGET_SEGMENT_SIZE_BYTES
+                )
                 segment_num = 0
                 page_start = 0
+                retry_count = 0
+                max_retries = 3
                 while page_start < total_pages:
+                    # Calculate page range for this segment
                     page_end = min(page_start + pages_per_segment, total_pages)
                     # Update progress
                     if progress_callback:
                         progress = (page_start / total_pages)
+                        progress_callback(progress, f"Processing pages {page_start+1}-{page_end} of {total_pages}...")
+                    # Create segment
+                    segment_num += 1
+                    segment_filename = f"segment_{segment_num:03d}_p{page_start+1}-{page_end}.pdf"
                     segment_path = output_dir / segment_filename
                     # Create new PDF with selected pages
                     for page_num in range(page_start, page_end):
                         segment_pdf.pages.append(pdf.pages[page_num])
+                    # Save with compression to minimize size
                     segment_pdf.save(
                         segment_path,
                         compress_streams=True,
+                        stream_decode_level=pikepdf.StreamDecodeLevel.none,
                         object_stream_mode=pikepdf.ObjectStreamMode.generate,
+                        linearize=True,
+                        recompress_flate=True
                     )
                     # Check segment size
                     segment_size = segment_path.stat().st_size
+                    segment_size_mb = segment_size / 1024 / 1024
+                    logger.info(f"Segment {segment_num}: {segment_size_mb:.2f} MB ({page_end - page_start} pages)")
+                    if segment_size <= MAX_ALLOWED_SIZE_BYTES:
+                        # File is under 5MB limit - keep it
+                        kept_files.append(segment_path)
                         stats["segments_created"] += 1
+                        stats["total_output_size_mb"] += segment_size_mb
+                        # Track largest and smallest segments
+                        stats["largest_segment_mb"] = max(stats["largest_segment_mb"], segment_size_mb)
+                        stats["smallest_segment_mb"] = min(stats["smallest_segment_mb"], segment_size_mb)
+                        # Move to next segment
+                        page_start = page_end
+                        retry_count = 0  # Reset retry count for next segment
+                        # Adjust pages per segment based on actual size
+                        if segment_size_mb < 4.0 and pages_per_segment < total_pages:
+                            # Segment is too small, try more pages next time
+                            pages_per_segment = min(pages_per_segment + 1, total_pages - page_end)
+                        elif segment_size_mb > 4.8:
+                            # Segment is getting close to limit, use fewer pages
+                            pages_per_segment = max(1, pages_per_segment - 1)
                     else:
+                        # File exceeds 5MB limit
+                        logger.warning(f"Segment {segment_num} too large ({segment_size_mb:.2f} MB)")
                         if page_end - page_start == 1:
+                            # Single page is over 5MB - discard and move on
+                            logger.warning(f"Single page {page_start+1} exceeds 5MB limit - discarding")
+                            segment_path.unlink()  # Delete the file
                             stats["segments_discarded"] += 1
+                            discarded_count += 1
+                            page_start = page_end  # Move to next page
+                            retry_count = 0
                         else:
+                            # Multiple pages - try with fewer pages
+                            segment_path.unlink()  # Delete the oversized file
+                            if retry_count < max_retries:
+                                # Reduce pages by half and retry
+                                pages_per_segment = max(1, (page_end - page_start) // 2)
+                                retry_count += 1
+                                segment_num -= 1  # Reuse segment number
+                                logger.info(f"Retrying with {pages_per_segment} pages")
+                            else:
+                                # Too many retries, try single pages
+                                pages_per_segment = 1
+                                retry_count = 0
+                                segment_num -= 1
+                # Clean up stats
+                if stats["smallest_segment_mb"] == float('inf'):
+                    stats["smallest_segment_mb"] = 0
                 if progress_callback:
                     progress_callback(1.0, "Splitting complete!")
             logger.error(f"Error splitting PDF: {str(e)}")
             raise
+        return kept_files, stats
 class SessionManager:
     """Manage user sessions and cleanup"""
 def process_pdf(file_obj, progress=gr.Progress()) -> Tuple[Optional[str], str, str]:
     """
     Main processing function for Gradio interface
     """
     if file_obj is None:
         return None, "", "⚠️ Please upload a PDF file"
+    session_id = str(uuid.uuid4())[:8]
     session_dir = SessionManager.create_session(session_id)
     try:
         output_dir = session_dir / "output"
         output_dir.mkdir(exist_ok=True)
+        # Split PDF with size constraints
+        progress(0.3, "Splitting PDF into 4.5MB segments...")
         def update_progress(value, message):
             scaled_progress = 0.3 + (value * 0.5)
             progress(scaled_progress, message)
+        output_files, stats = PDFProcessor.split_pdf_by_size(
             input_path,
             output_dir,
             progress_callback=update_progress
         )
         if not output_files:
+            return None, "", "❌ No valid segments created (all segments exceeded 5MB limit)"
         # Create ZIP file
         progress(0.9, "Creating ZIP archive...")
+        zip_path = session_dir / f"pdf_segments_{session_id}.zip"
         with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
             for file_path in output_files:
                 zipf.write(file_path, file_path.name)
+        # Generate statistics with proper styling
         stats_html = f"""
+        <div style="padding: 20px; background: #f0f9ff; border-radius: 10px; margin: 10px 0; border: 2px solid #0284c7;">
+            <h3 style="color: #0c4a6e; margin-top: 0;">📊 Processing Results</h3>
+            <table style="width: 100%; border-collapse: collapse; background: white; border-radius: 5px;">
+                <tr style="border-bottom: 1px solid #e2e8f0;">
+                    <td style="padding: 10px; font-weight: bold; color: #334155;">📄 Total Pages:</td>
+                    <td style="padding: 10px; text-align: right; color: #475569; font-weight: 600;">{stats['total_pages']}</td>
+                </tr>
+                <tr style="border-bottom: 1px solid #e2e8f0; background: #f8fafc;">
+                    <td style="padding: 10px; font-weight: bold; color: #334155;">✅ Segments Created (≤5MB):</td>
+                    <td style="padding: 10px; text-align: right; color: #16a34a; font-weight: 600;">{stats['segments_created']}</td>
                 </tr>
+                <tr style="border-bottom: 1px solid #e2e8f0;">
+                    <td style="padding: 10px; font-weight: bold; color: #334155;">❌ Segments Discarded (>5MB):</td>
+                    <td style="padding: 10px; text-align: right; color: #dc2626; font-weight: 600;">{stats['segments_discarded']}</td>
                 </tr>
+                <tr style="border-bottom: 1px solid #e2e8f0; background: #f8fafc;">
+                    <td style="padding: 10px; font-weight: bold; color: #334155;">📦 Original Size:</td>
+                    <td style="padding: 10px; text-align: right; color: #475569; font-weight: 600;">{stats['original_size_mb']:.2f} MB</td>
                 </tr>
+                <tr style="border-bottom: 1px solid #e2e8f0;">
+                    <td style="padding: 10px; font-weight: bold; color: #334155;">📁 Total Output Size:</td>
+                    <td style="padding: 10px; text-align: right; color: #475569; font-weight: 600;">{stats['total_output_size_mb']:.2f} MB</td>
                 </tr>
+                <tr style="border-bottom: 1px solid #e2e8f0; background: #f8fafc;">
+                    <td style="padding: 10px; font-weight: bold; color: #334155;">📈 Largest Segment:</td>
+                    <td style="padding: 10px; text-align: right; color: #475569; font-weight: 600;">{stats['largest_segment_mb']:.2f} MB</td>
+                </tr>
+                <tr style="background: white;">
+                    <td style="padding: 10px; font-weight: bold; color: #334155;">📉 Smallest Segment:</td>
+                    <td style="padding: 10px; text-align: right; color: #475569; font-weight: 600;">{stats['smallest_segment_mb']:.2f} MB</td>
                 </tr>
             </table>
             <p style="margin-top: 15px; color: #059669; font-weight: bold;">
                 ✨ Your file has been split successfully!
             </p>
+            <p style="margin-top: 10px; color: #6b7280; font-size: 0.9em;">
                 ⏱️ Files will be automatically deleted after {CLEANUP_AFTER_MINUTES} minutes
             </p>
         </div>
             pass
         return None, "", f"❌ Error: {str(e)}"
+# Create Gradio interface with fixed theme
+with gr.Blocks(
+    title="PDF Splitter - Fast & Simple",
+    theme=gr.themes.Base(),  # Using Base theme for better control
+    css="""
     .gradio-container {
         max-width: 800px;
         margin: auto;
         color: #1f2937 !important;
         font-weight: 500;
     }
+    """
 ) as app:
     gr.Markdown("""
     # 📄 PDF Splitter Tool
+    **Split large PDFs into 4.5MB segments - Files over 5MB are automatically discarded!**
+    This tool uses advanced compression with qpdf to split your PDF into segments of approximately **4.5 MB** each.
+    Any segments that exceed **5 MB** are automatically discarded to ensure all output files meet size requirements.
     ### How to use:
     1. Upload your PDF file
     2. Click "Split PDF"
+    3. Download the ZIP file containing only segments ≤5MB
     *Note: Files are automatically deleted after 10 minutes for your privacy.*
     """)
             )
             split_btn = gr.Button(
+                "🚀 Split PDF into 4.5MB Segments",
                 variant="primary",
                 size="lg",
                 elem_classes="split-button"
     with gr.Row():
         download_file = gr.File(
+            label="📦 Download ZIP (Contains only segments ≤5MB)",
             visible=True,
             elem_classes="download-section",
             interactive=False  # Make it non-interactive until file is ready
     # Add features with proper styling
     gr.Markdown("""
     ---
+    ### 💡 Key Features:
+    - ✅ **Target segment size: 4.5MB** - Optimized for most systems
+    - ✅ **Maximum allowed size: 5MB** - Segments over 5MB are automatically discarded
+    - ✅ **Smart splitting** - Adjusts page count per segment dynamically
+    - ✅ **Compressed output** - Uses qpdf for efficient PDF compression
+    - ✅ **Automatic cleanup** - Files deleted after 10 minutes
+    - ✅ **Progress tracking** - Real-time updates during processing
+    ### 🔒 Privacy & Security:
+    - All uploaded files are automatically deleted after processing
+    - No files are stored permanently on the server
+    - Each user gets a unique session ID for file isolation
+    ### ⚙️ Technical Details:
+    - Uses **pikepdf** (qpdf wrapper) for efficient PDF manipulation
+    - Maintains PDF compression without decompressing
+    - Dynamically adjusts segment size based on page content
+    - Automatically retries with fewer pages if segment exceeds limits
     """, elem_classes="features-section")
 # Launch the app