Spaces:

MicroHealth
/

pdf-split

Build error

App Files Files Community

bluenevus commited on Sep 18, 2025

Commit

21c0900

verified ·

1 Parent(s): 024b572

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -130

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ import logging
 import threading
 import time
 from typing import Tuple, List, Optional
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -31,168 +33,165 @@ TEMP_DIR.mkdir(exist_ok=True)
 user_sessions = {}
 class PDFProcessor:
-    """Handle PDF splitting with qpdf/pikepdf - testing actual file sizes like bash script"""
     @staticmethod
     def split_pdf_by_size(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[List[Path], dict]:
         """
-        Split PDF into segments of approximately 4.5MB, discarding any over 5MB
-        Mimics the bash script logic - incrementally test actual file sizes
         """
         kept_files = []
         stats = {
             "total_pages": 0,
             "segments_created": 0,
             "segments_discarded": 0,
-            "original_size_mb": 0,
             "total_output_size_mb": 0,
             "largest_segment_mb": 0,
             "smallest_segment_mb": float('inf')
         }
         try:
-            # Get original file size
-            stats["original_size_mb"] = input_path.stat().st_size / 1024 / 1024
-            logger.info(f"Original PDF size: {stats['original_size_mb']:.2f} MB")
-            # Open PDF with pikepdf
-            with pikepdf.open(input_path, suppress_warnings=True, attempt_recovery=True) as pdf:
-                total_pages = len(pdf.pages)
-                stats["total_pages"] = total_pages
-                logger.info(f"Total pages: {total_pages}")
-                if total_pages == 0:
-                    return kept_files, stats
-                start_page = 0  # 0-indexed in Python
-                part = 1
-                while start_page < total_pages:
-                    # Update progress
-                    if progress_callback:
-                        progress = (start_page / total_pages)
-                        progress_callback(progress, f"Creating segment {part}...")
-                    # Start with just the start page
-                    end_page = start_page
-                    last_valid_end = None
-                    last_valid_path = None
-                    # Keep adding pages until we exceed the target size
-                    while end_page < total_pages:
-                        # Create test segment
-                        test_filename = f"test_segment_{part}_{end_page}.pdf"
-                        test_path = output_dir / test_filename
-                        try:
-                            # Create PDF with pages from start_page to end_page (inclusive)
-                            test_pdf = pikepdf.new()
-                            for page_idx in range(start_page, end_page + 1):
-                                test_pdf.pages.append(pdf.pages[page_idx])
-                            # Save to test actual size
-                            test_pdf.save(
-                                test_path,
-                                compress_streams=True,
-                                object_stream_mode=pikepdf.ObjectStreamMode.generate,
-                                recompress_flate=True,  # Enable recompression
-                                linearize=False  # Skip linearization for speed
-                            )
-                            # Get actual file size
-                            actual_size = test_path.stat().st_size
-                            actual_size_mb = actual_size / 1024 / 1024
-                            logger.debug(f"Test segment: pages {start_page+1}-{end_page+1}, size: {actual_size_mb:.2f} MB")
-                            if actual_size >= MAX_ALLOWED_SIZE_BYTES:
-                                # Too large
-                                if end_page == start_page:
-                                    # Single page is over 5MB - discard it
-                                    logger.warning(f"Single page {start_page+1} is {actual_size_mb:.2f} MB (>5MB) - discarding")
-                                    test_path.unlink()
-                                    stats["segments_discarded"] += 1
-                                    start_page += 1  # Skip this page
-                                    break
-                                else:
-                                    # Multiple pages - use the last valid segment
-                                    test_path.unlink()
-                                    break
-                            else:
-                                # Under 5MB - this is valid
-                                # Delete previous valid if exists
-                                if last_valid_path and last_valid_path.exists():
-                                    last_valid_path.unlink()
-                                last_valid_path = test_path
-                                last_valid_end = end_page
-                                # If we're under target size and not at the last page, try adding more
-                                if actual_size < TARGET_SEGMENT_SIZE_BYTES and end_page < total_pages - 1:
-                                    end_page += 1
-                                    continue
                                 else:
-                                    # We've reached target size or the last page
-                                    break
-                        except Exception as e:
-                            logger.error(f"Error creating test segment: {e}")
-                            if test_path.exists():
-                                test_path.unlink()
-                            break
-                    # Save the final valid segment
-                    if last_valid_path and last_valid_path.exists():
-                        # Rename to final name
-                        final_filename = f"segment_{part:03d}_pages_{start_page+1}-{last_valid_end+1}.pdf"
-                        final_path = output_dir / final_filename
-                        last_valid_path.rename(final_path)
-                        # Record stats
-                        final_size = final_path.stat().st_size
-                        final_size_mb = final_size / 1024 / 1024
-                        kept_files.append(final_path)
-                        stats["segments_created"] += 1
-                        stats["total_output_size_mb"] += final_size_mb
-                        stats["largest_segment_mb"] = max(stats["largest_segment_mb"], final_size_mb)
-                        stats["smallest_segment_mb"] = min(stats["smallest_segment_mb"], final_size_mb)
-                        logger.info(f"Created segment {part}: {final_size_mb:.2f} MB (pages {start_page+1}-{last_valid_end+1})")
-                        # Move to next segment
-                        start_page = last_valid_end + 1
-                    else:
-                        # No valid segment created (shouldn't happen unless all pages > 5MB)
-                        if start_page < total_pages:
-                            start_page += 1
-                    part += 1
-                    # Clean up any remaining test files
-                    for test_file in output_dir.glob("test_segment_*.pdf"):
-                        try:
-                            test_file.unlink()
-                        except:
-                            pass
-                # Final stats cleanup
-                if stats["smallest_segment_mb"] == float('inf'):
-                    stats["smallest_segment_mb"] = 0
-                if progress_callback:
-                    progress_callback(1.0, "Splitting complete!")
-                logger.info(f"Splitting complete: {stats['segments_created']} segments created, {stats['segments_discarded']} discarded")
         except Exception as e:
-            logger.error(f"Error splitting PDF: {str(e)}")
-            # Clean up test files on error
-            for test_file in output_dir.glob("test_segment_*.pdf"):
-                try:
-                    test_file.unlink()
-                except:
-                    pass
             raise
         return kept_files, stats

 import threading
 import time
 from typing import Tuple, List, Optional
+import subprocess
+import json
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 user_sessions = {}
 class PDFProcessor:
+    """Handle PDF splitting using qpdf directly for performance"""
+    @staticmethod
+    def get_pdf_info(pdf_path: Path) -> dict:
+        """Get PDF info using qpdf"""
+        try:
+            result = subprocess.run(
+                ["qpdf", "--show-npages", str(pdf_path)],
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            return {"total_pages": int(result.stdout.strip())}
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Error getting PDF info: {e}")
+            raise
     @staticmethod
     def split_pdf_by_size(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[List[Path], dict]:
         """
+        Split PDF using qpdf directly (like your bash script) for much better performance
         """
         kept_files = []
         stats = {
             "total_pages": 0,
             "segments_created": 0,
             "segments_discarded": 0,
+            "original_size_mb": input_path.stat().st_size / 1024 / 1024,
             "total_output_size_mb": 0,
             "largest_segment_mb": 0,
             "smallest_segment_mb": float('inf')
         }
         try:
+            # Get total pages using qpdf
+            pdf_info = PDFProcessor.get_pdf_info(input_path)
+            total_pages = pdf_info["total_pages"]
+            stats["total_pages"] = total_pages
+            if total_pages == 0:
+                return kept_files, stats
+            logger.info(f"Starting split: {total_pages} pages, original size: {stats['original_size_mb']:.2f} MB")
+            start_page = 1  # qpdf uses 1-based indexing
+            part = 1
+            while start_page <= total_pages:
+                if progress_callback:
+                    progress = ((start_page - 1) / total_pages)
+                    progress_callback(progress, f"Processing segment {part}...")
+                # Binary search for the right number of pages
+                low = start_page
+                high = min(start_page + 100, total_pages)  # Start with max 100 pages
+                best_end = start_page
+                best_size = 0
+                # First, quickly find a rough upper bound
+                test_file = output_dir / f"test_{part}.pdf"
+                while low <= high:
+                    mid = (low + high) // 2
+                    # Create test segment using qpdf
+                    try:
+                        subprocess.run(
+                            ["qpdf", "--empty", "--pages", str(input_path), f"{start_page}-{mid}", "--", str(test_file)],
+                            capture_output=True,
+                            check=True,
+                            timeout=10  # 10 second timeout
+                        )
+                        # Check file size
+                        if test_file.exists():
+                            size = test_file.stat().st_size
+                            if size <= MAX_ALLOWED_SIZE_BYTES:
+                                best_end = mid
+                                best_size = size
+                                if size < TARGET_SEGMENT_SIZE_BYTES * 0.9:  # Less than 90% of target
+                                    low = mid + 1  # Try more pages
                                 else:
+                                    break  # Good enough, close to target
+                            else:
+                                high = mid - 1  # Too big, try fewer pages
+                            # Clean up test file
+                            test_file.unlink()
+                    except subprocess.CalledProcessError as e:
+                        logger.error(f"qpdf error: {e}")
+                        if test_file.exists():
+                            test_file.unlink()
+                        high = mid - 1
+                    except subprocess.TimeoutExpired:
+                        logger.error(f"qpdf timeout for pages {start_page}-{mid}")
+                        if test_file.exists():
+                            test_file.unlink()
+                        high = mid - 1
+                # Create final segment with best found size
+                if best_end >= start_page:
+                    final_filename = f"segment_{part:03d}_p{start_page}-{best_end}.pdf"
+                    final_path = output_dir / final_filename
+                    try:
+                        # Create final segment
+                        subprocess.run(
+                            ["qpdf", "--empty", "--pages", str(input_path), f"{start_page}-{best_end}", "--",
+                             str(final_path), "--compress-streams=y", "--object-streams=generate"],
+                            capture_output=True,
+                            check=True,
+                            timeout=30
+                        )
+                        if final_path.exists():
+                            final_size = final_path.stat().st_size
+                            final_size_mb = final_size / 1024 / 1024
+                            if final_size <= MAX_ALLOWED_SIZE_BYTES:
+                                kept_files.append(final_path)
+                                stats["segments_created"] += 1
+                                stats["total_output_size_mb"] += final_size_mb
+                                stats["largest_segment_mb"] = max(stats["largest_segment_mb"], final_size_mb)
+                                stats["smallest_segment_mb"] = min(stats["smallest_segment_mb"], final_size_mb)
+                                logger.info(f"Created segment {part}: {final_size_mb:.2f} MB (pages {start_page}-{best_end})")
+                            else:
+                                # Single page over 5MB
+                                final_path.unlink()
+                                stats["segments_discarded"] += 1
+                                logger.warning(f"Segment {part} exceeded 5MB limit - discarded")
+                    except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
+                        logger.error(f"Error creating final segment: {e}")
+                        if final_path.exists():
+                            final_path.unlink()
+                    start_page = best_end + 1
+                else:
+                    # Single page is too large, skip it
+                    logger.warning(f"Page {start_page} exceeds size limit - skipping")
+                    stats["segments_discarded"] += 1
+                    start_page += 1
+                part += 1
+            if stats["smallest_segment_mb"] == float('inf'):
+                stats["smallest_segment_mb"] = 0
+            if progress_callback:
+                progress_callback(1.0, "Splitting complete!")
+            logger.info(f"Completed: {stats['segments_created']} segments created, {stats['segments_discarded']} discarded")
         except Exception as e:
+            logger.error(f"Error in split_pdf_by_size: {str(e)}")
             raise
         return kept_files, stats