Spaces:

MicroHealth
/

pdf-split

Build error

App Files Files Community

bluenevus commited on Sep 18, 2025

Commit

024b572

verified ·

1 Parent(s): cdf1f66

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -87

app.py CHANGED Viewed

@@ -31,13 +31,13 @@ TEMP_DIR.mkdir(exist_ok=True)
 user_sessions = {}
 class PDFProcessor:
-    """Handle PDF splitting with qpdf/pikepdf - using incremental size checking like bash script"""
     @staticmethod
     def split_pdf_by_size(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[List[Path], dict]:
         """
         Split PDF into segments of approximately 4.5MB, discarding any over 5MB
-        Uses the same incremental approach as the bash script
         """
         kept_files = []
         stats = {
@@ -53,149 +53,144 @@ class PDFProcessor:
         try:
             # Get original file size
             stats["original_size_mb"] = input_path.stat().st_size / 1024 / 1024
             # Open PDF with pikepdf
             with pikepdf.open(input_path, suppress_warnings=True, attempt_recovery=True) as pdf:
                 total_pages = len(pdf.pages)
                 stats["total_pages"] = total_pages
                 if total_pages == 0:
                     return kept_files, stats
-                start_page = 0
                 part = 1
                 while start_page < total_pages:
-                    # Start with a single page
-                    end_page = start_page
-                    temp_segment = None
-                    last_good_segment = None
-                    last_good_end = start_page
                     # Update progress
                     if progress_callback:
                         progress = (start_page / total_pages)
-                        progress_callback(progress, f"Processing segment {part}, starting at page {start_page + 1}...")
-                    # Keep adding pages until we exceed the size limit
                     while end_page < total_pages:
-                        # Create temporary segment with pages from start_page to end_page (inclusive)
-                        temp_filename = f"temp_segment_{part}.pdf"
-                        temp_path = output_dir / temp_filename
                         try:
-                            # Create new PDF with selected pages
-                            segment_pdf = pikepdf.new()
-                            # Add pages from start_page to end_page (inclusive)
-                            for page_num in range(start_page, end_page + 1):
-                                segment_pdf.pages.append(pdf.pages[page_num])
-                            # Save with compression
-                            segment_pdf.save(
-                                temp_path,
                                 compress_streams=True,
                                 object_stream_mode=pikepdf.ObjectStreamMode.generate,
-                                linearize=False  # Don't linearize to save time during testing
                             )
-                            # Check file size
-                            segment_size = temp_path.stat().st_size
-                            segment_size_mb = segment_size / 1024 / 1024
-                            logger.debug(f"Testing segment {part}: pages {start_page+1}-{end_page+1}, size: {segment_size_mb:.2f} MB")
-                            if segment_size < TARGET_SEGMENT_SIZE_BYTES:
-                                # Still under target size, keep this as last good and try adding more pages
-                                if last_good_segment and last_good_segment.exists():
-                                    last_good_segment.unlink()  # Delete previous good segment
-                                last_good_segment = temp_path
-                                last_good_end = end_page
-                                # If we're at the last page, this is our final segment
-                                if end_page == total_pages - 1:
-                                    break
-                                # Try adding one more page
-                                end_page += 1
-                            elif segment_size <= MAX_ALLOWED_SIZE_BYTES:
-                                # Between 4.5MB and 5MB - this is acceptable, use it
-                                if last_good_segment and last_good_segment.exists():
-                                    last_good_segment.unlink()
-                                last_good_segment = temp_path
-                                last_good_end = end_page
-                                break  # Stop here, we found a good size
-                            else:
-                                # Over 5MB limit
-                                temp_path.unlink()  # Delete oversized segment
                                 if end_page == start_page:
                                     # Single page is over 5MB - discard it
-                                    logger.warning(f"Single page {start_page+1} exceeds 5MB limit - discarding")
                                     stats["segments_discarded"] += 1
-                                    last_good_end = start_page  # Move past this page
                                     break
                                 else:
-                                    # Multiple pages - use the last good segment
                                     break
                         except Exception as e:
-                            logger.error(f"Error creating segment: {e}")
-                            if temp_path and temp_path.exists():
-                                temp_path.unlink()
                             break
-                    # Save the final segment for this part
-                    if last_good_segment and last_good_segment.exists():
                         # Rename to final name
-                        final_filename = f"segment_{part:03d}_p{start_page+1}-{last_good_end+1}.pdf"
                         final_path = output_dir / final_filename
-                        last_good_segment.rename(final_path)
-                        # Check final size and add to kept files
                         final_size = final_path.stat().st_size
                         final_size_mb = final_size / 1024 / 1024
-                        if final_size <= MAX_ALLOWED_SIZE_BYTES:
-                            kept_files.append(final_path)
-                            stats["segments_created"] += 1
-                            stats["total_output_size_mb"] += final_size_mb
-                            stats["largest_segment_mb"] = max(stats["largest_segment_mb"], final_size_mb)
-                            stats["smallest_segment_mb"] = min(stats["smallest_segment_mb"], final_size_mb)
-                            logger.info(f"Created segment {part}: {final_size_mb:.2f} MB (pages {start_page+1}-{last_good_end+1})")
-                        else:
-                            # Should not happen, but just in case
-                            final_path.unlink()
-                            stats["segments_discarded"] += 1
-                            logger.warning(f"Final segment {part} exceeded 5MB limit after rename")
-                    # Move to next segment
-                    start_page = last_good_end + 1
                     part += 1
-                    # Clean up any remaining temp files
-                    for temp_file in output_dir.glob("temp_segment_*.pdf"):
                         try:
-                            temp_file.unlink()
                         except:
                             pass
-                # Final cleanup
                 if stats["smallest_segment_mb"] == float('inf'):
                     stats["smallest_segment_mb"] = 0
                 if progress_callback:
                     progress_callback(1.0, "Splitting complete!")
         except Exception as e:
             logger.error(f"Error splitting PDF: {str(e)}")
-            # Clean up temp files on error
-            for temp_file in output_dir.glob("temp_segment_*.pdf"):
                 try:
-                    temp_file.unlink()
                 except:
                     pass
             raise

 user_sessions = {}
 class PDFProcessor:
+    """Handle PDF splitting with qpdf/pikepdf - testing actual file sizes like bash script"""
     @staticmethod
     def split_pdf_by_size(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[List[Path], dict]:
         """
         Split PDF into segments of approximately 4.5MB, discarding any over 5MB
+        Mimics the bash script logic - incrementally test actual file sizes
         """
         kept_files = []
         stats = {
         try:
             # Get original file size
             stats["original_size_mb"] = input_path.stat().st_size / 1024 / 1024
+            logger.info(f"Original PDF size: {stats['original_size_mb']:.2f} MB")
             # Open PDF with pikepdf
             with pikepdf.open(input_path, suppress_warnings=True, attempt_recovery=True) as pdf:
                 total_pages = len(pdf.pages)
                 stats["total_pages"] = total_pages
+                logger.info(f"Total pages: {total_pages}")
                 if total_pages == 0:
                     return kept_files, stats
+                start_page = 0  # 0-indexed in Python
                 part = 1
                 while start_page < total_pages:
                     # Update progress
                     if progress_callback:
                         progress = (start_page / total_pages)
+                        progress_callback(progress, f"Creating segment {part}...")
+                    # Start with just the start page
+                    end_page = start_page
+                    last_valid_end = None
+                    last_valid_path = None
+                    # Keep adding pages until we exceed the target size
                     while end_page < total_pages:
+                        # Create test segment
+                        test_filename = f"test_segment_{part}_{end_page}.pdf"
+                        test_path = output_dir / test_filename
                         try:
+                            # Create PDF with pages from start_page to end_page (inclusive)
+                            test_pdf = pikepdf.new()
+                            for page_idx in range(start_page, end_page + 1):
+                                test_pdf.pages.append(pdf.pages[page_idx])
+                            # Save to test actual size
+                            test_pdf.save(
+                                test_path,
                                 compress_streams=True,
                                 object_stream_mode=pikepdf.ObjectStreamMode.generate,
+                                recompress_flate=True,  # Enable recompression
+                                linearize=False  # Skip linearization for speed
                             )
+                            # Get actual file size
+                            actual_size = test_path.stat().st_size
+                            actual_size_mb = actual_size / 1024 / 1024
+                            logger.debug(f"Test segment: pages {start_page+1}-{end_page+1}, size: {actual_size_mb:.2f} MB")
+                            if actual_size >= MAX_ALLOWED_SIZE_BYTES:
+                                # Too large
                                 if end_page == start_page:
                                     # Single page is over 5MB - discard it
+                                    logger.warning(f"Single page {start_page+1} is {actual_size_mb:.2f} MB (>5MB) - discarding")
+                                    test_path.unlink()
                                     stats["segments_discarded"] += 1
+                                    start_page += 1  # Skip this page
+                                    break
+                                else:
+                                    # Multiple pages - use the last valid segment
+                                    test_path.unlink()
                                     break
+                            else:
+                                # Under 5MB - this is valid
+                                # Delete previous valid if exists
+                                if last_valid_path and last_valid_path.exists():
+                                    last_valid_path.unlink()
+                                last_valid_path = test_path
+                                last_valid_end = end_page
+                                # If we're under target size and not at the last page, try adding more
+                                if actual_size < TARGET_SEGMENT_SIZE_BYTES and end_page < total_pages - 1:
+                                    end_page += 1
+                                    continue
                                 else:
+                                    # We've reached target size or the last page
                                     break
                         except Exception as e:
+                            logger.error(f"Error creating test segment: {e}")
+                            if test_path.exists():
+                                test_path.unlink()
                             break
+                    # Save the final valid segment
+                    if last_valid_path and last_valid_path.exists():
                         # Rename to final name
+                        final_filename = f"segment_{part:03d}_pages_{start_page+1}-{last_valid_end+1}.pdf"
                         final_path = output_dir / final_filename
+                        last_valid_path.rename(final_path)
+                        # Record stats
                         final_size = final_path.stat().st_size
                         final_size_mb = final_size / 1024 / 1024
+                        kept_files.append(final_path)
+                        stats["segments_created"] += 1
+                        stats["total_output_size_mb"] += final_size_mb
+                        stats["largest_segment_mb"] = max(stats["largest_segment_mb"], final_size_mb)
+                        stats["smallest_segment_mb"] = min(stats["smallest_segment_mb"], final_size_mb)
+                        logger.info(f"Created segment {part}: {final_size_mb:.2f} MB (pages {start_page+1}-{last_valid_end+1})")
+                        # Move to next segment
+                        start_page = last_valid_end + 1
+                    else:
+                        # No valid segment created (shouldn't happen unless all pages > 5MB)
+                        if start_page < total_pages:
+                            start_page += 1
                     part += 1
+                    # Clean up any remaining test files
+                    for test_file in output_dir.glob("test_segment_*.pdf"):
                         try:
+                            test_file.unlink()
                         except:
                             pass
+                # Final stats cleanup
                 if stats["smallest_segment_mb"] == float('inf'):
                     stats["smallest_segment_mb"] = 0
                 if progress_callback:
                     progress_callback(1.0, "Splitting complete!")
+                logger.info(f"Splitting complete: {stats['segments_created']} segments created, {stats['segments_discarded']} discarded")
         except Exception as e:
             logger.error(f"Error splitting PDF: {str(e)}")
+            # Clean up test files on error
+            for test_file in output_dir.glob("test_segment_*.pdf"):
                 try:
+                    test_file.unlink()
                 except:
                     pass
             raise