bluenevus commited on
Commit
024b572
·
verified ·
1 Parent(s): cdf1f66

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -87
app.py CHANGED
@@ -31,13 +31,13 @@ TEMP_DIR.mkdir(exist_ok=True)
31
  user_sessions = {}
32
 
33
  class PDFProcessor:
34
- """Handle PDF splitting with qpdf/pikepdf - using incremental size checking like bash script"""
35
 
36
  @staticmethod
37
  def split_pdf_by_size(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[List[Path], dict]:
38
  """
39
  Split PDF into segments of approximately 4.5MB, discarding any over 5MB
40
- Uses the same incremental approach as the bash script
41
  """
42
  kept_files = []
43
  stats = {
@@ -53,149 +53,144 @@ class PDFProcessor:
53
  try:
54
  # Get original file size
55
  stats["original_size_mb"] = input_path.stat().st_size / 1024 / 1024
 
56
 
57
  # Open PDF with pikepdf
58
  with pikepdf.open(input_path, suppress_warnings=True, attempt_recovery=True) as pdf:
59
  total_pages = len(pdf.pages)
60
  stats["total_pages"] = total_pages
 
61
 
62
  if total_pages == 0:
63
  return kept_files, stats
64
 
65
- start_page = 0
66
  part = 1
67
 
68
  while start_page < total_pages:
69
- # Start with a single page
70
- end_page = start_page
71
- temp_segment = None
72
- last_good_segment = None
73
- last_good_end = start_page
74
-
75
  # Update progress
76
  if progress_callback:
77
  progress = (start_page / total_pages)
78
- progress_callback(progress, f"Processing segment {part}, starting at page {start_page + 1}...")
 
 
 
 
 
79
 
80
- # Keep adding pages until we exceed the size limit
81
  while end_page < total_pages:
82
- # Create temporary segment with pages from start_page to end_page (inclusive)
83
- temp_filename = f"temp_segment_{part}.pdf"
84
- temp_path = output_dir / temp_filename
85
 
86
  try:
87
- # Create new PDF with selected pages
88
- segment_pdf = pikepdf.new()
89
-
90
- # Add pages from start_page to end_page (inclusive)
91
- for page_num in range(start_page, end_page + 1):
92
- segment_pdf.pages.append(pdf.pages[page_num])
93
 
94
- # Save with compression
95
- segment_pdf.save(
96
- temp_path,
97
  compress_streams=True,
98
  object_stream_mode=pikepdf.ObjectStreamMode.generate,
99
- linearize=False # Don't linearize to save time during testing
 
100
  )
101
 
102
- # Check file size
103
- segment_size = temp_path.stat().st_size
104
- segment_size_mb = segment_size / 1024 / 1024
105
 
106
- logger.debug(f"Testing segment {part}: pages {start_page+1}-{end_page+1}, size: {segment_size_mb:.2f} MB")
107
 
108
- if segment_size < TARGET_SEGMENT_SIZE_BYTES:
109
- # Still under target size, keep this as last good and try adding more pages
110
- if last_good_segment and last_good_segment.exists():
111
- last_good_segment.unlink() # Delete previous good segment
112
- last_good_segment = temp_path
113
- last_good_end = end_page
114
-
115
- # If we're at the last page, this is our final segment
116
- if end_page == total_pages - 1:
117
- break
118
-
119
- # Try adding one more page
120
- end_page += 1
121
-
122
- elif segment_size <= MAX_ALLOWED_SIZE_BYTES:
123
- # Between 4.5MB and 5MB - this is acceptable, use it
124
- if last_good_segment and last_good_segment.exists():
125
- last_good_segment.unlink()
126
- last_good_segment = temp_path
127
- last_good_end = end_page
128
- break # Stop here, we found a good size
129
-
130
- else:
131
- # Over 5MB limit
132
- temp_path.unlink() # Delete oversized segment
133
-
134
  if end_page == start_page:
135
  # Single page is over 5MB - discard it
136
- logger.warning(f"Single page {start_page+1} exceeds 5MB limit - discarding")
 
137
  stats["segments_discarded"] += 1
138
- last_good_end = start_page # Move past this page
 
 
 
 
139
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  else:
141
- # Multiple pages - use the last good segment
142
  break
143
 
144
  except Exception as e:
145
- logger.error(f"Error creating segment: {e}")
146
- if temp_path and temp_path.exists():
147
- temp_path.unlink()
148
  break
149
 
150
- # Save the final segment for this part
151
- if last_good_segment and last_good_segment.exists():
152
  # Rename to final name
153
- final_filename = f"segment_{part:03d}_p{start_page+1}-{last_good_end+1}.pdf"
154
  final_path = output_dir / final_filename
155
- last_good_segment.rename(final_path)
156
 
157
- # Check final size and add to kept files
158
  final_size = final_path.stat().st_size
159
  final_size_mb = final_size / 1024 / 1024
160
 
161
- if final_size <= MAX_ALLOWED_SIZE_BYTES:
162
- kept_files.append(final_path)
163
- stats["segments_created"] += 1
164
- stats["total_output_size_mb"] += final_size_mb
165
- stats["largest_segment_mb"] = max(stats["largest_segment_mb"], final_size_mb)
166
- stats["smallest_segment_mb"] = min(stats["smallest_segment_mb"], final_size_mb)
167
-
168
- logger.info(f"Created segment {part}: {final_size_mb:.2f} MB (pages {start_page+1}-{last_good_end+1})")
169
- else:
170
- # Should not happen, but just in case
171
- final_path.unlink()
172
- stats["segments_discarded"] += 1
173
- logger.warning(f"Final segment {part} exceeded 5MB limit after rename")
 
174
 
175
- # Move to next segment
176
- start_page = last_good_end + 1
177
  part += 1
178
 
179
- # Clean up any remaining temp files
180
- for temp_file in output_dir.glob("temp_segment_*.pdf"):
181
  try:
182
- temp_file.unlink()
183
  except:
184
  pass
185
 
186
- # Final cleanup
187
  if stats["smallest_segment_mb"] == float('inf'):
188
  stats["smallest_segment_mb"] = 0
189
 
190
  if progress_callback:
191
  progress_callback(1.0, "Splitting complete!")
 
 
192
 
193
  except Exception as e:
194
  logger.error(f"Error splitting PDF: {str(e)}")
195
- # Clean up temp files on error
196
- for temp_file in output_dir.glob("temp_segment_*.pdf"):
197
  try:
198
- temp_file.unlink()
199
  except:
200
  pass
201
  raise
 
31
  user_sessions = {}
32
 
33
  class PDFProcessor:
34
+ """Handle PDF splitting with qpdf/pikepdf - testing actual file sizes like bash script"""
35
 
36
  @staticmethod
37
  def split_pdf_by_size(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[List[Path], dict]:
38
  """
39
  Split PDF into segments of approximately 4.5MB, discarding any over 5MB
40
+ Mimics the bash script logic - incrementally test actual file sizes
41
  """
42
  kept_files = []
43
  stats = {
 
53
  try:
54
  # Get original file size
55
  stats["original_size_mb"] = input_path.stat().st_size / 1024 / 1024
56
+ logger.info(f"Original PDF size: {stats['original_size_mb']:.2f} MB")
57
 
58
  # Open PDF with pikepdf
59
  with pikepdf.open(input_path, suppress_warnings=True, attempt_recovery=True) as pdf:
60
  total_pages = len(pdf.pages)
61
  stats["total_pages"] = total_pages
62
+ logger.info(f"Total pages: {total_pages}")
63
 
64
  if total_pages == 0:
65
  return kept_files, stats
66
 
67
+ start_page = 0 # 0-indexed in Python
68
  part = 1
69
 
70
  while start_page < total_pages:
 
 
 
 
 
 
71
  # Update progress
72
  if progress_callback:
73
  progress = (start_page / total_pages)
74
+ progress_callback(progress, f"Creating segment {part}...")
75
+
76
+ # Start with just the start page
77
+ end_page = start_page
78
+ last_valid_end = None
79
+ last_valid_path = None
80
 
81
+ # Keep adding pages until we exceed the target size
82
  while end_page < total_pages:
83
+ # Create test segment
84
+ test_filename = f"test_segment_{part}_{end_page}.pdf"
85
+ test_path = output_dir / test_filename
86
 
87
  try:
88
+ # Create PDF with pages from start_page to end_page (inclusive)
89
+ test_pdf = pikepdf.new()
90
+ for page_idx in range(start_page, end_page + 1):
91
+ test_pdf.pages.append(pdf.pages[page_idx])
 
 
92
 
93
+ # Save to test actual size
94
+ test_pdf.save(
95
+ test_path,
96
  compress_streams=True,
97
  object_stream_mode=pikepdf.ObjectStreamMode.generate,
98
+ recompress_flate=True, # Enable recompression
99
+ linearize=False # Skip linearization for speed
100
  )
101
 
102
+ # Get actual file size
103
+ actual_size = test_path.stat().st_size
104
+ actual_size_mb = actual_size / 1024 / 1024
105
 
106
+ logger.debug(f"Test segment: pages {start_page+1}-{end_page+1}, size: {actual_size_mb:.2f} MB")
107
 
108
+ if actual_size >= MAX_ALLOWED_SIZE_BYTES:
109
+ # Too large
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  if end_page == start_page:
111
  # Single page is over 5MB - discard it
112
+ logger.warning(f"Single page {start_page+1} is {actual_size_mb:.2f} MB (>5MB) - discarding")
113
+ test_path.unlink()
114
  stats["segments_discarded"] += 1
115
+ start_page += 1 # Skip this page
116
+ break
117
+ else:
118
+ # Multiple pages - use the last valid segment
119
+ test_path.unlink()
120
  break
121
+ else:
122
+ # Under 5MB - this is valid
123
+ # Delete previous valid if exists
124
+ if last_valid_path and last_valid_path.exists():
125
+ last_valid_path.unlink()
126
+
127
+ last_valid_path = test_path
128
+ last_valid_end = end_page
129
+
130
+ # If we're under target size and not at the last page, try adding more
131
+ if actual_size < TARGET_SEGMENT_SIZE_BYTES and end_page < total_pages - 1:
132
+ end_page += 1
133
+ continue
134
  else:
135
+ # We've reached target size or the last page
136
  break
137
 
138
  except Exception as e:
139
+ logger.error(f"Error creating test segment: {e}")
140
+ if test_path.exists():
141
+ test_path.unlink()
142
  break
143
 
144
+ # Save the final valid segment
145
+ if last_valid_path and last_valid_path.exists():
146
  # Rename to final name
147
+ final_filename = f"segment_{part:03d}_pages_{start_page+1}-{last_valid_end+1}.pdf"
148
  final_path = output_dir / final_filename
149
+ last_valid_path.rename(final_path)
150
 
151
+ # Record stats
152
  final_size = final_path.stat().st_size
153
  final_size_mb = final_size / 1024 / 1024
154
 
155
+ kept_files.append(final_path)
156
+ stats["segments_created"] += 1
157
+ stats["total_output_size_mb"] += final_size_mb
158
+ stats["largest_segment_mb"] = max(stats["largest_segment_mb"], final_size_mb)
159
+ stats["smallest_segment_mb"] = min(stats["smallest_segment_mb"], final_size_mb)
160
+
161
+ logger.info(f"Created segment {part}: {final_size_mb:.2f} MB (pages {start_page+1}-{last_valid_end+1})")
162
+
163
+ # Move to next segment
164
+ start_page = last_valid_end + 1
165
+ else:
166
+ # No valid segment created (shouldn't happen unless all pages > 5MB)
167
+ if start_page < total_pages:
168
+ start_page += 1
169
 
 
 
170
  part += 1
171
 
172
+ # Clean up any remaining test files
173
+ for test_file in output_dir.glob("test_segment_*.pdf"):
174
  try:
175
+ test_file.unlink()
176
  except:
177
  pass
178
 
179
+ # Final stats cleanup
180
  if stats["smallest_segment_mb"] == float('inf'):
181
  stats["smallest_segment_mb"] = 0
182
 
183
  if progress_callback:
184
  progress_callback(1.0, "Splitting complete!")
185
+
186
+ logger.info(f"Splitting complete: {stats['segments_created']} segments created, {stats['segments_discarded']} discarded")
187
 
188
  except Exception as e:
189
  logger.error(f"Error splitting PDF: {str(e)}")
190
+ # Clean up test files on error
191
+ for test_file in output_dir.glob("test_segment_*.pdf"):
192
  try:
193
+ test_file.unlink()
194
  except:
195
  pass
196
  raise