bluenevus commited on
Commit
dbcc72f
·
verified ·
1 Parent(s): 0c94b1c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -150
app.py CHANGED
@@ -31,16 +31,15 @@ TEMP_DIR.mkdir(exist_ok=True)
31
  user_sessions = {}
32
 
33
  class PDFProcessor:
34
- """Handle PDF splitting with qpdf/pikepdf - with corruption handling"""
35
 
36
  @staticmethod
37
  def split_pdf_by_size(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[List[Path], dict]:
38
  """
39
  Split PDF into segments of approximately 4.5MB, discarding any over 5MB
40
- Handles corrupted PDFs by attempting recovery
41
  """
42
  kept_files = []
43
- discarded_count = 0
44
  stats = {
45
  "total_pages": 0,
46
  "segments_created": 0,
@@ -55,181 +54,146 @@ class PDFProcessor:
55
  # Get original file size
56
  stats["original_size_mb"] = input_path.stat().st_size / 1024 / 1024
57
 
58
- # First attempt: Try to open with recovery and stream decoding disabled
59
- pdf = None
60
- try:
61
- # Open PDF with recovery mode and suppress stream errors
62
- pdf = pikepdf.open(
63
- input_path,
64
- suppress_warnings=True,
65
- attempt_recovery=True
66
- )
67
- except pikepdf._qpdf.DataDecodingError as e:
68
- logger.warning(f"Initial open failed, attempting repair: {e}")
69
-
70
- # Second attempt: Create a repaired copy first
71
- repaired_path = output_dir.parent / "repaired_temp.pdf"
72
- try:
73
- # Use pikepdf to create a repaired version
74
- with pikepdf.open(input_path, suppress_warnings=True, attempt_recovery=True) as damaged_pdf:
75
- # Save with recompression to fix stream errors
76
- damaged_pdf.save(
77
- repaired_path,
78
- compress_streams=False, # Disable compression first
79
- decode_level=pikepdf.StreamDecodeLevel.all, # Decode all streams
80
- object_stream_mode=pikepdf.ObjectStreamMode.disable, # Disable object streams
81
- normalize_content=True, # Normalize content streams
82
- linearize=False
83
- )
84
-
85
- # Now open the repaired version
86
- pdf = pikepdf.open(repaired_path, suppress_warnings=True)
87
-
88
- # Clean up repaired file after opening
89
- input_path = repaired_path
90
-
91
- except Exception as repair_error:
92
- logger.error(f"Repair attempt failed: {repair_error}")
93
-
94
- # Third attempt: Try with qpdf command line if available
95
- import subprocess
96
- try:
97
- repaired_path = output_dir.parent / "qpdf_repaired.pdf"
98
- subprocess.run(
99
- ["qpdf", "--replace-input", "--stream-data=uncompress",
100
- str(input_path), str(repaired_path)],
101
- check=True,
102
- capture_output=True
103
- )
104
- pdf = pikepdf.open(repaired_path, suppress_warnings=True)
105
- input_path = repaired_path
106
- except (subprocess.CalledProcessError, FileNotFoundError):
107
- raise Exception("PDF is severely corrupted and cannot be repaired")
108
-
109
- if pdf is None:
110
- raise Exception("Failed to open PDF after all recovery attempts")
111
-
112
- with pdf:
113
  total_pages = len(pdf.pages)
114
  stats["total_pages"] = total_pages
115
 
116
  if total_pages == 0:
117
  return kept_files, stats
118
 
119
- # Estimate pages per segment
120
- file_size = input_path.stat().st_size
121
- avg_page_size = file_size / total_pages if total_pages > 0 else file_size
122
- pages_per_segment = max(1, int(TARGET_SEGMENT_SIZE_BYTES * 0.8 / avg_page_size))
123
-
124
- segment_num = 0
125
- page_start = 0
126
- retry_count = 0
127
- max_retries = 3
128
 
129
- while page_start < total_pages:
130
- page_end = min(page_start + pages_per_segment, total_pages)
 
 
 
 
131
 
 
132
  if progress_callback:
133
- progress = (page_start / total_pages)
134
- progress_callback(progress, f"Processing pages {page_start+1}-{page_end} of {total_pages}...")
135
 
136
- segment_num += 1
137
- segment_filename = f"segment_{segment_num:03d}_p{page_start+1}-{page_end}.pdf"
138
- segment_path = output_dir / segment_filename
139
-
140
- try:
141
- # Create new PDF with selected pages
142
- segment_pdf = pikepdf.new()
143
-
144
- # Copy pages with error handling
145
- for page_num in range(page_start, page_end):
146
- try:
147
- # Clone the page to avoid stream errors
148
- page = pdf.pages[page_num]
149
- segment_pdf.pages.append(page)
150
- except Exception as page_error:
151
- logger.warning(f"Error copying page {page_num+1}: {page_error}")
152
- # Skip corrupted pages
153
- continue
154
-
155
- # If no pages were successfully added, skip this segment
156
- if len(segment_pdf.pages) == 0:
157
- logger.warning(f"Segment {segment_num} has no valid pages, skipping")
158
- page_start = page_end
159
- continue
160
-
161
- # Save with safe compression settings
162
- segment_pdf.save(
163
- segment_path,
164
- compress_streams=True,
165
- stream_decode_level=pikepdf.StreamDecodeLevel.specialized, # Use specialized decoding
166
- object_stream_mode=pikepdf.ObjectStreamMode.generate,
167
- normalize_content=True, # Normalize to fix issues
168
- linearize=False, # Don't linearize to avoid issues
169
- recompress_flate=False # Don't recompress to avoid corruption
170
- )
171
 
172
- except Exception as save_error:
173
- logger.error(f"Error saving segment {segment_num}: {save_error}")
174
- # Try saving without compression
175
  try:
 
 
 
 
 
 
 
 
176
  segment_pdf.save(
177
- segment_path,
178
- compress_streams=False,
179
- object_stream_mode=pikepdf.ObjectStreamMode.disable
 
180
  )
181
- except:
182
- logger.error(f"Failed to save segment {segment_num} even without compression")
183
- page_start = page_end
184
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
- # Check segment size
187
- if segment_path.exists():
188
- segment_size = segment_path.stat().st_size
189
- segment_size_mb = segment_size / 1024 / 1024
 
 
190
 
191
- logger.info(f"Segment {segment_num}: {segment_size_mb:.2f} MB")
 
 
192
 
193
- if segment_size <= MAX_ALLOWED_SIZE_BYTES:
194
- kept_files.append(segment_path)
195
  stats["segments_created"] += 1
196
- stats["total_output_size_mb"] += segment_size_mb
197
- stats["largest_segment_mb"] = max(stats["largest_segment_mb"], segment_size_mb)
198
- stats["smallest_segment_mb"] = min(stats["smallest_segment_mb"], segment_size_mb)
199
- page_start = page_end
200
- retry_count = 0
201
- else:
202
- # File exceeds 5MB limit
203
- logger.warning(f"Segment {segment_num} too large ({segment_size_mb:.2f} MB)")
204
 
205
- if page_end - page_start == 1:
206
- # Single page is over 5MB - discard
207
- segment_path.unlink()
208
- stats["segments_discarded"] += 1
209
- page_start = page_end
210
- else:
211
- # Try with fewer pages
212
- segment_path.unlink()
213
- pages_per_segment = max(1, (page_end - page_start) // 2)
214
- segment_num -= 1
 
 
 
 
 
 
 
215
 
 
216
  if stats["smallest_segment_mb"] == float('inf'):
217
  stats["smallest_segment_mb"] = 0
218
 
219
  if progress_callback:
220
  progress_callback(1.0, "Splitting complete!")
221
-
222
- # Clean up temporary repaired files if they exist
223
- for temp_file in output_dir.parent.glob("*repaired*.pdf"):
224
- try:
225
- temp_file.unlink()
226
- except:
227
- pass
228
 
229
  except Exception as e:
230
  logger.error(f"Error splitting PDF: {str(e)}")
231
- # Clean up any temporary files
232
- for temp_file in output_dir.parent.glob("*repaired*.pdf"):
233
  try:
234
  temp_file.unlink()
235
  except:
 
31
  user_sessions = {}
32
 
33
  class PDFProcessor:
34
+ """Handle PDF splitting with qpdf/pikepdf - using incremental size checking like bash script"""
35
 
36
  @staticmethod
37
  def split_pdf_by_size(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[List[Path], dict]:
38
  """
39
  Split PDF into segments of approximately 4.5MB, discarding any over 5MB
40
+ Uses the same incremental approach as the bash script
41
  """
42
  kept_files = []
 
43
  stats = {
44
  "total_pages": 0,
45
  "segments_created": 0,
 
54
  # Get original file size
55
  stats["original_size_mb"] = input_path.stat().st_size / 1024 / 1024
56
 
57
+ # Open PDF with pikepdf
58
+ with pikepdf.open(input_path, suppress_warnings=True, attempt_recovery=True) as pdf:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  total_pages = len(pdf.pages)
60
  stats["total_pages"] = total_pages
61
 
62
  if total_pages == 0:
63
  return kept_files, stats
64
 
65
+ start_page = 0
66
+ part = 1
 
 
 
 
 
 
 
67
 
68
+ while start_page < total_pages:
69
+ # Start with a single page
70
+ end_page = start_page
71
+ temp_segment = None
72
+ last_good_segment = None
73
+ last_good_end = start_page
74
 
75
+ # Update progress
76
  if progress_callback:
77
+ progress = (start_page / total_pages)
78
+ progress_callback(progress, f"Processing segment {part}, starting at page {start_page + 1}...")
79
 
80
+ # Keep adding pages until we exceed the size limit
81
+ while end_page < total_pages:
82
+ # Create temporary segment with pages from start_page to end_page (inclusive)
83
+ temp_filename = f"temp_segment_{part}.pdf"
84
+ temp_path = output_dir / temp_filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
 
 
 
86
  try:
87
+ # Create new PDF with selected pages
88
+ segment_pdf = pikepdf.new()
89
+
90
+ # Add pages from start_page to end_page (inclusive)
91
+ for page_num in range(start_page, end_page + 1):
92
+ segment_pdf.pages.append(pdf.pages[page_num])
93
+
94
+ # Save with compression
95
  segment_pdf.save(
96
+ temp_path,
97
+ compress_streams=True,
98
+ object_stream_mode=pikepdf.ObjectStreamMode.generate,
99
+ linearize=False # Don't linearize to save time during testing
100
  )
101
+
102
+ # Check file size
103
+ segment_size = temp_path.stat().st_size
104
+ segment_size_mb = segment_size / 1024 / 1024
105
+
106
+ logger.debug(f"Testing segment {part}: pages {start_page+1}-{end_page+1}, size: {segment_size_mb:.2f} MB")
107
+
108
+ if segment_size < TARGET_SEGMENT_SIZE_BYTES:
109
+ # Still under target size, keep this as last good and try adding more pages
110
+ if last_good_segment and last_good_segment.exists():
111
+ last_good_segment.unlink() # Delete previous good segment
112
+ last_good_segment = temp_path
113
+ last_good_end = end_page
114
+
115
+ # If we're at the last page, this is our final segment
116
+ if end_page == total_pages - 1:
117
+ break
118
+
119
+ # Try adding one more page
120
+ end_page += 1
121
+
122
+ elif segment_size <= MAX_ALLOWED_SIZE_BYTES:
123
+ # Between 4.5MB and 5MB - this is acceptable, use it
124
+ if last_good_segment and last_good_segment.exists():
125
+ last_good_segment.unlink()
126
+ last_good_segment = temp_path
127
+ last_good_end = end_page
128
+ break # Stop here, we found a good size
129
+
130
+ else:
131
+ # Over 5MB limit
132
+ temp_path.unlink() # Delete oversized segment
133
+
134
+ if end_page == start_page:
135
+ # Single page is over 5MB - discard it
136
+ logger.warning(f"Single page {start_page+1} exceeds 5MB limit - discarding")
137
+ stats["segments_discarded"] += 1
138
+ last_good_end = start_page # Move past this page
139
+ break
140
+ else:
141
+ # Multiple pages - use the last good segment
142
+ break
143
+
144
+ except Exception as e:
145
+ logger.error(f"Error creating segment: {e}")
146
+ if temp_path and temp_path.exists():
147
+ temp_path.unlink()
148
+ break
149
 
150
+ # Save the final segment for this part
151
+ if last_good_segment and last_good_segment.exists():
152
+ # Rename to final name
153
+ final_filename = f"segment_{part:03d}_p{start_page+1}-{last_good_end+1}.pdf"
154
+ final_path = output_dir / final_filename
155
+ last_good_segment.rename(final_path)
156
 
157
+ # Check final size and add to kept files
158
+ final_size = final_path.stat().st_size
159
+ final_size_mb = final_size / 1024 / 1024
160
 
161
+ if final_size <= MAX_ALLOWED_SIZE_BYTES:
162
+ kept_files.append(final_path)
163
  stats["segments_created"] += 1
164
+ stats["total_output_size_mb"] += final_size_mb
165
+ stats["largest_segment_mb"] = max(stats["largest_segment_mb"], final_size_mb)
166
+ stats["smallest_segment_mb"] = min(stats["smallest_segment_mb"], final_size_mb)
 
 
 
 
 
167
 
168
+ logger.info(f"Created segment {part}: {final_size_mb:.2f} MB (pages {start_page+1}-{last_good_end+1})")
169
+ else:
170
+ # Should not happen, but just in case
171
+ final_path.unlink()
172
+ stats["segments_discarded"] += 1
173
+ logger.warning(f"Final segment {part} exceeded 5MB limit after rename")
174
+
175
+ # Move to next segment
176
+ start_page = last_good_end + 1
177
+ part += 1
178
+
179
+ # Clean up any remaining temp files
180
+ for temp_file in output_dir.glob("temp_segment_*.pdf"):
181
+ try:
182
+ temp_file.unlink()
183
+ except:
184
+ pass
185
 
186
+ # Final cleanup
187
  if stats["smallest_segment_mb"] == float('inf'):
188
  stats["smallest_segment_mb"] = 0
189
 
190
  if progress_callback:
191
  progress_callback(1.0, "Splitting complete!")
 
 
 
 
 
 
 
192
 
193
  except Exception as e:
194
  logger.error(f"Error splitting PDF: {str(e)}")
195
+ # Clean up temp files on error
196
+ for temp_file in output_dir.glob("temp_segment_*.pdf"):
197
  try:
198
  temp_file.unlink()
199
  except: