bluenevus commited on
Commit
0c94b1c
·
verified ·
1 Parent(s): f34313a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -97
app.py CHANGED
@@ -31,37 +31,13 @@ TEMP_DIR.mkdir(exist_ok=True)
31
  user_sessions = {}
32
 
33
  class PDFProcessor:
34
- """Handle PDF splitting with qpdf/pikepdf"""
35
-
36
- @staticmethod
37
- def estimate_pages_for_size(pdf, total_pages: int, target_size_bytes: int) -> int:
38
- """
39
- Estimate how many pages fit in the target size
40
- """
41
- # Get approximate file size
42
- temp_file = Path("temp_estimate.pdf")
43
- try:
44
- # Save the entire PDF temporarily to get its size
45
- pdf.save(temp_file)
46
- total_size = temp_file.stat().st_size
47
- temp_file.unlink()
48
-
49
- # Calculate average page size
50
- avg_page_size = total_size / total_pages if total_pages > 0 else total_size
51
-
52
- # Estimate pages that fit in target size (with 10% safety margin)
53
- estimated_pages = int((target_size_bytes * 0.9) / avg_page_size)
54
-
55
- return max(1, estimated_pages) # At least 1 page
56
-
57
- except Exception as e:
58
- logger.error(f"Error estimating page size: {e}")
59
- return max(1, int(total_pages / 10)) # Fallback to 10% of pages
60
 
61
  @staticmethod
62
  def split_pdf_by_size(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[List[Path], dict]:
63
  """
64
  Split PDF into segments of approximately 4.5MB, discarding any over 5MB
 
65
  """
66
  kept_files = []
67
  discarded_count = 0
@@ -79,18 +55,71 @@ class PDFProcessor:
79
  # Get original file size
80
  stats["original_size_mb"] = input_path.stat().st_size / 1024 / 1024
81
 
82
- # Open PDF with pikepdf
83
- with pikepdf.open(input_path) as pdf:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  total_pages = len(pdf.pages)
85
  stats["total_pages"] = total_pages
86
 
87
  if total_pages == 0:
88
  return kept_files, stats
89
 
90
- # Initial estimate of pages per segment
91
- pages_per_segment = PDFProcessor.estimate_pages_for_size(
92
- pdf, total_pages, TARGET_SEGMENT_SIZE_BYTES
93
- )
94
 
95
  segment_num = 0
96
  page_start = 0
@@ -98,99 +127,113 @@ class PDFProcessor:
98
  max_retries = 3
99
 
100
  while page_start < total_pages:
101
- # Calculate page range for this segment
102
  page_end = min(page_start + pages_per_segment, total_pages)
103
 
104
- # Update progress
105
  if progress_callback:
106
  progress = (page_start / total_pages)
107
  progress_callback(progress, f"Processing pages {page_start+1}-{page_end} of {total_pages}...")
108
 
109
- # Create segment
110
  segment_num += 1
111
  segment_filename = f"segment_{segment_num:03d}_p{page_start+1}-{page_end}.pdf"
112
  segment_path = output_dir / segment_filename
113
 
114
- # Create new PDF with selected pages
115
- segment_pdf = pikepdf.new()
116
- for page_num in range(page_start, page_end):
117
- segment_pdf.pages.append(pdf.pages[page_num])
118
-
119
- # Save with compression to minimize size
120
- segment_pdf.save(
121
- segment_path,
122
- compress_streams=True,
123
- stream_decode_level=pikepdf.StreamDecodeLevel.none,
124
- object_stream_mode=pikepdf.ObjectStreamMode.generate,
125
- linearize=True,
126
- recompress_flate=True
127
- )
128
-
129
- # Check segment size
130
- segment_size = segment_path.stat().st_size
131
- segment_size_mb = segment_size / 1024 / 1024
132
-
133
- logger.info(f"Segment {segment_num}: {segment_size_mb:.2f} MB ({page_end - page_start} pages)")
134
-
135
- if segment_size <= MAX_ALLOWED_SIZE_BYTES:
136
- # File is under 5MB limit - keep it
137
- kept_files.append(segment_path)
138
- stats["segments_created"] += 1
139
- stats["total_output_size_mb"] += segment_size_mb
140
 
141
- # Track largest and smallest segments
142
- stats["largest_segment_mb"] = max(stats["largest_segment_mb"], segment_size_mb)
143
- stats["smallest_segment_mb"] = min(stats["smallest_segment_mb"], segment_size_mb)
 
 
 
 
 
 
 
144
 
145
- # Move to next segment
146
- page_start = page_end
147
- retry_count = 0 # Reset retry count for next segment
 
 
148
 
149
- # Adjust pages per segment based on actual size
150
- if segment_size_mb < 4.0 and pages_per_segment < total_pages:
151
- # Segment is too small, try more pages next time
152
- pages_per_segment = min(pages_per_segment + 1, total_pages - page_end)
153
- elif segment_size_mb > 4.8:
154
- # Segment is getting close to limit, use fewer pages
155
- pages_per_segment = max(1, pages_per_segment - 1)
156
-
157
- else:
158
- # File exceeds 5MB limit
159
- logger.warning(f"Segment {segment_num} too large ({segment_size_mb:.2f} MB)")
160
 
161
- if page_end - page_start == 1:
162
- # Single page is over 5MB - discard and move on
163
- logger.warning(f"Single page {page_start+1} exceeds 5MB limit - discarding")
164
- segment_path.unlink() # Delete the file
165
- stats["segments_discarded"] += 1
166
- discarded_count += 1
167
- page_start = page_end # Move to next page
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  retry_count = 0
169
  else:
170
- # Multiple pages - try with fewer pages
171
- segment_path.unlink() # Delete the oversized file
172
 
173
- if retry_count < max_retries:
174
- # Reduce pages by half and retry
175
- pages_per_segment = max(1, (page_end - page_start) // 2)
176
- retry_count += 1
177
- segment_num -= 1 # Reuse segment number
178
- logger.info(f"Retrying with {pages_per_segment} pages")
179
  else:
180
- # Too many retries, try single pages
181
- pages_per_segment = 1
182
- retry_count = 0
183
  segment_num -= 1
184
 
185
- # Clean up stats
186
  if stats["smallest_segment_mb"] == float('inf'):
187
  stats["smallest_segment_mb"] = 0
188
 
189
  if progress_callback:
190
  progress_callback(1.0, "Splitting complete!")
 
 
 
 
 
 
 
191
 
192
  except Exception as e:
193
  logger.error(f"Error splitting PDF: {str(e)}")
 
 
 
 
 
 
194
  raise
195
 
196
  return kept_files, stats
 
31
  user_sessions = {}
32
 
33
  class PDFProcessor:
34
+ """Handle PDF splitting with qpdf/pikepdf - with corruption handling"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  @staticmethod
37
  def split_pdf_by_size(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[List[Path], dict]:
38
  """
39
  Split PDF into segments of approximately 4.5MB, discarding any over 5MB
40
+ Handles corrupted PDFs by attempting recovery
41
  """
42
  kept_files = []
43
  discarded_count = 0
 
55
  # Get original file size
56
  stats["original_size_mb"] = input_path.stat().st_size / 1024 / 1024
57
 
58
+ # First attempt: Try to open with recovery and stream decoding disabled
59
+ pdf = None
60
+ try:
61
+ # Open PDF with recovery mode and suppress stream errors
62
+ pdf = pikepdf.open(
63
+ input_path,
64
+ suppress_warnings=True,
65
+ attempt_recovery=True
66
+ )
67
+ except pikepdf._qpdf.DataDecodingError as e:
68
+ logger.warning(f"Initial open failed, attempting repair: {e}")
69
+
70
+ # Second attempt: Create a repaired copy first
71
+ repaired_path = output_dir.parent / "repaired_temp.pdf"
72
+ try:
73
+ # Use pikepdf to create a repaired version
74
+ with pikepdf.open(input_path, suppress_warnings=True, attempt_recovery=True) as damaged_pdf:
75
+ # Save with recompression to fix stream errors
76
+ damaged_pdf.save(
77
+ repaired_path,
78
+ compress_streams=False, # Disable compression first
79
+ decode_level=pikepdf.StreamDecodeLevel.all, # Decode all streams
80
+ object_stream_mode=pikepdf.ObjectStreamMode.disable, # Disable object streams
81
+ normalize_content=True, # Normalize content streams
82
+ linearize=False
83
+ )
84
+
85
+ # Now open the repaired version
86
+ pdf = pikepdf.open(repaired_path, suppress_warnings=True)
87
+
88
+ # Clean up repaired file after opening
89
+ input_path = repaired_path
90
+
91
+ except Exception as repair_error:
92
+ logger.error(f"Repair attempt failed: {repair_error}")
93
+
94
+ # Third attempt: Try with qpdf command line if available
95
+ import subprocess
96
+ try:
97
+ repaired_path = output_dir.parent / "qpdf_repaired.pdf"
98
+ subprocess.run(
99
+ ["qpdf", "--replace-input", "--stream-data=uncompress",
100
+ str(input_path), str(repaired_path)],
101
+ check=True,
102
+ capture_output=True
103
+ )
104
+ pdf = pikepdf.open(repaired_path, suppress_warnings=True)
105
+ input_path = repaired_path
106
+ except (subprocess.CalledProcessError, FileNotFoundError):
107
+ raise Exception("PDF is severely corrupted and cannot be repaired")
108
+
109
+ if pdf is None:
110
+ raise Exception("Failed to open PDF after all recovery attempts")
111
+
112
+ with pdf:
113
  total_pages = len(pdf.pages)
114
  stats["total_pages"] = total_pages
115
 
116
  if total_pages == 0:
117
  return kept_files, stats
118
 
119
+ # Estimate pages per segment
120
+ file_size = input_path.stat().st_size
121
+ avg_page_size = file_size / total_pages if total_pages > 0 else file_size
122
+ pages_per_segment = max(1, int(TARGET_SEGMENT_SIZE_BYTES * 0.8 / avg_page_size))
123
 
124
  segment_num = 0
125
  page_start = 0
 
127
  max_retries = 3
128
 
129
  while page_start < total_pages:
 
130
  page_end = min(page_start + pages_per_segment, total_pages)
131
 
 
132
  if progress_callback:
133
  progress = (page_start / total_pages)
134
  progress_callback(progress, f"Processing pages {page_start+1}-{page_end} of {total_pages}...")
135
 
 
136
  segment_num += 1
137
  segment_filename = f"segment_{segment_num:03d}_p{page_start+1}-{page_end}.pdf"
138
  segment_path = output_dir / segment_filename
139
 
140
+ try:
141
+ # Create new PDF with selected pages
142
+ segment_pdf = pikepdf.new()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
+ # Copy pages with error handling
145
+ for page_num in range(page_start, page_end):
146
+ try:
147
+ # Clone the page to avoid stream errors
148
+ page = pdf.pages[page_num]
149
+ segment_pdf.pages.append(page)
150
+ except Exception as page_error:
151
+ logger.warning(f"Error copying page {page_num+1}: {page_error}")
152
+ # Skip corrupted pages
153
+ continue
154
 
155
+ # If no pages were successfully added, skip this segment
156
+ if len(segment_pdf.pages) == 0:
157
+ logger.warning(f"Segment {segment_num} has no valid pages, skipping")
158
+ page_start = page_end
159
+ continue
160
 
161
+ # Save with safe compression settings
162
+ segment_pdf.save(
163
+ segment_path,
164
+ compress_streams=True,
165
+ stream_decode_level=pikepdf.StreamDecodeLevel.specialized, # Use specialized decoding
166
+ object_stream_mode=pikepdf.ObjectStreamMode.generate,
167
+ normalize_content=True, # Normalize to fix issues
168
+ linearize=False, # Don't linearize to avoid issues
169
+ recompress_flate=False # Don't recompress to avoid corruption
170
+ )
 
171
 
172
+ except Exception as save_error:
173
+ logger.error(f"Error saving segment {segment_num}: {save_error}")
174
+ # Try saving without compression
175
+ try:
176
+ segment_pdf.save(
177
+ segment_path,
178
+ compress_streams=False,
179
+ object_stream_mode=pikepdf.ObjectStreamMode.disable
180
+ )
181
+ except:
182
+ logger.error(f"Failed to save segment {segment_num} even without compression")
183
+ page_start = page_end
184
+ continue
185
+
186
+ # Check segment size
187
+ if segment_path.exists():
188
+ segment_size = segment_path.stat().st_size
189
+ segment_size_mb = segment_size / 1024 / 1024
190
+
191
+ logger.info(f"Segment {segment_num}: {segment_size_mb:.2f} MB")
192
+
193
+ if segment_size <= MAX_ALLOWED_SIZE_BYTES:
194
+ kept_files.append(segment_path)
195
+ stats["segments_created"] += 1
196
+ stats["total_output_size_mb"] += segment_size_mb
197
+ stats["largest_segment_mb"] = max(stats["largest_segment_mb"], segment_size_mb)
198
+ stats["smallest_segment_mb"] = min(stats["smallest_segment_mb"], segment_size_mb)
199
+ page_start = page_end
200
  retry_count = 0
201
  else:
202
+ # File exceeds 5MB limit
203
+ logger.warning(f"Segment {segment_num} too large ({segment_size_mb:.2f} MB)")
204
 
205
+ if page_end - page_start == 1:
206
+ # Single page is over 5MB - discard
207
+ segment_path.unlink()
208
+ stats["segments_discarded"] += 1
209
+ page_start = page_end
 
210
  else:
211
+ # Try with fewer pages
212
+ segment_path.unlink()
213
+ pages_per_segment = max(1, (page_end - page_start) // 2)
214
  segment_num -= 1
215
 
 
216
  if stats["smallest_segment_mb"] == float('inf'):
217
  stats["smallest_segment_mb"] = 0
218
 
219
  if progress_callback:
220
  progress_callback(1.0, "Splitting complete!")
221
+
222
+ # Clean up temporary repaired files if they exist
223
+ for temp_file in output_dir.parent.glob("*repaired*.pdf"):
224
+ try:
225
+ temp_file.unlink()
226
+ except:
227
+ pass
228
 
229
  except Exception as e:
230
  logger.error(f"Error splitting PDF: {str(e)}")
231
+ # Clean up any temporary files
232
+ for temp_file in output_dir.parent.glob("*repaired*.pdf"):
233
+ try:
234
+ temp_file.unlink()
235
+ except:
236
+ pass
237
  raise
238
 
239
  return kept_files, stats