bluenevus commited on
Commit
21c0900
·
verified ·
1 Parent(s): 024b572

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -130
app.py CHANGED
@@ -10,6 +10,8 @@ import logging
10
  import threading
11
  import time
12
  from typing import Tuple, List, Optional
 
 
13
 
14
  # Configure logging
15
  logging.basicConfig(level=logging.INFO)
@@ -31,168 +33,165 @@ TEMP_DIR.mkdir(exist_ok=True)
31
  user_sessions = {}
32
 
33
  class PDFProcessor:
34
- """Handle PDF splitting with qpdf/pikepdf - testing actual file sizes like bash script"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  @staticmethod
37
  def split_pdf_by_size(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[List[Path], dict]:
38
  """
39
- Split PDF into segments of approximately 4.5MB, discarding any over 5MB
40
- Mimics the bash script logic - incrementally test actual file sizes
41
  """
42
  kept_files = []
43
  stats = {
44
  "total_pages": 0,
45
  "segments_created": 0,
46
  "segments_discarded": 0,
47
- "original_size_mb": 0,
48
  "total_output_size_mb": 0,
49
  "largest_segment_mb": 0,
50
  "smallest_segment_mb": float('inf')
51
  }
52
 
53
  try:
54
- # Get original file size
55
- stats["original_size_mb"] = input_path.stat().st_size / 1024 / 1024
56
- logger.info(f"Original PDF size: {stats['original_size_mb']:.2f} MB")
 
57
 
58
- # Open PDF with pikepdf
59
- with pikepdf.open(input_path, suppress_warnings=True, attempt_recovery=True) as pdf:
60
- total_pages = len(pdf.pages)
61
- stats["total_pages"] = total_pages
62
- logger.info(f"Total pages: {total_pages}")
 
 
 
 
 
 
 
63
 
64
- if total_pages == 0:
65
- return kept_files, stats
 
 
 
66
 
67
- start_page = 0 # 0-indexed in Python
68
- part = 1
69
 
70
- while start_page < total_pages:
71
- # Update progress
72
- if progress_callback:
73
- progress = (start_page / total_pages)
74
- progress_callback(progress, f"Creating segment {part}...")
75
-
76
- # Start with just the start page
77
- end_page = start_page
78
- last_valid_end = None
79
- last_valid_path = None
80
 
81
- # Keep adding pages until we exceed the target size
82
- while end_page < total_pages:
83
- # Create test segment
84
- test_filename = f"test_segment_{part}_{end_page}.pdf"
85
- test_path = output_dir / test_filename
 
 
 
86
 
87
- try:
88
- # Create PDF with pages from start_page to end_page (inclusive)
89
- test_pdf = pikepdf.new()
90
- for page_idx in range(start_page, end_page + 1):
91
- test_pdf.pages.append(pdf.pages[page_idx])
92
 
93
- # Save to test actual size
94
- test_pdf.save(
95
- test_path,
96
- compress_streams=True,
97
- object_stream_mode=pikepdf.ObjectStreamMode.generate,
98
- recompress_flate=True, # Enable recompression
99
- linearize=False # Skip linearization for speed
100
- )
101
-
102
- # Get actual file size
103
- actual_size = test_path.stat().st_size
104
- actual_size_mb = actual_size / 1024 / 1024
105
-
106
- logger.debug(f"Test segment: pages {start_page+1}-{end_page+1}, size: {actual_size_mb:.2f} MB")
107
-
108
- if actual_size >= MAX_ALLOWED_SIZE_BYTES:
109
- # Too large
110
- if end_page == start_page:
111
- # Single page is over 5MB - discard it
112
- logger.warning(f"Single page {start_page+1} is {actual_size_mb:.2f} MB (>5MB) - discarding")
113
- test_path.unlink()
114
- stats["segments_discarded"] += 1
115
- start_page += 1 # Skip this page
116
- break
117
- else:
118
- # Multiple pages - use the last valid segment
119
- test_path.unlink()
120
- break
121
- else:
122
- # Under 5MB - this is valid
123
- # Delete previous valid if exists
124
- if last_valid_path and last_valid_path.exists():
125
- last_valid_path.unlink()
126
 
127
- last_valid_path = test_path
128
- last_valid_end = end_page
129
-
130
- # If we're under target size and not at the last page, try adding more
131
- if actual_size < TARGET_SEGMENT_SIZE_BYTES and end_page < total_pages - 1:
132
- end_page += 1
133
- continue
134
  else:
135
- # We've reached target size or the last page
136
- break
 
137
 
138
- except Exception as e:
139
- logger.error(f"Error creating test segment: {e}")
140
- if test_path.exists():
141
- test_path.unlink()
142
- break
143
-
144
- # Save the final valid segment
145
- if last_valid_path and last_valid_path.exists():
146
- # Rename to final name
147
- final_filename = f"segment_{part:03d}_pages_{start_page+1}-{last_valid_end+1}.pdf"
148
- final_path = output_dir / final_filename
149
- last_valid_path.rename(final_path)
150
-
151
- # Record stats
152
- final_size = final_path.stat().st_size
153
- final_size_mb = final_size / 1024 / 1024
154
 
155
- kept_files.append(final_path)
156
- stats["segments_created"] += 1
157
- stats["total_output_size_mb"] += final_size_mb
158
- stats["largest_segment_mb"] = max(stats["largest_segment_mb"], final_size_mb)
159
- stats["smallest_segment_mb"] = min(stats["smallest_segment_mb"], final_size_mb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
- logger.info(f"Created segment {part}: {final_size_mb:.2f} MB (pages {start_page+1}-{last_valid_end+1})")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
- # Move to next segment
164
- start_page = last_valid_end + 1
165
- else:
166
- # No valid segment created (shouldn't happen unless all pages > 5MB)
167
- if start_page < total_pages:
168
- start_page += 1
169
 
170
- part += 1
171
-
172
- # Clean up any remaining test files
173
- for test_file in output_dir.glob("test_segment_*.pdf"):
174
- try:
175
- test_file.unlink()
176
- except:
177
- pass
178
 
179
- # Final stats cleanup
180
- if stats["smallest_segment_mb"] == float('inf'):
181
- stats["smallest_segment_mb"] = 0
182
-
183
- if progress_callback:
184
- progress_callback(1.0, "Splitting complete!")
185
-
186
- logger.info(f"Splitting complete: {stats['segments_created']} segments created, {stats['segments_discarded']} discarded")
187
-
 
188
  except Exception as e:
189
- logger.error(f"Error splitting PDF: {str(e)}")
190
- # Clean up test files on error
191
- for test_file in output_dir.glob("test_segment_*.pdf"):
192
- try:
193
- test_file.unlink()
194
- except:
195
- pass
196
  raise
197
 
198
  return kept_files, stats
 
10
  import threading
11
  import time
12
  from typing import Tuple, List, Optional
13
+ import subprocess
14
+ import json
15
 
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO)
 
33
  user_sessions = {}
34
 
35
  class PDFProcessor:
36
+ """Handle PDF splitting using qpdf directly for performance"""
37
+
38
+ @staticmethod
39
+ def get_pdf_info(pdf_path: Path) -> dict:
40
+ """Get PDF info using qpdf"""
41
+ try:
42
+ result = subprocess.run(
43
+ ["qpdf", "--show-npages", str(pdf_path)],
44
+ capture_output=True,
45
+ text=True,
46
+ check=True
47
+ )
48
+ return {"total_pages": int(result.stdout.strip())}
49
+ except subprocess.CalledProcessError as e:
50
+ logger.error(f"Error getting PDF info: {e}")
51
+ raise
52
 
53
  @staticmethod
54
  def split_pdf_by_size(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[List[Path], dict]:
55
  """
56
+ Split PDF using qpdf directly (like your bash script) for much better performance
 
57
  """
58
  kept_files = []
59
  stats = {
60
  "total_pages": 0,
61
  "segments_created": 0,
62
  "segments_discarded": 0,
63
+ "original_size_mb": input_path.stat().st_size / 1024 / 1024,
64
  "total_output_size_mb": 0,
65
  "largest_segment_mb": 0,
66
  "smallest_segment_mb": float('inf')
67
  }
68
 
69
  try:
70
+ # Get total pages using qpdf
71
+ pdf_info = PDFProcessor.get_pdf_info(input_path)
72
+ total_pages = pdf_info["total_pages"]
73
+ stats["total_pages"] = total_pages
74
 
75
+ if total_pages == 0:
76
+ return kept_files, stats
77
+
78
+ logger.info(f"Starting split: {total_pages} pages, original size: {stats['original_size_mb']:.2f} MB")
79
+
80
+ start_page = 1 # qpdf uses 1-based indexing
81
+ part = 1
82
+
83
+ while start_page <= total_pages:
84
+ if progress_callback:
85
+ progress = ((start_page - 1) / total_pages)
86
+ progress_callback(progress, f"Processing segment {part}...")
87
 
88
+ # Binary search for the right number of pages
89
+ low = start_page
90
+ high = min(start_page + 100, total_pages) # Start with max 100 pages
91
+ best_end = start_page
92
+ best_size = 0
93
 
94
+ # First, quickly find a rough upper bound
95
+ test_file = output_dir / f"test_{part}.pdf"
96
 
97
+ while low <= high:
98
+ mid = (low + high) // 2
 
 
 
 
 
 
 
 
99
 
100
+ # Create test segment using qpdf
101
+ try:
102
+ subprocess.run(
103
+ ["qpdf", "--empty", "--pages", str(input_path), f"{start_page}-{mid}", "--", str(test_file)],
104
+ capture_output=True,
105
+ check=True,
106
+ timeout=10 # 10 second timeout
107
+ )
108
 
109
+ # Check file size
110
+ if test_file.exists():
111
+ size = test_file.stat().st_size
 
 
112
 
113
+ if size <= MAX_ALLOWED_SIZE_BYTES:
114
+ best_end = mid
115
+ best_size = size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ if size < TARGET_SEGMENT_SIZE_BYTES * 0.9: # Less than 90% of target
118
+ low = mid + 1 # Try more pages
 
 
 
 
 
119
  else:
120
+ break # Good enough, close to target
121
+ else:
122
+ high = mid - 1 # Too big, try fewer pages
123
 
124
+ # Clean up test file
125
+ test_file.unlink()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ except subprocess.CalledProcessError as e:
128
+ logger.error(f"qpdf error: {e}")
129
+ if test_file.exists():
130
+ test_file.unlink()
131
+ high = mid - 1
132
+ except subprocess.TimeoutExpired:
133
+ logger.error(f"qpdf timeout for pages {start_page}-{mid}")
134
+ if test_file.exists():
135
+ test_file.unlink()
136
+ high = mid - 1
137
+
138
+ # Create final segment with best found size
139
+ if best_end >= start_page:
140
+ final_filename = f"segment_{part:03d}_p{start_page}-{best_end}.pdf"
141
+ final_path = output_dir / final_filename
142
+
143
+ try:
144
+ # Create final segment
145
+ subprocess.run(
146
+ ["qpdf", "--empty", "--pages", str(input_path), f"{start_page}-{best_end}", "--",
147
+ str(final_path), "--compress-streams=y", "--object-streams=generate"],
148
+ capture_output=True,
149
+ check=True,
150
+ timeout=30
151
+ )
152
 
153
+ if final_path.exists():
154
+ final_size = final_path.stat().st_size
155
+ final_size_mb = final_size / 1024 / 1024
156
+
157
+ if final_size <= MAX_ALLOWED_SIZE_BYTES:
158
+ kept_files.append(final_path)
159
+ stats["segments_created"] += 1
160
+ stats["total_output_size_mb"] += final_size_mb
161
+ stats["largest_segment_mb"] = max(stats["largest_segment_mb"], final_size_mb)
162
+ stats["smallest_segment_mb"] = min(stats["smallest_segment_mb"], final_size_mb)
163
+
164
+ logger.info(f"Created segment {part}: {final_size_mb:.2f} MB (pages {start_page}-{best_end})")
165
+ else:
166
+ # Single page over 5MB
167
+ final_path.unlink()
168
+ stats["segments_discarded"] += 1
169
+ logger.warning(f"Segment {part} exceeded 5MB limit - discarded")
170
 
171
+ except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
172
+ logger.error(f"Error creating final segment: {e}")
173
+ if final_path.exists():
174
+ final_path.unlink()
 
 
175
 
176
+ start_page = best_end + 1
177
+ else:
178
+ # Single page is too large, skip it
179
+ logger.warning(f"Page {start_page} exceeds size limit - skipping")
180
+ stats["segments_discarded"] += 1
181
+ start_page += 1
 
 
182
 
183
+ part += 1
184
+
185
+ if stats["smallest_segment_mb"] == float('inf'):
186
+ stats["smallest_segment_mb"] = 0
187
+
188
+ if progress_callback:
189
+ progress_callback(1.0, "Splitting complete!")
190
+
191
+ logger.info(f"Completed: {stats['segments_created']} segments created, {stats['segments_discarded']} discarded")
192
+
193
  except Exception as e:
194
+ logger.error(f"Error in split_pdf_by_size: {str(e)}")
 
 
 
 
 
 
195
  raise
196
 
197
  return kept_files, stats