bluenevus commited on
Commit
3809b5f
Β·
verified Β·
1 Parent(s): b2d0f10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +174 -92
app.py CHANGED
@@ -2,50 +2,77 @@ import gradio as gr
2
  import pikepdf
3
  import os
4
  import zipfile
5
- import tempfile
6
  import shutil
7
  from pathlib import Path
8
  import uuid
9
  from datetime import datetime, timedelta
 
10
  import threading
11
  import time
12
- from typing import Tuple, Optional
13
- import logging
14
 
15
  # Configure logging
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
- # Configuration
20
- MAX_FILE_SIZE_MB = 5
21
- CHUNK_SIZE_MB = 4.5
22
- MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
23
- CHUNK_SIZE_BYTES = int(CHUNK_SIZE_MB * 1024 * 1024)
 
24
  TEMP_DIR = Path("temp_files")
25
  CLEANUP_AFTER_MINUTES = 10
26
 
27
  # Create temp directory
28
  TEMP_DIR.mkdir(exist_ok=True)
29
 
30
- # Store user sessions
31
  user_sessions = {}
32
 
33
  class PDFProcessor:
34
  """Handle PDF splitting with qpdf/pikepdf"""
35
 
36
  @staticmethod
37
- def split_pdf(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[list, dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  """
39
- Split PDF into chunks using pikepdf (qpdf wrapper)
40
- Returns: (list of output files, statistics dict)
41
  """
42
- output_files = []
 
43
  stats = {
44
  "total_pages": 0,
45
  "segments_created": 0,
46
  "segments_discarded": 0,
47
  "original_size_mb": 0,
48
- "total_output_size_mb": 0
 
 
49
  }
50
 
51
  try:
@@ -57,25 +84,31 @@ class PDFProcessor:
57
  total_pages = len(pdf.pages)
58
  stats["total_pages"] = total_pages
59
 
60
- # Calculate pages per segment
61
- file_size = input_path.stat().st_size
62
- avg_page_size = file_size / total_pages if total_pages > 0 else file_size
63
- pages_per_segment = max(1, int(CHUNK_SIZE_BYTES / avg_page_size))
 
 
 
64
 
65
  segment_num = 0
66
  page_start = 0
 
 
67
 
68
  while page_start < total_pages:
 
69
  page_end = min(page_start + pages_per_segment, total_pages)
70
- segment_num += 1
71
 
72
  # Update progress
73
  if progress_callback:
74
  progress = (page_start / total_pages)
75
- progress_callback(progress, f"Processing segment {segment_num}...")
76
 
77
- # Create segment filename
78
- segment_filename = f"segment_{segment_num:04d}_pages_{page_start+1}-{page_end}.pdf"
 
79
  segment_path = output_dir / segment_filename
80
 
81
  # Create new PDF with selected pages
@@ -83,35 +116,75 @@ class PDFProcessor:
83
  for page_num in range(page_start, page_end):
84
  segment_pdf.pages.append(pdf.pages[page_num])
85
 
86
- # Save with compression
87
  segment_pdf.save(
88
  segment_path,
89
  compress_streams=True,
 
90
  object_stream_mode=pikepdf.ObjectStreamMode.generate,
91
- linearize=True
 
92
  )
93
 
94
  # Check segment size
95
  segment_size = segment_path.stat().st_size
 
96
 
97
- if segment_size <= MAX_FILE_SIZE_BYTES:
98
- output_files.append(segment_path)
 
 
 
99
  stats["segments_created"] += 1
100
- stats["total_output_size_mb"] += segment_size / 1024 / 1024
101
- logger.info(f"Created segment {segment_num}: {segment_size / 1024 / 1024:.2f} MB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  else:
103
- # If single page is too large, still keep it but mark as oversized
 
 
104
  if page_end - page_start == 1:
105
- output_files.append(segment_path)
 
 
106
  stats["segments_discarded"] += 1
107
- logger.warning(f"Segment {segment_num} exceeds size limit but kept (single page)")
 
 
108
  else:
109
- # Try with fewer pages
110
- segment_path.unlink()
111
- pages_per_segment = max(1, pages_per_segment // 2)
112
- continue
113
-
114
- page_start = page_end
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  if progress_callback:
117
  progress_callback(1.0, "Splitting complete!")
@@ -120,7 +193,7 @@ class PDFProcessor:
120
  logger.error(f"Error splitting PDF: {str(e)}")
121
  raise
122
 
123
- return output_files, stats
124
 
125
  class SessionManager:
126
  """Manage user sessions and cleanup"""
@@ -170,12 +243,11 @@ cleanup_thread.start()
170
  def process_pdf(file_obj, progress=gr.Progress()) -> Tuple[Optional[str], str, str]:
171
  """
172
  Main processing function for Gradio interface
173
- Returns: (zip_file_path, statistics_html, status_message)
174
  """
175
  if file_obj is None:
176
  return None, "", "⚠️ Please upload a PDF file"
177
 
178
- session_id = str(uuid.uuid4())
179
  session_dir = SessionManager.create_session(session_id)
180
 
181
  try:
@@ -202,63 +274,68 @@ def process_pdf(file_obj, progress=gr.Progress()) -> Tuple[Optional[str], str, s
202
  output_dir = session_dir / "output"
203
  output_dir.mkdir(exist_ok=True)
204
 
205
- # Split PDF with progress updates
206
- progress(0.3, "Splitting PDF into segments...")
207
 
208
  def update_progress(value, message):
209
- # Scale progress from 0.3 to 0.8 for splitting phase
210
  scaled_progress = 0.3 + (value * 0.5)
211
  progress(scaled_progress, message)
212
 
213
- output_files, stats = PDFProcessor.split_pdf(
214
  input_path,
215
  output_dir,
216
  progress_callback=update_progress
217
  )
218
 
219
  if not output_files:
220
- return None, "", "❌ No valid segments created"
221
 
222
  # Create ZIP file
223
  progress(0.9, "Creating ZIP archive...")
224
- zip_path = session_dir / f"pdf_segments_{session_id[:8]}.zip"
225
 
226
  with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
227
  for file_path in output_files:
228
  zipf.write(file_path, file_path.name)
229
 
230
- # Generate statistics HTML with fixed styling
231
  stats_html = f"""
232
- <div style="padding: 20px; background: #e8f4f8; border-radius: 10px; margin: 10px 0; border: 1px solid #0369a1;">
233
- <h3 style="color: #0369a1; margin-top: 0;">πŸ“Š Processing Results</h3>
234
- <table style="width: 100%; border-collapse: collapse;">
235
- <tr style="border-bottom: 1px solid #94a3b8;">
236
- <td style="padding: 8px; font-weight: bold; color: #1e293b;">πŸ“„ Total Pages:</td>
237
- <td style="padding: 8px; text-align: right; color: #334155;">{stats['total_pages']}</td>
 
 
 
 
238
  </tr>
239
- <tr style="border-bottom: 1px solid #94a3b8;">
240
- <td style="padding: 8px; font-weight: bold; color: #1e293b;">βœ… Segments Created:</td>
241
- <td style="padding: 8px; text-align: right; color: #334155;">{stats['segments_created']}</td>
242
  </tr>
243
- <tr style="border-bottom: 1px solid #94a3b8;">
244
- <td style="padding: 8px; font-weight: bold; color: #1e293b;">πŸ“¦ Original Size:</td>
245
- <td style="padding: 8px; text-align: right; color: #334155;">{stats['original_size_mb']:.2f} MB</td>
246
  </tr>
247
- <tr style="border-bottom: 1px solid #94a3b8;">
248
- <td style="padding: 8px; font-weight: bold; color: #1e293b;">πŸ“ Total Output Size:</td>
249
- <td style="padding: 8px; text-align: right; color: #334155;">{stats['total_output_size_mb']:.2f} MB</td>
250
  </tr>
251
- <tr>
252
- <td style="padding: 8px; font-weight: bold; color: #1e293b;">πŸ’Ύ Compression Ratio:</td>
253
- <td style="padding: 8px; text-align: right; color: #334155;">
254
- {((1 - stats['total_output_size_mb'] / stats['original_size_mb']) * 100) if stats['original_size_mb'] > 0 else 0:.1f}%
255
- </td>
 
 
256
  </tr>
257
  </table>
258
  <p style="margin-top: 15px; color: #059669; font-weight: bold;">
259
  ✨ Your file has been split successfully!
260
  </p>
261
- <p style="margin-top: 10px; color: #64748b; font-size: 0.9em;">
262
  ⏱️ Files will be automatically deleted after {CLEANUP_AFTER_MINUTES} minutes
263
  </p>
264
  </div>
@@ -280,8 +357,11 @@ def process_pdf(file_obj, progress=gr.Progress()) -> Tuple[Optional[str], str, s
280
  pass
281
  return None, "", f"❌ Error: {str(e)}"
282
 
283
- # Custom CSS to fix text visibility and button styling
284
- custom_css = """
 
 
 
285
  .gradio-container {
286
  max-width: 800px;
287
  margin: auto;
@@ -319,27 +399,21 @@ custom_css = """
319
  color: #1f2937 !important;
320
  font-weight: 500;
321
  }
322
- """
323
-
324
- # Create Gradio interface with fixed theme
325
- with gr.Blocks(
326
- title="PDF Splitter - Fast & Simple",
327
- theme=gr.themes.Base(), # Using Base theme for better control
328
- css=custom_css
329
  ) as app:
330
 
331
  gr.Markdown("""
332
  # πŸ“„ PDF Splitter Tool
333
 
334
- **Split large PDFs into smaller segments quickly and efficiently!**
335
 
336
- This tool uses advanced compression to split your PDF into segments of approximately **4.5 MB** each.
337
- Files are processed using qpdf for optimal performance without decompressing the PDF.
338
 
339
  ### How to use:
340
  1. Upload your PDF file
341
  2. Click "Split PDF"
342
- 3. Download the ZIP file containing all segments
343
 
344
  *Note: Files are automatically deleted after 10 minutes for your privacy.*
345
  """)
@@ -354,7 +428,7 @@ with gr.Blocks(
354
  )
355
 
356
  split_btn = gr.Button(
357
- "πŸš€ Split PDF",
358
  variant="primary",
359
  size="lg",
360
  elem_classes="split-button"
@@ -368,7 +442,7 @@ with gr.Blocks(
368
 
369
  with gr.Row():
370
  download_file = gr.File(
371
- label="πŸ“¦ Download ZIP",
372
  visible=True,
373
  elem_classes="download-section",
374
  interactive=False # Make it non-interactive until file is ready
@@ -384,16 +458,24 @@ with gr.Blocks(
384
  # Add features with proper styling
385
  gr.Markdown("""
386
  ---
387
- ### πŸ’‘ Features:
388
- - βœ… Handles compressed PDFs efficiently using qpdf
389
- - βœ… Automatic file cleanup for privacy
390
- - βœ… Progress tracking during processing
391
- - βœ… Creates ZIP archive for easy download
392
- - βœ… Optimized for Hugging Face Spaces
 
 
 
 
 
 
393
 
394
- ### πŸ”’ Privacy:
395
- All uploaded files are automatically deleted after processing and download.
396
- No files are stored permanently on the server.
 
 
397
  """, elem_classes="features-section")
398
 
399
  # Launch the app
 
2
  import pikepdf
3
  import os
4
  import zipfile
 
5
  import shutil
6
  from pathlib import Path
7
  import uuid
8
  from datetime import datetime, timedelta
9
+ import logging
10
  import threading
11
  import time
12
+ from typing import Tuple, List, Optional
 
13
 
14
  # Configure logging
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
+ # Configuration - FIXED VALUES
19
+ TARGET_SEGMENT_SIZE_MB = 4.5 # Target size for each segment
20
+ MAX_ALLOWED_SIZE_MB = 5.0 # Maximum allowed size - discard if larger
21
+ TARGET_SEGMENT_SIZE_BYTES = int(TARGET_SEGMENT_SIZE_MB * 1024 * 1024) # 4.5MB in bytes
22
+ MAX_ALLOWED_SIZE_BYTES = int(MAX_ALLOWED_SIZE_MB * 1024 * 1024) # 5MB in bytes
23
+
24
  TEMP_DIR = Path("temp_files")
25
  CLEANUP_AFTER_MINUTES = 10
26
 
27
  # Create temp directory
28
  TEMP_DIR.mkdir(exist_ok=True)
29
 
30
+ # Store user sessions for cleanup
31
  user_sessions = {}
32
 
33
  class PDFProcessor:
34
  """Handle PDF splitting with qpdf/pikepdf"""
35
 
36
  @staticmethod
37
+ def estimate_pages_for_size(pdf, total_pages: int, target_size_bytes: int) -> int:
38
+ """
39
+ Estimate how many pages fit in the target size
40
+ """
41
+ # Get approximate file size
42
+ temp_file = Path("temp_estimate.pdf")
43
+ try:
44
+ # Save the entire PDF temporarily to get its size
45
+ pdf.save(temp_file)
46
+ total_size = temp_file.stat().st_size
47
+ temp_file.unlink()
48
+
49
+ # Calculate average page size
50
+ avg_page_size = total_size / total_pages if total_pages > 0 else total_size
51
+
52
+ # Estimate pages that fit in target size (with 10% safety margin)
53
+ estimated_pages = int((target_size_bytes * 0.9) / avg_page_size)
54
+
55
+ return max(1, estimated_pages) # At least 1 page
56
+
57
+ except Exception as e:
58
+ logger.error(f"Error estimating page size: {e}")
59
+ return max(1, int(total_pages / 10)) # Fallback to 10% of pages
60
+
61
+ @staticmethod
62
+ def split_pdf_by_size(input_path: Path, output_dir: Path, progress_callback=None) -> Tuple[List[Path], dict]:
63
  """
64
+ Split PDF into segments of approximately 4.5MB, discarding any over 5MB
 
65
  """
66
+ kept_files = []
67
+ discarded_count = 0
68
  stats = {
69
  "total_pages": 0,
70
  "segments_created": 0,
71
  "segments_discarded": 0,
72
  "original_size_mb": 0,
73
+ "total_output_size_mb": 0,
74
+ "largest_segment_mb": 0,
75
+ "smallest_segment_mb": float('inf')
76
  }
77
 
78
  try:
 
84
  total_pages = len(pdf.pages)
85
  stats["total_pages"] = total_pages
86
 
87
+ if total_pages == 0:
88
+ return kept_files, stats
89
+
90
+ # Initial estimate of pages per segment
91
+ pages_per_segment = PDFProcessor.estimate_pages_for_size(
92
+ pdf, total_pages, TARGET_SEGMENT_SIZE_BYTES
93
+ )
94
 
95
  segment_num = 0
96
  page_start = 0
97
+ retry_count = 0
98
+ max_retries = 3
99
 
100
  while page_start < total_pages:
101
+ # Calculate page range for this segment
102
  page_end = min(page_start + pages_per_segment, total_pages)
 
103
 
104
  # Update progress
105
  if progress_callback:
106
  progress = (page_start / total_pages)
107
+ progress_callback(progress, f"Processing pages {page_start+1}-{page_end} of {total_pages}...")
108
 
109
+ # Create segment
110
+ segment_num += 1
111
+ segment_filename = f"segment_{segment_num:03d}_p{page_start+1}-{page_end}.pdf"
112
  segment_path = output_dir / segment_filename
113
 
114
  # Create new PDF with selected pages
 
116
  for page_num in range(page_start, page_end):
117
  segment_pdf.pages.append(pdf.pages[page_num])
118
 
119
+ # Save with compression to minimize size
120
  segment_pdf.save(
121
  segment_path,
122
  compress_streams=True,
123
+ stream_decode_level=pikepdf.StreamDecodeLevel.none,
124
  object_stream_mode=pikepdf.ObjectStreamMode.generate,
125
+ linearize=True,
126
+ recompress_flate=True
127
  )
128
 
129
  # Check segment size
130
  segment_size = segment_path.stat().st_size
131
+ segment_size_mb = segment_size / 1024 / 1024
132
 
133
+ logger.info(f"Segment {segment_num}: {segment_size_mb:.2f} MB ({page_end - page_start} pages)")
134
+
135
+ if segment_size <= MAX_ALLOWED_SIZE_BYTES:
136
+ # File is under 5MB limit - keep it
137
+ kept_files.append(segment_path)
138
  stats["segments_created"] += 1
139
+ stats["total_output_size_mb"] += segment_size_mb
140
+
141
+ # Track largest and smallest segments
142
+ stats["largest_segment_mb"] = max(stats["largest_segment_mb"], segment_size_mb)
143
+ stats["smallest_segment_mb"] = min(stats["smallest_segment_mb"], segment_size_mb)
144
+
145
+ # Move to next segment
146
+ page_start = page_end
147
+ retry_count = 0 # Reset retry count for next segment
148
+
149
+ # Adjust pages per segment based on actual size
150
+ if segment_size_mb < 4.0 and pages_per_segment < total_pages:
151
+ # Segment is too small, try more pages next time
152
+ pages_per_segment = min(pages_per_segment + 1, total_pages - page_end)
153
+ elif segment_size_mb > 4.8:
154
+ # Segment is getting close to limit, use fewer pages
155
+ pages_per_segment = max(1, pages_per_segment - 1)
156
+
157
  else:
158
+ # File exceeds 5MB limit
159
+ logger.warning(f"Segment {segment_num} too large ({segment_size_mb:.2f} MB)")
160
+
161
  if page_end - page_start == 1:
162
+ # Single page is over 5MB - discard and move on
163
+ logger.warning(f"Single page {page_start+1} exceeds 5MB limit - discarding")
164
+ segment_path.unlink() # Delete the file
165
  stats["segments_discarded"] += 1
166
+ discarded_count += 1
167
+ page_start = page_end # Move to next page
168
+ retry_count = 0
169
  else:
170
+ # Multiple pages - try with fewer pages
171
+ segment_path.unlink() # Delete the oversized file
172
+
173
+ if retry_count < max_retries:
174
+ # Reduce pages by half and retry
175
+ pages_per_segment = max(1, (page_end - page_start) // 2)
176
+ retry_count += 1
177
+ segment_num -= 1 # Reuse segment number
178
+ logger.info(f"Retrying with {pages_per_segment} pages")
179
+ else:
180
+ # Too many retries, try single pages
181
+ pages_per_segment = 1
182
+ retry_count = 0
183
+ segment_num -= 1
184
+
185
+ # Clean up stats
186
+ if stats["smallest_segment_mb"] == float('inf'):
187
+ stats["smallest_segment_mb"] = 0
188
 
189
  if progress_callback:
190
  progress_callback(1.0, "Splitting complete!")
 
193
  logger.error(f"Error splitting PDF: {str(e)}")
194
  raise
195
 
196
+ return kept_files, stats
197
 
198
  class SessionManager:
199
  """Manage user sessions and cleanup"""
 
243
  def process_pdf(file_obj, progress=gr.Progress()) -> Tuple[Optional[str], str, str]:
244
  """
245
  Main processing function for Gradio interface
 
246
  """
247
  if file_obj is None:
248
  return None, "", "⚠️ Please upload a PDF file"
249
 
250
+ session_id = str(uuid.uuid4())[:8]
251
  session_dir = SessionManager.create_session(session_id)
252
 
253
  try:
 
274
  output_dir = session_dir / "output"
275
  output_dir.mkdir(exist_ok=True)
276
 
277
+ # Split PDF with size constraints
278
+ progress(0.3, "Splitting PDF into 4.5MB segments...")
279
 
280
  def update_progress(value, message):
 
281
  scaled_progress = 0.3 + (value * 0.5)
282
  progress(scaled_progress, message)
283
 
284
+ output_files, stats = PDFProcessor.split_pdf_by_size(
285
  input_path,
286
  output_dir,
287
  progress_callback=update_progress
288
  )
289
 
290
  if not output_files:
291
+ return None, "", "❌ No valid segments created (all segments exceeded 5MB limit)"
292
 
293
  # Create ZIP file
294
  progress(0.9, "Creating ZIP archive...")
295
+ zip_path = session_dir / f"pdf_segments_{session_id}.zip"
296
 
297
  with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
298
  for file_path in output_files:
299
  zipf.write(file_path, file_path.name)
300
 
301
+ # Generate statistics with proper styling
302
  stats_html = f"""
303
+ <div style="padding: 20px; background: #f0f9ff; border-radius: 10px; margin: 10px 0; border: 2px solid #0284c7;">
304
+ <h3 style="color: #0c4a6e; margin-top: 0;">πŸ“Š Processing Results</h3>
305
+ <table style="width: 100%; border-collapse: collapse; background: white; border-radius: 5px;">
306
+ <tr style="border-bottom: 1px solid #e2e8f0;">
307
+ <td style="padding: 10px; font-weight: bold; color: #334155;">πŸ“„ Total Pages:</td>
308
+ <td style="padding: 10px; text-align: right; color: #475569; font-weight: 600;">{stats['total_pages']}</td>
309
+ </tr>
310
+ <tr style="border-bottom: 1px solid #e2e8f0; background: #f8fafc;">
311
+ <td style="padding: 10px; font-weight: bold; color: #334155;">βœ… Segments Created (≀5MB):</td>
312
+ <td style="padding: 10px; text-align: right; color: #16a34a; font-weight: 600;">{stats['segments_created']}</td>
313
  </tr>
314
+ <tr style="border-bottom: 1px solid #e2e8f0;">
315
+ <td style="padding: 10px; font-weight: bold; color: #334155;">❌ Segments Discarded (>5MB):</td>
316
+ <td style="padding: 10px; text-align: right; color: #dc2626; font-weight: 600;">{stats['segments_discarded']}</td>
317
  </tr>
318
+ <tr style="border-bottom: 1px solid #e2e8f0; background: #f8fafc;">
319
+ <td style="padding: 10px; font-weight: bold; color: #334155;">πŸ“¦ Original Size:</td>
320
+ <td style="padding: 10px; text-align: right; color: #475569; font-weight: 600;">{stats['original_size_mb']:.2f} MB</td>
321
  </tr>
322
+ <tr style="border-bottom: 1px solid #e2e8f0;">
323
+ <td style="padding: 10px; font-weight: bold; color: #334155;">πŸ“ Total Output Size:</td>
324
+ <td style="padding: 10px; text-align: right; color: #475569; font-weight: 600;">{stats['total_output_size_mb']:.2f} MB</td>
325
  </tr>
326
+ <tr style="border-bottom: 1px solid #e2e8f0; background: #f8fafc;">
327
+ <td style="padding: 10px; font-weight: bold; color: #334155;">πŸ“ˆ Largest Segment:</td>
328
+ <td style="padding: 10px; text-align: right; color: #475569; font-weight: 600;">{stats['largest_segment_mb']:.2f} MB</td>
329
+ </tr>
330
+ <tr style="background: white;">
331
+ <td style="padding: 10px; font-weight: bold; color: #334155;">πŸ“‰ Smallest Segment:</td>
332
+ <td style="padding: 10px; text-align: right; color: #475569; font-weight: 600;">{stats['smallest_segment_mb']:.2f} MB</td>
333
  </tr>
334
  </table>
335
  <p style="margin-top: 15px; color: #059669; font-weight: bold;">
336
  ✨ Your file has been split successfully!
337
  </p>
338
+ <p style="margin-top: 10px; color: #6b7280; font-size: 0.9em;">
339
  ⏱️ Files will be automatically deleted after {CLEANUP_AFTER_MINUTES} minutes
340
  </p>
341
  </div>
 
357
  pass
358
  return None, "", f"❌ Error: {str(e)}"
359
 
360
+ # Create Gradio interface with fixed theme
361
+ with gr.Blocks(
362
+ title="PDF Splitter - Fast & Simple",
363
+ theme=gr.themes.Base(), # Using Base theme for better control
364
+ css="""
365
  .gradio-container {
366
  max-width: 800px;
367
  margin: auto;
 
399
  color: #1f2937 !important;
400
  font-weight: 500;
401
  }
402
+ """
 
 
 
 
 
 
403
  ) as app:
404
 
405
  gr.Markdown("""
406
  # πŸ“„ PDF Splitter Tool
407
 
408
+ **Split large PDFs into 4.5MB segments - Files over 5MB are automatically discarded!**
409
 
410
+ This tool uses advanced compression with qpdf to split your PDF into segments of approximately **4.5 MB** each.
411
+ Any segments that exceed **5 MB** are automatically discarded to ensure all output files meet size requirements.
412
 
413
  ### How to use:
414
  1. Upload your PDF file
415
  2. Click "Split PDF"
416
+ 3. Download the ZIP file containing only segments ≀5MB
417
 
418
  *Note: Files are automatically deleted after 10 minutes for your privacy.*
419
  """)
 
428
  )
429
 
430
  split_btn = gr.Button(
431
+ "πŸš€ Split PDF into 4.5MB Segments",
432
  variant="primary",
433
  size="lg",
434
  elem_classes="split-button"
 
442
 
443
  with gr.Row():
444
  download_file = gr.File(
445
+ label="πŸ“¦ Download ZIP (Contains only segments ≀5MB)",
446
  visible=True,
447
  elem_classes="download-section",
448
  interactive=False # Make it non-interactive until file is ready
 
458
  # Add features with proper styling
459
  gr.Markdown("""
460
  ---
461
+ ### πŸ’‘ Key Features:
462
+ - βœ… **Target segment size: 4.5MB** - Optimized for most systems
463
+ - βœ… **Maximum allowed size: 5MB** - Segments over 5MB are automatically discarded
464
+ - βœ… **Smart splitting** - Adjusts page count per segment dynamically
465
+ - βœ… **Compressed output** - Uses qpdf for efficient PDF compression
466
+ - βœ… **Automatic cleanup** - Files deleted after 10 minutes
467
+ - βœ… **Progress tracking** - Real-time updates during processing
468
+
469
+ ### πŸ”’ Privacy & Security:
470
+ - All uploaded files are automatically deleted after processing
471
+ - No files are stored permanently on the server
472
+ - Each user gets a unique session ID for file isolation
473
 
474
+ ### βš™οΈ Technical Details:
475
+ - Uses **pikepdf** (qpdf wrapper) for efficient PDF manipulation
476
+ - Maintains PDF compression without decompressing
477
+ - Dynamically adjusts segment size based on page content
478
+ - Automatically retries with fewer pages if segment exceeds limits
479
  """, elem_classes="features-section")
480
 
481
  # Launch the app