mac9087 commited on
Commit
f3b1537
·
verified ·
1 Parent(s): efc9cd2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +148 -27
app.py CHANGED
@@ -4,24 +4,76 @@ import fitz # PyMuPDF
4
  import os
5
  import tempfile
6
  import shutil
 
7
 
8
  app = Flask(__name__)
9
  CORS(app) # This will allow all origins
10
 
11
- def compress_pdf(input_path, output_path, compression_level=4):
12
  """
13
- Compresses a PDF using PyMuPDF with deflate and garbage collection.
14
- Allows adjusting the compression level (1-4, higher = more compression)
 
 
 
 
 
 
 
 
15
  """
16
  try:
17
  doc = fitz.open(input_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  doc.save(
19
  output_path,
20
- garbage=compression_level, # 4 is max compression
21
- deflate=True,
22
- clean=True,
23
- pretty=False, # Disable pretty printing for smaller size
24
- linear=True # Create a linearized PDF
25
  )
26
  doc.close()
27
  return True
@@ -29,10 +81,40 @@ def compress_pdf(input_path, output_path, compression_level=4):
29
  print(f"Error compressing PDF: {str(e)}")
30
  return False
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  @app.route("/")
33
  def home():
34
  return "PDF Compressor API is running"
35
-
36
  @app.route("/compress", methods=["POST"])
37
  def compress():
38
  # Track temporary directories to clean up
@@ -71,42 +153,81 @@ def compress():
71
 
72
  print(f"Original size: {original_size_kb:.2f} KB, Target: {target_kb:.2f} KB")
73
 
74
- # Try different compression levels if needed
75
- for compression_level in [4, 3, 2, 1]:
76
- # If original file is already below target, just use it
77
- if original_size_kb <= target_kb:
78
- shutil.copy(input_path, output_path)
79
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
- compress_pdf(input_path, output_path, compression_level)
82
- final_size_kb = os.path.getsize(output_path) / 1024
 
 
83
 
84
- print(f"Compression level {compression_level} result: {final_size_kb:.2f} KB")
 
 
85
 
86
- # If we reached target or can't compress anymore, stop
87
- if final_size_kb <= target_kb or compression_level == 1:
88
- break
 
 
89
 
90
- # Get final compressed size
91
- final_size_kb = os.path.getsize(output_path) / 1024
92
 
93
  # If final size is too large, inform but still provide the file
94
  if final_size_kb > target_kb:
95
  # Important! Check Accept header to determine what client expects
96
  accepts = request.headers.get('Accept', '')
97
 
98
- # If client expects JSON (or we're not sure), send the warning JSON first
99
  if 'application/json' in accepts or '*/*' in accepts:
100
- # The frontend should handle this warning specially
 
 
 
 
 
 
 
 
 
 
 
 
101
  return jsonify({
102
  "warning": f"Unable to compress below {target_kb:.2f} KB. Best compressed size is {int(final_size_kb)} KB.",
103
  "original_size_kb": round(original_size_kb, 2),
104
  "compressed_size_kb": round(final_size_kb, 2),
105
  "target_size_kb": round(target_kb, 2),
106
- "compression_ratio": round((original_size_kb - final_size_kb) / original_size_kb * 100, 2),
 
107
  "download_available": True
108
  }), 200
109
-
110
  # Return the compressed file
111
  response = send_file(output_path, as_attachment=True, download_name="compressed.pdf")
112
 
 
4
  import os
5
  import tempfile
6
  import shutil
7
+ import io
8
 
9
  app = Flask(__name__)
10
  CORS(app) # This will allow all origins
11
 
12
+ def compress_pdf(input_path, output_path, quality=70, compression_level=4):
13
  """
14
+ Compresses a PDF using PyMuPDF with enhanced compression strategies.
15
+
16
+ Args:
17
+ input_path: Path to input PDF
18
+ output_path: Path to save compressed PDF
19
+ quality: Image quality (0-100), lower means more compression
20
+ compression_level: PDF compression level (1-4)
21
+
22
+ Returns:
23
+ True if compression was successful, False otherwise
24
  """
25
  try:
26
  doc = fitz.open(input_path)
27
+
28
+ # Check if PDF has images that can be recompressed
29
+ has_images = False
30
+ for page_num in range(doc.page_count):
31
+ page = doc[page_num]
32
+ image_list = page.get_images(full=True)
33
+ if image_list:
34
+ has_images = True
35
+ break
36
+
37
+ # If PDF has images, apply image recompression
38
+ if has_images:
39
+ for page_num in range(doc.page_count):
40
+ page = doc[page_num]
41
+ image_list = page.get_images(full=True)
42
+ for img_index, img in enumerate(image_list):
43
+ xref = img[0]
44
+ try:
45
+ # Get the image data
46
+ base_image = doc.extract_image(xref)
47
+ image_bytes = base_image["image"]
48
+
49
+ # Replace with lower quality if it's JPEG
50
+ if base_image["ext"] == "jpeg":
51
+ # Create a more compressed version of the image
52
+ # For PyMuPDF 1.20.0+, use this approach:
53
+ pix = fitz.Pixmap(image_bytes)
54
+ if pix.colorspace.n > 3: # CMYK or other colorspace
55
+ pix = fitz.Pixmap(fitz.csRGB, pix) # convert to RGB
56
+
57
+ # Compress image with reduced quality
58
+ new_bytes = pix.tobytes(output="jpeg", jpg_quality=quality)
59
+
60
+ # Replace the image in the PDF if the new one is smaller
61
+ if len(new_bytes) < len(image_bytes):
62
+ doc._deleteObject(xref)
63
+ doc._setObject(xref, new_bytes, compress=True)
64
+ except Exception as e:
65
+ print(f"Error processing image {img_index} on page {page_num}: {str(e)}")
66
+ # Continue with the next image
67
+ continue
68
+
69
+ # Apply standard PDF compression options
70
  doc.save(
71
  output_path,
72
+ garbage=compression_level, # 4 is max garbage collection
73
+ deflate=True, # Use deflate compression for streams
74
+ clean=True, # Clean document structure
75
+ pretty=False, # Disable pretty printing for smaller size
76
+ linear=True # Create a linearized PDF
77
  )
78
  doc.close()
79
  return True
 
81
  print(f"Error compressing PDF: {str(e)}")
82
  return False
83
 
84
+ def get_progressive_compression_settings(original_size_kb, target_kb):
85
+ """
86
+ Determine compression settings based on the gap between original and target size.
87
+ Returns a list of (quality, compression_level) tuples to try in sequence.
88
+ """
89
+ # If target is very aggressive (less than 25% of original)
90
+ if target_kb < original_size_kb * 0.25:
91
+ return [
92
+ (40, 4), # Very aggressive compression
93
+ (30, 4), # Ultra aggressive compression
94
+ (20, 4), # Extreme compression - might affect readability
95
+ (10, 4) # Last resort - significant quality loss
96
+ ]
97
+ # If target is aggressive (less than 50% of original)
98
+ elif target_kb < original_size_kb * 0.5:
99
+ return [
100
+ (60, 4),
101
+ (50, 4),
102
+ (40, 4),
103
+ (30, 4)
104
+ ]
105
+ # Moderate compression needed
106
+ else:
107
+ return [
108
+ (80, 4),
109
+ (70, 4),
110
+ (60, 4),
111
+ (50, 4)
112
+ ]
113
+
114
  @app.route("/")
115
  def home():
116
  return "PDF Compressor API is running"
117
+
118
  @app.route("/compress", methods=["POST"])
119
  def compress():
120
  # Track temporary directories to clean up
 
153
 
154
  print(f"Original size: {original_size_kb:.2f} KB, Target: {target_kb:.2f} KB")
155
 
156
+ # Check if PDF is already smaller than target
157
+ if original_size_kb <= target_kb:
158
+ shutil.copy(input_path, output_path)
159
+ print("Original file already meets target size")
160
+ final_size_kb = original_size_kb
161
+ else:
162
+ # Get progressive compression settings based on size gap
163
+ compression_settings = get_progressive_compression_settings(original_size_kb, target_kb)
164
+
165
+ # Try increasingly aggressive compression until target is met or we run out of options
166
+ best_size_kb = original_size_kb
167
+ best_output_path = input_path
168
+
169
+ for quality, compression_level in compression_settings:
170
+ temp_output = os.path.join(temp_dir, f"temp_q{quality}_c{compression_level}.pdf")
171
+ print(f"Trying compression with quality={quality}, level={compression_level}")
172
+
173
+ compress_pdf(input_path, temp_output, quality, compression_level)
174
+ current_size_kb = os.path.getsize(temp_output) / 1024
175
+
176
+ print(f"Result: {current_size_kb:.2f} KB")
177
+
178
+ # Keep the smallest file that's been generated
179
+ if current_size_kb < best_size_kb:
180
+ best_size_kb = current_size_kb
181
+ best_output_path = temp_output
182
 
183
+ # If we've reached target, stop trying
184
+ if current_size_kb <= target_kb:
185
+ print(f"Target reached with quality={quality}, level={compression_level}")
186
+ break
187
 
188
+ # Copy the best result to the output path
189
+ shutil.copy(best_output_path, output_path)
190
+ final_size_kb = best_size_kb
191
 
192
+ # If best compression result is larger than original, use original
193
+ if final_size_kb > original_size_kb:
194
+ print("Compression ineffective, using original file")
195
+ shutil.copy(input_path, output_path)
196
+ final_size_kb = original_size_kb
197
 
198
+ # Get final metrics
199
+ compression_ratio = 100 * (1 - final_size_kb / original_size_kb)
200
 
201
  # If final size is too large, inform but still provide the file
202
  if final_size_kb > target_kb:
203
  # Important! Check Accept header to determine what client expects
204
  accepts = request.headers.get('Accept', '')
205
 
206
+ # If client expects JSON, send the warning JSON
207
  if 'application/json' in accepts or '*/*' in accepts:
208
+ compression_analysis = ""
209
+ if compression_ratio <= 0:
210
+ compression_analysis = (
211
+ "Your PDF may already be highly optimized or contain mostly "
212
+ "vector graphics/text which don't compress well. "
213
+ "Consider a higher target size."
214
+ )
215
+ elif compression_ratio < 10:
216
+ compression_analysis = (
217
+ "Limited compression achieved. This PDF may contain pre-compressed "
218
+ "images or be mostly text/vector content. Consider a higher target size."
219
+ )
220
+
221
  return jsonify({
222
  "warning": f"Unable to compress below {target_kb:.2f} KB. Best compressed size is {int(final_size_kb)} KB.",
223
  "original_size_kb": round(original_size_kb, 2),
224
  "compressed_size_kb": round(final_size_kb, 2),
225
  "target_size_kb": round(target_kb, 2),
226
+ "compression_ratio": round(compression_ratio, 2),
227
+ "technical_details": compression_analysis,
228
  "download_available": True
229
  }), 200
230
+
231
  # Return the compressed file
232
  response = send_file(output_path, as_attachment=True, download_name="compressed.pdf")
233