Spaces:
Paused
Paused
File size: 10,876 Bytes
97de2ec 5315934 97de2ec f3b1537 5315934 97de2ec 5315934 f3b1537 b99402c f3b1537 b99402c 97de2ec f3b1537 97de2ec f3b1537 97de2ec 5315934 f3b1537 b99402c f3b1537 5315934 97de2ec 5315934 97de2ec f3b1537 97de2ec f3b1537 97de2ec f3b1537 97de2ec f3b1537 97de2ec f3b1537 97de2ec efc9cd2 f3b1537 efc9cd2 f3b1537 efc9cd2 f3b1537 efc9cd2 f3b1537 97de2ec 5315934 97de2ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 |
from flask import Flask, request, send_file, jsonify
from flask_cors import CORS
import fitz # PyMuPDF
import os
import tempfile
import shutil
import io
app = Flask(__name__)
CORS(app) # This will allow all origins
def compress_pdf(input_path, output_path, quality=70, compression_level=4):
"""
Compresses a PDF using PyMuPDF with enhanced compression strategies.
Args:
input_path: Path to input PDF
output_path: Path to save compressed PDF
quality: Image quality (0-100), lower means more compression
compression_level: PDF compression level (1-4)
Returns:
True if compression was successful, False otherwise
"""
try:
doc = fitz.open(input_path)
# Check if PDF has images that can be recompressed
has_images = False
for page_num in range(doc.page_count):
page = doc[page_num]
image_list = page.get_images(full=True)
if image_list:
has_images = True
break
# If PDF has images, apply image recompression
if has_images:
for page_num in range(doc.page_count):
page = doc[page_num]
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list):
xref = img[0]
try:
# Get the image data
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
# Replace with lower quality if it's JPEG
if base_image["ext"] == "jpeg":
# Create a more compressed version of the image
# For PyMuPDF 1.20.0+, use this approach:
pix = fitz.Pixmap(image_bytes)
if pix.colorspace.n > 3: # CMYK or other colorspace
pix = fitz.Pixmap(fitz.csRGB, pix) # convert to RGB
# Compress image with reduced quality
new_bytes = pix.tobytes(output="jpeg", jpg_quality=quality)
# Replace the image in the PDF if the new one is smaller
if len(new_bytes) < len(image_bytes):
doc._deleteObject(xref)
doc._setObject(xref, new_bytes, compress=True)
except Exception as e:
print(f"Error processing image {img_index} on page {page_num}: {str(e)}")
# Continue with the next image
continue
# Apply standard PDF compression options
doc.save(
output_path,
garbage=compression_level, # 4 is max garbage collection
deflate=True, # Use deflate compression for streams
clean=True, # Clean document structure
pretty=False, # Disable pretty printing for smaller size
linear=True # Create a linearized PDF
)
doc.close()
return True
except Exception as e:
print(f"Error compressing PDF: {str(e)}")
return False
def get_progressive_compression_settings(original_size_kb, target_kb):
"""
Determine compression settings based on the gap between original and target size.
Returns a list of (quality, compression_level) tuples to try in sequence.
"""
# If target is very aggressive (less than 25% of original)
if target_kb < original_size_kb * 0.25:
return [
(40, 4), # Very aggressive compression
(30, 4), # Ultra aggressive compression
(20, 4), # Extreme compression - might affect readability
(10, 4) # Last resort - significant quality loss
]
# If target is aggressive (less than 50% of original)
elif target_kb < original_size_kb * 0.5:
return [
(60, 4),
(50, 4),
(40, 4),
(30, 4)
]
# Moderate compression needed
else:
return [
(80, 4),
(70, 4),
(60, 4),
(50, 4)
]
@app.route("/")
def home():
return "PDF Compressor API is running"
@app.route("/compress", methods=["POST"])
def compress():
# Track temporary directories to clean up
temp_dirs = []
try:
if 'pdf' not in request.files:
return jsonify({"error": "No PDF file uploaded."}), 400
target_size_str = request.form.get("target_size")
if not target_size_str:
return jsonify({"error": "Target size not specified."}), 400
# Parse target size
try:
if target_size_str.lower().endswith("mb"):
target_kb = float(target_size_str[:-2]) * 1024
elif target_size_str.lower().endswith("kb"):
target_kb = float(target_size_str[:-2])
else:
target_kb = float(target_size_str)
except ValueError:
return jsonify({"error": "Invalid target size format."}), 400
# Create a temporary directory
temp_dir = tempfile.mkdtemp()
temp_dirs.append(temp_dir)
# Process PDF
pdf_file = request.files['pdf']
input_path = os.path.join(temp_dir, "input.pdf")
output_path = os.path.join(temp_dir, "compressed.pdf")
pdf_file.save(input_path)
original_size_kb = os.path.getsize(input_path) / 1024
print(f"Original size: {original_size_kb:.2f} KB, Target: {target_kb:.2f} KB")
# Check if PDF is already smaller than target
if original_size_kb <= target_kb:
shutil.copy(input_path, output_path)
print("Original file already meets target size")
final_size_kb = original_size_kb
else:
# Get progressive compression settings based on size gap
compression_settings = get_progressive_compression_settings(original_size_kb, target_kb)
# Try increasingly aggressive compression until target is met or we run out of options
best_size_kb = original_size_kb
best_output_path = input_path
for quality, compression_level in compression_settings:
temp_output = os.path.join(temp_dir, f"temp_q{quality}_c{compression_level}.pdf")
print(f"Trying compression with quality={quality}, level={compression_level}")
compress_pdf(input_path, temp_output, quality, compression_level)
current_size_kb = os.path.getsize(temp_output) / 1024
print(f"Result: {current_size_kb:.2f} KB")
# Keep the smallest file that's been generated
if current_size_kb < best_size_kb:
best_size_kb = current_size_kb
best_output_path = temp_output
# If we've reached target, stop trying
if current_size_kb <= target_kb:
print(f"Target reached with quality={quality}, level={compression_level}")
break
# Copy the best result to the output path
shutil.copy(best_output_path, output_path)
final_size_kb = best_size_kb
# If best compression result is larger than original, use original
if final_size_kb > original_size_kb:
print("Compression ineffective, using original file")
shutil.copy(input_path, output_path)
final_size_kb = original_size_kb
# Get final metrics
compression_ratio = 100 * (1 - final_size_kb / original_size_kb)
# If final size is too large, inform but still provide the file
if final_size_kb > target_kb:
# Important! Check Accept header to determine what client expects
accepts = request.headers.get('Accept', '')
# If client expects JSON, send the warning JSON
if 'application/json' in accepts or '*/*' in accepts:
compression_analysis = ""
if compression_ratio <= 0:
compression_analysis = (
"Your PDF may already be highly optimized or contain mostly "
"vector graphics/text which don't compress well. "
"Consider a higher target size."
)
elif compression_ratio < 10:
compression_analysis = (
"Limited compression achieved. This PDF may contain pre-compressed "
"images or be mostly text/vector content. Consider a higher target size."
)
return jsonify({
"warning": f"Unable to compress below {target_kb:.2f} KB. Best compressed size is {int(final_size_kb)} KB.",
"original_size_kb": round(original_size_kb, 2),
"compressed_size_kb": round(final_size_kb, 2),
"target_size_kb": round(target_kb, 2),
"compression_ratio": round(compression_ratio, 2),
"technical_details": compression_analysis,
"download_available": True
}), 200
# Return the compressed file
response = send_file(output_path, as_attachment=True, download_name="compressed.pdf")
# Add cleanup function to be called after response is sent
@response.call_on_close
def cleanup():
for dir_path in temp_dirs:
try:
if os.path.exists(dir_path):
shutil.rmtree(dir_path)
print(f"Cleaned up temporary directory: {dir_path}")
except Exception as e:
print(f"Error cleaning up {dir_path}: {str(e)}")
return response
except Exception as e:
# Ensure cleanup in case of errors
for dir_path in temp_dirs:
try:
if os.path.exists(dir_path):
shutil.rmtree(dir_path)
except:
pass # Best effort cleanup
print(f"Error in compression endpoint: {str(e)}")
return jsonify({"error": f"An error occurred: {str(e)}"}), 500
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860) |