compressor / app.py
mac9087's picture
Update app.py
f3b1537 verified
from flask import Flask, request, send_file, jsonify
from flask_cors import CORS
import fitz # PyMuPDF
import os
import tempfile
import shutil
import io
app = Flask(__name__)
CORS(app) # This will allow all origins
def compress_pdf(input_path, output_path, quality=70, compression_level=4):
"""
Compresses a PDF using PyMuPDF with enhanced compression strategies.
Args:
input_path: Path to input PDF
output_path: Path to save compressed PDF
quality: Image quality (0-100), lower means more compression
compression_level: PDF compression level (1-4)
Returns:
True if compression was successful, False otherwise
"""
try:
doc = fitz.open(input_path)
# Check if PDF has images that can be recompressed
has_images = False
for page_num in range(doc.page_count):
page = doc[page_num]
image_list = page.get_images(full=True)
if image_list:
has_images = True
break
# If PDF has images, apply image recompression
if has_images:
for page_num in range(doc.page_count):
page = doc[page_num]
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list):
xref = img[0]
try:
# Get the image data
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
# Replace with lower quality if it's JPEG
if base_image["ext"] == "jpeg":
# Create a more compressed version of the image
# For PyMuPDF 1.20.0+, use this approach:
pix = fitz.Pixmap(image_bytes)
if pix.colorspace.n > 3: # CMYK or other colorspace
pix = fitz.Pixmap(fitz.csRGB, pix) # convert to RGB
# Compress image with reduced quality
new_bytes = pix.tobytes(output="jpeg", jpg_quality=quality)
# Replace the image in the PDF if the new one is smaller
if len(new_bytes) < len(image_bytes):
doc._deleteObject(xref)
doc._setObject(xref, new_bytes, compress=True)
except Exception as e:
print(f"Error processing image {img_index} on page {page_num}: {str(e)}")
# Continue with the next image
continue
# Apply standard PDF compression options
doc.save(
output_path,
garbage=compression_level, # 4 is max garbage collection
deflate=True, # Use deflate compression for streams
clean=True, # Clean document structure
pretty=False, # Disable pretty printing for smaller size
linear=True # Create a linearized PDF
)
doc.close()
return True
except Exception as e:
print(f"Error compressing PDF: {str(e)}")
return False
def get_progressive_compression_settings(original_size_kb, target_kb):
"""
Determine compression settings based on the gap between original and target size.
Returns a list of (quality, compression_level) tuples to try in sequence.
"""
# If target is very aggressive (less than 25% of original)
if target_kb < original_size_kb * 0.25:
return [
(40, 4), # Very aggressive compression
(30, 4), # Ultra aggressive compression
(20, 4), # Extreme compression - might affect readability
(10, 4) # Last resort - significant quality loss
]
# If target is aggressive (less than 50% of original)
elif target_kb < original_size_kb * 0.5:
return [
(60, 4),
(50, 4),
(40, 4),
(30, 4)
]
# Moderate compression needed
else:
return [
(80, 4),
(70, 4),
(60, 4),
(50, 4)
]
@app.route("/")
def home():
return "PDF Compressor API is running"
@app.route("/compress", methods=["POST"])
def compress():
# Track temporary directories to clean up
temp_dirs = []
try:
if 'pdf' not in request.files:
return jsonify({"error": "No PDF file uploaded."}), 400
target_size_str = request.form.get("target_size")
if not target_size_str:
return jsonify({"error": "Target size not specified."}), 400
# Parse target size
try:
if target_size_str.lower().endswith("mb"):
target_kb = float(target_size_str[:-2]) * 1024
elif target_size_str.lower().endswith("kb"):
target_kb = float(target_size_str[:-2])
else:
target_kb = float(target_size_str)
except ValueError:
return jsonify({"error": "Invalid target size format."}), 400
# Create a temporary directory
temp_dir = tempfile.mkdtemp()
temp_dirs.append(temp_dir)
# Process PDF
pdf_file = request.files['pdf']
input_path = os.path.join(temp_dir, "input.pdf")
output_path = os.path.join(temp_dir, "compressed.pdf")
pdf_file.save(input_path)
original_size_kb = os.path.getsize(input_path) / 1024
print(f"Original size: {original_size_kb:.2f} KB, Target: {target_kb:.2f} KB")
# Check if PDF is already smaller than target
if original_size_kb <= target_kb:
shutil.copy(input_path, output_path)
print("Original file already meets target size")
final_size_kb = original_size_kb
else:
# Get progressive compression settings based on size gap
compression_settings = get_progressive_compression_settings(original_size_kb, target_kb)
# Try increasingly aggressive compression until target is met or we run out of options
best_size_kb = original_size_kb
best_output_path = input_path
for quality, compression_level in compression_settings:
temp_output = os.path.join(temp_dir, f"temp_q{quality}_c{compression_level}.pdf")
print(f"Trying compression with quality={quality}, level={compression_level}")
compress_pdf(input_path, temp_output, quality, compression_level)
current_size_kb = os.path.getsize(temp_output) / 1024
print(f"Result: {current_size_kb:.2f} KB")
# Keep the smallest file that's been generated
if current_size_kb < best_size_kb:
best_size_kb = current_size_kb
best_output_path = temp_output
# If we've reached target, stop trying
if current_size_kb <= target_kb:
print(f"Target reached with quality={quality}, level={compression_level}")
break
# Copy the best result to the output path
shutil.copy(best_output_path, output_path)
final_size_kb = best_size_kb
# If best compression result is larger than original, use original
if final_size_kb > original_size_kb:
print("Compression ineffective, using original file")
shutil.copy(input_path, output_path)
final_size_kb = original_size_kb
# Get final metrics
compression_ratio = 100 * (1 - final_size_kb / original_size_kb)
# If final size is too large, inform but still provide the file
if final_size_kb > target_kb:
# Important! Check Accept header to determine what client expects
accepts = request.headers.get('Accept', '')
# If client expects JSON, send the warning JSON
if 'application/json' in accepts or '*/*' in accepts:
compression_analysis = ""
if compression_ratio <= 0:
compression_analysis = (
"Your PDF may already be highly optimized or contain mostly "
"vector graphics/text which don't compress well. "
"Consider a higher target size."
)
elif compression_ratio < 10:
compression_analysis = (
"Limited compression achieved. This PDF may contain pre-compressed "
"images or be mostly text/vector content. Consider a higher target size."
)
return jsonify({
"warning": f"Unable to compress below {target_kb:.2f} KB. Best compressed size is {int(final_size_kb)} KB.",
"original_size_kb": round(original_size_kb, 2),
"compressed_size_kb": round(final_size_kb, 2),
"target_size_kb": round(target_kb, 2),
"compression_ratio": round(compression_ratio, 2),
"technical_details": compression_analysis,
"download_available": True
}), 200
# Return the compressed file
response = send_file(output_path, as_attachment=True, download_name="compressed.pdf")
# Add cleanup function to be called after response is sent
@response.call_on_close
def cleanup():
for dir_path in temp_dirs:
try:
if os.path.exists(dir_path):
shutil.rmtree(dir_path)
print(f"Cleaned up temporary directory: {dir_path}")
except Exception as e:
print(f"Error cleaning up {dir_path}: {str(e)}")
return response
except Exception as e:
# Ensure cleanup in case of errors
for dir_path in temp_dirs:
try:
if os.path.exists(dir_path):
shutil.rmtree(dir_path)
except:
pass # Best effort cleanup
print(f"Error in compression endpoint: {str(e)}")
return jsonify({"error": f"An error occurred: {str(e)}"}), 500
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860)