Spaces:

mac9087
/

compressor

Paused

App Files Files Community

compressor / app.py

mac9087

Update app.py

f3b1537 verified 9 months ago

raw

history blame contribute delete

10.9 kB

	from flask import Flask, request, send_file, jsonify
	from flask_cors import CORS
	import fitz # PyMuPDF
	import os
	import tempfile
	import shutil
	import io

	app = Flask(__name__)
	CORS(app) # This will allow all origins

	def compress_pdf(input_path, output_path, quality=70, compression_level=4):
	"""
	Compresses a PDF using PyMuPDF with enhanced compression strategies.

	Args:
	input_path: Path to input PDF
	output_path: Path to save compressed PDF
	quality: Image quality (0-100), lower means more compression
	compression_level: PDF compression level (1-4)

	Returns:
	True if compression was successful, False otherwise
	"""
	try:
	doc = fitz.open(input_path)

	# Check if PDF has images that can be recompressed
	has_images = False
	for page_num in range(doc.page_count):
	page = doc[page_num]
	image_list = page.get_images(full=True)
	if image_list:
	has_images = True
	break

	# If PDF has images, apply image recompression
	if has_images:
	for page_num in range(doc.page_count):
	page = doc[page_num]
	image_list = page.get_images(full=True)
	for img_index, img in enumerate(image_list):
	xref = img[0]
	try:
	# Get the image data
	base_image = doc.extract_image(xref)
	image_bytes = base_image["image"]

	# Replace with lower quality if it's JPEG
	if base_image["ext"] == "jpeg":
	# Create a more compressed version of the image
	# For PyMuPDF 1.20.0+, use this approach:
	pix = fitz.Pixmap(image_bytes)
	if pix.colorspace.n > 3: # CMYK or other colorspace
	pix = fitz.Pixmap(fitz.csRGB, pix) # convert to RGB

	# Compress image with reduced quality
	new_bytes = pix.tobytes(output="jpeg", jpg_quality=quality)

	# Replace the image in the PDF if the new one is smaller
	if len(new_bytes) < len(image_bytes):
	doc._deleteObject(xref)
	doc._setObject(xref, new_bytes, compress=True)
	except Exception as e:
	print(f"Error processing image {img_index} on page {page_num}: {str(e)}")
	# Continue with the next image
	continue

	# Apply standard PDF compression options
	doc.save(
	output_path,
	garbage=compression_level, # 4 is max garbage collection
	deflate=True, # Use deflate compression for streams
	clean=True, # Clean document structure
	pretty=False, # Disable pretty printing for smaller size
	linear=True # Create a linearized PDF
	)
	doc.close()
	return True
	except Exception as e:
	print(f"Error compressing PDF: {str(e)}")
	return False

	def get_progressive_compression_settings(original_size_kb, target_kb):
	"""
	Determine compression settings based on the gap between original and target size.
	Returns a list of (quality, compression_level) tuples to try in sequence.
	"""
	# If target is very aggressive (less than 25% of original)
	if target_kb < original_size_kb * 0.25:
	return [
	(40, 4), # Very aggressive compression
	(30, 4), # Ultra aggressive compression
	(20, 4), # Extreme compression - might affect readability
	(10, 4) # Last resort - significant quality loss
	]
	# If target is aggressive (less than 50% of original)
	elif target_kb < original_size_kb * 0.5:
	return [
	(60, 4),
	(50, 4),
	(40, 4),
	(30, 4)
	]
	# Moderate compression needed
	else:
	return [
	(80, 4),
	(70, 4),
	(60, 4),
	(50, 4)
	]

	@app.route("/")
	def home():
	return "PDF Compressor API is running"

	@app.route("/compress", methods=["POST"])
	def compress():
	# Track temporary directories to clean up
	temp_dirs = []

	try:
	if 'pdf' not in request.files:
	return jsonify({"error": "No PDF file uploaded."}), 400

	target_size_str = request.form.get("target_size")
	if not target_size_str:
	return jsonify({"error": "Target size not specified."}), 400

	# Parse target size
	try:
	if target_size_str.lower().endswith("mb"):
	target_kb = float(target_size_str[:-2]) * 1024
	elif target_size_str.lower().endswith("kb"):
	target_kb = float(target_size_str[:-2])
	else:
	target_kb = float(target_size_str)
	except ValueError:
	return jsonify({"error": "Invalid target size format."}), 400

	# Create a temporary directory
	temp_dir = tempfile.mkdtemp()
	temp_dirs.append(temp_dir)

	# Process PDF
	pdf_file = request.files['pdf']
	input_path = os.path.join(temp_dir, "input.pdf")
	output_path = os.path.join(temp_dir, "compressed.pdf")

	pdf_file.save(input_path)
	original_size_kb = os.path.getsize(input_path) / 1024

	print(f"Original size: {original_size_kb:.2f} KB, Target: {target_kb:.2f} KB")

	# Check if PDF is already smaller than target
	if original_size_kb <= target_kb:
	shutil.copy(input_path, output_path)
	print("Original file already meets target size")
	final_size_kb = original_size_kb
	else:
	# Get progressive compression settings based on size gap
	compression_settings = get_progressive_compression_settings(original_size_kb, target_kb)

	# Try increasingly aggressive compression until target is met or we run out of options
	best_size_kb = original_size_kb
	best_output_path = input_path

	for quality, compression_level in compression_settings:
	temp_output = os.path.join(temp_dir, f"temp_q{quality}_c{compression_level}.pdf")
	print(f"Trying compression with quality={quality}, level={compression_level}")

	compress_pdf(input_path, temp_output, quality, compression_level)
	current_size_kb = os.path.getsize(temp_output) / 1024

	print(f"Result: {current_size_kb:.2f} KB")

	# Keep the smallest file that's been generated
	if current_size_kb < best_size_kb:
	best_size_kb = current_size_kb
	best_output_path = temp_output

	# If we've reached target, stop trying
	if current_size_kb <= target_kb:
	print(f"Target reached with quality={quality}, level={compression_level}")
	break

	# Copy the best result to the output path
	shutil.copy(best_output_path, output_path)
	final_size_kb = best_size_kb

	# If best compression result is larger than original, use original
	if final_size_kb > original_size_kb:
	print("Compression ineffective, using original file")
	shutil.copy(input_path, output_path)
	final_size_kb = original_size_kb

	# Get final metrics
	compression_ratio = 100 * (1 - final_size_kb / original_size_kb)

	# If final size is too large, inform but still provide the file
	if final_size_kb > target_kb:
	# Important! Check Accept header to determine what client expects
	accepts = request.headers.get('Accept', '')

	# If client expects JSON, send the warning JSON
	if 'application/json' in accepts or '/' in accepts:
	compression_analysis = ""
	if compression_ratio <= 0:
	compression_analysis = (
	"Your PDF may already be highly optimized or contain mostly "
	"vector graphics/text which don't compress well. "
	"Consider a higher target size."
	)
	elif compression_ratio < 10:
	compression_analysis = (
	"Limited compression achieved. This PDF may contain pre-compressed "
	"images or be mostly text/vector content. Consider a higher target size."
	)

	return jsonify({
	"warning": f"Unable to compress below {target_kb:.2f} KB. Best compressed size is {int(final_size_kb)} KB.",
	"original_size_kb": round(original_size_kb, 2),
	"compressed_size_kb": round(final_size_kb, 2),
	"target_size_kb": round(target_kb, 2),
	"compression_ratio": round(compression_ratio, 2),
	"technical_details": compression_analysis,
	"download_available": True
	}), 200

	# Return the compressed file
	response = send_file(output_path, as_attachment=True, download_name="compressed.pdf")

	# Add cleanup function to be called after response is sent
	@response.call_on_close
	def cleanup():
	for dir_path in temp_dirs:
	try:
	if os.path.exists(dir_path):
	shutil.rmtree(dir_path)
	print(f"Cleaned up temporary directory: {dir_path}")
	except Exception as e:
	print(f"Error cleaning up {dir_path}: {str(e)}")

	return response

	except Exception as e:
	# Ensure cleanup in case of errors
	for dir_path in temp_dirs:
	try:
	if os.path.exists(dir_path):
	shutil.rmtree(dir_path)
	except:
	pass # Best effort cleanup

	print(f"Error in compression endpoint: {str(e)}")
	return jsonify({"error": f"An error occurred: {str(e)}"}), 500

	if __name__ == "__main__":
	app.run(host="0.0.0.0", port=7860)