Spaces:

mac9087
/

compressor

Paused

File size: 10,876 Bytes

from flask import Flask, request, send_file, jsonify
from flask_cors import CORS
import fitz  # PyMuPDF
import os
import tempfile
import shutil
import io

app = Flask(__name__)
CORS(app)  # This will allow all origins

def compress_pdf(input_path, output_path, quality=70, compression_level=4):
    """
    Compresses a PDF using PyMuPDF with enhanced compression strategies.
    
    Args:
        input_path: Path to input PDF
        output_path: Path to save compressed PDF
        quality: Image quality (0-100), lower means more compression
        compression_level: PDF compression level (1-4)
        
    Returns:
        True if compression was successful, False otherwise
    """
    try:
        doc = fitz.open(input_path)
        
        # Check if PDF has images that can be recompressed
        has_images = False
        for page_num in range(doc.page_count):
            page = doc[page_num]
            image_list = page.get_images(full=True)
            if image_list:
                has_images = True
                break
        
        # If PDF has images, apply image recompression
        if has_images:
            for page_num in range(doc.page_count):
                page = doc[page_num]
                image_list = page.get_images(full=True)
                for img_index, img in enumerate(image_list):
                    xref = img[0]
                    try:
                        # Get the image data
                        base_image = doc.extract_image(xref)
                        image_bytes = base_image["image"]
                        
                        # Replace with lower quality if it's JPEG
                        if base_image["ext"] == "jpeg":
                            # Create a more compressed version of the image
                            # For PyMuPDF 1.20.0+, use this approach:
                            pix = fitz.Pixmap(image_bytes)
                            if pix.colorspace.n > 3:  # CMYK or other colorspace
                                pix = fitz.Pixmap(fitz.csRGB, pix)  # convert to RGB
                            
                            # Compress image with reduced quality
                            new_bytes = pix.tobytes(output="jpeg", jpg_quality=quality)
                            
                            # Replace the image in the PDF if the new one is smaller
                            if len(new_bytes) < len(image_bytes):
                                doc._deleteObject(xref)
                                doc._setObject(xref, new_bytes, compress=True)
                    except Exception as e:
                        print(f"Error processing image {img_index} on page {page_num}: {str(e)}")
                        # Continue with the next image
                        continue

        # Apply standard PDF compression options
        doc.save(
            output_path, 
            garbage=compression_level,  # 4 is max garbage collection
            deflate=True,               # Use deflate compression for streams
            clean=True,                 # Clean document structure
            pretty=False,               # Disable pretty printing for smaller size
            linear=True                 # Create a linearized PDF
        )
        doc.close()
        return True
    except Exception as e:
        print(f"Error compressing PDF: {str(e)}")
        return False

def get_progressive_compression_settings(original_size_kb, target_kb):
    """
    Determine compression settings based on the gap between original and target size.
    Returns a list of (quality, compression_level) tuples to try in sequence.
    """
    # If target is very aggressive (less than 25% of original)
    if target_kb < original_size_kb * 0.25:
        return [
            (40, 4),  # Very aggressive compression
            (30, 4),  # Ultra aggressive compression
            (20, 4),  # Extreme compression - might affect readability
            (10, 4)   # Last resort - significant quality loss
        ]
    # If target is aggressive (less than 50% of original)
    elif target_kb < original_size_kb * 0.5:
        return [
            (60, 4),
            (50, 4),
            (40, 4),
            (30, 4)
        ]
    # Moderate compression needed
    else:
        return [
            (80, 4),
            (70, 4),
            (60, 4),
            (50, 4)
        ]

@app.route("/")
def home():
    return "PDF Compressor API is running"
    
@app.route("/compress", methods=["POST"])
def compress():
    # Track temporary directories to clean up
    temp_dirs = []
    
    try:
        if 'pdf' not in request.files:
            return jsonify({"error": "No PDF file uploaded."}), 400
        
        target_size_str = request.form.get("target_size")
        if not target_size_str:
            return jsonify({"error": "Target size not specified."}), 400
        
        # Parse target size
        try:
            if target_size_str.lower().endswith("mb"):
                target_kb = float(target_size_str[:-2]) * 1024
            elif target_size_str.lower().endswith("kb"):
                target_kb = float(target_size_str[:-2])
            else:
                target_kb = float(target_size_str)
        except ValueError:
            return jsonify({"error": "Invalid target size format."}), 400
        
        # Create a temporary directory
        temp_dir = tempfile.mkdtemp()
        temp_dirs.append(temp_dir)
        
        # Process PDF
        pdf_file = request.files['pdf']
        input_path = os.path.join(temp_dir, "input.pdf")
        output_path = os.path.join(temp_dir, "compressed.pdf")
        
        pdf_file.save(input_path)
        original_size_kb = os.path.getsize(input_path) / 1024
        
        print(f"Original size: {original_size_kb:.2f} KB, Target: {target_kb:.2f} KB")
        
        # Check if PDF is already smaller than target
        if original_size_kb <= target_kb:
            shutil.copy(input_path, output_path)
            print("Original file already meets target size")
            final_size_kb = original_size_kb
        else:
            # Get progressive compression settings based on size gap
            compression_settings = get_progressive_compression_settings(original_size_kb, target_kb)
            
            # Try increasingly aggressive compression until target is met or we run out of options
            best_size_kb = original_size_kb
            best_output_path = input_path
            
            for quality, compression_level in compression_settings:
                temp_output = os.path.join(temp_dir, f"temp_q{quality}_c{compression_level}.pdf")
                print(f"Trying compression with quality={quality}, level={compression_level}")
                
                compress_pdf(input_path, temp_output, quality, compression_level)
                current_size_kb = os.path.getsize(temp_output) / 1024
                
                print(f"Result: {current_size_kb:.2f} KB")
                
                # Keep the smallest file that's been generated
                if current_size_kb < best_size_kb:
                    best_size_kb = current_size_kb
                    best_output_path = temp_output
                
                # If we've reached target, stop trying
                if current_size_kb <= target_kb:
                    print(f"Target reached with quality={quality}, level={compression_level}")
                    break
            
            # Copy the best result to the output path
            shutil.copy(best_output_path, output_path)
            final_size_kb = best_size_kb
            
            # If best compression result is larger than original, use original
            if final_size_kb > original_size_kb:
                print("Compression ineffective, using original file")
                shutil.copy(input_path, output_path)
                final_size_kb = original_size_kb
        
        # Get final metrics
        compression_ratio = 100 * (1 - final_size_kb / original_size_kb)
        
        # If final size is too large, inform but still provide the file
        if final_size_kb > target_kb:
            # Important! Check Accept header to determine what client expects
            accepts = request.headers.get('Accept', '')
            
            # If client expects JSON, send the warning JSON
            if 'application/json' in accepts or '*/*' in accepts:
                compression_analysis = ""
                if compression_ratio <= 0:
                    compression_analysis = (
                        "Your PDF may already be highly optimized or contain mostly "
                        "vector graphics/text which don't compress well. "
                        "Consider a higher target size."
                    )
                elif compression_ratio < 10:
                    compression_analysis = (
                        "Limited compression achieved. This PDF may contain pre-compressed "
                        "images or be mostly text/vector content. Consider a higher target size."
                    )
                
                return jsonify({
                    "warning": f"Unable to compress below {target_kb:.2f} KB. Best compressed size is {int(final_size_kb)} KB.",
                    "original_size_kb": round(original_size_kb, 2),
                    "compressed_size_kb": round(final_size_kb, 2),
                    "target_size_kb": round(target_kb, 2),
                    "compression_ratio": round(compression_ratio, 2),
                    "technical_details": compression_analysis,
                    "download_available": True
                }), 200
        
        # Return the compressed file
        response = send_file(output_path, as_attachment=True, download_name="compressed.pdf")
        
        # Add cleanup function to be called after response is sent
        @response.call_on_close
        def cleanup():
            for dir_path in temp_dirs:
                try:
                    if os.path.exists(dir_path):
                        shutil.rmtree(dir_path)
                        print(f"Cleaned up temporary directory: {dir_path}")
                except Exception as e:
                    print(f"Error cleaning up {dir_path}: {str(e)}")
                    
        return response
    
    except Exception as e:
        # Ensure cleanup in case of errors
        for dir_path in temp_dirs:
            try:
                if os.path.exists(dir_path):
                    shutil.rmtree(dir_path)
            except:
                pass  # Best effort cleanup
                
        print(f"Error in compression endpoint: {str(e)}")
        return jsonify({"error": f"An error occurred: {str(e)}"}), 500

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)