from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from config import Config

# Simple global caching for the captioner
captioner_processor = None
captioner_model = None

def resize_image_to_1mp(image):
    """Resizes image to approx 1MP (e.g., 1024x1024) preserving aspect ratio."""
    image = image.convert("RGB")
    w, h = image.size
    target_pixels = 1024 * 1024
    aspect_ratio = w / h
    
    # Calculate new dimensions
    new_h = int((target_pixels / aspect_ratio) ** 0.5)
    new_w = int(new_h * aspect_ratio)
    
    # Ensure divisibility by 32 for efficiency
    new_w = (new_w // 32) * 32
    new_h = (new_h // 32) * 32
    
    if new_w == 0 or new_h == 0:
        new_w, new_h = 1024, 1024 # Fallback
        
    return image.resize((new_w, new_h), Image.LANCZOS)

def get_caption(image):
    """Generates a caption for the image if one isn't provided."""
    global captioner_processor, captioner_model
    
    if captioner_model is None:
        print("Loading Captioner (BLIP)...")
        captioner_processor = BlipProcessor.from_pretrained(Config.CAPTIONER_REPO)
        captioner_model = BlipForConditionalGeneration.from_pretrained(Config.CAPTIONER_REPO).to(Config.DEVICE)

    inputs = captioner_processor(image, return_tensors="pt").to(Config.DEVICE)
    out = captioner_model.generate(**inputs)
    caption = captioner_processor.decode(out[0], skip_special_tokens=True)
    return caption