from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration import torch from config import Config # Simple global caching for the captioner captioner_processor = None captioner_model = None def resize_image_to_1mp(image): """Resizes image to approx 1MP (e.g., 1024x1024) preserving aspect ratio.""" image = image.convert("RGB") w, h = image.size target_pixels = 1024 * 1024 aspect_ratio = w / h # Calculate new dimensions new_h = int((target_pixels / aspect_ratio) ** 0.5) new_w = int(new_h * aspect_ratio) # Ensure divisibility by 32 for efficiency new_w = (new_w // 32) * 32 new_h = (new_h // 32) * 32 if new_w == 0 or new_h == 0: new_w, new_h = 1024, 1024 # Fallback return image.resize((new_w, new_h), Image.LANCZOS) def get_caption(image): """Generates a caption for the image if one isn't provided.""" global captioner_processor, captioner_model if captioner_model is None: print("Loading Captioner (BLIP)...") captioner_processor = BlipProcessor.from_pretrained(Config.CAPTIONER_REPO) captioner_model = BlipForConditionalGeneration.from_pretrained(Config.CAPTIONER_REPO).to(Config.DEVICE) inputs = captioner_processor(image, return_tensors="pt").to(Config.DEVICE) out = captioner_model.generate(**inputs) caption = captioner_processor.decode(out[0], skip_special_tokens=True) return caption