from transformers import BlipProcessor, BlipForConditionalGeneration _caption_model = None def get_caption_model(): global _caption_model if _caption_model is None: processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") _caption_model = (processor, model) return _caption_model def generate_caption(image_bytes): processor, model = get_caption_model() image = Image.open(io.BytesIO(image_bytes)).convert('RGB') inputs = processor(image, return_tensors="pt") out = model.generate(**inputs) caption = processor.decode(out[0], skip_special_tokens=True) return caption