| from transformers import AutoProcessor, BlipForConditionalGeneration | |
| caption_id = "Salesforce/blip-image-captioning-base" | |
| caption_model = BlipForConditionalGeneration.from_pretrained(caption_id) | |
| caption_processor = AutoProcessor.from_pretrained(caption_id) | |
| def image_captioning(image): | |
| inputs = caption_processor(image, "a photograph of", return_tensors="pt") | |
| out = caption_model.generate(**inputs) | |
| return caption_processor.decode(out[0], skip_special_tokens=True) | |