| from PIL import Image |
| import io |
| import torch |
| from transformers import BlipProcessor, BlipForConditionalGeneration |
|
|
| device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| class ImageCaptioning: |
|
|
| def __init__(self): |
| self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") |
| self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device) |
|
|
| def get_caption(self, image_bytes): |
| img = Image.open(io.BytesIO(image_bytes)) |
| img_tensors = self.processor(img, return_tensors="pt").to(device) |
| output = self.model.generate(**img_tensors) |
| caption = self.processor.batch_decode(output, skip_special_tokens=True)[0] |
| return caption |
|
|