| import open_clip | |
| import torch | |
| from PIL import Image | |
| model, _, transform = open_clip.create_model_and_transforms( | |
| model_name="coca_ViT-L-14", | |
| pretrained="mscoco_finetuned_laion2B-s13B-b90k" | |
| ) | |
| def get_captions(image): | |
| im = transform(image).unsqueeze(0) | |
| with torch.no_grad(), torch.cuda.amp.autocast(): | |
| generated = model.generate(im) | |
| return open_clip.decode(generated[0]).split("<end_of_text>")[0].replace("<start_of_text>", "") |