arefmousavi's picture
chore: initial commit
91b143d
import torch
import gradio as gr
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from models.blip import blip_decoder
MEAN = torch.tensor((0.48145466, 0.4578275 , 0.40821073))
STD = torch.tensor((0.26862954, 0.26130258, 0.27577711))
# Normalize function
def normalize(img_tensor):
img_tensor = img_tensor.squeeze(0)
mean = MEAN.to(img_tensor.device).view(-1, 1, 1)
std = STD.to(img_tensor.device).view(-1, 1, 1)
img_tensor = (img_tensor - mean) / std
img_tensor = img_tensor.unsqueeze(0)
return img_tensor
# Preprocess function
def preprocess_img(raw_img, img_size):
img = raw_img.convert("RGB")
transform = transforms.Compose([
transforms.Resize((img_size, img_size), interpolation=InterpolationMode.BICUBIC),
transforms.ToTensor()
])
img = transform(img).unsqueeze(0)
return img
# Hyperparameters
IMG_SIZE = 384
MODEL_URL = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load model
model = blip_decoder(pretrained=MODEL_URL, image_size=IMG_SIZE, vit='base')
model.eval()
model = model.to(DEVICE)
# Function to generate caption
def generate_caption(user_image):
img = preprocess_img(user_image, IMG_SIZE).to(DEVICE)
img_norm = normalize(img)
with torch.no_grad():
caption = model.generate(img_norm, sample=False, num_beams=3, max_length=20, min_length=5)[0]
return caption
# Gradio interface
demo = gr.Interface(
fn=generate_caption,
inputs=gr.Image(type="pil"),
outputs=gr.Textbox(label="Generated Caption"),
title="BLIP Image Captioning (Base Model)",
description="This model is implemented according to the official BLIP repository: https://github.com/salesforce/BLIP"
)
if __name__ == "__main__":
demo.launch()