|
|
import gradio as gr |
|
|
import torch |
|
|
import clip |
|
|
from PIL import Image |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
def calculate_similarity(image, text, model_name): |
|
|
|
|
|
model, preprocess = load_model(model_name) |
|
|
|
|
|
image = preprocess(image).unsqueeze(0).to(device) |
|
|
|
|
|
|
|
|
text = clip.tokenize([text]).to(device) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
image_features = model.encode_image(image) |
|
|
text_features = model.encode_text(text) |
|
|
|
|
|
similarity = torch.cosine_similarity(image_features, text_features).cpu().numpy()[0] |
|
|
|
|
|
return similarity |
|
|
|
|
|
def load_model(model_name): |
|
|
model, preprocess = clip.load(model_name, device=device) |
|
|
return model, preprocess |
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=calculate_similarity, |
|
|
inputs=[ |
|
|
gr.Image(type="pil"), |
|
|
gr.Textbox(lines=2, placeholder="A photo of a ..."), |
|
|
gr.Radio(["ViT-B/32", "ViT-B/16", "ViT-L/14", "ViT-L/14@336px"], label="モデル選択") |
|
|
], |
|
|
outputs="number", |
|
|
title="CLIPによる画像とテキストの類似度計算", |
|
|
description="類似度を計算したい画像とテキストを入力し,使用するCLIPモデルを選択してください." |
|
|
) |
|
|
|
|
|
iface.launch() |