Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,20 +5,23 @@ import torch
|
|
| 5 |
import kelip
|
| 6 |
import gradio as gr
|
| 7 |
|
|
|
|
| 8 |
def load_model():
|
| 9 |
-
model, preprocess_img, tokenizer = kelip.build_model(
|
| 10 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 11 |
model = model.to(device)
|
| 12 |
model.eval()
|
| 13 |
|
| 14 |
-
model_dict = {
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
| 18 |
return model_dict
|
| 19 |
|
|
|
|
| 20 |
def classify(img, user_text):
|
| 21 |
-
preprocess_img = model_dict[
|
| 22 |
|
| 23 |
input_img = preprocess_img(img).unsqueeze(0)
|
| 24 |
|
|
@@ -27,17 +30,17 @@ def classify(img, user_text):
|
|
| 27 |
|
| 28 |
# extract image features
|
| 29 |
with torch.no_grad():
|
| 30 |
-
image_features = model_dict[
|
| 31 |
|
| 32 |
# extract text features
|
| 33 |
-
user_texts = user_text.split(
|
| 34 |
-
if user_text ==
|
| 35 |
user_texts = []
|
| 36 |
|
| 37 |
-
input_texts = model_dict[
|
| 38 |
if torch.cuda.is_available():
|
| 39 |
input_texts = input_texts.cuda()
|
| 40 |
-
text_features = model_dict[
|
| 41 |
|
| 42 |
# l2 normalize
|
| 43 |
image_features /= image_features.norm(dim=-1, keepdim=True)
|
|
@@ -50,28 +53,30 @@ def classify(img, user_text):
|
|
| 50 |
|
| 51 |
return result
|
| 52 |
|
| 53 |
-
|
|
|
|
| 54 |
global model_dict
|
| 55 |
|
| 56 |
model_dict = load_model()
|
| 57 |
|
| 58 |
-
inputs = [
|
| 59 |
-
|
| 60 |
-
|
|
|
|
| 61 |
|
| 62 |
-
outputs = [
|
| 63 |
|
| 64 |
title = "KELIP"
|
| 65 |
description = "Zero-shot classification with KELIP -- Korean and English bilingual contrastive Language-Image Pre-training model that is trained with collected 1.1 billion image-text pairs (708 million Korean and 476 million English).<br> <br><a href='https://arxiv.org/abs/2203.14463' target='_blank'>Arxiv</a> | <a href='https://github.com/navervision/KELIP' target='_blank'>Github</a>"
|
| 66 |
-
|
| 67 |
article = ""
|
| 68 |
|
| 69 |
-
iface=gr.Interface(
|
| 70 |
fn=classify,
|
| 71 |
inputs=inputs,
|
| 72 |
outputs=outputs,
|
| 73 |
title=title,
|
| 74 |
description=description,
|
| 75 |
-
article=article
|
| 76 |
)
|
| 77 |
-
iface.launch()
|
|
|
|
| 5 |
import kelip
|
| 6 |
import gradio as gr
|
| 7 |
|
| 8 |
+
|
| 9 |
def load_model():
|
| 10 |
+
model, preprocess_img, tokenizer = kelip.build_model("ViT-B/32")
|
| 11 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 12 |
model = model.to(device)
|
| 13 |
model.eval()
|
| 14 |
|
| 15 |
+
model_dict = {
|
| 16 |
+
"model": model,
|
| 17 |
+
"preprocess_img": preprocess_img,
|
| 18 |
+
"tokenizer": tokenizer,
|
| 19 |
+
}
|
| 20 |
return model_dict
|
| 21 |
|
| 22 |
+
|
| 23 |
def classify(img, user_text):
|
| 24 |
+
preprocess_img = model_dict["preprocess_img"]
|
| 25 |
|
| 26 |
input_img = preprocess_img(img).unsqueeze(0)
|
| 27 |
|
|
|
|
| 30 |
|
| 31 |
# extract image features
|
| 32 |
with torch.no_grad():
|
| 33 |
+
image_features = model_dict["model"].encode_image(input_img)
|
| 34 |
|
| 35 |
# extract text features
|
| 36 |
+
user_texts = user_text.split(",")
|
| 37 |
+
if user_text == "" or user_text.isspace():
|
| 38 |
user_texts = []
|
| 39 |
|
| 40 |
+
input_texts = model_dict["tokenizer"].encode(user_texts)
|
| 41 |
if torch.cuda.is_available():
|
| 42 |
input_texts = input_texts.cuda()
|
| 43 |
+
text_features = model_dict["model"].encode_text(input_texts)
|
| 44 |
|
| 45 |
# l2 normalize
|
| 46 |
image_features /= image_features.norm(dim=-1, keepdim=True)
|
|
|
|
| 53 |
|
| 54 |
return result
|
| 55 |
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
global model_dict
|
| 59 |
|
| 60 |
model_dict = load_model()
|
| 61 |
|
| 62 |
+
inputs = [
|
| 63 |
+
gr.inputs.Image(type="pil", label="Image"),
|
| 64 |
+
gr.inputs.Textbox(lines=5, label="Caption"),
|
| 65 |
+
]
|
| 66 |
|
| 67 |
+
outputs = ["label"]
|
| 68 |
|
| 69 |
title = "KELIP"
|
| 70 |
description = "Zero-shot classification with KELIP -- Korean and English bilingual contrastive Language-Image Pre-training model that is trained with collected 1.1 billion image-text pairs (708 million Korean and 476 million English).<br> <br><a href='https://arxiv.org/abs/2203.14463' target='_blank'>Arxiv</a> | <a href='https://github.com/navervision/KELIP' target='_blank'>Github</a>"
|
| 71 |
+
|
| 72 |
article = ""
|
| 73 |
|
| 74 |
+
iface = gr.Interface(
|
| 75 |
fn=classify,
|
| 76 |
inputs=inputs,
|
| 77 |
outputs=outputs,
|
| 78 |
title=title,
|
| 79 |
description=description,
|
| 80 |
+
article=article,
|
| 81 |
)
|
| 82 |
+
iface.launch()
|