icce-tutorial-2024

Build error

aswin-raghavan commited on Dec 30, 2023

Commit

c32f3ac

1 Parent(s): ca4b933

CLIP feature extractor

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,26 +3,32 @@ import gradio as gr
 from transformers import pipeline
 import numpy as np
 from PIL import Image
-pipe = pipeline("zero-shot-image-classification", model="openai/clip-vit-base-patch32")
-images="dog.jpg"
 def shot(image, labels_text):
     PIL_image = Image.fromarray(np.uint8(image)).convert('RGB')
     labels = labels_text.split(",")
-    res = pipe(images=PIL_image,
-           candidate_labels=labels,
-           hypothesis_template= "This is a photo of a {}")
-    return {dic["label"]: dic["score"] for dic in res}
 iface = gr.Interface(shot,
                     ["image", "text"],
-                    "label",
                     examples=[["dog.jpg", "dog,cat,bird,animal"],
                             #   ["germany.jpg", "germany,belgium,colombia"],
                               ["colombia.jpg", "germany,belgium,colombia"]],
                     description="Add a picture and a list of labels separated by commas",
-                    title="Zero-shot Image Classification")
 iface.launch()

 from transformers import pipeline
 import numpy as np
 from PIL import Image
+from transformers import CLIPProcessor, CLIPModel
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+# pipe = pipeline("zero-shot-image-classification", model="openai/clip-vit-base-patch32")
+# images="dog.jpg"
 def shot(image, labels_text):
     PIL_image = Image.fromarray(np.uint8(image)).convert('RGB')
     labels = labels_text.split(",")
+    inputs = clip_processor(text=["a photo of a cat", "a photo of a dog"], images=PIL_image, return_tensors="pt", padding=True)
+    outputs = clip_model(**inputs)
+    print(outputs)
+    return outputs.image_embeds
+    # res = pipe(images=PIL_image,
+    #        candidate_labels=labels,
+    #        hypothesis_template= "This is a photo of a {}")
+    # return {dic["label"]: dic["score"] for dic in res}
 iface = gr.Interface(shot,
                     ["image", "text"],
+                    "text",
                     examples=[["dog.jpg", "dog,cat,bird,animal"],
                             #   ["germany.jpg", "germany,belgium,colombia"],
                               ["colombia.jpg", "germany,belgium,colombia"]],
                     description="Add a picture and a list of labels separated by commas",
+                    title="CLIP feature extractor")
 iface.launch()