Spaces:

GSMK
/

zero-shot

Running

App Files Files Community

GSMK commited on Mar 10

Commit

00fa93d

verified ·

1 Parent(s): 58150c4

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -21

app.py CHANGED Viewed

@@ -4,16 +4,16 @@ from PIL import Image
 import open_clip
 from transformers import (
     BlipProcessor,
-    BlipForConditionalGeneration,
-    AutoProcessor,
-    AutoModel
 )
-st.set_page_config(page_title="Zero Shot Image Classification", layout="wide")
-st.title("Zero Shot Image Classification")
-st.write("BiomedCLIP + RemoteCLIP + AgriCLIP + BLIP")
 device = "cpu"
@@ -25,7 +25,7 @@ device = "cpu"
 @st.cache_resource
 def load_models():
-    # -------- BIOMEDCLIP --------
     biomed_model, _, biomed_preprocess = open_clip.create_model_and_transforms(
         "hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224"
     )
@@ -36,7 +36,8 @@ def load_models():
     biomed_model = biomed_model.to(device).eval()
-    # -------- REMOTECLIP --------
     remote_model, _, remote_preprocess = open_clip.create_model_and_transforms(
         "ViT-B-32",
         pretrained="laion2b_s34b_b79k"
@@ -45,11 +46,18 @@ def load_models():
     remote_tokenizer = open_clip.get_tokenizer("ViT-B-32")
     remote_model = remote_model.to(device).eval()
-    # -------- AGRICLIP --------
-    agri_processor = AutoProcessor.from_pretrained("hmellor/agriclip")
-    agri_model = AutoModel.from_pretrained("hmellor/agriclip").to(device).eval()
-    # -------- BLIP --------
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-base"
     )
@@ -58,6 +66,7 @@ def load_models():
         "Salesforce/blip-image-captioning-base"
     ).to(device).eval()
     return (
         biomed_model,
         biomed_preprocess,
@@ -65,8 +74,8 @@ def load_models():
         remote_model,
         remote_preprocess,
         remote_tokenizer,
-        agri_model,
-        agri_processor,
         blip_processor,
         blip_model
     )
@@ -79,15 +88,15 @@ def load_models():
     remote_model,
     remote_preprocess,
     remote_tokenizer,
-    agri_model,
-    agri_processor,
     blip_processor,
     blip_model
 ) = load_models()
 # --------------------------------------------------
-# DATASETS
 # --------------------------------------------------
 DATASETS = {
@@ -99,7 +108,7 @@ DATASETS = {
 # --------------------------------------------------
-# PROMPT TEMPLATES WITH SUBCLASSES
 # --------------------------------------------------
 templates = {
@@ -291,6 +300,7 @@ if uploaded_file:
         similarity = (image_features @ text_features.T).softmax(dim=-1)
     elif dataset_key == "satellite":
         img = remote_preprocess(image).unsqueeze(0).to(device)
@@ -302,9 +312,10 @@ if uploaded_file:
         similarity = (image_features @ text_features.T).softmax(dim=-1)
     elif dataset_key == "agriculture":
-        inputs = agri_processor(
             text=text_queries,
             images=image,
             return_tensors="pt",
@@ -312,7 +323,7 @@ if uploaded_file:
         ).to(device)
         with torch.no_grad():
-            outputs = agri_model(**inputs)
         similarity = outputs.logits_per_image.softmax(dim=1)
@@ -326,7 +337,7 @@ if uploaded_file:
     # --------------------------------------------------
-    # BLIP CAPTION
     # --------------------------------------------------
     blip_inputs = blip_processor(images=image, return_tensors="pt").to(device)

 import open_clip
 from transformers import (
+    CLIPProcessor,
+    CLIPModel,
     BlipProcessor,
+    BlipForConditionalGeneration
 )
+st.set_page_config(page_title="Multi-Domain Zero Shot AI", layout="wide")
+st.title("Multi-Domain Zero Shot Image Classification")
+st.write("BiomedCLIP + RemoteCLIP + CLIP + BLIP")
 device = "cpu"
 @st.cache_resource
 def load_models():
+    # ---------- BIOMEDCLIP ----------
     biomed_model, _, biomed_preprocess = open_clip.create_model_and_transforms(
         "hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224"
     )
     biomed_model = biomed_model.to(device).eval()
+    # ---------- REMOTECLIP ----------
     remote_model, _, remote_preprocess = open_clip.create_model_and_transforms(
         "ViT-B-32",
         pretrained="laion2b_s34b_b79k"
     remote_tokenizer = open_clip.get_tokenizer("ViT-B-32")
     remote_model = remote_model.to(device).eval()
+    # ---------- CLIP (AGRICULTURE) ----------
+    clip_model = CLIPModel.from_pretrained(
+        "openai/clip-vit-base-patch32"
+    ).to(device).eval()
+    clip_processor = CLIPProcessor.from_pretrained(
+        "openai/clip-vit-base-patch32"
+    )
+    # ---------- BLIP ----------
     blip_processor = BlipProcessor.from_pretrained(
         "Salesforce/blip-image-captioning-base"
     )
         "Salesforce/blip-image-captioning-base"
     ).to(device).eval()
     return (
         biomed_model,
         biomed_preprocess,
         remote_model,
         remote_preprocess,
         remote_tokenizer,
+        clip_model,
+        clip_processor,
         blip_processor,
         blip_model
     )
     remote_model,
     remote_preprocess,
     remote_tokenizer,
+    clip_model,
+    clip_processor,
     blip_processor,
     blip_model
 ) = load_models()
 # --------------------------------------------------
+# DATASET CLASSES
 # --------------------------------------------------
 DATASETS = {
 # --------------------------------------------------
+# PROMPT TEMPLATES (SUBCLASS PROMPTS)
 # --------------------------------------------------
 templates = {
         similarity = (image_features @ text_features.T).softmax(dim=-1)
     elif dataset_key == "satellite":
         img = remote_preprocess(image).unsqueeze(0).to(device)
         similarity = (image_features @ text_features.T).softmax(dim=-1)
     elif dataset_key == "agriculture":
+        inputs = clip_processor(
             text=text_queries,
             images=image,
             return_tensors="pt",
         ).to(device)
         with torch.no_grad():
+            outputs = clip_model(**inputs)
         similarity = outputs.logits_per_image.softmax(dim=1)
     # --------------------------------------------------
+    # BLIP IMAGE CAPTION
     # --------------------------------------------------
     blip_inputs = blip_processor(images=image, return_tensors="pt").to(device)