Spaces:

ryaalbr
/

QuestApp

Sleeping

App Files Files

ryaalbr commited on Apr 11, 2023

Commit

e7f40b6

1 Parent(s): d486d0b

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -11

app.py CHANGED Viewed

@@ -8,12 +8,37 @@ import clip
 import pickle
 import requests
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # # Load the pre-trained model and processor
-model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 #orig_clip_model, orig_clip_processor = clip.load("ViT-B/32", device=device, jit=False)
@@ -21,12 +46,19 @@ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 # Load the Unsplash dataset
 dataset = load_dataset("jamescalam/unsplash-25k-photos", split="train")  # all 25K images are in train split
 height = 256   # height for resizing images
 def predict(image, labels):
     with torch.no_grad():
-        inputs = processor(text=[f"a photo of {c}" for c in labels], images=image, return_tensors="pt", padding=True)
-        outputs = model(**inputs)
         logits_per_image = outputs.logits_per_image # this is the image-text similarity score
         probs = logits_per_image.softmax(dim=1).cpu().numpy() # we can take the softmax to get the label probabilities
     return {k: float(v) for k, v in zip(labels, probs[0])}
@@ -50,11 +82,103 @@ def rand_image():
 def set_labels(text):
     return text.split(",")
-get_caption = gr.load("ryaalbr/caption", src="spaces", hf_token=environ["api_key"])
-def generate_text(image, model_name):
-    return get_caption(image, model_name)
-# get_images = gr.load("ryaalbr/ImageSearch", src="spaces", hf_token=environ["api_key"])
 # def search_images(text):
 #     return get_images(text, api_name="images")
@@ -68,8 +192,8 @@ def search(search_query):
     with torch.no_grad():
         # Encode and normalize the description using CLIP (HF CLIP)
-        inputs = processor(text=search_query, images=None, return_tensors="pt", padding=True)
-        text_encoded = model.get_text_features(**inputs)
         # # Encode and normalize the description using CLIP (original CLIP)
         # text_encoded = orig_clip_model.encode_text(clip.tokenize(search_query))
@@ -163,7 +287,7 @@ with gr.Blocks() as demo:
             caption = gr.Textbox(label='Caption', elem_classes="caption-text")
         get_btn_cap.click(fn=rand_image, outputs=im_cap)
         #im_cap.change(generate_text, inputs=im_cap, outputs=caption)
-        caption_btn.click(generate_text, inputs=[im_cap, model_name], outputs=caption)
     with gr.Tab("Search"):
         instructions = """## Instructions:

 import pickle
 import requests
 import torch
+import os
+from huggingface_hub import hf_hub_download
+from torch import nn
+import torch.nn.functional as nnf
+import sys
+from typing import Tuple, List, Union, Optional
+from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
+N = type(None)
+V = np.array
+ARRAY = np.ndarray
+ARRAYS = Union[Tuple[ARRAY, ...], List[ARRAY]]
+VS = Union[Tuple[V, ...], List[V]]
+VN = Union[V, N]
+VNS = Union[VS, N]
+T = torch.Tensor
+TS = Union[Tuple[T, ...], List[T]]
+TN = Optional[T]
+TNS = Union[Tuple[TN, ...], List[TN]]
+TSN = Optional[TS]
+TA = Union[T, ARRAY]
+D = torch.device
+CPU = torch.device('cpu')
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # # Load the pre-trained model and processor
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 #orig_clip_model, orig_clip_processor = clip.load("ViT-B/32", device=device, jit=False)
 # Load the Unsplash dataset
 dataset = load_dataset("jamescalam/unsplash-25k-photos", split="train")  # all 25K images are in train split
+# Load gpt and modifed weights for captions
+gpt = GPT2LMHeadModel.from_pretrained('gpt2')
+tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+conceptual_weight = hf_hub_download(repo_id="akhaliq/CLIP-prefix-captioning-conceptual-weights", filename="conceptual_weights.pt")
+coco_weight = hf_hub_download(repo_id="akhaliq/CLIP-prefix-captioning-COCO-weights", filename="coco_weights.pt")
 height = 256   # height for resizing images
 def predict(image, labels):
     with torch.no_grad():
+        inputs = clip_processor(text=[f"a photo of {c}" for c in labels], images=image, return_tensors="pt", padding=True)
+        outputs = clip_model(**inputs)
         logits_per_image = outputs.logits_per_image # this is the image-text similarity score
         probs = logits_per_image.softmax(dim=1).cpu().numpy() # we can take the softmax to get the label probabilities
     return {k: float(v) for k, v in zip(labels, probs[0])}
 def set_labels(text):
     return text.split(",")
+# get_caption = gr.load("ryaalbr/caption", src="spaces", hf_token=environ["api_key"])
+# def generate_text(image, model_name):
+#     return get_caption(image, model_name)
+class MLP(nn.Module):
+    def forward(self, x: T) -> T:
+        return self.model(x)
+    def __init__(self, sizes: Tuple[int, ...], bias=True, act=nn.Tanh):
+        super(MLP, self).__init__()
+        layers = []
+        for i in range(len(sizes) -1):
+            layers.append(nn.Linear(sizes[i], sizes[i + 1], bias=bias))
+            if i < len(sizes) - 2:
+                layers.append(act())
+        self.model = nn.Sequential(*layers)
+class ClipCaptionModel(nn.Module):
+    def get_dummy_token(self, batch_size: int, device: D) -> T:
+        return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64, device=device)
+    def forward(self, tokens: T, prefix: T, mask: Optional[T] = None, labels: Optional[T] = None):
+        embedding_text = self.gpt.transformer.wte(tokens)
+        prefix_projections = self.clip_project(prefix).view(-1, self.prefix_length, self.gpt_embedding_size)
+        embedding_cat = torch.cat((prefix_projections, embedding_text), dim=1)
+        if labels is not None:
+            dummy_token = self.get_dummy_token(tokens.shape[0], tokens.device)
+            labels = torch.cat((dummy_token, tokens), dim=1)
+        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=mask)
+        return out
+    def __init__(self, prefix_length: int, prefix_size: int = 512):
+        super(ClipCaptionModel, self).__init__()
+        self.prefix_length = prefix_length
+        self.gpt = gpt
+        self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
+        if prefix_length > 10:  # not enough memory
+            self.clip_project = nn.Linear(prefix_size, self.gpt_embedding_size * prefix_length)
+        else:
+            self.clip_project = MLP((prefix_size, (self.gpt_embedding_size * prefix_length) // 2, self.gpt_embedding_size * prefix_length))
+#clip_model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
+def get_caption(img,model_name):
+    prefix_length = 10
+    model = ClipCaptionModel(prefix_length)
+    if model_name == "COCO":
+      model_path = coco_weight
+    else:
+      model_path = conceptual_weight
+    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+    model = model.eval()
+    model = model.to(device)
+    input = clip_processor(images=img, return_tensors="pt").to(device)
+    with torch.no_grad():
+        prefix = clip_model.get_image_features(**input)
+    # image = preprocess(img).unsqueeze(0).to(device)
+    # with torch.no_grad():
+    #     prefix = clip_model.encode_image(image).to(device, dtype=torch.float32)
+        prefix_embed = model.clip_project(prefix).reshape(1, prefix_length, -1)
+        output = model.gpt.generate(inputs_embeds=prefix_embed,
+                                  num_beams=1,
+                                  do_sample=False,
+                                  num_return_sequences=1,
+                                  no_repeat_ngram_size=1,
+                                  max_new_tokens = 67,
+                                  pad_token_id = tokenizer.eos_token_id,
+                                  eos_token_id = tokenizer.encode('.')[0],
+                                  renormalize_logits = True)
+        generated_text_prefix = tokenizer.decode(output[0], skip_special_tokens=True)
+    return generated_text_prefix[:-1] if generated_text_prefix[-1] == "." else generated_text_prefix  #remove period at end if present
+    # get_images = gr.load("ryaalbr/ImageSearch", src="spaces", hf_token=environ["api_key"])
 # def search_images(text):
 #     return get_images(text, api_name="images")
     with torch.no_grad():
         # Encode and normalize the description using CLIP (HF CLIP)
+        inputs = clip_processor(text=search_query, images=None, return_tensors="pt", padding=True)
+        text_encoded = clip_model.get_text_features(**inputs)
         # # Encode and normalize the description using CLIP (original CLIP)
         # text_encoded = orig_clip_model.encode_text(clip.tokenize(search_query))
             caption = gr.Textbox(label='Caption', elem_classes="caption-text")
         get_btn_cap.click(fn=rand_image, outputs=im_cap)
         #im_cap.change(generate_text, inputs=im_cap, outputs=caption)
+        caption_btn.click(get_caption, inputs=[im_cap, model_name], outputs=caption)
     with gr.Tab("Search"):
         instructions = """## Instructions: