Spaces:

ncoop57
/

clifs

Build error

App Files Files Community

ncoop57 commited on Sep 30, 2021

Commit

021b099

1 Parent(s): f400687

add initial code

Browse files

Files changed (3) hide show

app.py +87 -0
clip.py +80 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from torch._C import device
+import ffmpeg
+import youtube_dl
+import numpy as np
+from PIL import Image
+import requests
+import torch
+from sentence_transformers import SentenceTransformer, util, models
+from clip import CLIPModel
+# from sentence_transformers.models import CLIPModel
+from PIL import Image
+clip = CLIPModel()
+model = SentenceTransformer(modules=[clip]).to(dtype=torch.float32, device=torch.device('cpu'))
+def get_embedding(query, video):
+    text_emb = model.encode(query, device='cpu')
+    # Encode an image:
+    images = []
+    for img in video:
+        images.append(Image.fromarray(img))
+    img_embs = model.encode(images, device='cpu')
+    return text_emb, img_embs
+# # Encode an image:
+# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+# img = Image.fromarray(np.array(Image.open(requests.get(url, stream=True).raw))).convert('RGB')
+# img_emb = model.encode([img, img], device='cpu')
+# # Encode text descriptions
+# text_emb = model.encode(['Two dogs in the snow', 'Two cats laying on a sofa',
+#                          'A picture of London at night'], device='cpu')
+# # Compute cosine similarities
+# cos_scores = util.cos_sim(img_emb, text_emb)
+# print(cos_scores)
+def my_hook(d):
+    if d['status'] == 'finished':
+        print(d)
+        print('Done downloading, now extracting frames ...')
+        probe = ffmpeg.probe(d["filename"])
+        video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
+        width = int(video_stream['width'])
+        height = int(video_stream['height'])
+        out, _ = (
+            ffmpeg
+            .input(d["filename"])
+            .output('pipe:', format='rawvideo', pix_fmt='rgb24')
+            .run(capture_stdout=True)
+        )
+        video = (
+            np
+            .frombuffer(out, np.uint8)
+            .reshape([-1, height, width, 3])
+        )[::10]
+        print(video.shape)
+        txt_embd, img_embds = get_embedding("two white puppies", video)
+        cos_scores = util.cos_sim(txt_embd, img_embds)
+        print(cos_scores)
+ydl_opts = {"format": "mp4", "progress_hooks": [my_hook], }
+with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+    ydl.download(['https://youtu.be/I3AaW9ZevIU'])
+# # out, _ = (
+# #     ffmpeg
+# #     .input('in.mp4')
+# #     .output('pipe:', format='rawvideo', pix_fmt='rgb24')
+# #     .run(capture_stdout=True)
+# # )
+# # video = (
+# #     np
+# #     .frombuffer(out, np.uint8)
+# #     .reshape([-1, height, width, 3])
+# )

clip.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from torch import nn
+import transformers
+import torch
+from PIL import Image
+class CLIPModel(nn.Module):
+    def __init__(self, model_name: str = "openai/clip-vit-base-patch32", processor_name=None):
+        super(CLIPModel, self).__init__()
+        if processor_name is None:
+            processor_name = model_name
+        self.model = transformers.CLIPModel.from_pretrained(model_name)
+        self.processor = transformers.CLIPProcessor.from_pretrained(processor_name)
+    def __repr__(self):
+        return "CLIPModel()"
+    def forward(self, features):
+        image_embeds = []
+        text_embeds = []
+        if 'pixel_values' in features:
+            vision_outputs = self.model.vision_model(pixel_values=features['pixel_values'])
+            image_embeds = self.model.visual_projection(vision_outputs[1])
+        if 'input_ids' in features:
+            text_outputs = self.model.text_model(
+                input_ids=features.get('input_ids'),
+                attention_mask=features.get('attention_mask', None),
+                position_ids=features.get('position_ids', None),
+                output_attentions=features.get('output_attentions', None),
+                output_hidden_states=features.get('output_hidden_states', None),
+            )
+            text_embeds = self.model.text_projection(text_outputs[1])
+        sentence_embedding = []
+        image_features = iter(image_embeds)
+        text_features = iter(text_embeds)
+        for idx, input_type in enumerate(features['image_text_info']):
+            if input_type == 0:
+                sentence_embedding.append(next(image_features))
+            else:
+                sentence_embedding.append(next(text_features))
+        features['sentence_embedding'] = torch.stack(sentence_embedding).float()
+        return features
+    def tokenize(self, texts):
+        images = []
+        texts_values = []
+        image_text_info = []
+        for idx, data in enumerate(texts):
+            if isinstance(data, Image.Image):  # An Image
+                images.append(data)
+                image_text_info.append(0)
+            else:  # A text
+                texts_values.append(data)
+                image_text_info.append(1)
+        if len(texts_values) == 0:
+            texts_values = None
+        if len(images) == 0:
+            images = None
+        inputs = self.processor(text=texts_values, images=images, return_tensors="pt", padding=True)
+        inputs['image_text_info'] = image_text_info
+        return inputs
+    def save(self, output_path: str):
+        self.model.save_pretrained(output_path)
+        self.processor.save_pretrained(output_path)
+    @staticmethod
+    def load(input_path: str):
+        return CLIPModel(model_name=input_path)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+ffmpeg-python
+numpy
+pillow
+torch
+git+https://github.com/ncoop57/sentence-transformers@clip-image-check
+youtube_dl