ncoop57 commited on
Commit ·
021b099
1
Parent(s): f400687
add initial code
Browse files- app.py +87 -0
- clip.py +80 -0
- requirements.txt +6 -0
app.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torch._C import device
|
| 2 |
+
import ffmpeg
|
| 3 |
+
import youtube_dl
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
from PIL import Image
|
| 8 |
+
import requests
|
| 9 |
+
|
| 10 |
+
import torch
|
| 11 |
+
from sentence_transformers import SentenceTransformer, util, models
|
| 12 |
+
from clip import CLIPModel
|
| 13 |
+
# from sentence_transformers.models import CLIPModel
|
| 14 |
+
from PIL import Image
|
| 15 |
+
|
| 16 |
+
clip = CLIPModel()
|
| 17 |
+
model = SentenceTransformer(modules=[clip]).to(dtype=torch.float32, device=torch.device('cpu'))
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_embedding(query, video):
|
| 21 |
+
text_emb = model.encode(query, device='cpu')
|
| 22 |
+
|
| 23 |
+
# Encode an image:
|
| 24 |
+
images = []
|
| 25 |
+
for img in video:
|
| 26 |
+
images.append(Image.fromarray(img))
|
| 27 |
+
img_embs = model.encode(images, device='cpu')
|
| 28 |
+
|
| 29 |
+
return text_emb, img_embs
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# # Encode an image:
|
| 33 |
+
# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
| 34 |
+
# img = Image.fromarray(np.array(Image.open(requests.get(url, stream=True).raw))).convert('RGB')
|
| 35 |
+
# img_emb = model.encode([img, img], device='cpu')
|
| 36 |
+
|
| 37 |
+
# # Encode text descriptions
|
| 38 |
+
# text_emb = model.encode(['Two dogs in the snow', 'Two cats laying on a sofa',
|
| 39 |
+
# 'A picture of London at night'], device='cpu')
|
| 40 |
+
|
| 41 |
+
# # Compute cosine similarities
|
| 42 |
+
# cos_scores = util.cos_sim(img_emb, text_emb)
|
| 43 |
+
# print(cos_scores)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def my_hook(d):
|
| 47 |
+
if d['status'] == 'finished':
|
| 48 |
+
print(d)
|
| 49 |
+
print('Done downloading, now extracting frames ...')
|
| 50 |
+
probe = ffmpeg.probe(d["filename"])
|
| 51 |
+
video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
|
| 52 |
+
width = int(video_stream['width'])
|
| 53 |
+
height = int(video_stream['height'])
|
| 54 |
+
out, _ = (
|
| 55 |
+
ffmpeg
|
| 56 |
+
.input(d["filename"])
|
| 57 |
+
.output('pipe:', format='rawvideo', pix_fmt='rgb24')
|
| 58 |
+
.run(capture_stdout=True)
|
| 59 |
+
)
|
| 60 |
+
video = (
|
| 61 |
+
np
|
| 62 |
+
.frombuffer(out, np.uint8)
|
| 63 |
+
.reshape([-1, height, width, 3])
|
| 64 |
+
)[::10]
|
| 65 |
+
|
| 66 |
+
print(video.shape)
|
| 67 |
+
txt_embd, img_embds = get_embedding("two white puppies", video)
|
| 68 |
+
cos_scores = util.cos_sim(txt_embd, img_embds)
|
| 69 |
+
print(cos_scores)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
ydl_opts = {"format": "mp4", "progress_hooks": [my_hook], }
|
| 73 |
+
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
| 74 |
+
ydl.download(['https://youtu.be/I3AaW9ZevIU'])
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# # out, _ = (
|
| 78 |
+
# # ffmpeg
|
| 79 |
+
# # .input('in.mp4')
|
| 80 |
+
# # .output('pipe:', format='rawvideo', pix_fmt='rgb24')
|
| 81 |
+
# # .run(capture_stdout=True)
|
| 82 |
+
# # )
|
| 83 |
+
# # video = (
|
| 84 |
+
# # np
|
| 85 |
+
# # .frombuffer(out, np.uint8)
|
| 86 |
+
# # .reshape([-1, height, width, 3])
|
| 87 |
+
# )
|
clip.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torch import nn
|
| 2 |
+
import transformers
|
| 3 |
+
import torch
|
| 4 |
+
from PIL import Image
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class CLIPModel(nn.Module):
|
| 8 |
+
def __init__(self, model_name: str = "openai/clip-vit-base-patch32", processor_name=None):
|
| 9 |
+
super(CLIPModel, self).__init__()
|
| 10 |
+
|
| 11 |
+
if processor_name is None:
|
| 12 |
+
processor_name = model_name
|
| 13 |
+
|
| 14 |
+
self.model = transformers.CLIPModel.from_pretrained(model_name)
|
| 15 |
+
self.processor = transformers.CLIPProcessor.from_pretrained(processor_name)
|
| 16 |
+
|
| 17 |
+
def __repr__(self):
|
| 18 |
+
return "CLIPModel()"
|
| 19 |
+
|
| 20 |
+
def forward(self, features):
|
| 21 |
+
image_embeds = []
|
| 22 |
+
text_embeds = []
|
| 23 |
+
|
| 24 |
+
if 'pixel_values' in features:
|
| 25 |
+
vision_outputs = self.model.vision_model(pixel_values=features['pixel_values'])
|
| 26 |
+
image_embeds = self.model.visual_projection(vision_outputs[1])
|
| 27 |
+
|
| 28 |
+
if 'input_ids' in features:
|
| 29 |
+
text_outputs = self.model.text_model(
|
| 30 |
+
input_ids=features.get('input_ids'),
|
| 31 |
+
attention_mask=features.get('attention_mask', None),
|
| 32 |
+
position_ids=features.get('position_ids', None),
|
| 33 |
+
output_attentions=features.get('output_attentions', None),
|
| 34 |
+
output_hidden_states=features.get('output_hidden_states', None),
|
| 35 |
+
)
|
| 36 |
+
text_embeds = self.model.text_projection(text_outputs[1])
|
| 37 |
+
|
| 38 |
+
sentence_embedding = []
|
| 39 |
+
image_features = iter(image_embeds)
|
| 40 |
+
text_features = iter(text_embeds)
|
| 41 |
+
|
| 42 |
+
for idx, input_type in enumerate(features['image_text_info']):
|
| 43 |
+
if input_type == 0:
|
| 44 |
+
sentence_embedding.append(next(image_features))
|
| 45 |
+
else:
|
| 46 |
+
sentence_embedding.append(next(text_features))
|
| 47 |
+
|
| 48 |
+
features['sentence_embedding'] = torch.stack(sentence_embedding).float()
|
| 49 |
+
|
| 50 |
+
return features
|
| 51 |
+
|
| 52 |
+
def tokenize(self, texts):
|
| 53 |
+
images = []
|
| 54 |
+
texts_values = []
|
| 55 |
+
image_text_info = []
|
| 56 |
+
|
| 57 |
+
for idx, data in enumerate(texts):
|
| 58 |
+
if isinstance(data, Image.Image): # An Image
|
| 59 |
+
images.append(data)
|
| 60 |
+
image_text_info.append(0)
|
| 61 |
+
else: # A text
|
| 62 |
+
texts_values.append(data)
|
| 63 |
+
image_text_info.append(1)
|
| 64 |
+
|
| 65 |
+
if len(texts_values) == 0:
|
| 66 |
+
texts_values = None
|
| 67 |
+
if len(images) == 0:
|
| 68 |
+
images = None
|
| 69 |
+
|
| 70 |
+
inputs = self.processor(text=texts_values, images=images, return_tensors="pt", padding=True)
|
| 71 |
+
inputs['image_text_info'] = image_text_info
|
| 72 |
+
return inputs
|
| 73 |
+
|
| 74 |
+
def save(self, output_path: str):
|
| 75 |
+
self.model.save_pretrained(output_path)
|
| 76 |
+
self.processor.save_pretrained(output_path)
|
| 77 |
+
|
| 78 |
+
@staticmethod
|
| 79 |
+
def load(input_path: str):
|
| 80 |
+
return CLIPModel(model_name=input_path)
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ffmpeg-python
|
| 2 |
+
numpy
|
| 3 |
+
pillow
|
| 4 |
+
torch
|
| 5 |
+
git+https://github.com/ncoop57/sentence-transformers@clip-image-check
|
| 6 |
+
youtube_dl
|