Spaces:
Runtime error
Runtime error
switch to open_clip
Browse files- __pycache__/clip_transform.cpython-39.pyc +0 -0
- app.py +80 -57
- clip_transform.py +16 -11
- debug.py +4 -0
- requirements.txt +1 -1
__pycache__/clip_transform.cpython-39.pyc
CHANGED
|
Binary files a/__pycache__/clip_transform.cpython-39.pyc and b/__pycache__/clip_transform.cpython-39.pyc differ
|
|
|
app.py
CHANGED
|
@@ -29,56 +29,81 @@ system_one = {
|
|
| 29 |
"vision_embeddings_fps": 2,
|
| 30 |
}
|
| 31 |
|
|
|
|
| 32 |
system_one["video_detection_emotions"] = [
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
-
"
|
| 40 |
-
"
|
| 41 |
-
"
|
| 42 |
-
"
|
| 43 |
-
"
|
| 44 |
-
"
|
| 45 |
-
"
|
| 46 |
-
"Embarrassment",
|
| 47 |
-
"Pride",
|
| 48 |
-
"Envy",
|
| 49 |
-
"Jealousy",
|
| 50 |
-
"Anxiety",
|
| 51 |
-
"Hope",
|
| 52 |
-
"Despair",
|
| 53 |
-
"Frustration",
|
| 54 |
-
"Confusion",
|
| 55 |
-
"Curiosity",
|
| 56 |
-
"Contentment",
|
| 57 |
-
"Indifference",
|
| 58 |
-
"Anticipation",
|
| 59 |
-
"Gratitude",
|
| 60 |
-
"Bitterness"
|
| 61 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
system_one["video_detection_engement"] = [
|
| 63 |
-
"
|
| 64 |
-
"
|
| 65 |
-
"
|
| 66 |
-
"
|
| 67 |
-
"
|
| 68 |
-
"
|
| 69 |
-
"
|
| 70 |
-
"
|
| 71 |
-
"
|
| 72 |
-
"Engaged_Language",
|
| 73 |
-
"Short_Responses",
|
| 74 |
-
"Distraction_Signs"
|
| 75 |
]
|
| 76 |
system_one["video_detection_present"] = [
|
| 77 |
-
"a
|
| 78 |
-
"
|
| 79 |
-
" ",
|
| 80 |
-
"
|
| 81 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
]
|
| 83 |
|
| 84 |
system_one_audio_status = st.empty()
|
|
@@ -203,6 +228,13 @@ def get_dot_similarities(video_embedding, embeddings, embeddings_labels):
|
|
| 203 |
similarity_image_label.sort(reverse=True)
|
| 204 |
return similarity_image_label
|
| 205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
while True:
|
| 208 |
if webrtc_ctx.state.playing:
|
|
@@ -221,18 +253,9 @@ while True:
|
|
| 221 |
current_video_embedding_timestamp = current_time
|
| 222 |
current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
|
| 223 |
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
emotions_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
|
| 228 |
-
similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
|
| 229 |
-
engagement_top_3 = ""
|
| 230 |
-
for i in range(3):
|
| 231 |
-
engagement_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
|
| 232 |
-
similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
|
| 233 |
-
present_top_3 = ""
|
| 234 |
-
for i in range(3):
|
| 235 |
-
present_top_3 += f"'{similarities[i][1]}' ({similarities[i][0]}), "
|
| 236 |
|
| 237 |
# table_content = "**System 1 Video:**\n\n"
|
| 238 |
table_content = "| System 1 Video | |\n| --- | --- |\n"
|
|
|
|
| 29 |
"vision_embeddings_fps": 2,
|
| 30 |
}
|
| 31 |
|
| 32 |
+
|
| 33 |
system_one["video_detection_emotions"] = [
|
| 34 |
+
"a happy person",
|
| 35 |
+
"the person is happy",
|
| 36 |
+
"the person's emotional state is happy",
|
| 37 |
+
"a sad person",
|
| 38 |
+
"a scared person",
|
| 39 |
+
"a disgusted person",
|
| 40 |
+
"an angry person",
|
| 41 |
+
"a suprised person",
|
| 42 |
+
"a bored person",
|
| 43 |
+
"an interested person",
|
| 44 |
+
"a guilty person",
|
| 45 |
+
"an indiffert person",
|
| 46 |
+
"a distracted person",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# system_one["video_detection_emotions"] = [
|
| 51 |
+
# "Happiness",
|
| 52 |
+
# "Sadness",
|
| 53 |
+
# "Fear",
|
| 54 |
+
# "Disgust",
|
| 55 |
+
# "Anger",
|
| 56 |
+
# "Surprise",
|
| 57 |
+
# "Boredom",
|
| 58 |
+
# "Interest",
|
| 59 |
+
# "Excitement",
|
| 60 |
+
# "Guilt",
|
| 61 |
+
# "Shame",
|
| 62 |
+
# "Relief",
|
| 63 |
+
# "Love",
|
| 64 |
+
# "Embarrassment",
|
| 65 |
+
# "Pride",
|
| 66 |
+
# "Envy",
|
| 67 |
+
# "Jealousy",
|
| 68 |
+
# "Anxiety",
|
| 69 |
+
# "Hope",
|
| 70 |
+
# "Despair",
|
| 71 |
+
# "Frustration",
|
| 72 |
+
# "Confusion",
|
| 73 |
+
# "Curiosity",
|
| 74 |
+
# "Contentment",
|
| 75 |
+
# "Indifference",
|
| 76 |
+
# "Anticipation",
|
| 77 |
+
# "Gratitude",
|
| 78 |
+
# "Bitterness"
|
| 79 |
+
# ]
|
| 80 |
system_one["video_detection_engement"] = [
|
| 81 |
+
"the person is engaged in the conversation",
|
| 82 |
+
"the person is not engaged in the conversation",
|
| 83 |
+
"the person is looking at me",
|
| 84 |
+
"the person is not looking at me",
|
| 85 |
+
"the person is talking to me",
|
| 86 |
+
"the person is not talking to me",
|
| 87 |
+
"the person is engaged",
|
| 88 |
+
"the person is talking",
|
| 89 |
+
"the person is listening",
|
|
|
|
|
|
|
|
|
|
| 90 |
]
|
| 91 |
system_one["video_detection_present"] = [
|
| 92 |
+
"the view from a webcam",
|
| 93 |
+
"the view from a webcam we see a person",
|
| 94 |
+
# "the view from a webcam. I see a person",
|
| 95 |
+
# "the view from a webcam. The person is looking at the camera",
|
| 96 |
+
# "i am a webcam",
|
| 97 |
+
# "i am a webcam and i see a person",
|
| 98 |
+
# "i am a webcam and i see a person. The person is looking at me",
|
| 99 |
+
# "a person",
|
| 100 |
+
# "a person on a Zoom call",
|
| 101 |
+
# "a person on a FaceTime call",
|
| 102 |
+
# "a person on a WebCam call",
|
| 103 |
+
# "no one",
|
| 104 |
+
# " ",
|
| 105 |
+
# "multiple people",
|
| 106 |
+
# "a group of people",
|
| 107 |
]
|
| 108 |
|
| 109 |
system_one_audio_status = st.empty()
|
|
|
|
| 228 |
similarity_image_label.sort(reverse=True)
|
| 229 |
return similarity_image_label
|
| 230 |
|
| 231 |
+
def get_top_3_similarities_as_a_string(video_embedding, embeddings, embeddings_labels):
|
| 232 |
+
similarities = get_dot_similarities(video_embedding, embeddings, embeddings_labels)
|
| 233 |
+
top_3 = ""
|
| 234 |
+
range_len = 3 if len(similarities) > 3 else len(similarities)
|
| 235 |
+
for i in range(range_len):
|
| 236 |
+
top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
|
| 237 |
+
return top_3
|
| 238 |
|
| 239 |
while True:
|
| 240 |
if webrtc_ctx.state.playing:
|
|
|
|
| 253 |
current_video_embedding_timestamp = current_time
|
| 254 |
current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
|
| 255 |
|
| 256 |
+
emotions_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_emotions_embeddings"], system_one["video_detection_emotions"])
|
| 257 |
+
engagement_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
|
| 258 |
+
present_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
# table_content = "**System 1 Video:**\n\n"
|
| 261 |
table_content = "| System 1 Video | |\n| --- | --- |\n"
|
clip_transform.py
CHANGED
|
@@ -3,8 +3,7 @@ import os
|
|
| 3 |
import numpy as np
|
| 4 |
import torch
|
| 5 |
from PIL import Image
|
| 6 |
-
|
| 7 |
-
# from clip_retrieval.clip_client import ClipClient, Modality
|
| 8 |
|
| 9 |
class CLIPTransform:
|
| 10 |
def __init__(self):
|
|
@@ -14,15 +13,21 @@ class CLIPTransform:
|
|
| 14 |
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 15 |
# if self.device == "cpu" and torch.backends.mps.is_available():
|
| 16 |
# self.device = torch.device("mps")
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
# self._clip_model="
|
| 20 |
-
# self.
|
| 21 |
-
|
| 22 |
-
#
|
| 23 |
-
#
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
print ("using device", self.device)
|
| 28 |
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import torch
|
| 5 |
from PIL import Image
|
| 6 |
+
import open_clip
|
|
|
|
| 7 |
|
| 8 |
class CLIPTransform:
|
| 9 |
def __init__(self):
|
|
|
|
| 13 |
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 14 |
# if self.device == "cpu" and torch.backends.mps.is_available():
|
| 15 |
# self.device = torch.device("mps")
|
| 16 |
+
|
| 17 |
+
# # ViT-H-14
|
| 18 |
+
# self._clip_model="ViT-H-14"
|
| 19 |
+
# self._pretrained='laion2B-s32B-b79K'
|
| 20 |
+
|
| 21 |
+
# # ViT-B-32
|
| 22 |
+
# self._clip_model="ViT-B-32"
|
| 23 |
+
# self._pretrained='laion2b_s34b_b79k'
|
| 24 |
+
|
| 25 |
+
# ViT-L/14 1.71gb
|
| 26 |
+
self._clip_model="ViT-L-14"
|
| 27 |
+
self._pretrained='datacomp_xl_s13b_b90k'
|
| 28 |
+
|
| 29 |
+
self.model, _, self.preprocess = open_clip.create_model_and_transforms(self._clip_model, pretrained=self._pretrained)
|
| 30 |
+
self.tokenizer = open_clip.get_tokenizer(self._clip_model)
|
| 31 |
|
| 32 |
print ("using device", self.device)
|
| 33 |
|
debug.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from clip_transform import CLIPTransform
|
| 2 |
+
clip_transform = CLIPTransform()
|
| 3 |
+
|
| 4 |
+
print ("Initializing CLIP templates")
|
requirements.txt
CHANGED
|
@@ -13,4 +13,4 @@ watchdog
|
|
| 13 |
pydub
|
| 14 |
torch
|
| 15 |
numpy
|
| 16 |
-
|
|
|
|
| 13 |
pydub
|
| 14 |
torch
|
| 15 |
numpy
|
| 16 |
+
open_clip_torch
|