Spaces:

sohojoe
/

project_charles

Runtime error

App Files Files Community

sohojoe commited on May 26, 2023

Commit

c58cbbc

1 Parent(s): c6ad8e3

switch to open_clip

Browse files

Files changed (5) hide show

__pycache__/clip_transform.cpython-39.pyc +0 -0
app.py +80 -57
clip_transform.py +16 -11
debug.py +4 -0
requirements.txt +1 -1

__pycache__/clip_transform.cpython-39.pyc CHANGED Viewed

Binary files a/__pycache__/clip_transform.cpython-39.pyc and b/__pycache__/clip_transform.cpython-39.pyc differ

app.py CHANGED Viewed

@@ -29,56 +29,81 @@ system_one = {
     "vision_embeddings_fps": 2,
 }
 system_one["video_detection_emotions"] = [
-    "Happiness",
-    "Sadness",
-    "Fear",
-    "Disgust",
-    "Anger",
-    "Surprise",
-    "Boredom",
-    "Interest",
-    "Excitement",
-    "Guilt",
-    "Shame",
-    "Relief",
-    "Love",
-    "Embarrassment",
-    "Pride",
-    "Envy",
-    "Jealousy",
-    "Anxiety",
-    "Hope",
-    "Despair",
-    "Frustration",
-    "Confusion",
-    "Curiosity",
-    "Contentment",
-    "Indifference",
-    "Anticipation",
-    "Gratitude",
-    "Bitterness"
 ]
 system_one["video_detection_engement"] = [
-    "Facial_Expressions",
-    "Open_Body_Language",
-    "Closed_Body_Language",
-    "Eye_Contact",
-    "Interest",
-    "Boredom",
-    "Confusion",
-    "Frustration",
-    "Question_Asking",
-    "Engaged_Language",
-    "Short_Responses",
-    "Distraction_Signs"
 ]
 system_one["video_detection_present"] = [
-    "a person",
-    "no one",
-    " ",
-    "multiple people",
-    "a group of people",
 ]
 system_one_audio_status = st.empty()
@@ -203,6 +228,13 @@ def get_dot_similarities(video_embedding, embeddings, embeddings_labels):
     similarity_image_label.sort(reverse=True)
     return similarity_image_label
 while True:
     if webrtc_ctx.state.playing:
@@ -221,18 +253,9 @@ while True:
             current_video_embedding_timestamp = current_time
             current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
-            similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_emotions_embeddings"], system_one["video_detection_emotions"])
-            emotions_top_3 = ""
-            for i in range(3):
-                emotions_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
-            similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
-            engagement_top_3 = ""
-            for i in range(3):
-                engagement_top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
-            similarities = get_dot_similarities(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
-            present_top_3 = ""
-            for i in range(3):
-                present_top_3 += f"'{similarities[i][1]}' ({similarities[i][0]}), "
             # table_content = "**System 1 Video:**\n\n"
             table_content = "| System 1 Video |    |\n| --- | --- |\n"

     "vision_embeddings_fps": 2,
 }
 system_one["video_detection_emotions"] = [
+    "a happy person",
+    "the person is happy",
+    "the person's emotional state is happy",
+    "a sad person",
+    "a scared person",
+    "a disgusted person",
+    "an angry person",
+    "a suprised person",
+    "a bored person",
+    "an interested person",
+    "a guilty person",
+    "an indiffert person",
+    "a distracted person",
 ]
+# system_one["video_detection_emotions"] = [
+#     "Happiness",
+#     "Sadness",
+#     "Fear",
+#     "Disgust",
+#     "Anger",
+#     "Surprise",
+#     "Boredom",
+#     "Interest",
+#     "Excitement",
+#     "Guilt",
+#     "Shame",
+#     "Relief",
+#     "Love",
+#     "Embarrassment",
+#     "Pride",
+#     "Envy",
+#     "Jealousy",
+#     "Anxiety",
+#     "Hope",
+#     "Despair",
+#     "Frustration",
+#     "Confusion",
+#     "Curiosity",
+#     "Contentment",
+#     "Indifference",
+#     "Anticipation",
+#     "Gratitude",
+#     "Bitterness"
+# ]
 system_one["video_detection_engement"] = [
+    "the person is engaged in the conversation",
+    "the person is not engaged in the conversation",
+    "the person is looking at me",
+    "the person is not looking at me",
+    "the person is talking to me",
+    "the person is not talking to me",
+    "the person is engaged",
+    "the person is talking",
+    "the person is listening",
 ]
 system_one["video_detection_present"] = [
+    "the view from a webcam",
+    "the view from a webcam we see a person",
+    # "the view from a webcam. I see a person",
+    # "the view from a webcam. The person is looking at the camera",
+    # "i am a webcam",
+    # "i am a webcam and i see a person",
+    # "i am a webcam and i see a person. The person is looking at me",
+#     "a person",
+#     "a person on a Zoom call",
+#     "a person on a FaceTime call",
+#     "a person on a WebCam call",
+#     "no one",
+#     " ",
+#     "multiple people",
+#     "a group of people",
 ]
 system_one_audio_status = st.empty()
     similarity_image_label.sort(reverse=True)
     return similarity_image_label
+def get_top_3_similarities_as_a_string(video_embedding, embeddings, embeddings_labels):
+    similarities = get_dot_similarities(video_embedding, embeddings, embeddings_labels)
+    top_3 = ""
+    range_len = 3 if len(similarities) > 3 else len(similarities)
+    for i in range(range_len):
+        top_3 += f"{similarities[i][1]} ({similarities[i][0]}) "
+    return top_3
 while True:
     if webrtc_ctx.state.playing:
             current_video_embedding_timestamp = current_time
             current_video_embedding = clip_transform.image_to_embeddings(video_frames[-1].to_ndarray())
+            emotions_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_emotions_embeddings"], system_one["video_detection_emotions"])
+            engagement_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_engement_embeddings"], system_one["video_detection_engement"])
+            present_top_3 = get_top_3_similarities_as_a_string(current_video_embedding, system_one["video_detection_present_embeddings"], system_one["video_detection_present"])
             # table_content = "**System 1 Video:**\n\n"
             table_content = "| System 1 Video |    |\n| --- | --- |\n"

clip_transform.py CHANGED Viewed

@@ -3,8 +3,7 @@ import os
 import numpy as np
 import torch
 from PIL import Image
-from clip_retrieval.load_clip import load_clip, get_tokenizer
-# from clip_retrieval.clip_client import ClipClient, Modality
 class CLIPTransform:
     def __init__(self):
@@ -14,15 +13,21 @@ class CLIPTransform:
         self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
         # if self.device == "cpu" and torch.backends.mps.is_available():
         #     self.device = torch.device("mps")
-        # self._clip_model="ViT-L/14"
-        self._clip_model="open_clip:ViT-H-14"
-        # self._clip_model="open_clip:ViT-L-14"
-        # self._clip_model="open_clip:datacomp_xl_s13b_b90k"
-        # import open_clip
-        # pretrained = dict(open_clip.list_pretrained())
-        # checkpoint = pretrained[self._clip_model]
-        self.model, self.preprocess = load_clip(self._clip_model, use_jit=True, device=self.device)
-        self.tokenizer = get_tokenizer(self._clip_model)
         print ("using device", self.device)

 import numpy as np
 import torch
 from PIL import Image
+import open_clip
 class CLIPTransform:
     def __init__(self):
         self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
         # if self.device == "cpu" and torch.backends.mps.is_available():
         #     self.device = torch.device("mps")
+        # # ViT-H-14
+        # self._clip_model="ViT-H-14"
+        # self._pretrained='laion2B-s32B-b79K'
+        # # ViT-B-32
+        # self._clip_model="ViT-B-32"
+        # self._pretrained='laion2b_s34b_b79k'
+        # ViT-L/14 1.71gb
+        self._clip_model="ViT-L-14"
+        self._pretrained='datacomp_xl_s13b_b90k'
+        self.model, _, self.preprocess = open_clip.create_model_and_transforms(self._clip_model, pretrained=self._pretrained)
+        self.tokenizer = open_clip.get_tokenizer(self._clip_model)
         print ("using device", self.device)

debug.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from clip_transform import CLIPTransform
+clip_transform = CLIPTransform()
+print ("Initializing CLIP templates")

requirements.txt CHANGED Viewed

@@ -13,4 +13,4 @@ watchdog
 pydub
 torch
 numpy
-clip-retrieval == 2.36.1

 pydub
 torch
 numpy
+open_clip_torch