Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
449a2b2
1
Parent(s):
e997409
Update app
Browse files
app.py
CHANGED
|
@@ -1075,7 +1075,7 @@ def extract_audio(video, result_folder):
|
|
| 1075 |
return wav_file, "success"
|
| 1076 |
|
| 1077 |
@spaces.GPU(duration=100)
|
| 1078 |
-
def get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True):
|
| 1079 |
|
| 1080 |
'''
|
| 1081 |
This function extracts the video and audio embeddings from the input frames and audio sequences
|
|
@@ -1090,30 +1090,64 @@ def get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True):
|
|
| 1090 |
- audio_emb (array) : Audio embedding
|
| 1091 |
'''
|
| 1092 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1093 |
video_emb = []
|
| 1094 |
audio_emb = []
|
| 1095 |
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
for i in tqdm(range(0, len(video_sequences), batch_size)):
|
| 1099 |
video_inp = video_sequences[i:i+batch_size, ]
|
| 1100 |
vid_emb = model.forward_vid(video_inp, return_feats=False)
|
| 1101 |
vid_emb = torch.mean(vid_emb, axis=-1)
|
|
|
|
|
|
|
| 1102 |
|
| 1103 |
-
video_emb.append(vid_emb.detach()
|
|
|
|
| 1104 |
|
| 1105 |
if calc_aud_emb:
|
| 1106 |
audio_inp = audio_sequences[i:i+batch_size, ]
|
| 1107 |
aud_emb = model.forward_aud(audio_inp)
|
| 1108 |
-
audio_emb.append(aud_emb.detach()
|
|
|
|
| 1109 |
|
| 1110 |
-
|
| 1111 |
|
| 1112 |
-
|
|
|
|
|
|
|
| 1113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1114 |
|
| 1115 |
-
if calc_aud_emb==True:
|
| 1116 |
-
print("returning audio and video embeddings...")
|
| 1117 |
return video_emb, audio_emb
|
| 1118 |
|
| 1119 |
return video_emb
|
|
@@ -1135,11 +1169,11 @@ def predict_active_speaker(all_video_embeddings, audio_embedding, global_score,
|
|
| 1135 |
|
| 1136 |
cos = nn.CosineSimilarity(dim=1)
|
| 1137 |
|
| 1138 |
-
audio_embedding = audio_embedding.squeeze(2)
|
| 1139 |
|
| 1140 |
scores = []
|
| 1141 |
for i in range(len(all_video_embeddings)):
|
| 1142 |
-
video_embedding = all_video_embeddings[i]
|
| 1143 |
|
| 1144 |
# Compute the similarity of each speaker's video embeddings with the audio embedding
|
| 1145 |
sim = cos(video_embedding, audio_embedding)
|
|
@@ -1332,15 +1366,16 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
|
|
| 1332 |
|
| 1333 |
# Extract embeddings
|
| 1334 |
print("Obtaining audio and video embeddings...")
|
| 1335 |
-
video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True)
|
| 1336 |
print("Obtained embeddings")
|
| 1337 |
-
video_emb = torch.cat(video_emb, dim=0)
|
| 1338 |
-
audio_emb = torch.cat(audio_emb, dim=0)
|
| 1339 |
-
print("Successfully extracted GestSync embeddings")
|
| 1340 |
|
| 1341 |
# L2 normalize embeddings
|
| 1342 |
print("Normalizing embeddings")
|
|
|
|
| 1343 |
video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
|
|
|
|
| 1344 |
audio_emb = torch.nn.functional.normalize(audio_emb, p=2, dim=1)
|
| 1345 |
|
| 1346 |
audio_emb = torch.split(audio_emb, B, dim=0)
|
|
@@ -1351,6 +1386,7 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
|
|
| 1351 |
video_emb = torch.split(video_emb, B, dim=0)
|
| 1352 |
video_emb = torch.stack(video_emb, dim=2)
|
| 1353 |
video_emb = video_emb.squeeze(3)
|
|
|
|
| 1354 |
|
| 1355 |
|
| 1356 |
# Calculate sync offset
|
|
@@ -1484,11 +1520,11 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
|
| 1484 |
video_sequences = torch.cat([all_masked_frames[idx][:, :, i] for i in range(all_masked_frames[idx].size(2))], dim=0)
|
| 1485 |
|
| 1486 |
if idx==0:
|
| 1487 |
-
video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True)
|
| 1488 |
video_emb = torch.cat(video_emb, dim=0)
|
| 1489 |
audio_emb = torch.cat(audio_emb, dim=0)
|
| 1490 |
else:
|
| 1491 |
-
video_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=False)
|
| 1492 |
video_emb = torch.cat(video_emb, dim=0)
|
| 1493 |
all_video_embs.append(video_emb)
|
| 1494 |
print("Successfully extracted GestSync embeddings")
|
|
|
|
| 1075 |
return wav_file, "success"
|
| 1076 |
|
| 1077 |
@spaces.GPU(duration=100)
|
| 1078 |
+
def get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_emb=True):
|
| 1079 |
|
| 1080 |
'''
|
| 1081 |
This function extracts the video and audio embeddings from the input frames and audio sequences
|
|
|
|
| 1090 |
- audio_emb (array) : Audio embedding
|
| 1091 |
'''
|
| 1092 |
|
| 1093 |
+
# video_emb = []
|
| 1094 |
+
# audio_emb = []
|
| 1095 |
+
|
| 1096 |
+
# # model = model.to(device)
|
| 1097 |
+
|
| 1098 |
+
# for i in tqdm(range(0, len(video_sequences), batch_size)):
|
| 1099 |
+
# video_inp = video_sequences[i:i+batch_size, ]
|
| 1100 |
+
# vid_emb = model.forward_vid(video_inp, return_feats=False)
|
| 1101 |
+
# vid_emb = torch.mean(vid_emb, axis=-1)
|
| 1102 |
+
|
| 1103 |
+
# video_emb.append(vid_emb.detach().cpu())
|
| 1104 |
+
|
| 1105 |
+
# if calc_aud_emb:
|
| 1106 |
+
# audio_inp = audio_sequences[i:i+batch_size, ]
|
| 1107 |
+
# aud_emb = model.forward_aud(audio_inp)
|
| 1108 |
+
# audio_emb.append(aud_emb.detach().cpu())
|
| 1109 |
+
|
| 1110 |
+
# # torch.cuda.empty_cache()
|
| 1111 |
+
|
| 1112 |
+
# print("Extracted embeddings: ", len(video_emb), len(audio_emb))
|
| 1113 |
+
|
| 1114 |
+
|
| 1115 |
+
# if calc_aud_emb==True:
|
| 1116 |
+
# print("returning audio and video embeddings...")
|
| 1117 |
+
# return video_emb, audio_emb
|
| 1118 |
+
|
| 1119 |
+
# return video_emb
|
| 1120 |
+
|
| 1121 |
video_emb = []
|
| 1122 |
audio_emb = []
|
| 1123 |
|
| 1124 |
+
for i in range(0, len(video_sequences), batch_size):
|
|
|
|
|
|
|
| 1125 |
video_inp = video_sequences[i:i+batch_size, ]
|
| 1126 |
vid_emb = model.forward_vid(video_inp, return_feats=False)
|
| 1127 |
vid_emb = torch.mean(vid_emb, axis=-1)
|
| 1128 |
+
if not asd:
|
| 1129 |
+
vid_emb = vid_emb.unsqueeze(-1)
|
| 1130 |
|
| 1131 |
+
# video_emb.append(vid_emb.detach())
|
| 1132 |
+
video_emb.extend(vid_emb.detach().cpu().numpy())
|
| 1133 |
|
| 1134 |
if calc_aud_emb:
|
| 1135 |
audio_inp = audio_sequences[i:i+batch_size, ]
|
| 1136 |
aud_emb = model.forward_aud(audio_inp)
|
| 1137 |
+
# audio_emb.append(aud_emb.detach())
|
| 1138 |
+
audio_emb.extend(aud_emb.detach().cpu().numpy())
|
| 1139 |
|
| 1140 |
+
torch.cuda.empty_cache()
|
| 1141 |
|
| 1142 |
+
# video_emb = torch.cat(video_emb, dim=0)
|
| 1143 |
+
video_emb = np.array(video_emb)
|
| 1144 |
+
print("Video Embedding Shape: ", video_emb.shape)
|
| 1145 |
|
| 1146 |
+
if calc_aud_emb:
|
| 1147 |
+
# audio_emb = torch.cat(audio_emb, dim=0)
|
| 1148 |
+
audio_emb = np.array(audio_emb)
|
| 1149 |
+
print("Audio Embedding Shape: ", audio_emb.shape)
|
| 1150 |
|
|
|
|
|
|
|
| 1151 |
return video_emb, audio_emb
|
| 1152 |
|
| 1153 |
return video_emb
|
|
|
|
| 1169 |
|
| 1170 |
cos = nn.CosineSimilarity(dim=1)
|
| 1171 |
|
| 1172 |
+
audio_embedding = torch.tensor(audio_embedding).squeeze(2)
|
| 1173 |
|
| 1174 |
scores = []
|
| 1175 |
for i in range(len(all_video_embeddings)):
|
| 1176 |
+
video_embedding = torch.tensor(all_video_embeddings[i])
|
| 1177 |
|
| 1178 |
# Compute the similarity of each speaker's video embeddings with the audio embedding
|
| 1179 |
sim = cos(video_embedding, audio_embedding)
|
|
|
|
| 1366 |
|
| 1367 |
# Extract embeddings
|
| 1368 |
print("Obtaining audio and video embeddings...")
|
| 1369 |
+
video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_emb=True)
|
| 1370 |
print("Obtained embeddings")
|
| 1371 |
+
# video_emb = torch.cat(video_emb, dim=0)
|
| 1372 |
+
# audio_emb = torch.cat(audio_emb, dim=0)
|
|
|
|
| 1373 |
|
| 1374 |
# L2 normalize embeddings
|
| 1375 |
print("Normalizing embeddings")
|
| 1376 |
+
video_emb = torch.tensor(video_emb)
|
| 1377 |
video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
|
| 1378 |
+
audio_emb = torch.tensor(audio_emb)
|
| 1379 |
audio_emb = torch.nn.functional.normalize(audio_emb, p=2, dim=1)
|
| 1380 |
|
| 1381 |
audio_emb = torch.split(audio_emb, B, dim=0)
|
|
|
|
| 1386 |
video_emb = torch.split(video_emb, B, dim=0)
|
| 1387 |
video_emb = torch.stack(video_emb, dim=2)
|
| 1388 |
video_emb = video_emb.squeeze(3)
|
| 1389 |
+
print("Successfully extracted GestSync embeddings")
|
| 1390 |
|
| 1391 |
|
| 1392 |
# Calculate sync offset
|
|
|
|
| 1520 |
video_sequences = torch.cat([all_masked_frames[idx][:, :, i] for i in range(all_masked_frames[idx].size(2))], dim=0)
|
| 1521 |
|
| 1522 |
if idx==0:
|
| 1523 |
+
video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, asd=True, calc_aud_emb=True)
|
| 1524 |
video_emb = torch.cat(video_emb, dim=0)
|
| 1525 |
audio_emb = torch.cat(audio_emb, dim=0)
|
| 1526 |
else:
|
| 1527 |
+
video_emb = get_embeddings(video_sequences, audio_sequences, model, asd=True, calc_aud_emb=False)
|
| 1528 |
video_emb = torch.cat(video_emb, dim=0)
|
| 1529 |
all_video_embs.append(video_emb)
|
| 1530 |
print("Successfully extracted GestSync embeddings")
|