sindhuhegde commited on
Commit
449a2b2
·
1 Parent(s): e997409

Update app

Browse files
Files changed (1) hide show
  1. app.py +54 -18
app.py CHANGED
@@ -1075,7 +1075,7 @@ def extract_audio(video, result_folder):
1075
  return wav_file, "success"
1076
 
1077
  @spaces.GPU(duration=100)
1078
- def get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True):
1079
 
1080
  '''
1081
  This function extracts the video and audio embeddings from the input frames and audio sequences
@@ -1090,30 +1090,64 @@ def get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True):
1090
  - audio_emb (array) : Audio embedding
1091
  '''
1092
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1093
  video_emb = []
1094
  audio_emb = []
1095
 
1096
- # model = model.to(device)
1097
-
1098
- for i in tqdm(range(0, len(video_sequences), batch_size)):
1099
  video_inp = video_sequences[i:i+batch_size, ]
1100
  vid_emb = model.forward_vid(video_inp, return_feats=False)
1101
  vid_emb = torch.mean(vid_emb, axis=-1)
 
 
1102
 
1103
- video_emb.append(vid_emb.detach().cpu())
 
1104
 
1105
  if calc_aud_emb:
1106
  audio_inp = audio_sequences[i:i+batch_size, ]
1107
  aud_emb = model.forward_aud(audio_inp)
1108
- audio_emb.append(aud_emb.detach().cpu())
 
1109
 
1110
- # torch.cuda.empty_cache()
1111
 
1112
- print("Extracted embeddings: ", len(video_emb), len(audio_emb))
 
 
1113
 
 
 
 
 
1114
 
1115
- if calc_aud_emb==True:
1116
- print("returning audio and video embeddings...")
1117
  return video_emb, audio_emb
1118
 
1119
  return video_emb
@@ -1135,11 +1169,11 @@ def predict_active_speaker(all_video_embeddings, audio_embedding, global_score,
1135
 
1136
  cos = nn.CosineSimilarity(dim=1)
1137
 
1138
- audio_embedding = audio_embedding.squeeze(2)
1139
 
1140
  scores = []
1141
  for i in range(len(all_video_embeddings)):
1142
- video_embedding = all_video_embeddings[i]
1143
 
1144
  # Compute the similarity of each speaker's video embeddings with the audio embedding
1145
  sim = cos(video_embedding, audio_embedding)
@@ -1332,15 +1366,16 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
1332
 
1333
  # Extract embeddings
1334
  print("Obtaining audio and video embeddings...")
1335
- video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True)
1336
  print("Obtained embeddings")
1337
- video_emb = torch.cat(video_emb, dim=0)
1338
- audio_emb = torch.cat(audio_emb, dim=0)
1339
- print("Successfully extracted GestSync embeddings")
1340
 
1341
  # L2 normalize embeddings
1342
  print("Normalizing embeddings")
 
1343
  video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
 
1344
  audio_emb = torch.nn.functional.normalize(audio_emb, p=2, dim=1)
1345
 
1346
  audio_emb = torch.split(audio_emb, B, dim=0)
@@ -1351,6 +1386,7 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
1351
  video_emb = torch.split(video_emb, B, dim=0)
1352
  video_emb = torch.stack(video_emb, dim=2)
1353
  video_emb = video_emb.squeeze(3)
 
1354
 
1355
 
1356
  # Calculate sync offset
@@ -1484,11 +1520,11 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
1484
  video_sequences = torch.cat([all_masked_frames[idx][:, :, i] for i in range(all_masked_frames[idx].size(2))], dim=0)
1485
 
1486
  if idx==0:
1487
- video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True)
1488
  video_emb = torch.cat(video_emb, dim=0)
1489
  audio_emb = torch.cat(audio_emb, dim=0)
1490
  else:
1491
- video_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=False)
1492
  video_emb = torch.cat(video_emb, dim=0)
1493
  all_video_embs.append(video_emb)
1494
  print("Successfully extracted GestSync embeddings")
 
1075
  return wav_file, "success"
1076
 
1077
  @spaces.GPU(duration=100)
1078
+ def get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_emb=True):
1079
 
1080
  '''
1081
  This function extracts the video and audio embeddings from the input frames and audio sequences
 
1090
  - audio_emb (array) : Audio embedding
1091
  '''
1092
 
1093
+ # video_emb = []
1094
+ # audio_emb = []
1095
+
1096
+ # # model = model.to(device)
1097
+
1098
+ # for i in tqdm(range(0, len(video_sequences), batch_size)):
1099
+ # video_inp = video_sequences[i:i+batch_size, ]
1100
+ # vid_emb = model.forward_vid(video_inp, return_feats=False)
1101
+ # vid_emb = torch.mean(vid_emb, axis=-1)
1102
+
1103
+ # video_emb.append(vid_emb.detach().cpu())
1104
+
1105
+ # if calc_aud_emb:
1106
+ # audio_inp = audio_sequences[i:i+batch_size, ]
1107
+ # aud_emb = model.forward_aud(audio_inp)
1108
+ # audio_emb.append(aud_emb.detach().cpu())
1109
+
1110
+ # # torch.cuda.empty_cache()
1111
+
1112
+ # print("Extracted embeddings: ", len(video_emb), len(audio_emb))
1113
+
1114
+
1115
+ # if calc_aud_emb==True:
1116
+ # print("returning audio and video embeddings...")
1117
+ # return video_emb, audio_emb
1118
+
1119
+ # return video_emb
1120
+
1121
  video_emb = []
1122
  audio_emb = []
1123
 
1124
+ for i in range(0, len(video_sequences), batch_size):
 
 
1125
  video_inp = video_sequences[i:i+batch_size, ]
1126
  vid_emb = model.forward_vid(video_inp, return_feats=False)
1127
  vid_emb = torch.mean(vid_emb, axis=-1)
1128
+ if not asd:
1129
+ vid_emb = vid_emb.unsqueeze(-1)
1130
 
1131
+ # video_emb.append(vid_emb.detach())
1132
+ video_emb.extend(vid_emb.detach().cpu().numpy())
1133
 
1134
  if calc_aud_emb:
1135
  audio_inp = audio_sequences[i:i+batch_size, ]
1136
  aud_emb = model.forward_aud(audio_inp)
1137
+ # audio_emb.append(aud_emb.detach())
1138
+ audio_emb.extend(aud_emb.detach().cpu().numpy())
1139
 
1140
+ torch.cuda.empty_cache()
1141
 
1142
+ # video_emb = torch.cat(video_emb, dim=0)
1143
+ video_emb = np.array(video_emb)
1144
+ print("Video Embedding Shape: ", video_emb.shape)
1145
 
1146
+ if calc_aud_emb:
1147
+ # audio_emb = torch.cat(audio_emb, dim=0)
1148
+ audio_emb = np.array(audio_emb)
1149
+ print("Audio Embedding Shape: ", audio_emb.shape)
1150
 
 
 
1151
  return video_emb, audio_emb
1152
 
1153
  return video_emb
 
1169
 
1170
  cos = nn.CosineSimilarity(dim=1)
1171
 
1172
+ audio_embedding = torch.tensor(audio_embedding).squeeze(2)
1173
 
1174
  scores = []
1175
  for i in range(len(all_video_embeddings)):
1176
+ video_embedding = torch.tensor(all_video_embeddings[i])
1177
 
1178
  # Compute the similarity of each speaker's video embeddings with the audio embedding
1179
  sim = cos(video_embedding, audio_embedding)
 
1366
 
1367
  # Extract embeddings
1368
  print("Obtaining audio and video embeddings...")
1369
+ video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_emb=True)
1370
  print("Obtained embeddings")
1371
+ # video_emb = torch.cat(video_emb, dim=0)
1372
+ # audio_emb = torch.cat(audio_emb, dim=0)
 
1373
 
1374
  # L2 normalize embeddings
1375
  print("Normalizing embeddings")
1376
+ video_emb = torch.tensor(video_emb)
1377
  video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
1378
+ audio_emb = torch.tensor(audio_emb)
1379
  audio_emb = torch.nn.functional.normalize(audio_emb, p=2, dim=1)
1380
 
1381
  audio_emb = torch.split(audio_emb, B, dim=0)
 
1386
  video_emb = torch.split(video_emb, B, dim=0)
1387
  video_emb = torch.stack(video_emb, dim=2)
1388
  video_emb = video_emb.squeeze(3)
1389
+ print("Successfully extracted GestSync embeddings")
1390
 
1391
 
1392
  # Calculate sync offset
 
1520
  video_sequences = torch.cat([all_masked_frames[idx][:, :, i] for i in range(all_masked_frames[idx].size(2))], dim=0)
1521
 
1522
  if idx==0:
1523
+ video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, asd=True, calc_aud_emb=True)
1524
  video_emb = torch.cat(video_emb, dim=0)
1525
  audio_emb = torch.cat(audio_emb, dim=0)
1526
  else:
1527
+ video_emb = get_embeddings(video_sequences, audio_sequences, model, asd=True, calc_aud_emb=False)
1528
  video_emb = torch.cat(video_emb, dim=0)
1529
  all_video_embs.append(video_emb)
1530
  print("Successfully extracted GestSync embeddings")