Spaces:

sindhuhegde
/

gestsync

Running on Zero

App Files Files Community

sindhuhegde commited on Sep 1, 2024

Commit

449a2b2

1 Parent(s): e997409

Update app

Browse files

Files changed (1) hide show

app.py +54 -18

app.py CHANGED Viewed

@@ -1075,7 +1075,7 @@ def extract_audio(video, result_folder):
 	return wav_file, "success"
 @spaces.GPU(duration=100)
-def get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True):
 	'''
 	This function extracts the video and audio embeddings from the input frames and audio sequences
@@ -1090,30 +1090,64 @@ def get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True):
 		- audio_emb (array) : Audio embedding
 	'''
 	video_emb = []
 	audio_emb = []
-	# model = model.to(device)
-	for i in tqdm(range(0, len(video_sequences), batch_size)):
 		video_inp = video_sequences[i:i+batch_size, ]
 		vid_emb = model.forward_vid(video_inp, return_feats=False)
 		vid_emb = torch.mean(vid_emb, axis=-1)
-		video_emb.append(vid_emb.detach().cpu())
 		if calc_aud_emb:
 			audio_inp = audio_sequences[i:i+batch_size, ]
 			aud_emb = model.forward_aud(audio_inp)
-			audio_emb.append(aud_emb.detach().cpu())
-		# torch.cuda.empty_cache()
-	print("Extracted embeddings: ", len(video_emb), len(audio_emb))
-	if calc_aud_emb==True:
-		print("returning audio and video embeddings...")
 		return video_emb, audio_emb
 	return video_emb
@@ -1135,11 +1169,11 @@ def predict_active_speaker(all_video_embeddings, audio_embedding, global_score,
 	cos = nn.CosineSimilarity(dim=1)
-	audio_embedding = audio_embedding.squeeze(2)
 	scores = []
 	for i in range(len(all_video_embeddings)):
-		video_embedding = all_video_embeddings[i]
 		# Compute the similarity of each speaker's video embeddings with the audio embedding
 		sim = cos(video_embedding, audio_embedding)
@@ -1332,15 +1366,16 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		# Extract embeddings
 		print("Obtaining audio and video embeddings...")
-		video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True)
 		print("Obtained embeddings")
-		video_emb = torch.cat(video_emb, dim=0)
-		audio_emb = torch.cat(audio_emb, dim=0)
-		print("Successfully extracted GestSync embeddings")
 		# L2 normalize embeddings
 		print("Normalizing embeddings")
 		video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
 		audio_emb = torch.nn.functional.normalize(audio_emb, p=2, dim=1)
 		audio_emb = torch.split(audio_emb, B, dim=0)
@@ -1351,6 +1386,7 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		video_emb = torch.split(video_emb, B, dim=0)
 		video_emb = torch.stack(video_emb, dim=2)
 		video_emb = video_emb.squeeze(3)
 		# Calculate sync offset
@@ -1484,11 +1520,11 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 					video_sequences = torch.cat([all_masked_frames[idx][:, :, i] for i in range(all_masked_frames[idx].size(2))], dim=0)
 					if idx==0:
-						video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True)
 						video_emb = torch.cat(video_emb, dim=0)
 						audio_emb = torch.cat(audio_emb, dim=0)
 					else:
-						video_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=False)
 						video_emb = torch.cat(video_emb, dim=0)
 					all_video_embs.append(video_emb)
 			print("Successfully extracted GestSync embeddings")

 	return wav_file, "success"
 @spaces.GPU(duration=100)
+def get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_emb=True):
 	'''
 	This function extracts the video and audio embeddings from the input frames and audio sequences
 		- audio_emb (array) : Audio embedding
 	'''
+	# video_emb = []
+	# audio_emb = []
+	# # model = model.to(device)
+	# for i in tqdm(range(0, len(video_sequences), batch_size)):
+	# 	video_inp = video_sequences[i:i+batch_size, ]
+	# 	vid_emb = model.forward_vid(video_inp, return_feats=False)
+	# 	vid_emb = torch.mean(vid_emb, axis=-1)
+	# 	video_emb.append(vid_emb.detach().cpu())
+	# 	if calc_aud_emb:
+	# 		audio_inp = audio_sequences[i:i+batch_size, ]
+	# 		aud_emb = model.forward_aud(audio_inp)
+	# 		audio_emb.append(aud_emb.detach().cpu())
+	# 	# torch.cuda.empty_cache()
+	# print("Extracted embeddings: ", len(video_emb), len(audio_emb))
+	# if calc_aud_emb==True:
+	# 	print("returning audio and video embeddings...")
+	# 	return video_emb, audio_emb
+	# return video_emb
 	video_emb = []
 	audio_emb = []
+	for i in range(0, len(video_sequences), batch_size):
 		video_inp = video_sequences[i:i+batch_size, ]
 		vid_emb = model.forward_vid(video_inp, return_feats=False)
 		vid_emb = torch.mean(vid_emb, axis=-1)
+		if not asd:
+			vid_emb = vid_emb.unsqueeze(-1)
+		# video_emb.append(vid_emb.detach())
+		video_emb.extend(vid_emb.detach().cpu().numpy())
 		if calc_aud_emb:
 			audio_inp = audio_sequences[i:i+batch_size, ]
 			aud_emb = model.forward_aud(audio_inp)
+			# audio_emb.append(aud_emb.detach())
+			audio_emb.extend(aud_emb.detach().cpu().numpy())
+		torch.cuda.empty_cache()
+	# video_emb = torch.cat(video_emb, dim=0)
+	video_emb = np.array(video_emb)
+	print("Video Embedding Shape: ", video_emb.shape)
+	if calc_aud_emb:
+		# audio_emb = torch.cat(audio_emb, dim=0)
+		audio_emb = np.array(audio_emb)
+		print("Audio Embedding Shape: ", audio_emb.shape)
 		return video_emb, audio_emb
 	return video_emb
 	cos = nn.CosineSimilarity(dim=1)
+	audio_embedding = torch.tensor(audio_embedding).squeeze(2)
 	scores = []
 	for i in range(len(all_video_embeddings)):
+		video_embedding = torch.tensor(all_video_embeddings[i])
 		# Compute the similarity of each speaker's video embeddings with the audio embedding
 		sim = cos(video_embedding, audio_embedding)
 		# Extract embeddings
 		print("Obtaining audio and video embeddings...")
+		video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_emb=True)
 		print("Obtained embeddings")
+		# video_emb = torch.cat(video_emb, dim=0)
+		# audio_emb = torch.cat(audio_emb, dim=0)
 		# L2 normalize embeddings
 		print("Normalizing embeddings")
+		video_emb = torch.tensor(video_emb)
 		video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
+		audio_emb = torch.tensor(audio_emb)
 		audio_emb = torch.nn.functional.normalize(audio_emb, p=2, dim=1)
 		audio_emb = torch.split(audio_emb, B, dim=0)
 		video_emb = torch.split(video_emb, B, dim=0)
 		video_emb = torch.stack(video_emb, dim=2)
 		video_emb = video_emb.squeeze(3)
+		print("Successfully extracted GestSync embeddings")
 		# Calculate sync offset
 					video_sequences = torch.cat([all_masked_frames[idx][:, :, i] for i in range(all_masked_frames[idx].size(2))], dim=0)
 					if idx==0:
+						video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, asd=True, calc_aud_emb=True)
 						video_emb = torch.cat(video_emb, dim=0)
 						audio_emb = torch.cat(audio_emb, dim=0)
 					else:
+						video_emb = get_embeddings(video_sequences, audio_sequences, model, asd=True, calc_aud_emb=False)
 						video_emb = torch.cat(video_emb, dim=0)
 					all_video_embs.append(video_emb)
 			print("Successfully extracted GestSync embeddings")