Spaces:

sindhuhegde
/

gestsync

Running on Zero

App Files Files Community

sindhuhegde commited on Aug 26, 2024

Commit

af9489c

1 Parent(s): a8e8684

Update app

Browse files

Files changed (1) hide show

app.py +35 -21

app.py CHANGED Viewed

@@ -882,6 +882,16 @@ def save_video(output_tracks, input_frames, wav_file, result_folder):
 	return video_output, "success"
 @spaces.GPU(duration=150)
 def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 	try:
@@ -974,26 +984,28 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		model = load_checkpoint(CHECKPOINT_PATH, model)
 		print("Successfully loaded the model")
 		# Process in batches
-		batch_size = 12
-		video_emb = []
-		audio_emb = []
-		for i in tqdm(range(0, len(video_sequences), batch_size)):
-			video_inp = video_sequences[i:i+batch_size, ]
-			audio_inp = audio_sequences[i:i+batch_size, ]
-			vid_emb = model.forward_vid(video_inp.to(device))
-			vid_emb = torch.mean(vid_emb, axis=-1).unsqueeze(-1)
-			aud_emb = model.forward_aud(audio_inp.to(device))
-			video_emb.append(vid_emb.detach())
-			audio_emb.append(aud_emb.detach())
-			torch.cuda.empty_cache()
-		audio_emb = torch.cat(audio_emb, dim=0)
-		video_emb = torch.cat(video_emb, dim=0)
 		# L2 normalize embeddings
 		video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
@@ -1027,7 +1039,6 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		return None, f"Error: {str(e)}"
-@spaces.GPU(duration=150)
 def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 	try:
 		# Extract the video filename
@@ -1079,12 +1090,15 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 			return None, status
 		# Pre-process and extract per-speaker tracks in each scene
-		print("Pre-processing the input video...")
-		status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
-		if status != 0:
-			msg = "Error in pre-processing the input video, please check the input video and try again..."
-			return None, msg
 		# Load the tracks file saved during pre-processing
 		with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
 			tracks = pickle.load(file)

 	return video_output, "success"
 @spaces.GPU(duration=150)
+def preprocess_asd(video_path, result_folder_input):
+	print("Pre-processing the input video...")
+	status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
+	if status != 0:
+		msg = "Error in pre-processing the input video, please check the input video and try again..."
+		return msg
+	return "success"
 def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 	try:
 		model = load_checkpoint(CHECKPOINT_PATH, model)
 		print("Successfully loaded the model")
+		video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True)
 		# Process in batches
+		# batch_size = 12
+		# video_emb = []
+		# audio_emb = []
+		# for i in tqdm(range(0, len(video_sequences), batch_size)):
+		# 	video_inp = video_sequences[i:i+batch_size, ]
+		# 	audio_inp = audio_sequences[i:i+batch_size, ]
+		# 	vid_emb = model.forward_vid(video_inp.to(device))
+		# 	vid_emb = torch.mean(vid_emb, axis=-1).unsqueeze(-1)
+		# 	aud_emb = model.forward_aud(audio_inp.to(device))
+		# 	video_emb.append(vid_emb.detach())
+		# 	audio_emb.append(aud_emb.detach())
+		# 	torch.cuda.empty_cache()
+		# audio_emb = torch.cat(audio_emb, dim=0)
+		# video_emb = torch.cat(video_emb, dim=0)
 		# L2 normalize embeddings
 		video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
 		return None, f"Error: {str(e)}"
 def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 	try:
 		# Extract the video filename
 			return None, status
 		# Pre-process and extract per-speaker tracks in each scene
+		# print("Pre-processing the input video...")
+		# status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
+		# if status != 0:
+		# 	msg = "Error in pre-processing the input video, please check the input video and try again..."
+		# 	return None, msg
+		status = preprocess_asd(video_path, result_folder_input)
+		if status != "success":
+			return None, status
 		# Load the tracks file saved during pre-processing
 		with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
 			tracks = pickle.load(file)