sindhuhegde commited on
Commit
af9489c
·
1 Parent(s): a8e8684

Update app

Browse files
Files changed (1) hide show
  1. app.py +35 -21
app.py CHANGED
@@ -882,6 +882,16 @@ def save_video(output_tracks, input_frames, wav_file, result_folder):
882
  return video_output, "success"
883
 
884
  @spaces.GPU(duration=150)
 
 
 
 
 
 
 
 
 
 
885
  def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
886
 
887
  try:
@@ -974,26 +984,28 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
974
  model = load_checkpoint(CHECKPOINT_PATH, model)
975
  print("Successfully loaded the model")
976
 
 
 
977
  # Process in batches
978
- batch_size = 12
979
- video_emb = []
980
- audio_emb = []
981
 
982
- for i in tqdm(range(0, len(video_sequences), batch_size)):
983
- video_inp = video_sequences[i:i+batch_size, ]
984
- audio_inp = audio_sequences[i:i+batch_size, ]
985
 
986
- vid_emb = model.forward_vid(video_inp.to(device))
987
- vid_emb = torch.mean(vid_emb, axis=-1).unsqueeze(-1)
988
- aud_emb = model.forward_aud(audio_inp.to(device))
989
 
990
- video_emb.append(vid_emb.detach())
991
- audio_emb.append(aud_emb.detach())
992
 
993
- torch.cuda.empty_cache()
994
 
995
- audio_emb = torch.cat(audio_emb, dim=0)
996
- video_emb = torch.cat(video_emb, dim=0)
997
 
998
  # L2 normalize embeddings
999
  video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
@@ -1027,7 +1039,6 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
1027
  return None, f"Error: {str(e)}"
1028
 
1029
 
1030
- @spaces.GPU(duration=150)
1031
  def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
1032
  try:
1033
  # Extract the video filename
@@ -1079,12 +1090,15 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
1079
  return None, status
1080
 
1081
  # Pre-process and extract per-speaker tracks in each scene
1082
- print("Pre-processing the input video...")
1083
- status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
1084
- if status != 0:
1085
- msg = "Error in pre-processing the input video, please check the input video and try again..."
1086
- return None, msg
1087
-
 
 
 
1088
  # Load the tracks file saved during pre-processing
1089
  with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
1090
  tracks = pickle.load(file)
 
882
  return video_output, "success"
883
 
884
  @spaces.GPU(duration=150)
885
+ def preprocess_asd(video_path, result_folder_input):
886
+
887
+ print("Pre-processing the input video...")
888
+ status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
889
+ if status != 0:
890
+ msg = "Error in pre-processing the input video, please check the input video and try again..."
891
+ return msg
892
+
893
+ return "success"
894
+
895
  def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
896
 
897
  try:
 
984
  model = load_checkpoint(CHECKPOINT_PATH, model)
985
  print("Successfully loaded the model")
986
 
987
+ video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True)
988
+
989
  # Process in batches
990
+ # batch_size = 12
991
+ # video_emb = []
992
+ # audio_emb = []
993
 
994
+ # for i in tqdm(range(0, len(video_sequences), batch_size)):
995
+ # video_inp = video_sequences[i:i+batch_size, ]
996
+ # audio_inp = audio_sequences[i:i+batch_size, ]
997
 
998
+ # vid_emb = model.forward_vid(video_inp.to(device))
999
+ # vid_emb = torch.mean(vid_emb, axis=-1).unsqueeze(-1)
1000
+ # aud_emb = model.forward_aud(audio_inp.to(device))
1001
 
1002
+ # video_emb.append(vid_emb.detach())
1003
+ # audio_emb.append(aud_emb.detach())
1004
 
1005
+ # torch.cuda.empty_cache()
1006
 
1007
+ # audio_emb = torch.cat(audio_emb, dim=0)
1008
+ # video_emb = torch.cat(video_emb, dim=0)
1009
 
1010
  # L2 normalize embeddings
1011
  video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
 
1039
  return None, f"Error: {str(e)}"
1040
 
1041
 
 
1042
  def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
1043
  try:
1044
  # Extract the video filename
 
1090
  return None, status
1091
 
1092
  # Pre-process and extract per-speaker tracks in each scene
1093
+ # print("Pre-processing the input video...")
1094
+ # status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
1095
+ # if status != 0:
1096
+ # msg = "Error in pre-processing the input video, please check the input video and try again..."
1097
+ # return None, msg
1098
+ status = preprocess_asd(video_path, result_folder_input)
1099
+ if status != "success":
1100
+ return None, status
1101
+
1102
  # Load the tracks file saved during pre-processing
1103
  with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
1104
  tracks = pickle.load(file)