Spaces:
Running on Zero
Running on Zero
Commit ·
360ddab
1
Parent(s): 6828b68
Update app
Browse files
app.py
CHANGED
|
@@ -35,7 +35,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
| 35 |
use_cuda = torch.cuda.is_available()
|
| 36 |
n_negative_samples = 100
|
| 37 |
|
| 38 |
-
def preprocess_video(path, result_folder, padding=20):
|
| 39 |
|
| 40 |
'''
|
| 41 |
This function preprocesses the input video to extract the audio and crop the frames using YOLO model
|
|
@@ -60,62 +60,10 @@ def preprocess_video(path, result_folder, padding=20):
|
|
| 60 |
msg = "Oops! Could not load the video. Please check the input video and try again."
|
| 61 |
return None, None, None, msg
|
| 62 |
|
| 63 |
-
all_frames = []
|
| 64 |
-
for k in range(len(vr)):
|
| 65 |
-
all_frames.append(vr[k].asnumpy())
|
| 66 |
-
all_frames = np.asarray(all_frames)
|
| 67 |
-
print("Extracted the frames for pre-processing")
|
| 68 |
-
|
| 69 |
-
# Load YOLOv9 model (pre-trained on COCO dataset)
|
| 70 |
-
yolo_model = YOLO("yolov9s.pt")
|
| 71 |
-
print("Loaded the YOLO model")
|
| 72 |
-
|
| 73 |
if frame_count < 25:
|
| 74 |
msg = "Not enough frames to process! Please give a longer video as input"
|
| 75 |
return None, None, None, msg
|
| 76 |
|
| 77 |
-
person_videos = {}
|
| 78 |
-
person_tracks = {}
|
| 79 |
-
|
| 80 |
-
print("Processing the frames...")
|
| 81 |
-
for frame_idx in tqdm(range(frame_count)):
|
| 82 |
-
|
| 83 |
-
frame = all_frames[frame_idx]
|
| 84 |
-
|
| 85 |
-
# Perform person detection
|
| 86 |
-
results = yolo_model(frame, verbose=False)
|
| 87 |
-
detections = results[0].boxes
|
| 88 |
-
|
| 89 |
-
for i, det in enumerate(detections):
|
| 90 |
-
x1, y1, x2, y2 = det.xyxy[0]
|
| 91 |
-
cls = det.cls[0]
|
| 92 |
-
if int(cls) == 0: # Class 0 is 'person' in COCO dataset
|
| 93 |
-
|
| 94 |
-
x1 = max(0, int(x1) - padding)
|
| 95 |
-
y1 = max(0, int(y1) - padding)
|
| 96 |
-
x2 = min(frame.shape[1], int(x2) + padding)
|
| 97 |
-
y2 = min(frame.shape[0], int(y2) + padding)
|
| 98 |
-
|
| 99 |
-
if i not in person_videos:
|
| 100 |
-
person_videos[i] = []
|
| 101 |
-
person_tracks[i] = []
|
| 102 |
-
|
| 103 |
-
person_videos[i].append(frame)
|
| 104 |
-
person_tracks[i].append([x1,y1,x2,y2])
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
num_persons = 0
|
| 108 |
-
for i in person_videos.keys():
|
| 109 |
-
if len(person_videos[i]) >= frame_count//2:
|
| 110 |
-
num_persons+=1
|
| 111 |
-
|
| 112 |
-
if num_persons==0:
|
| 113 |
-
msg = "No person detected in the video! Please give a video with one person as input"
|
| 114 |
-
return None, None, None, msg
|
| 115 |
-
if num_persons>1:
|
| 116 |
-
msg = "More than one person detected in the video! Please give a video with only one person as input"
|
| 117 |
-
return None, None, None, msg
|
| 118 |
-
|
| 119 |
# Extract the audio from the input video file using ffmpeg
|
| 120 |
wav_file = os.path.join(result_folder, "audio.wav")
|
| 121 |
|
|
@@ -125,50 +73,109 @@ def preprocess_video(path, result_folder, padding=20):
|
|
| 125 |
if status != 0:
|
| 126 |
msg = "Oops! Could not load the audio file. Please check the input video and try again."
|
| 127 |
return None, None, None, msg
|
| 128 |
-
|
| 129 |
print("Extracted the audio from the video")
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
no_sound_video = crop_filename.split('.')[0] + '_nosound.mp4'
|
| 153 |
-
status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -c copy -an -strict -2 %s' % (crop_filename, no_sound_video), shell=True)
|
| 154 |
-
if status != 0:
|
| 155 |
-
msg = "Oops! Could not preprocess the video. Please check the input video and try again."
|
| 156 |
-
return None, None, None, msg
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
return None, None, None, msg
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
-
os.remove(crop_filename)
|
| 166 |
-
os.remove(no_sound_video)
|
| 167 |
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
else:
|
| 170 |
-
|
| 171 |
-
return None, None, None, msg
|
| 172 |
|
| 173 |
return wav_file, fps, video_output, "success"
|
| 174 |
|
|
@@ -649,7 +656,7 @@ class Logger:
|
|
| 649 |
return False
|
| 650 |
|
| 651 |
|
| 652 |
-
def process_video(video_path, num_avg_frames):
|
| 653 |
try:
|
| 654 |
# Extract the video filename
|
| 655 |
video_fname = os.path.basename(video_path.split(".")[0])
|
|
@@ -668,7 +675,8 @@ def process_video(video_path, num_avg_frames):
|
|
| 668 |
|
| 669 |
|
| 670 |
# Preprocess the video
|
| 671 |
-
|
|
|
|
| 672 |
if status != "success":
|
| 673 |
return status, None
|
| 674 |
print("Successfully preprocessed the video")
|
|
@@ -902,6 +910,7 @@ if __name__ == "__main__":
|
|
| 902 |
value=75,
|
| 903 |
label="Number of Average Frames",
|
| 904 |
)
|
|
|
|
| 905 |
video_input = gr.Video(label="Upload Video", height=400)
|
| 906 |
|
| 907 |
with gr.Column():
|
|
@@ -914,12 +923,12 @@ if __name__ == "__main__":
|
|
| 914 |
|
| 915 |
submit_button.click(
|
| 916 |
fn=process_video,
|
| 917 |
-
inputs=[video_input, num_avg_frames],
|
| 918 |
outputs=[result_text, output_video]
|
| 919 |
)
|
| 920 |
|
| 921 |
clear_button.click(
|
| 922 |
-
fn=lambda: (None, 75, "", None),
|
| 923 |
inputs=[],
|
| 924 |
outputs=[video_input, num_avg_frames, result_text, output_video]
|
| 925 |
)
|
|
|
|
| 35 |
use_cuda = torch.cuda.is_available()
|
| 36 |
n_negative_samples = 100
|
| 37 |
|
| 38 |
+
def preprocess_video(path, result_folder, apply_preprocess, padding=20):
|
| 39 |
|
| 40 |
'''
|
| 41 |
This function preprocesses the input video to extract the audio and crop the frames using YOLO model
|
|
|
|
| 60 |
msg = "Oops! Could not load the video. Please check the input video and try again."
|
| 61 |
return None, None, None, msg
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
if frame_count < 25:
|
| 64 |
msg = "Not enough frames to process! Please give a longer video as input"
|
| 65 |
return None, None, None, msg
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
# Extract the audio from the input video file using ffmpeg
|
| 68 |
wav_file = os.path.join(result_folder, "audio.wav")
|
| 69 |
|
|
|
|
| 73 |
if status != 0:
|
| 74 |
msg = "Oops! Could not load the audio file. Please check the input video and try again."
|
| 75 |
return None, None, None, msg
|
|
|
|
| 76 |
print("Extracted the audio from the video")
|
| 77 |
|
| 78 |
+
if apply_preprocess=="True":
|
| 79 |
+
all_frames = []
|
| 80 |
+
for k in range(len(vr)):
|
| 81 |
+
all_frames.append(vr[k].asnumpy())
|
| 82 |
+
all_frames = np.asarray(all_frames)
|
| 83 |
+
print("Extracted the frames for pre-processing")
|
| 84 |
+
|
| 85 |
+
# Load YOLOv9 model (pre-trained on COCO dataset)
|
| 86 |
+
yolo_model = YOLO("yolov9s.pt")
|
| 87 |
+
print("Loaded the YOLO model")
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
person_videos = {}
|
| 92 |
+
person_tracks = {}
|
| 93 |
+
|
| 94 |
+
print("Processing the frames...")
|
| 95 |
+
for frame_idx in tqdm(range(frame_count)):
|
| 96 |
+
|
| 97 |
+
frame = all_frames[frame_idx]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
+
# Perform person detection
|
| 100 |
+
results = yolo_model(frame, verbose=False)
|
| 101 |
+
detections = results[0].boxes
|
| 102 |
+
|
| 103 |
+
for i, det in enumerate(detections):
|
| 104 |
+
x1, y1, x2, y2 = det.xyxy[0]
|
| 105 |
+
cls = det.cls[0]
|
| 106 |
+
if int(cls) == 0: # Class 0 is 'person' in COCO dataset
|
| 107 |
+
|
| 108 |
+
x1 = max(0, int(x1) - padding)
|
| 109 |
+
y1 = max(0, int(y1) - padding)
|
| 110 |
+
x2 = min(frame.shape[1], int(x2) + padding)
|
| 111 |
+
y2 = min(frame.shape[0], int(y2) + padding)
|
| 112 |
+
|
| 113 |
+
if i not in person_videos:
|
| 114 |
+
person_videos[i] = []
|
| 115 |
+
person_tracks[i] = []
|
| 116 |
+
|
| 117 |
+
person_videos[i].append(frame)
|
| 118 |
+
person_tracks[i].append([x1,y1,x2,y2])
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
num_persons = 0
|
| 122 |
+
for i in person_videos.keys():
|
| 123 |
+
if len(person_videos[i]) >= frame_count//2:
|
| 124 |
+
num_persons+=1
|
| 125 |
+
|
| 126 |
+
if num_persons==0:
|
| 127 |
+
msg = "No person detected in the video! Please give a video with one person as input"
|
| 128 |
return None, None, None, msg
|
| 129 |
+
if num_persons>1:
|
| 130 |
+
msg = "More than one person detected in the video! Please give a video with only one person as input"
|
| 131 |
+
return None, None, None, msg
|
| 132 |
+
|
| 133 |
|
|
|
|
|
|
|
| 134 |
|
| 135 |
+
# For the person detected, crop the frame based on the bounding box
|
| 136 |
+
if len(person_videos[0]) > frame_count-10:
|
| 137 |
+
crop_filename = os.path.join(result_folder, "preprocessed_video.avi")
|
| 138 |
+
fourcc = cv2.VideoWriter_fourcc(*'DIVX')
|
| 139 |
+
|
| 140 |
+
# Get bounding box coordinates based on person_tracks[i]
|
| 141 |
+
max_x1 = min([track[0] for track in person_tracks[0]])
|
| 142 |
+
max_y1 = min([track[1] for track in person_tracks[0]])
|
| 143 |
+
max_x2 = max([track[2] for track in person_tracks[0]])
|
| 144 |
+
max_y2 = max([track[3] for track in person_tracks[0]])
|
| 145 |
+
|
| 146 |
+
max_width = max_x2 - max_x1
|
| 147 |
+
max_height = max_y2 - max_y1
|
| 148 |
+
|
| 149 |
+
out = cv2.VideoWriter(crop_filename, fourcc, fps, (max_width, max_height))
|
| 150 |
+
for frame in person_videos[0]:
|
| 151 |
+
crop = frame[max_y1:max_y2, max_x1:max_x2]
|
| 152 |
+
crop = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
|
| 153 |
+
out.write(crop)
|
| 154 |
+
out.release()
|
| 155 |
+
|
| 156 |
+
no_sound_video = crop_filename.split('.')[0] + '_nosound.mp4'
|
| 157 |
+
status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -c copy -an -strict -2 %s' % (crop_filename, no_sound_video), shell=True)
|
| 158 |
+
if status != 0:
|
| 159 |
+
msg = "Oops! Could not preprocess the video. Please check the input video and try again."
|
| 160 |
+
return None, None, None, msg
|
| 161 |
+
|
| 162 |
+
video_output = crop_filename.split('.')[0] + '.mp4'
|
| 163 |
+
status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -i %s -strict -2 -q:v 1 %s' %
|
| 164 |
+
(wav_file , no_sound_video, video_output), shell=True)
|
| 165 |
+
if status != 0:
|
| 166 |
+
msg = "Oops! Could not preprocess the video. Please check the input video and try again."
|
| 167 |
+
return None, None, None, msg
|
| 168 |
+
|
| 169 |
+
os.remove(crop_filename)
|
| 170 |
+
os.remove(no_sound_video)
|
| 171 |
+
|
| 172 |
+
print("Successfully saved the pre-processed video: ", video_output)
|
| 173 |
+
else:
|
| 174 |
+
msg = "Could not track the person in the full video! Please give a single-speaker video as input"
|
| 175 |
+
return None, None, None, msg
|
| 176 |
+
|
| 177 |
else:
|
| 178 |
+
video_output = path
|
|
|
|
| 179 |
|
| 180 |
return wav_file, fps, video_output, "success"
|
| 181 |
|
|
|
|
| 656 |
return False
|
| 657 |
|
| 658 |
|
| 659 |
+
def process_video(video_path, num_avg_frames, apply_preprocess):
|
| 660 |
try:
|
| 661 |
# Extract the video filename
|
| 662 |
video_fname = os.path.basename(video_path.split(".")[0])
|
|
|
|
| 675 |
|
| 676 |
|
| 677 |
# Preprocess the video
|
| 678 |
+
print("Applying preprocessing: ", apply_preprocess)
|
| 679 |
+
wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input, apply_preprocess)
|
| 680 |
if status != "success":
|
| 681 |
return status, None
|
| 682 |
print("Successfully preprocessed the video")
|
|
|
|
| 910 |
value=75,
|
| 911 |
label="Number of Average Frames",
|
| 912 |
)
|
| 913 |
+
apply_preprocess = gr.Checkbox(label="Apply Preprocessing", value=False)
|
| 914 |
video_input = gr.Video(label="Upload Video", height=400)
|
| 915 |
|
| 916 |
with gr.Column():
|
|
|
|
| 923 |
|
| 924 |
submit_button.click(
|
| 925 |
fn=process_video,
|
| 926 |
+
inputs=[video_input, num_avg_frames, apply_preprocess],
|
| 927 |
outputs=[result_text, output_video]
|
| 928 |
)
|
| 929 |
|
| 930 |
clear_button.click(
|
| 931 |
+
fn=lambda: (None, 75, False, "", None),
|
| 932 |
inputs=[],
|
| 933 |
outputs=[video_input, num_avg_frames, result_text, output_video]
|
| 934 |
)
|