Spaces:

sindhuhegde
/

gestsync

Running on Zero

App Files Files Community

sindhuhegde commited on Aug 23, 2024

Commit

360ddab

1 Parent(s): 6828b68

Update app

Browse files

Files changed (1) hide show

app.py +103 -94

app.py CHANGED Viewed

@@ -35,7 +35,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 use_cuda = torch.cuda.is_available()
 n_negative_samples = 100
-def preprocess_video(path, result_folder, padding=20):
 	'''
 	This function preprocesses the input video to extract the audio and crop the frames using YOLO model
@@ -60,62 +60,10 @@ def preprocess_video(path, result_folder, padding=20):
 		msg = "Oops! Could not load the video. Please check the input video and try again."
 		return None, None, None, msg
-	all_frames = []
-	for k in range(len(vr)):
-		all_frames.append(vr[k].asnumpy())
-	all_frames = np.asarray(all_frames)
-	print("Extracted the frames for pre-processing")
-	# Load YOLOv9 model (pre-trained on COCO dataset)
-	yolo_model = YOLO("yolov9s.pt")
-	print("Loaded the YOLO model")
 	if frame_count < 25:
 		msg = "Not enough frames to process! Please give a longer video as input"
 		return None, None, None, msg
-	person_videos = {}
-	person_tracks = {}
-	print("Processing the frames...")
-	for frame_idx in tqdm(range(frame_count)):
-		frame = all_frames[frame_idx]
-		# Perform person detection
-		results = yolo_model(frame, verbose=False)
-		detections = results[0].boxes
-		for i, det in enumerate(detections):
-			x1, y1, x2, y2 = det.xyxy[0]
-			cls = det.cls[0]
-			if int(cls) == 0:  # Class 0 is 'person' in COCO dataset
-				x1 = max(0, int(x1) - padding)
-				y1 = max(0, int(y1) - padding)
-				x2 = min(frame.shape[1], int(x2) + padding)
-				y2 = min(frame.shape[0], int(y2) + padding)
-				if i not in person_videos:
-					person_videos[i] = []
-					person_tracks[i] = []
-				person_videos[i].append(frame)
-				person_tracks[i].append([x1,y1,x2,y2])
-	num_persons = 0
-	for i in person_videos.keys():
-		if len(person_videos[i]) >= frame_count//2:
-			num_persons+=1
-	if num_persons==0:
-		msg = "No person detected in the video! Please give a video with one person as input"
-		return None, None, None, msg
-	if num_persons>1:
-		msg = "More than one person detected in the video! Please give a video with only one person as input"
-		return None, None, None, msg
 	# Extract the audio from the input video file using ffmpeg
 	wav_file  = os.path.join(result_folder, "audio.wav")
@@ -125,50 +73,109 @@ def preprocess_video(path, result_folder, padding=20):
 	if status != 0:
 		msg = "Oops! Could not load the audio file. Please check the input video and try again."
 		return None, None, None, msg
 	print("Extracted the audio from the video")
-	# For the person detected, crop the frame based on the bounding box
-	if len(person_videos[0]) > frame_count-10:
-		crop_filename = os.path.join(result_folder, "preprocessed_video.avi")
-		fourcc = cv2.VideoWriter_fourcc(*'DIVX')
-		# Get bounding box coordinates based on person_tracks[i]
-		max_x1 = min([track[0] for track in person_tracks[0]])
-		max_y1 = min([track[1] for track in person_tracks[0]])
-		max_x2 = max([track[2] for track in person_tracks[0]])
-		max_y2 = max([track[3] for track in person_tracks[0]])
-		max_width = max_x2 - max_x1
-		max_height = max_y2 - max_y1
-		out = cv2.VideoWriter(crop_filename, fourcc, fps, (max_width, max_height))
-		for frame in person_videos[0]:
-			crop = frame[max_y1:max_y2, max_x1:max_x2]
-			crop = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
-			out.write(crop)
-		out.release()
-		no_sound_video = crop_filename.split('.')[0] + '_nosound.mp4'
-		status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -c copy -an -strict -2 %s' % (crop_filename, no_sound_video), shell=True)
-		if status != 0:
-			msg = "Oops! Could not preprocess the video. Please check the input video and try again."
-			return None, None, None, msg
-		video_output = crop_filename.split('.')[0] + '.mp4'
-		status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -i %s -strict -2 -q:v 1 %s' %
-						(wav_file , no_sound_video, video_output), shell=True)
-		if status != 0:
-			msg = "Oops! Could not preprocess the video. Please check the input video and try again."
 			return None, None, None, msg
-		os.remove(crop_filename)
-		os.remove(no_sound_video)
-		print("Successfully saved the pre-processed video: ", video_output)
 	else:
-		msg = "Could not track the person in the full video! Please give a single-speaker video as input"
-		return None, None, None, msg
 	return wav_file, fps, video_output, "success"
@@ -649,7 +656,7 @@ class Logger:
 		return False
-def process_video(video_path, num_avg_frames):
 	try:
 		# Extract the video filename
 		video_fname = os.path.basename(video_path.split(".")[0])
@@ -668,7 +675,8 @@ def process_video(video_path, num_avg_frames):
 		# Preprocess the video
-		wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input)
 		if status != "success":
 			return status, None
 		print("Successfully preprocessed the video")
@@ -902,6 +910,7 @@ if __name__ == "__main__":
 						value=75,
 						label="Number of Average Frames",
 					)
 				video_input = gr.Video(label="Upload Video", height=400)
 			with gr.Column():
@@ -914,12 +923,12 @@ if __name__ == "__main__":
 		submit_button.click(
 			fn=process_video,
-			inputs=[video_input, num_avg_frames],
 			outputs=[result_text, output_video]
 		)
 		clear_button.click(
-			fn=lambda: (None, 75, "", None),
 			inputs=[],
 			outputs=[video_input, num_avg_frames, result_text, output_video]
 		)

 use_cuda = torch.cuda.is_available()
 n_negative_samples = 100
+def preprocess_video(path, result_folder, apply_preprocess, padding=20):
 	'''
 	This function preprocesses the input video to extract the audio and crop the frames using YOLO model
 		msg = "Oops! Could not load the video. Please check the input video and try again."
 		return None, None, None, msg
 	if frame_count < 25:
 		msg = "Not enough frames to process! Please give a longer video as input"
 		return None, None, None, msg
 	# Extract the audio from the input video file using ffmpeg
 	wav_file  = os.path.join(result_folder, "audio.wav")
 	if status != 0:
 		msg = "Oops! Could not load the audio file. Please check the input video and try again."
 		return None, None, None, msg
 	print("Extracted the audio from the video")
+	if apply_preprocess=="True":
+		all_frames = []
+		for k in range(len(vr)):
+				all_frames.append(vr[k].asnumpy())
+		all_frames = np.asarray(all_frames)
+		print("Extracted the frames for pre-processing")
+		# Load YOLOv9 model (pre-trained on COCO dataset)
+		yolo_model = YOLO("yolov9s.pt")
+		print("Loaded the YOLO model")
+		person_videos = {}
+		person_tracks = {}
+		print("Processing the frames...")
+		for frame_idx in tqdm(range(frame_count)):
+			frame = all_frames[frame_idx]
+			# Perform person detection
+			results = yolo_model(frame, verbose=False)
+			detections = results[0].boxes
+			for i, det in enumerate(detections):
+				x1, y1, x2, y2 = det.xyxy[0]
+				cls = det.cls[0]
+				if int(cls) == 0:  # Class 0 is 'person' in COCO dataset
+					x1 = max(0, int(x1) - padding)
+					y1 = max(0, int(y1) - padding)
+					x2 = min(frame.shape[1], int(x2) + padding)
+					y2 = min(frame.shape[0], int(y2) + padding)
+					if i not in person_videos:
+						person_videos[i] = []
+						person_tracks[i] = []
+					person_videos[i].append(frame)
+					person_tracks[i].append([x1,y1,x2,y2])
+		num_persons = 0
+		for i in person_videos.keys():
+			if len(person_videos[i]) >= frame_count//2:
+				num_persons+=1
+		if num_persons==0:
+			msg = "No person detected in the video! Please give a video with one person as input"
 			return None, None, None, msg
+		if num_persons>1:
+			msg = "More than one person detected in the video! Please give a video with only one person as input"
+			return None, None, None, msg
+		# For the person detected, crop the frame based on the bounding box
+		if len(person_videos[0]) > frame_count-10:
+			crop_filename = os.path.join(result_folder, "preprocessed_video.avi")
+			fourcc = cv2.VideoWriter_fourcc(*'DIVX')
+			# Get bounding box coordinates based on person_tracks[i]
+			max_x1 = min([track[0] for track in person_tracks[0]])
+			max_y1 = min([track[1] for track in person_tracks[0]])
+			max_x2 = max([track[2] for track in person_tracks[0]])
+			max_y2 = max([track[3] for track in person_tracks[0]])
+			max_width = max_x2 - max_x1
+			max_height = max_y2 - max_y1
+			out = cv2.VideoWriter(crop_filename, fourcc, fps, (max_width, max_height))
+			for frame in person_videos[0]:
+				crop = frame[max_y1:max_y2, max_x1:max_x2]
+				crop = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
+				out.write(crop)
+			out.release()
+			no_sound_video = crop_filename.split('.')[0] + '_nosound.mp4'
+			status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -c copy -an -strict -2 %s' % (crop_filename, no_sound_video), shell=True)
+			if status != 0:
+				msg = "Oops! Could not preprocess the video. Please check the input video and try again."
+				return None, None, None, msg
+			video_output = crop_filename.split('.')[0] + '.mp4'
+			status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -i %s -strict -2 -q:v 1 %s' %
+							(wav_file , no_sound_video, video_output), shell=True)
+			if status != 0:
+				msg = "Oops! Could not preprocess the video. Please check the input video and try again."
+				return None, None, None, msg
+			os.remove(crop_filename)
+			os.remove(no_sound_video)
+			print("Successfully saved the pre-processed video: ", video_output)
+		else:
+			msg = "Could not track the person in the full video! Please give a single-speaker video as input"
+			return None, None, None, msg
 	else:
+		video_output = path
 	return wav_file, fps, video_output, "success"
 		return False
+def process_video(video_path, num_avg_frames, apply_preprocess):
 	try:
 		# Extract the video filename
 		video_fname = os.path.basename(video_path.split(".")[0])
 		# Preprocess the video
+		print("Applying preprocessing: ", apply_preprocess)
+		wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input, apply_preprocess)
 		if status != "success":
 			return status, None
 		print("Successfully preprocessed the video")
 						value=75,
 						label="Number of Average Frames",
 					)
+				apply_preprocess = gr.Checkbox(label="Apply Preprocessing", value=False)
 				video_input = gr.Video(label="Upload Video", height=400)
 			with gr.Column():
 		submit_button.click(
 			fn=process_video,
+			inputs=[video_input, num_avg_frames, apply_preprocess],
 			outputs=[result_text, output_video]
 		)
 		clear_button.click(
+			fn=lambda: (None, 75, False, "", None),
 			inputs=[],
 			outputs=[video_input, num_avg_frames, result_text, output_video]
 		)