Spaces:

sindhuhegde
/

gestsync

Running on Zero

App Files Files Community

sindhuhegde commited on Aug 27, 2024

Commit

342ecda

1 Parent(s): 4de346a

Update app

Browse files

Files changed (1) hide show

app.py +98 -38

app.py CHANGED Viewed

@@ -357,8 +357,62 @@ def process_video_asd(file, sd_root, work_root, data_root, avi_dir, tmp_dir, wor
 	return "success"
 @spaces.GPU(duration=60)
 def preprocess_video(path, result_folder, apply_preprocess, padding=20):
 	'''
@@ -406,53 +460,57 @@ def preprocess_video(path, result_folder, apply_preprocess, padding=20):
 		all_frames = np.asarray(all_frames)
 		print("Extracted the frames for pre-processing")
-		# Load YOLOv9 model (pre-trained on COCO dataset)
-		yolo_model = YOLO("yolov9s.pt")
-		print("Loaded the YOLO model")
-		person_videos = {}
-		person_tracks = {}
-		print("Processing the frames...")
-		for frame_idx in tqdm(range(frame_count)):
-			frame = all_frames[frame_idx]
-			# Perform person detection
-			results = yolo_model(frame, verbose=False)
-			detections = results[0].boxes
-			for i, det in enumerate(detections):
-				x1, y1, x2, y2 = det.xyxy[0]
-				cls = det.cls[0]
-				if int(cls) == 0:  # Class 0 is 'person' in COCO dataset
-					x1 = max(0, int(x1) - padding)
-					y1 = max(0, int(y1) - padding)
-					x2 = min(frame.shape[1], int(x2) + padding)
-					y2 = min(frame.shape[0], int(y2) + padding)
-					if i not in person_videos:
-						person_videos[i] = []
-						person_tracks[i] = []
-					person_videos[i].append(frame)
-					person_tracks[i].append([x1,y1,x2,y2])
-		num_persons = 0
-		for i in person_videos.keys():
-			if len(person_videos[i]) >= frame_count//2:
-				num_persons+=1
-		if num_persons==0:
-			msg = "No person detected in the video! Please give a video with one person as input"
-			return None, None, None, msg
-		if num_persons>1:
-			msg = "More than one person detected in the video! Please give a video with only one person as input"
-			return None, None, None, msg
@@ -1100,7 +1158,7 @@ def get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True):
 			aud_emb = model.forward_aud(audio_inp.to(device))
 			audio_emb.append(aud_emb.detach())
-		torch.cuda.empty_cache()
 	video_emb = torch.cat(video_emb, dim=0)
@@ -1323,6 +1381,7 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		# Extract embeddings
 		print("Obtaining audio and video embeddings...")
 		video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True)
 		# L2 normalize embeddings
 		video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
@@ -1336,9 +1395,10 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		video_emb = torch.split(video_emb, B, dim=0)
 		video_emb = torch.stack(video_emb, dim=2)
 		video_emb = video_emb.squeeze(3)
-		print("Successfully extracted GestSync embeddings")
 		# Calculate sync offset
 		pred_offset, status = calc_optimal_av_offset(video_emb, audio_emb, num_avg_frames, model)
 		if status != "success":
 			return None, status

 	return "success"
 @spaces.GPU(duration=60)
+def get_person_detection(all_frames, frame_count, padding=20):
+	try:
+		# Load YOLOv9 model (pre-trained on COCO dataset)
+		yolo_model = YOLO("yolov9s.pt")
+		print("Loaded the YOLO model")
+		person_videos = {}
+		person_tracks = {}
+		print("Processing the frames...")
+		for frame_idx in tqdm(range(frame_count)):
+			frame = all_frames[frame_idx]
+			# Perform person detection
+			results = yolo_model(frame, verbose=False)
+			detections = results[0].boxes
+			for i, det in enumerate(detections):
+				x1, y1, x2, y2 = det.xyxy[0]
+				cls = det.cls[0]
+				if int(cls) == 0:  # Class 0 is 'person' in COCO dataset
+					x1 = max(0, int(x1) - padding)
+					y1 = max(0, int(y1) - padding)
+					x2 = min(frame.shape[1], int(x2) + padding)
+					y2 = min(frame.shape[0], int(y2) + padding)
+					if i not in person_videos:
+						person_videos[i] = []
+						person_tracks[i] = []
+					person_videos[i].append(frame)
+					person_tracks[i].append([x1,y1,x2,y2])
+		num_persons = 0
+		for i in person_videos.keys():
+			if len(person_videos[i]) >= frame_count//2:
+				num_persons+=1
+		if num_persons==0:
+			msg = "No person detected in the video! Please give a video with one person as input"
+			return None, None, msg
+		if num_persons>1:
+			msg = "More than one person detected in the video! Please give a video with only one person as input"
+			return None, None, msg
+	except:
+		msg = "Error in detecting person in the video, please check the input video and try again"
+		return None, None, msg
+	return person_videos, person_tracks, "success"
 def preprocess_video(path, result_folder, apply_preprocess, padding=20):
 	'''
 		all_frames = np.asarray(all_frames)
 		print("Extracted the frames for pre-processing")
+		person_videos, person_tracks, msg = get_person_detection(all_frames, frame_count, padding)
+		if msg != "success":
+			return None, None, None, msg
+		# # Load YOLOv9 model (pre-trained on COCO dataset)
+		# yolo_model = YOLO("yolov9s.pt")
+		# print("Loaded the YOLO model")
+		# person_videos = {}
+		# person_tracks = {}
+		# print("Processing the frames...")
+		# for frame_idx in tqdm(range(frame_count)):
+		# 	frame = all_frames[frame_idx]
+		# 	# Perform person detection
+		# 	results = yolo_model(frame, verbose=False)
+		# 	detections = results[0].boxes
+		# 	for i, det in enumerate(detections):
+		# 		x1, y1, x2, y2 = det.xyxy[0]
+		# 		cls = det.cls[0]
+		# 		if int(cls) == 0:  # Class 0 is 'person' in COCO dataset
+		# 			x1 = max(0, int(x1) - padding)
+		# 			y1 = max(0, int(y1) - padding)
+		# 			x2 = min(frame.shape[1], int(x2) + padding)
+		# 			y2 = min(frame.shape[0], int(y2) + padding)
+		# 			if i not in person_videos:
+		# 				person_videos[i] = []
+		# 				person_tracks[i] = []
+		# 			person_videos[i].append(frame)
+		# 			person_tracks[i].append([x1,y1,x2,y2])
+		# num_persons = 0
+		# for i in person_videos.keys():
+		# 	if len(person_videos[i]) >= frame_count//2:
+		# 		num_persons+=1
+		# if num_persons==0:
+		# 	msg = "No person detected in the video! Please give a video with one person as input"
+		# 	return None, None, None, msg
+		# if num_persons>1:
+		# 	msg = "More than one person detected in the video! Please give a video with only one person as input"
+		# 	return None, None, None, msg
 			aud_emb = model.forward_aud(audio_inp.to(device))
 			audio_emb.append(aud_emb.detach())
+		# torch.cuda.empty_cache()
 	video_emb = torch.cat(video_emb, dim=0)
 		# Extract embeddings
 		print("Obtaining audio and video embeddings...")
 		video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True)
+		print("Successfully extracted GestSync embeddings")
 		# L2 normalize embeddings
 		video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
 		video_emb = torch.split(video_emb, B, dim=0)
 		video_emb = torch.stack(video_emb, dim=2)
 		video_emb = video_emb.squeeze(3)
 		# Calculate sync offset
+		print("Calculating sync offset...")
 		pred_offset, status = calc_optimal_av_offset(video_emb, audio_emb, num_avg_frames, model)
 		if status != "success":
 			return None, status