sindhuhegde commited on
Commit
342ecda
·
1 Parent(s): 4de346a

Update app

Browse files
Files changed (1) hide show
  1. app.py +98 -38
app.py CHANGED
@@ -357,8 +357,62 @@ def process_video_asd(file, sd_root, work_root, data_root, avi_dir, tmp_dir, wor
357
  return "success"
358
 
359
 
360
-
361
  @spaces.GPU(duration=60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  def preprocess_video(path, result_folder, apply_preprocess, padding=20):
363
 
364
  '''
@@ -406,53 +460,57 @@ def preprocess_video(path, result_folder, apply_preprocess, padding=20):
406
  all_frames = np.asarray(all_frames)
407
  print("Extracted the frames for pre-processing")
408
 
409
- # Load YOLOv9 model (pre-trained on COCO dataset)
410
- yolo_model = YOLO("yolov9s.pt")
411
- print("Loaded the YOLO model")
 
 
 
 
412
 
413
 
414
 
415
- person_videos = {}
416
- person_tracks = {}
417
 
418
- print("Processing the frames...")
419
- for frame_idx in tqdm(range(frame_count)):
420
 
421
- frame = all_frames[frame_idx]
422
 
423
- # Perform person detection
424
- results = yolo_model(frame, verbose=False)
425
- detections = results[0].boxes
426
 
427
- for i, det in enumerate(detections):
428
- x1, y1, x2, y2 = det.xyxy[0]
429
- cls = det.cls[0]
430
- if int(cls) == 0: # Class 0 is 'person' in COCO dataset
431
 
432
- x1 = max(0, int(x1) - padding)
433
- y1 = max(0, int(y1) - padding)
434
- x2 = min(frame.shape[1], int(x2) + padding)
435
- y2 = min(frame.shape[0], int(y2) + padding)
436
 
437
- if i not in person_videos:
438
- person_videos[i] = []
439
- person_tracks[i] = []
440
 
441
- person_videos[i].append(frame)
442
- person_tracks[i].append([x1,y1,x2,y2])
443
 
444
 
445
- num_persons = 0
446
- for i in person_videos.keys():
447
- if len(person_videos[i]) >= frame_count//2:
448
- num_persons+=1
449
-
450
- if num_persons==0:
451
- msg = "No person detected in the video! Please give a video with one person as input"
452
- return None, None, None, msg
453
- if num_persons>1:
454
- msg = "More than one person detected in the video! Please give a video with only one person as input"
455
- return None, None, None, msg
456
 
457
 
458
 
@@ -1100,7 +1158,7 @@ def get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True):
1100
  aud_emb = model.forward_aud(audio_inp.to(device))
1101
  audio_emb.append(aud_emb.detach())
1102
 
1103
- torch.cuda.empty_cache()
1104
 
1105
  video_emb = torch.cat(video_emb, dim=0)
1106
 
@@ -1323,6 +1381,7 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
1323
  # Extract embeddings
1324
  print("Obtaining audio and video embeddings...")
1325
  video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True)
 
1326
 
1327
  # L2 normalize embeddings
1328
  video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
@@ -1336,9 +1395,10 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
1336
  video_emb = torch.split(video_emb, B, dim=0)
1337
  video_emb = torch.stack(video_emb, dim=2)
1338
  video_emb = video_emb.squeeze(3)
1339
- print("Successfully extracted GestSync embeddings")
1340
 
1341
  # Calculate sync offset
 
1342
  pred_offset, status = calc_optimal_av_offset(video_emb, audio_emb, num_avg_frames, model)
1343
  if status != "success":
1344
  return None, status
 
357
  return "success"
358
 
359
 
 
360
  @spaces.GPU(duration=60)
361
+ def get_person_detection(all_frames, frame_count, padding=20):
362
+
363
+ try:
364
+ # Load YOLOv9 model (pre-trained on COCO dataset)
365
+ yolo_model = YOLO("yolov9s.pt")
366
+ print("Loaded the YOLO model")
367
+
368
+ person_videos = {}
369
+ person_tracks = {}
370
+
371
+ print("Processing the frames...")
372
+ for frame_idx in tqdm(range(frame_count)):
373
+
374
+ frame = all_frames[frame_idx]
375
+
376
+ # Perform person detection
377
+ results = yolo_model(frame, verbose=False)
378
+ detections = results[0].boxes
379
+
380
+ for i, det in enumerate(detections):
381
+ x1, y1, x2, y2 = det.xyxy[0]
382
+ cls = det.cls[0]
383
+ if int(cls) == 0: # Class 0 is 'person' in COCO dataset
384
+
385
+ x1 = max(0, int(x1) - padding)
386
+ y1 = max(0, int(y1) - padding)
387
+ x2 = min(frame.shape[1], int(x2) + padding)
388
+ y2 = min(frame.shape[0], int(y2) + padding)
389
+
390
+ if i not in person_videos:
391
+ person_videos[i] = []
392
+ person_tracks[i] = []
393
+
394
+ person_videos[i].append(frame)
395
+ person_tracks[i].append([x1,y1,x2,y2])
396
+
397
+
398
+ num_persons = 0
399
+ for i in person_videos.keys():
400
+ if len(person_videos[i]) >= frame_count//2:
401
+ num_persons+=1
402
+
403
+ if num_persons==0:
404
+ msg = "No person detected in the video! Please give a video with one person as input"
405
+ return None, None, msg
406
+ if num_persons>1:
407
+ msg = "More than one person detected in the video! Please give a video with only one person as input"
408
+ return None, None, msg
409
+
410
+ except:
411
+ msg = "Error in detecting person in the video, please check the input video and try again"
412
+ return None, None, msg
413
+
414
+ return person_videos, person_tracks, "success"
415
+
416
  def preprocess_video(path, result_folder, apply_preprocess, padding=20):
417
 
418
  '''
 
460
  all_frames = np.asarray(all_frames)
461
  print("Extracted the frames for pre-processing")
462
 
463
+ person_videos, person_tracks, msg = get_person_detection(all_frames, frame_count, padding)
464
+ if msg != "success":
465
+ return None, None, None, msg
466
+
467
+ # # Load YOLOv9 model (pre-trained on COCO dataset)
468
+ # yolo_model = YOLO("yolov9s.pt")
469
+ # print("Loaded the YOLO model")
470
 
471
 
472
 
473
+ # person_videos = {}
474
+ # person_tracks = {}
475
 
476
+ # print("Processing the frames...")
477
+ # for frame_idx in tqdm(range(frame_count)):
478
 
479
+ # frame = all_frames[frame_idx]
480
 
481
+ # # Perform person detection
482
+ # results = yolo_model(frame, verbose=False)
483
+ # detections = results[0].boxes
484
 
485
+ # for i, det in enumerate(detections):
486
+ # x1, y1, x2, y2 = det.xyxy[0]
487
+ # cls = det.cls[0]
488
+ # if int(cls) == 0: # Class 0 is 'person' in COCO dataset
489
 
490
+ # x1 = max(0, int(x1) - padding)
491
+ # y1 = max(0, int(y1) - padding)
492
+ # x2 = min(frame.shape[1], int(x2) + padding)
493
+ # y2 = min(frame.shape[0], int(y2) + padding)
494
 
495
+ # if i not in person_videos:
496
+ # person_videos[i] = []
497
+ # person_tracks[i] = []
498
 
499
+ # person_videos[i].append(frame)
500
+ # person_tracks[i].append([x1,y1,x2,y2])
501
 
502
 
503
+ # num_persons = 0
504
+ # for i in person_videos.keys():
505
+ # if len(person_videos[i]) >= frame_count//2:
506
+ # num_persons+=1
507
+
508
+ # if num_persons==0:
509
+ # msg = "No person detected in the video! Please give a video with one person as input"
510
+ # return None, None, None, msg
511
+ # if num_persons>1:
512
+ # msg = "More than one person detected in the video! Please give a video with only one person as input"
513
+ # return None, None, None, msg
514
 
515
 
516
 
 
1158
  aud_emb = model.forward_aud(audio_inp.to(device))
1159
  audio_emb.append(aud_emb.detach())
1160
 
1161
+ # torch.cuda.empty_cache()
1162
 
1163
  video_emb = torch.cat(video_emb, dim=0)
1164
 
 
1381
  # Extract embeddings
1382
  print("Obtaining audio and video embeddings...")
1383
  video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, calc_aud_emb=True)
1384
+ print("Successfully extracted GestSync embeddings")
1385
 
1386
  # L2 normalize embeddings
1387
  video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
 
1395
  video_emb = torch.split(video_emb, B, dim=0)
1396
  video_emb = torch.stack(video_emb, dim=2)
1397
  video_emb = video_emb.squeeze(3)
1398
+
1399
 
1400
  # Calculate sync offset
1401
+ print("Calculating sync offset...")
1402
  pred_offset, status = calc_optimal_av_offset(video_emb, audio_emb, num_avg_frames, model)
1403
  if status != "success":
1404
  return None, status