import dlib import cv2 import os import srt import re from math import floor,sqrt from backend.utils import convert_to_css_pixel # Some constants THETA1 = 1.2 # Difference between lip distance of prev and curr frame THETA2 = 0.4 # No. of lips crossed ratio SAMPLE_RATE = 5 FACE_AREA = 0.6 # Face detector and landmark detector face_detector = dlib.get_frontal_face_detector() landmark_detector = dlib.shape_predictor("backend/speech_bubble/shape_predictor_68_face_landmarks.dat") def dist(p1, p2): p1_x = p1[0] p2_x = p2[0] p1_y = p1[1] p2_y = p2[1] dist = sqrt((p2_x - p1_x) ** 2 + (p2_y - p1_y) ** 2) return dist # Checks if 2 face rectangles have the same area using their top-left and bottom-right corners def similar_to_keyframe(face_rects, keyframe_face_rects): rect1_top_left = face_rects[0].tl_corner() rect1_bottom_right = face_rects[0].br_corner() rect2_top_left = keyframe_face_rects[0].tl_corner() rect2_bottom_right = keyframe_face_rects[0].br_corner() tolerance = 0.2 def calculate_area(top_left, bottom_right): width = abs(bottom_right.x - top_left.x) height = abs(bottom_right.y - top_left.y) return width * height area_rect1 = calculate_area(rect1_top_left, rect1_bottom_right) area_rect2 = calculate_area(rect2_top_left, rect2_bottom_right) area_tolerance = area_rect1 * tolerance if abs(area_rect1 - area_rect2) <= area_tolerance: return True else: return False #crop_coords contain left,right,top,bottom of each frame def get_lips(video, crop_coords, black_x, black_y): print(crop_coords) data="" with open("test1.srt") as f: data = f.read() subs = srt.parse(data) lips = {} for sub in subs: keyframe_path = f"frames/final/frame{sub.index:03}.png" keyframe = cv2.imread(keyframe_path) gray = cv2.cvtColor(keyframe,cv2.COLOR_BGR2GRAY) # Convert image into grayscale face_rects = face_detector(gray,1) # Detect face print("\nsub:",sub.index) if sub.content == "((action-scene))": print("skipping action scene") lips[sub.index] = (-1,-1) continue if len(face_rects) < 1: # No face detected print("No face detected: ",sub) lips[sub.index] = (-1,-1) continue if len(face_rects) == 1: # 1 face detected: Extract from keyframe itself rect = face_rects[0] landmark = landmark_detector(gray, rect) # Detect face landmarks x,y = convert_to_css_pixel(landmark.part(65).x, landmark.part(65).y, crop_coords[sub.index - 1]) lips[sub.index] = (x,y) continue if len(face_rects) > 1: # Too many face detected print("Too many face: sub_",sub.index,": ", len(face_rects)) origin = (crop_coords[sub.index - 1][0] , crop_coords[sub.index - 1][2] ) # (left,top) lip_coords = get_multi_speaker_lips(sub,video,face_rects) if lip_coords == (-1,-1): lips[sub.index] = (-1,-1) else: x = lip_coords[0] - (origin[0] + black_x) y = lip_coords[1] - (origin[1] + black_y) x , y = convert_to_css_pixel(x,y,crop_coords[sub.index - 1]) lips[sub.index] = (x,y) continue print(lips) return lips def get_multi_speaker_lips(sub,video, keyframe_face_rects): start_time = sub.start.total_seconds() end_time = sub.end.total_seconds() keyframe_path = f"frames/final/frame{sub.index:03}.png" vid = cv2.VideoCapture(video) # Read video frames_per_sec = vid.get(cv2.CAP_PROP_FPS) # Number of frames per second # total_frames = int(vid.get(cv2.CAP_PROP_FRAME_COUNT)) # frames_count = total_frames // frameRate # Calculate the frame skip value select_index = floor(frames_per_sec / SAMPLE_RATE) # Select every (skip_rate)'th position frames to get the SAMPLE_RATE number of frames per second start_frame = int(start_time * frames_per_sec) end_frame = int(end_time * frames_per_sec) vid.set(cv2.CAP_PROP_POS_FRAMES, start_frame) print("FPS, select index = ", frames_per_sec, select_index) # Initialize frame counter current_frame = start_frame total_frames_selected = 0 # Parse into frames frame_buffer = [] # A list to hold frame images frame_buffer_color = [] # A list to hold original frame images while(current_frame= 1: # Too many face detected # Check if area of the first face rectangle is close to keyframe if not similar_to_keyframe(face_rects, keyframe_face_rects): print("frame not similar: ",i) continue largest_face = max(face_rects, key=lambda rect: rect.area()) print("largest face: ", largest_face) avg_gap[i] = {} prev_lip_dist[i] = {} for (j,rect) in enumerate(face_rects): if (rect.area() / largest_face.area()) < FACE_AREA: #Consider lip only if face area crosses a threshold(ROI) print("Lip skipped: ", j, rect) continue prev_lip_dist[i][j] = 0 landmark = landmark_detector(image, rect) # Detect face landmarks # landmark = shape_to_list(landmark) part_61 = (landmark.part(61).x,landmark.part(61).y) part_67 = (landmark.part(67).x,landmark.part(67).y) part_62 = (landmark.part(62).x,landmark.part(62).y) part_66 = (landmark.part(66).x,landmark.part(66).y) part_63 = (landmark.part(63).x,landmark.part(63).y) part_65 = (landmark.part(65).x,landmark.part(65).y) A = dist(part_61, part_67) B = dist(part_62, part_66) C = dist(part_63, part_65) avg_gap[i][j] = (A + B + C) / 3.0 # Store lip coordinate if encountered for first time if j not in lip_coords: lip_coords[j] = part_65 # Loop runs for the first time if start_flag==False: prev_lip_dist[i][j] = avg_gap[i][j] start_flag = True continue # Check if lip distance between continous frame is above threshold, if so increase lip count print("Difference for frame {0}, lip {1}: {2}".format( i, j, abs(avg_gap[i][j] - prev_lip_dist[i][j])) ) if abs(avg_gap[i][j] - prev_lip_dist[i][j]) > THETA1: lip_motion_count[j] = lip_motion_count.get(j,0) + 1 prev_lip_dist[i][j] = avg_gap[i][j] print("Lip motion count, total_frames_selected = ", lip_motion_count, total_frames_selected) # print("max lip count ratio = ", lip_motion_count / (total_frames_selected-1)) try: max_lip_index = max(lip_motion_count, key=lip_motion_count.get) # max_value = lip_motion_count[max_lip_index] # if max_lip_count / (total_frames_selected-1) > THETA2: # print("speaking") if lip_motion_count[max_lip_index] / (total_frames_selected-1) > THETA2: return lip_coords[max_lip_index] else: return (-1,-1) except ValueError: return (-1,-1) except ZeroDivisionError: return (-1,-1)