| import dlib |
| import cv2 |
| import os |
| import srt |
| import re |
| from math import floor,sqrt |
| from backend.utils import convert_to_css_pixel |
|
|
| |
| THETA1 = 1.2 |
| THETA2 = 0.4 |
| SAMPLE_RATE = 5 |
| FACE_AREA = 0.6 |
|
|
| |
| face_detector = dlib.get_frontal_face_detector() |
| landmark_detector = dlib.shape_predictor("backend/speech_bubble/shape_predictor_68_face_landmarks.dat") |
|
|
|
|
| def dist(p1, p2): |
| p1_x = p1[0] |
| p2_x = p2[0] |
| p1_y = p1[1] |
| p2_y = p2[1] |
| dist = sqrt((p2_x - p1_x) ** 2 + (p2_y - p1_y) ** 2) |
| return dist |
|
|
| |
| def similar_to_keyframe(face_rects, keyframe_face_rects): |
| rect1_top_left = face_rects[0].tl_corner() |
| rect1_bottom_right = face_rects[0].br_corner() |
| rect2_top_left = keyframe_face_rects[0].tl_corner() |
| rect2_bottom_right = keyframe_face_rects[0].br_corner() |
| tolerance = 0.2 |
| |
| def calculate_area(top_left, bottom_right): |
| width = abs(bottom_right.x - top_left.x) |
| height = abs(bottom_right.y - top_left.y) |
| return width * height |
|
|
| area_rect1 = calculate_area(rect1_top_left, rect1_bottom_right) |
| area_rect2 = calculate_area(rect2_top_left, rect2_bottom_right) |
| |
| area_tolerance = area_rect1 * tolerance |
| |
| if abs(area_rect1 - area_rect2) <= area_tolerance: |
| return True |
| else: |
| return False |
|
|
| |
| def get_lips(video, crop_coords, black_x, black_y): |
| print(crop_coords) |
| data="" |
| with open("test1.srt") as f: |
| data = f.read() |
| subs = srt.parse(data) |
|
|
| lips = {} |
| for sub in subs: |
| keyframe_path = f"frames/final/frame{sub.index:03}.png" |
| keyframe = cv2.imread(keyframe_path) |
| gray = cv2.cvtColor(keyframe,cv2.COLOR_BGR2GRAY) |
| face_rects = face_detector(gray,1) |
| print("\nsub:",sub.index) |
| if sub.content == "((action-scene))": |
| print("skipping action scene") |
| lips[sub.index] = (-1,-1) |
| continue |
|
|
| if len(face_rects) < 1: |
| print("No face detected: ",sub) |
| lips[sub.index] = (-1,-1) |
| continue |
|
|
| if len(face_rects) == 1: |
| rect = face_rects[0] |
| landmark = landmark_detector(gray, rect) |
| x,y = convert_to_css_pixel(landmark.part(65).x, landmark.part(65).y, crop_coords[sub.index - 1]) |
| lips[sub.index] = (x,y) |
| continue |
|
|
| |
| if len(face_rects) > 1: |
| print("Too many face: sub_",sub.index,": ", len(face_rects)) |
| origin = (crop_coords[sub.index - 1][0] , crop_coords[sub.index - 1][2] ) |
| lip_coords = get_multi_speaker_lips(sub,video,face_rects) |
| if lip_coords == (-1,-1): |
| lips[sub.index] = (-1,-1) |
| else: |
| x = lip_coords[0] - (origin[0] + black_x) |
| y = lip_coords[1] - (origin[1] + black_y) |
| x , y = convert_to_css_pixel(x,y,crop_coords[sub.index - 1]) |
| lips[sub.index] = (x,y) |
| continue |
| print(lips) |
| return lips |
|
|
|
|
| def get_multi_speaker_lips(sub,video, keyframe_face_rects): |
| start_time = sub.start.total_seconds() |
| end_time = sub.end.total_seconds() |
| keyframe_path = f"frames/final/frame{sub.index:03}.png" |
|
|
| vid = cv2.VideoCapture(video) |
| frames_per_sec = vid.get(cv2.CAP_PROP_FPS) |
| |
| |
|
|
| |
| select_index = floor(frames_per_sec / SAMPLE_RATE) |
| start_frame = int(start_time * frames_per_sec) |
| end_frame = int(end_time * frames_per_sec) |
|
|
| vid.set(cv2.CAP_PROP_POS_FRAMES, start_frame) |
| print("FPS, select index = ", frames_per_sec, select_index) |
|
|
| |
| current_frame = start_frame |
| total_frames_selected = 0 |
|
|
| |
| frame_buffer = [] |
| frame_buffer_color = [] |
| while(current_frame<end_frame): |
| success, frame = vid.read() |
| if not success: |
| break |
| if current_frame % select_index == 0: |
| gray = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY) |
| frame_buffer.append(gray) |
| frame_buffer_color.append(frame) |
| total_frames_selected += 1 |
| current_frame += 1 |
| vid.release() |
|
|
| prev_lip_dist = {} |
| lip_motion_count = {} |
| lip_coords = {} |
| avg_gap = {} |
|
|
| start_flag = False |
|
|
| for (i, image) in enumerate(frame_buffer): |
| face_rects = face_detector(image,1) |
| if len(face_rects) < 1: |
| print("No face detected: frame ",i) |
| continue |
| if len(face_rects) >= 1: |
|
|
| |
| if not similar_to_keyframe(face_rects, keyframe_face_rects): |
| print("frame not similar: ",i) |
| continue |
|
|
| largest_face = max(face_rects, key=lambda rect: rect.area()) |
| print("largest face: ", largest_face) |
|
|
| avg_gap[i] = {} |
| prev_lip_dist[i] = {} |
| for (j,rect) in enumerate(face_rects): |
| if (rect.area() / largest_face.area()) < FACE_AREA: |
| print("Lip skipped: ", j, rect) |
| continue |
|
|
| prev_lip_dist[i][j] = 0 |
| landmark = landmark_detector(image, rect) |
| |
|
|
| part_61 = (landmark.part(61).x,landmark.part(61).y) |
| part_67 = (landmark.part(67).x,landmark.part(67).y) |
| part_62 = (landmark.part(62).x,landmark.part(62).y) |
| part_66 = (landmark.part(66).x,landmark.part(66).y) |
| part_63 = (landmark.part(63).x,landmark.part(63).y) |
| part_65 = (landmark.part(65).x,landmark.part(65).y) |
| A = dist(part_61, part_67) |
| B = dist(part_62, part_66) |
| C = dist(part_63, part_65) |
|
|
| avg_gap[i][j] = (A + B + C) / 3.0 |
|
|
| |
| if j not in lip_coords: |
| lip_coords[j] = part_65 |
|
|
| |
| if start_flag==False: |
| prev_lip_dist[i][j] = avg_gap[i][j] |
| start_flag = True |
| continue |
| |
| |
| print("Difference for frame {0}, lip {1}: {2}".format( i, j, abs(avg_gap[i][j] - prev_lip_dist[i][j])) ) |
| if abs(avg_gap[i][j] - prev_lip_dist[i][j]) > THETA1: |
| lip_motion_count[j] = lip_motion_count.get(j,0) + 1 |
| prev_lip_dist[i][j] = avg_gap[i][j] |
|
|
| |
| print("Lip motion count, total_frames_selected = ", lip_motion_count, total_frames_selected) |
| |
| try: |
| max_lip_index = max(lip_motion_count, key=lip_motion_count.get) |
| |
| |
| |
| if lip_motion_count[max_lip_index] / (total_frames_selected-1) > THETA2: |
| return lip_coords[max_lip_index] |
| else: |
| return (-1,-1) |
| except ValueError: |
| return (-1,-1) |
| except ZeroDivisionError: |
| return (-1,-1) |
|
|
| |
|
|
| |
|
|