|
|
import dlib |
|
|
import cv2 |
|
|
import os |
|
|
import srt |
|
|
import re |
|
|
from math import floor,sqrt |
|
|
from backend.utils import convert_to_css_pixel |
|
|
|
|
|
|
|
|
THETA1 = 1.2 |
|
|
THETA2 = 0.4 |
|
|
SAMPLE_RATE = 5 |
|
|
FACE_AREA = 0.6 |
|
|
|
|
|
|
|
|
face_detector = dlib.get_frontal_face_detector() |
|
|
landmark_detector = dlib.shape_predictor("backend/speech_bubble/shape_predictor_68_face_landmarks.dat") |
|
|
|
|
|
|
|
|
def dist(p1, p2): |
|
|
p1_x = p1[0] |
|
|
p2_x = p2[0] |
|
|
p1_y = p1[1] |
|
|
p2_y = p2[1] |
|
|
dist = sqrt((p2_x - p1_x) ** 2 + (p2_y - p1_y) ** 2) |
|
|
return dist |
|
|
|
|
|
|
|
|
def similar_to_keyframe(face_rects, keyframe_face_rects): |
|
|
rect1_top_left = face_rects[0].tl_corner() |
|
|
rect1_bottom_right = face_rects[0].br_corner() |
|
|
rect2_top_left = keyframe_face_rects[0].tl_corner() |
|
|
rect2_bottom_right = keyframe_face_rects[0].br_corner() |
|
|
tolerance = 0.2 |
|
|
|
|
|
def calculate_area(top_left, bottom_right): |
|
|
width = abs(bottom_right.x - top_left.x) |
|
|
height = abs(bottom_right.y - top_left.y) |
|
|
return width * height |
|
|
|
|
|
area_rect1 = calculate_area(rect1_top_left, rect1_bottom_right) |
|
|
area_rect2 = calculate_area(rect2_top_left, rect2_bottom_right) |
|
|
|
|
|
area_tolerance = area_rect1 * tolerance |
|
|
|
|
|
if abs(area_rect1 - area_rect2) <= area_tolerance: |
|
|
return True |
|
|
else: |
|
|
return False |
|
|
|
|
|
|
|
|
def get_lips(video, crop_coords, black_x, black_y): |
|
|
print(crop_coords) |
|
|
data="" |
|
|
with open("test1.srt") as f: |
|
|
data = f.read() |
|
|
subs = srt.parse(data) |
|
|
|
|
|
lips = {} |
|
|
for sub in subs: |
|
|
keyframe_path = f"frames/final/frame{sub.index:03}.png" |
|
|
keyframe = cv2.imread(keyframe_path) |
|
|
gray = cv2.cvtColor(keyframe,cv2.COLOR_BGR2GRAY) |
|
|
face_rects = face_detector(gray,1) |
|
|
print("\nsub:",sub.index) |
|
|
if sub.content == "((action-scene))": |
|
|
print("skipping action scene") |
|
|
lips[sub.index] = (-1,-1) |
|
|
continue |
|
|
|
|
|
if len(face_rects) < 1: |
|
|
print("No face detected: ",sub) |
|
|
lips[sub.index] = (-1,-1) |
|
|
continue |
|
|
|
|
|
if len(face_rects) == 1: |
|
|
rect = face_rects[0] |
|
|
landmark = landmark_detector(gray, rect) |
|
|
x,y = convert_to_css_pixel(landmark.part(65).x, landmark.part(65).y, crop_coords[sub.index - 1]) |
|
|
lips[sub.index] = (x,y) |
|
|
continue |
|
|
|
|
|
|
|
|
if len(face_rects) > 1: |
|
|
print("Too many face: sub_",sub.index,": ", len(face_rects)) |
|
|
origin = (crop_coords[sub.index - 1][0] , crop_coords[sub.index - 1][2] ) |
|
|
lip_coords = get_multi_speaker_lips(sub,video,face_rects) |
|
|
if lip_coords == (-1,-1): |
|
|
lips[sub.index] = (-1,-1) |
|
|
else: |
|
|
x = lip_coords[0] - (origin[0] + black_x) |
|
|
y = lip_coords[1] - (origin[1] + black_y) |
|
|
x , y = convert_to_css_pixel(x,y,crop_coords[sub.index - 1]) |
|
|
lips[sub.index] = (x,y) |
|
|
continue |
|
|
print(lips) |
|
|
return lips |
|
|
|
|
|
|
|
|
def get_multi_speaker_lips(sub,video, keyframe_face_rects): |
|
|
start_time = sub.start.total_seconds() |
|
|
end_time = sub.end.total_seconds() |
|
|
keyframe_path = f"frames/final/frame{sub.index:03}.png" |
|
|
|
|
|
vid = cv2.VideoCapture(video) |
|
|
frames_per_sec = vid.get(cv2.CAP_PROP_FPS) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
select_index = floor(frames_per_sec / SAMPLE_RATE) |
|
|
start_frame = int(start_time * frames_per_sec) |
|
|
end_frame = int(end_time * frames_per_sec) |
|
|
|
|
|
vid.set(cv2.CAP_PROP_POS_FRAMES, start_frame) |
|
|
print("FPS, select index = ", frames_per_sec, select_index) |
|
|
|
|
|
|
|
|
current_frame = start_frame |
|
|
total_frames_selected = 0 |
|
|
|
|
|
|
|
|
frame_buffer = [] |
|
|
frame_buffer_color = [] |
|
|
while(current_frame<end_frame): |
|
|
success, frame = vid.read() |
|
|
if not success: |
|
|
break |
|
|
if current_frame % select_index == 0: |
|
|
gray = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY) |
|
|
frame_buffer.append(gray) |
|
|
frame_buffer_color.append(frame) |
|
|
total_frames_selected += 1 |
|
|
current_frame += 1 |
|
|
vid.release() |
|
|
|
|
|
prev_lip_dist = {} |
|
|
lip_motion_count = {} |
|
|
lip_coords = {} |
|
|
avg_gap = {} |
|
|
|
|
|
start_flag = False |
|
|
|
|
|
for (i, image) in enumerate(frame_buffer): |
|
|
face_rects = face_detector(image,1) |
|
|
if len(face_rects) < 1: |
|
|
print("No face detected: frame ",i) |
|
|
continue |
|
|
if len(face_rects) >= 1: |
|
|
|
|
|
|
|
|
if not similar_to_keyframe(face_rects, keyframe_face_rects): |
|
|
print("frame not similar: ",i) |
|
|
continue |
|
|
|
|
|
largest_face = max(face_rects, key=lambda rect: rect.area()) |
|
|
print("largest face: ", largest_face) |
|
|
|
|
|
avg_gap[i] = {} |
|
|
prev_lip_dist[i] = {} |
|
|
for (j,rect) in enumerate(face_rects): |
|
|
if (rect.area() / largest_face.area()) < FACE_AREA: |
|
|
print("Lip skipped: ", j, rect) |
|
|
continue |
|
|
|
|
|
prev_lip_dist[i][j] = 0 |
|
|
landmark = landmark_detector(image, rect) |
|
|
|
|
|
|
|
|
part_61 = (landmark.part(61).x,landmark.part(61).y) |
|
|
part_67 = (landmark.part(67).x,landmark.part(67).y) |
|
|
part_62 = (landmark.part(62).x,landmark.part(62).y) |
|
|
part_66 = (landmark.part(66).x,landmark.part(66).y) |
|
|
part_63 = (landmark.part(63).x,landmark.part(63).y) |
|
|
part_65 = (landmark.part(65).x,landmark.part(65).y) |
|
|
A = dist(part_61, part_67) |
|
|
B = dist(part_62, part_66) |
|
|
C = dist(part_63, part_65) |
|
|
|
|
|
avg_gap[i][j] = (A + B + C) / 3.0 |
|
|
|
|
|
|
|
|
if j not in lip_coords: |
|
|
lip_coords[j] = part_65 |
|
|
|
|
|
|
|
|
if start_flag==False: |
|
|
prev_lip_dist[i][j] = avg_gap[i][j] |
|
|
start_flag = True |
|
|
continue |
|
|
|
|
|
|
|
|
print("Difference for frame {0}, lip {1}: {2}".format( i, j, abs(avg_gap[i][j] - prev_lip_dist[i][j])) ) |
|
|
if abs(avg_gap[i][j] - prev_lip_dist[i][j]) > THETA1: |
|
|
lip_motion_count[j] = lip_motion_count.get(j,0) + 1 |
|
|
prev_lip_dist[i][j] = avg_gap[i][j] |
|
|
|
|
|
|
|
|
print("Lip motion count, total_frames_selected = ", lip_motion_count, total_frames_selected) |
|
|
|
|
|
try: |
|
|
max_lip_index = max(lip_motion_count, key=lip_motion_count.get) |
|
|
|
|
|
|
|
|
|
|
|
if lip_motion_count[max_lip_index] / (total_frames_selected-1) > THETA2: |
|
|
return lip_coords[max_lip_index] |
|
|
else: |
|
|
return (-1,-1) |
|
|
except ValueError: |
|
|
return (-1,-1) |
|
|
except ZeroDivisionError: |
|
|
return (-1,-1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|