Test / backend /speech_bubble /lip_detection.py
3v324v23's picture
Update Comic123 with local comic folder files
83e35a7
import dlib
import cv2
import os
import srt
import re
from math import floor,sqrt
from backend.utils import convert_to_css_pixel
# Some constants
THETA1 = 1.2 # Difference between lip distance of prev and curr frame
THETA2 = 0.4 # No. of lips crossed ratio
SAMPLE_RATE = 5
FACE_AREA = 0.6
# Face detector and landmark detector
face_detector = dlib.get_frontal_face_detector()
landmark_detector = dlib.shape_predictor("backend/speech_bubble/shape_predictor_68_face_landmarks.dat")
def dist(p1, p2):
p1_x = p1[0]
p2_x = p2[0]
p1_y = p1[1]
p2_y = p2[1]
dist = sqrt((p2_x - p1_x) ** 2 + (p2_y - p1_y) ** 2)
return dist
# Checks if 2 face rectangles have the same area using their top-left and bottom-right corners
def similar_to_keyframe(face_rects, keyframe_face_rects):
rect1_top_left = face_rects[0].tl_corner()
rect1_bottom_right = face_rects[0].br_corner()
rect2_top_left = keyframe_face_rects[0].tl_corner()
rect2_bottom_right = keyframe_face_rects[0].br_corner()
tolerance = 0.2
def calculate_area(top_left, bottom_right):
width = abs(bottom_right.x - top_left.x)
height = abs(bottom_right.y - top_left.y)
return width * height
area_rect1 = calculate_area(rect1_top_left, rect1_bottom_right)
area_rect2 = calculate_area(rect2_top_left, rect2_bottom_right)
area_tolerance = area_rect1 * tolerance
if abs(area_rect1 - area_rect2) <= area_tolerance:
return True
else:
return False
#crop_coords contain left,right,top,bottom of each frame
def get_lips(video, crop_coords, black_x, black_y):
print(crop_coords)
data=""
with open("test1.srt") as f:
data = f.read()
subs = srt.parse(data)
lips = {}
for sub in subs:
keyframe_path = f"frames/final/frame{sub.index:03}.png"
keyframe = cv2.imread(keyframe_path)
gray = cv2.cvtColor(keyframe,cv2.COLOR_BGR2GRAY) # Convert image into grayscale
face_rects = face_detector(gray,1) # Detect face
print("\nsub:",sub.index)
if sub.content == "((action-scene))":
print("skipping action scene")
lips[sub.index] = (-1,-1)
continue
if len(face_rects) < 1: # No face detected
print("No face detected: ",sub)
lips[sub.index] = (-1,-1)
continue
if len(face_rects) == 1: # 1 face detected: Extract from keyframe itself
rect = face_rects[0]
landmark = landmark_detector(gray, rect) # Detect face landmarks
x,y = convert_to_css_pixel(landmark.part(65).x, landmark.part(65).y, crop_coords[sub.index - 1])
lips[sub.index] = (x,y)
continue
if len(face_rects) > 1: # Too many face detected
print("Too many face: sub_",sub.index,": ", len(face_rects))
origin = (crop_coords[sub.index - 1][0] , crop_coords[sub.index - 1][2] ) # (left,top)
lip_coords = get_multi_speaker_lips(sub,video,face_rects)
if lip_coords == (-1,-1):
lips[sub.index] = (-1,-1)
else:
x = lip_coords[0] - (origin[0] + black_x)
y = lip_coords[1] - (origin[1] + black_y)
x , y = convert_to_css_pixel(x,y,crop_coords[sub.index - 1])
lips[sub.index] = (x,y)
continue
print(lips)
return lips
def get_multi_speaker_lips(sub,video, keyframe_face_rects):
start_time = sub.start.total_seconds()
end_time = sub.end.total_seconds()
keyframe_path = f"frames/final/frame{sub.index:03}.png"
vid = cv2.VideoCapture(video) # Read video
frames_per_sec = vid.get(cv2.CAP_PROP_FPS) # Number of frames per second
# total_frames = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
# frames_count = total_frames // frameRate
# Calculate the frame skip value
select_index = floor(frames_per_sec / SAMPLE_RATE) # Select every (skip_rate)'th position frames to get the SAMPLE_RATE number of frames per second
start_frame = int(start_time * frames_per_sec)
end_frame = int(end_time * frames_per_sec)
vid.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
print("FPS, select index = ", frames_per_sec, select_index)
# Initialize frame counter
current_frame = start_frame
total_frames_selected = 0
# Parse into frames
frame_buffer = [] # A list to hold frame images
frame_buffer_color = [] # A list to hold original frame images
while(current_frame<end_frame):
success, frame = vid.read() # Read frame
if not success:
break
if current_frame % select_index == 0: # Break if no frame to read left
gray = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY) # Convert image into grayscale
frame_buffer.append(gray) # Add image to the frame buffer
frame_buffer_color.append(frame)
total_frames_selected += 1
current_frame += 1
vid.release()
prev_lip_dist = {} #2D[i][j]
lip_motion_count = {} #1D[j]
lip_coords = {} #1D[j]
avg_gap = {} #2D[i][j]
start_flag = False #To skip the lip distance calculation for first frame
for (i, image) in enumerate(frame_buffer): # Iterate on frame buffer
face_rects = face_detector(image,1) # Detect face
if len(face_rects) < 1: # No face detected
print("No face detected: frame ",i)
continue
if len(face_rects) >= 1: # Too many face detected
# Check if area of the first face rectangle is close to keyframe
if not similar_to_keyframe(face_rects, keyframe_face_rects):
print("frame not similar: ",i)
continue
largest_face = max(face_rects, key=lambda rect: rect.area())
print("largest face: ", largest_face)
avg_gap[i] = {}
prev_lip_dist[i] = {}
for (j,rect) in enumerate(face_rects):
if (rect.area() / largest_face.area()) < FACE_AREA: #Consider lip only if face area crosses a threshold(ROI)
print("Lip skipped: ", j, rect)
continue
prev_lip_dist[i][j] = 0
landmark = landmark_detector(image, rect) # Detect face landmarks
# landmark = shape_to_list(landmark)
part_61 = (landmark.part(61).x,landmark.part(61).y)
part_67 = (landmark.part(67).x,landmark.part(67).y)
part_62 = (landmark.part(62).x,landmark.part(62).y)
part_66 = (landmark.part(66).x,landmark.part(66).y)
part_63 = (landmark.part(63).x,landmark.part(63).y)
part_65 = (landmark.part(65).x,landmark.part(65).y)
A = dist(part_61, part_67)
B = dist(part_62, part_66)
C = dist(part_63, part_65)
avg_gap[i][j] = (A + B + C) / 3.0
# Store lip coordinate if encountered for first time
if j not in lip_coords:
lip_coords[j] = part_65
# Loop runs for the first time
if start_flag==False:
prev_lip_dist[i][j] = avg_gap[i][j]
start_flag = True
continue
# Check if lip distance between continous frame is above threshold, if so increase lip count
print("Difference for frame {0}, lip {1}: {2}".format( i, j, abs(avg_gap[i][j] - prev_lip_dist[i][j])) )
if abs(avg_gap[i][j] - prev_lip_dist[i][j]) > THETA1:
lip_motion_count[j] = lip_motion_count.get(j,0) + 1
prev_lip_dist[i][j] = avg_gap[i][j]
print("Lip motion count, total_frames_selected = ", lip_motion_count, total_frames_selected)
# print("max lip count ratio = ", lip_motion_count / (total_frames_selected-1))
try:
max_lip_index = max(lip_motion_count, key=lip_motion_count.get)
# max_value = lip_motion_count[max_lip_index]
# if max_lip_count / (total_frames_selected-1) > THETA2:
# print("speaking")
if lip_motion_count[max_lip_index] / (total_frames_selected-1) > THETA2:
return lip_coords[max_lip_index]
else:
return (-1,-1)
except ValueError:
return (-1,-1)
except ZeroDivisionError:
return (-1,-1)