Test

Paused

App Files Files Community

Test / backend /speech_bubble /lip_detection.py

3v324v23

Update Comic123 with local comic folder files

83e35a7 3 months ago

raw

history blame contribute delete

8.7 kB

	import dlib
	import cv2
	import os
	import srt
	import re
	from math import floor,sqrt
	from backend.utils import convert_to_css_pixel

	# Some constants
	THETA1 = 1.2 # Difference between lip distance of prev and curr frame
	THETA2 = 0.4 # No. of lips crossed ratio
	SAMPLE_RATE = 5
	FACE_AREA = 0.6

	# Face detector and landmark detector
	face_detector = dlib.get_frontal_face_detector()
	landmark_detector = dlib.shape_predictor("backend/speech_bubble/shape_predictor_68_face_landmarks.dat")


	def dist(p1, p2):
	p1_x = p1[0]
	p2_x = p2[0]
	p1_y = p1[1]
	p2_y = p2[1]
	dist = sqrt((p2_x - p1_x) 2 + (p2_y - p1_y) 2)
	return dist

	# Checks if 2 face rectangles have the same area using their top-left and bottom-right corners
	def similar_to_keyframe(face_rects, keyframe_face_rects):
	rect1_top_left = face_rects[0].tl_corner()
	rect1_bottom_right = face_rects[0].br_corner()
	rect2_top_left = keyframe_face_rects[0].tl_corner()
	rect2_bottom_right = keyframe_face_rects[0].br_corner()
	tolerance = 0.2

	def calculate_area(top_left, bottom_right):
	width = abs(bottom_right.x - top_left.x)
	height = abs(bottom_right.y - top_left.y)
	return width * height

	area_rect1 = calculate_area(rect1_top_left, rect1_bottom_right)
	area_rect2 = calculate_area(rect2_top_left, rect2_bottom_right)

	area_tolerance = area_rect1 * tolerance

	if abs(area_rect1 - area_rect2) <= area_tolerance:
	return True
	else:
	return False

	#crop_coords contain left,right,top,bottom of each frame
	def get_lips(video, crop_coords, black_x, black_y):
	print(crop_coords)
	data=""
	with open("test1.srt") as f:
	data = f.read()
	subs = srt.parse(data)

	lips = {}
	for sub in subs:
	keyframe_path = f"frames/final/frame{sub.index:03}.png"
	keyframe = cv2.imread(keyframe_path)
	gray = cv2.cvtColor(keyframe,cv2.COLOR_BGR2GRAY) # Convert image into grayscale
	face_rects = face_detector(gray,1) # Detect face
	print("\nsub:",sub.index)
	if sub.content == "((action-scene))":
	print("skipping action scene")
	lips[sub.index] = (-1,-1)
	continue

	if len(face_rects) < 1: # No face detected
	print("No face detected: ",sub)
	lips[sub.index] = (-1,-1)
	continue

	if len(face_rects) == 1: # 1 face detected: Extract from keyframe itself
	rect = face_rects[0]
	landmark = landmark_detector(gray, rect) # Detect face landmarks
	x,y = convert_to_css_pixel(landmark.part(65).x, landmark.part(65).y, crop_coords[sub.index - 1])
	lips[sub.index] = (x,y)
	continue


	if len(face_rects) > 1: # Too many face detected
	print("Too many face: sub_",sub.index,": ", len(face_rects))
	origin = (crop_coords[sub.index - 1][0] , crop_coords[sub.index - 1][2] ) # (left,top)
	lip_coords = get_multi_speaker_lips(sub,video,face_rects)
	if lip_coords == (-1,-1):
	lips[sub.index] = (-1,-1)
	else:
	x = lip_coords[0] - (origin[0] + black_x)
	y = lip_coords[1] - (origin[1] + black_y)
	x , y = convert_to_css_pixel(x,y,crop_coords[sub.index - 1])
	lips[sub.index] = (x,y)
	continue
	print(lips)
	return lips


	def get_multi_speaker_lips(sub,video, keyframe_face_rects):
	start_time = sub.start.total_seconds()
	end_time = sub.end.total_seconds()
	keyframe_path = f"frames/final/frame{sub.index:03}.png"

	vid = cv2.VideoCapture(video) # Read video
	frames_per_sec = vid.get(cv2.CAP_PROP_FPS) # Number of frames per second
	# total_frames = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
	# frames_count = total_frames // frameRate

	# Calculate the frame skip value
	select_index = floor(frames_per_sec / SAMPLE_RATE) # Select every (skip_rate)'th position frames to get the SAMPLE_RATE number of frames per second
	start_frame = int(start_time * frames_per_sec)
	end_frame = int(end_time * frames_per_sec)

	vid.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
	print("FPS, select index = ", frames_per_sec, select_index)

	# Initialize frame counter
	current_frame = start_frame
	total_frames_selected = 0

	# Parse into frames
	frame_buffer = [] # A list to hold frame images
	frame_buffer_color = [] # A list to hold original frame images
	while(current_frame<end_frame):
	success, frame = vid.read() # Read frame
	if not success:
	break
	if current_frame % select_index == 0: # Break if no frame to read left
	gray = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY) # Convert image into grayscale
	frame_buffer.append(gray) # Add image to the frame buffer
	frame_buffer_color.append(frame)
	total_frames_selected += 1
	current_frame += 1
	vid.release()

	prev_lip_dist = {} #2D[i][j]
	lip_motion_count = {} #1D[j]
	lip_coords = {} #1D[j]
	avg_gap = {} #2D[i][j]

	start_flag = False #To skip the lip distance calculation for first frame

	for (i, image) in enumerate(frame_buffer): # Iterate on frame buffer
	face_rects = face_detector(image,1) # Detect face
	if len(face_rects) < 1: # No face detected
	print("No face detected: frame ",i)
	continue
	if len(face_rects) >= 1: # Too many face detected

	# Check if area of the first face rectangle is close to keyframe
	if not similar_to_keyframe(face_rects, keyframe_face_rects):
	print("frame not similar: ",i)
	continue

	largest_face = max(face_rects, key=lambda rect: rect.area())
	print("largest face: ", largest_face)

	avg_gap[i] = {}
	prev_lip_dist[i] = {}
	for (j,rect) in enumerate(face_rects):
	if (rect.area() / largest_face.area()) < FACE_AREA: #Consider lip only if face area crosses a threshold(ROI)
	print("Lip skipped: ", j, rect)
	continue

	prev_lip_dist[i][j] = 0
	landmark = landmark_detector(image, rect) # Detect face landmarks
	# landmark = shape_to_list(landmark)

	part_61 = (landmark.part(61).x,landmark.part(61).y)
	part_67 = (landmark.part(67).x,landmark.part(67).y)
	part_62 = (landmark.part(62).x,landmark.part(62).y)
	part_66 = (landmark.part(66).x,landmark.part(66).y)
	part_63 = (landmark.part(63).x,landmark.part(63).y)
	part_65 = (landmark.part(65).x,landmark.part(65).y)
	A = dist(part_61, part_67)
	B = dist(part_62, part_66)
	C = dist(part_63, part_65)

	avg_gap[i][j] = (A + B + C) / 3.0

	# Store lip coordinate if encountered for first time
	if j not in lip_coords:
	lip_coords[j] = part_65

	# Loop runs for the first time
	if start_flag==False:
	prev_lip_dist[i][j] = avg_gap[i][j]
	start_flag = True
	continue

	# Check if lip distance between continous frame is above threshold, if so increase lip count
	print("Difference for frame {0}, lip {1}: {2}".format( i, j, abs(avg_gap[i][j] - prev_lip_dist[i][j])) )
	if abs(avg_gap[i][j] - prev_lip_dist[i][j]) > THETA1:
	lip_motion_count[j] = lip_motion_count.get(j,0) + 1
	prev_lip_dist[i][j] = avg_gap[i][j]


	print("Lip motion count, total_frames_selected = ", lip_motion_count, total_frames_selected)
	# print("max lip count ratio = ", lip_motion_count / (total_frames_selected-1))
	try:
	max_lip_index = max(lip_motion_count, key=lip_motion_count.get)
	# max_value = lip_motion_count[max_lip_index]
	# if max_lip_count / (total_frames_selected-1) > THETA2:
	# print("speaking")
	if lip_motion_count[max_lip_index] / (total_frames_selected-1) > THETA2:
	return lip_coords[max_lip_index]
	else:
	return (-1,-1)
	except ValueError:
	return (-1,-1)
	except ZeroDivisionError:
	return (-1,-1)