Spaces:

JusperLee
/

Dolphin

Running

Dolphin / Inference.py

Hive-Dataset

Change CUDA to CPU

adbe9ab about 2 months ago

26.9 kB

	import warnings
	warnings.filterwarnings("ignore")
	import os
	import argparse
	import face_alignment
	import torch
	import torchaudio
	import numpy as np
	import cv2
	from PIL import Image, ImageDraw
	from moviepy import *
	from collections import deque
	from skimage import transform as tf
	import yaml

	from look2hear.models import Dolphin
	from look2hear.datas.transform import get_preprocessing_pipelines

	from face_detection_utils import detect_faces

	# -- Landmark interpolation:
	def linear_interpolate(landmarks, start_idx, stop_idx):
	start_landmarks = landmarks[start_idx]
	stop_landmarks = landmarks[stop_idx]
	delta = stop_landmarks - start_landmarks
	for idx in range(1, stop_idx-start_idx):
	landmarks[start_idx+idx] = start_landmarks + idx/float(stop_idx-start_idx) * delta
	return landmarks

	# -- Face Transformation
	def warp_img(src, dst, img, std_size):
	tform = tf.estimate_transform('similarity', src, dst) # find the transformation matrix
	warped = tf.warp(img, inverse_map=tform.inverse, output_shape=std_size) # wrap the frame image
	warped = warped * 255 # note output from wrap is double image (value range [0,1])
	warped = warped.astype('uint8')
	return warped, tform

	def apply_transform(transform, img, std_size):
	warped = tf.warp(img, inverse_map=transform.inverse, output_shape=std_size)
	warped = warped * 255 # note output from wrap is double image (value range [0,1])
	warped = warped.astype('uint8')
	return warped

	# -- Crop
	def cut_patch(img, landmarks, height, width, threshold=5):

	center_x, center_y = np.mean(landmarks, axis=0)

	if center_y - height < 0:
	center_y = height
	if center_y - height < 0 - threshold:
	raise Exception('too much bias in height')
	if center_x - width < 0:
	center_x = width
	if center_x - width < 0 - threshold:
	raise Exception('too much bias in width')

	if center_y + height > img.shape[0]:
	center_y = img.shape[0] - height
	if center_y + height > img.shape[0] + threshold:
	raise Exception('too much bias in height')
	if center_x + width > img.shape[1]:
	center_x = img.shape[1] - width
	if center_x + width > img.shape[1] + threshold:
	raise Exception('too much bias in width')

	cutted_img = np.copy(img[ int(round(center_y) - round(height)): int(round(center_y) + round(height)),
	int(round(center_x) - round(width)): int(round(center_x) + round(width))])
	return cutted_img

	# -- RGB to GRAY
	def convert_bgr2gray(data):
	return np.stack([cv2.cvtColor(_, cv2.COLOR_BGR2GRAY) for _ in data], axis=0)


	def save2npz(filename, data=None):
	assert data is not None, "data is {}".format(data)
	if not os.path.exists(os.path.dirname(filename)):
	os.makedirs(os.path.dirname(filename))
	np.savez_compressed(filename, data=data)

	def read_video(filename):
	"""Read video frames using MoviePy for better compatibility"""
	try:
	video_clip = VideoFileClip(filename)
	for frame in video_clip.iter_frames():
	# Convert RGB to BGR to match cv2 format
	frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
	yield frame_bgr
	video_clip.close()
	except Exception as e:
	print(f"Error reading video {filename}: {e}")
	return

	def face2head(boxes, scale=1.5):
	new_boxes = []
	for box in boxes:
	width = box[2] - box[0]
	height= box[3] - box[1]
	width_center = (box[2] + box[0]) / 2
	height_center = (box[3] + box[1]) / 2
	square_width = int(max(width, height) * scale)
	new_box = [width_center - square_width/2, height_center - square_width/2, width_center + square_width/2, height_center + square_width/2]
	new_boxes.append(new_box)
	return new_boxes

	def bb_intersection_over_union(boxA, boxB):
	# determine the (x, y)-coordinates of the intersection rectangle
	xA = max(boxA[0], boxB[0])
	yA = max(boxA[1], boxB[1])
	xB = min(boxA[2], boxB[2])
	yB = min(boxA[3], boxB[3])
	# compute the area of intersection rectangle
	interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
	# compute the area of both the prediction and ground-truth
	# rectangles
	boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
	boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
	# compute the intersection over union by taking the intersection
	# area and dividing it by the sum of prediction + ground-truth
	# areas - the interesection area
	iou = interArea / float(boxAArea + boxBArea - interArea)
	# return the intersection over union value
	return iou

	def detectface(video_input_path, output_path, detect_every_N_frame, scalar_face_detection, number_of_speakers):
	device = torch.device("cpu")
	print('Running on device: {}'.format(device))
	os.makedirs(os.path.join(output_path, 'faces'), exist_ok=True)
	os.makedirs(os.path.join(output_path, 'landmark'), exist_ok=True)

	landmarks_dic = {}
	faces_dic = {}
	boxes_dic = {}

	for i in range(number_of_speakers):
	landmarks_dic[i] = []
	faces_dic[i] = []
	boxes_dic[i] = []

	video_clip = VideoFileClip(video_input_path)
	print("Video statistics: ", video_clip.w, video_clip.h, (video_clip.w, video_clip.h), video_clip.fps)
	frames = [Image.fromarray(frame) for frame in video_clip.iter_frames()]
	print('Number of frames in video: ', len(frames))
	video_clip.close()
	fa = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D, flip_input=False, device='cpu')

	for i, frame in enumerate(frames):
	print('\rTracking frame: {}'.format(i + 1), end='')

	# Detect faces every N frames
	if i % detect_every_N_frame == 0:
	frame_array = np.array(frame)

	detected_boxes, _ = detect_faces(
	frame_array,
	threshold=0.9,
	allow_upscaling=False,
	)

	if detected_boxes is None or len(detected_boxes) == 0:
	detected_boxes, _ = detect_faces(
	frame_array,
	threshold=0.7,
	allow_upscaling=True,
	)

	if detected_boxes is not None and len(detected_boxes) > 0:
	detected_boxes = detected_boxes[:number_of_speakers]
	detected_boxes = face2head(detected_boxes, scalar_face_detection)
	else:
	detected_boxes = []

	# Process the detection results
	if i == 0:
	# First frame - initialize tracking
	if len(detected_boxes) < number_of_speakers:
	raise ValueError(f"First frame must detect at least {number_of_speakers} faces, but only found {len(detected_boxes)}")

	# Assign first detections to speakers in order
	for j in range(number_of_speakers):
	box = detected_boxes[j]
	face = frame.crop((box[0], box[1], box[2], box[3])).resize((224,224))
	preds = fa.get_landmarks(np.array(face))

	if preds is None:
	raise ValueError(f"Face landmarks not detected in initial frame for speaker {j}")

	faces_dic[j].append(face)
	landmarks_dic[j].append(preds)
	boxes_dic[j].append(box)
	else:
	# For subsequent frames, match detected boxes to speakers
	matched_speakers = set()
	speaker_boxes = [None] * number_of_speakers

	# Match each detected box to the most likely speaker
	for box in detected_boxes:
	iou_scores = []
	for speaker_id in range(number_of_speakers):
	if speaker_id in matched_speakers:
	iou_scores.append(-1) # Already matched
	else:
	last_box = boxes_dic[speaker_id][-1]
	iou_score = bb_intersection_over_union(box, last_box)
	iou_scores.append(iou_score)

	if max(iou_scores) > 0: # Valid match found
	best_speaker = iou_scores.index(max(iou_scores))
	speaker_boxes[best_speaker] = box
	matched_speakers.add(best_speaker)

	# Process each speaker
	for speaker_id in range(number_of_speakers):
	if speaker_boxes[speaker_id] is not None:
	# Use detected box
	box = speaker_boxes[speaker_id]
	else:
	# Use previous box for this speaker
	box = boxes_dic[speaker_id][-1]

	# Extract face and landmarks
	face = frame.crop((box[0], box[1], box[2], box[3])).resize((224,224))
	preds = fa.get_landmarks(np.array(face))

	if preds is None:
	# Use previous landmarks if detection fails
	preds = landmarks_dic[speaker_id][-1]

	faces_dic[speaker_id].append(face)
	landmarks_dic[speaker_id].append(preds)
	boxes_dic[speaker_id].append(box)

	# Verify all speakers have same number of frames
	frame_counts = [len(boxes_dic[s]) for s in range(number_of_speakers)]
	print(f"\nFrame counts per speaker: {frame_counts}")
	assert all(count == len(frames) for count in frame_counts), f"Inconsistent frame counts: {frame_counts}"

	# Continue with saving videos and landmarks...
	for s in range(number_of_speakers):
	frames_tracked = []
	for i, frame in enumerate(frames):
	frame_draw = frame.copy()
	draw = ImageDraw.Draw(frame_draw)
	draw.rectangle(boxes_dic[s][i], outline=(255, 0, 0), width=6)
	frames_tracked.append(frame_draw)

	# Save tracked video
	tracked_frames = [np.array(frame) for frame in frames_tracked]
	if tracked_frames:
	tracked_clip = ImageSequenceClip(tracked_frames, fps=25.0)
	tracked_video_path = os.path.join(output_path, 'video_tracked' + str(s+1) + '.mp4')
	tracked_clip.write_videofile(tracked_video_path, codec='libx264', audio=False, logger=None)
	tracked_clip.close()

	# Save landmarks
	for i in range(number_of_speakers):
	save2npz(os.path.join(output_path, 'landmark', 'speaker' + str(i+1)+'.npz'), data=landmarks_dic[i])

	# Save face video
	face_frames = [np.array(frame) for frame in faces_dic[i]]
	if face_frames:
	face_clip = ImageSequenceClip(face_frames, fps=25.0)
	face_video_path = os.path.join(output_path, 'faces', 'speaker' + str(i+1) + '.mp4')
	face_clip.write_videofile(face_video_path, codec='libx264', audio=False, logger=None)
	face_clip.close()

	# Output video path
	parts = video_input_path.split('/')
	video_name = parts[-1][:-4]
	if not os.path.exists(os.path.join(output_path, 'filename_input')):
	os.mkdir(os.path.join(output_path, 'filename_input'))
	csvfile = open(os.path.join(output_path, 'filename_input', str(video_name) + '.csv'), 'w')
	for i in range(number_of_speakers):
	csvfile.write('speaker' + str(i+1)+ ',0\n')
	csvfile.close()
	return os.path.join(output_path, 'filename_input', str(video_name) + '.csv')


	def crop_patch(mean_face_landmarks, video_pathname, landmarks, window_margin, start_idx, stop_idx, crop_height, crop_width, STD_SIZE=(256, 256)):

	"""Crop mouth patch
	:param str video_pathname: pathname for the video_dieo
	:param list landmarks: interpolated landmarks
	"""

	stablePntsIDs = [33, 36, 39, 42, 45]

	frame_idx = 0
	frame_gen = read_video(video_pathname)
	while True:
	try:
	frame = frame_gen.__next__() ## -- BGR
	except StopIteration:
	break
	if frame_idx == 0:
	q_frame, q_landmarks = deque(), deque()
	sequence = []

	q_landmarks.append(landmarks[frame_idx])
	q_frame.append(frame)
	if len(q_frame) == window_margin:
	smoothed_landmarks = np.mean(q_landmarks, axis=0)
	cur_landmarks = q_landmarks.popleft()
	cur_frame = q_frame.popleft()
	# -- affine transformation
	trans_frame, trans = warp_img( smoothed_landmarks[stablePntsIDs, :],
	mean_face_landmarks[stablePntsIDs, :],
	cur_frame,
	STD_SIZE)
	trans_landmarks = trans(cur_landmarks)
	# -- crop mouth patch
	sequence.append( cut_patch( trans_frame,
	trans_landmarks[start_idx:stop_idx],
	crop_height//2,
	crop_width//2,))
	if frame_idx == len(landmarks)-1:
	#deal with corner case with video too short
	if len(landmarks) < window_margin:
	smoothed_landmarks = np.mean(q_landmarks, axis=0)
	cur_landmarks = q_landmarks.popleft()
	cur_frame = q_frame.popleft()

	# -- affine transformation
	trans_frame, trans = warp_img(smoothed_landmarks[stablePntsIDs, :],
	mean_face_landmarks[stablePntsIDs, :],
	cur_frame,
	STD_SIZE)
	trans_landmarks = trans(cur_landmarks)
	# -- crop mouth patch
	sequence.append(cut_patch( trans_frame,
	trans_landmarks[start_idx:stop_idx],
	crop_height//2,
	crop_width//2,))

	while q_frame:
	cur_frame = q_frame.popleft()
	# -- transform frame
	trans_frame = apply_transform( trans, cur_frame, STD_SIZE)
	# -- transform landmarks
	trans_landmarks = trans(q_landmarks.popleft())
	# -- crop mouth patch
	sequence.append( cut_patch( trans_frame,
	trans_landmarks[start_idx:stop_idx],
	crop_height//2,
	crop_width//2,))
	return np.array(sequence)
	frame_idx += 1
	return None

	def landmarks_interpolate(landmarks):

	"""Interpolate landmarks
	param list landmarks: landmarks detected in raw videos
	"""

	valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None]
	if not valid_frames_idx:
	return None
	for idx in range(1, len(valid_frames_idx)):
	if valid_frames_idx[idx] - valid_frames_idx[idx-1] == 1:
	continue
	else:
	landmarks = linear_interpolate(landmarks, valid_frames_idx[idx-1], valid_frames_idx[idx])
	valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None]
	# -- Corner case: keep frames at the beginning or at the end failed to be detected.
	if valid_frames_idx:
	landmarks[:valid_frames_idx[0]] = [landmarks[valid_frames_idx[0]]] * valid_frames_idx[0]
	landmarks[valid_frames_idx[-1]:] = [landmarks[valid_frames_idx[-1]]] * (len(landmarks) - valid_frames_idx[-1])
	valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None]
	assert len(valid_frames_idx) == len(landmarks), "not every frame has landmark"
	return landmarks

	def crop_mouth(video_direc, landmark_direc, filename_path, save_direc, convert_gray=False, testset_only=False):
	lines = open(filename_path).read().splitlines()
	lines = list(filter(lambda x: 'test' in x, lines)) if testset_only else lines

	for filename_idx, line in enumerate(lines):

	filename, person_id = line.split(',')
	print('idx: {} \tProcessing.\t{}'.format(filename_idx, filename))

	video_pathname = os.path.join(video_direc, filename+'.mp4')
	landmarks_pathname = os.path.join(landmark_direc, filename+'.npz')
	dst_pathname = os.path.join( save_direc, filename+'.npz')

	# if os.path.exists(dst_pathname):
	# continue

	multi_sub_landmarks = np.load(landmarks_pathname, allow_pickle=True)['data']
	landmarks = [None] * len(multi_sub_landmarks)
	for frame_idx in range(len(landmarks)):
	try:
	#landmarks[frame_idx] = multi_sub_landmarks[frame_idx][int(person_id)]['facial_landmarks'] #original for LRW
	landmarks[frame_idx] = multi_sub_landmarks[frame_idx][int(person_id)] #VOXCELEB2
	except (IndexError, TypeError):
	continue

	# -- pre-process landmarks: interpolate frames not being detected.
	preprocessed_landmarks = landmarks_interpolate(landmarks)
	if not preprocessed_landmarks:
	continue

	# -- crop
	mean_face_landmarks = np.load('assets/20words_mean_face.npy')
	sequence = crop_patch(mean_face_landmarks, video_pathname, preprocessed_landmarks, 12, 48, 68, 96, 96)
	assert sequence is not None, "cannot crop from {}.".format(filename)

	# -- save
	data = convert_bgr2gray(sequence) if convert_gray else sequence[...,::-1]
	save2npz(dst_pathname, data=data)

	def convert_video_fps(input_file, output_file, target_fps=25):
	"""Convert video to target FPS using moviepy"""
	video = VideoFileClip(input_file)
	video_fps = video.fps

	if video_fps != target_fps:
	video.write_videofile(
	output_file,
	fps=target_fps,
	codec='libx264',
	audio_codec='aac',
	temp_audiofile='temp-audio.m4a',
	remove_temp=True,
	)
	else:
	# If already at target fps, just copy
	import shutil
	shutil.copy2(input_file, output_file)

	video.close()
	print(f'Video has been converted to {target_fps} fps and saved to {output_file}')

	def extract_audio(video_file, audio_output_file, sample_rate=16000):
	"""Extract audio from video using moviepy"""
	video = VideoFileClip(video_file)
	audio = video.audio

	# Save audio with specified sample rate
	audio.write_audiofile(audio_output_file, fps=sample_rate, nbytes=2, codec='pcm_s16le')

	video.close()
	audio.close()

	def merge_video_audio(video_file, audio_file, output_file):
	"""Merge video and audio using moviepy"""
	video = VideoFileClip(video_file)
	audio = AudioFileClip(audio_file)

	# Attach audio (MoviePy v2 renamed set_audio to with_audio)
	set_audio_fn = getattr(video, "set_audio", None)
	if callable(set_audio_fn):
	final_video = set_audio_fn(audio)
	else:
	with_audio_fn = getattr(video, "with_audio", None)
	if not callable(with_audio_fn):
	video.close()
	audio.close()
	raise AttributeError("VideoFileClip object lacks both set_audio and with_audio methods")
	final_video = with_audio_fn(audio)

	# Write the result
	final_video.write_videofile(output_file, codec='libx264', audio_codec='aac', temp_audiofile='temp-audio.m4a', remove_temp=True)

	# Clean up
	video.close()
	audio.close()
	final_video.close()

	def process_video(input_file, output_path, number_of_speakers=2,
	detect_every_N_frame=8, scalar_face_detection=1.5,
	config_path="checkpoints/vox2/conf.yml"):
	"""Main processing function for video speaker separation"""

	device = torch.device("cpu")

	# Create output directory
	os.makedirs(output_path, exist_ok=True)

	# Convert video to 25fps
	temp_25fps_file = os.path.join(output_path, 'temp_25fps.mp4')
	convert_video_fps(input_file, temp_25fps_file, target_fps=25)

	# Detect faces
	filename_path = detectface(video_input_path=temp_25fps_file,
	output_path=output_path,
	detect_every_N_frame=detect_every_N_frame,
	scalar_face_detection=scalar_face_detection,
	number_of_speakers=number_of_speakers)

	# Extract audio
	audio_output = os.path.join(output_path, 'audio.wav')
	extract_audio(temp_25fps_file, audio_output, sample_rate=16000)

	# Crop mouth
	crop_mouth(video_direc=os.path.join(output_path, "faces"),
	landmark_direc=os.path.join(output_path, "landmark"),
	filename_path=filename_path,
	save_direc=os.path.join(output_path, "mouthroi"),
	convert_gray=True,
	testset_only=False)

	# Load model
	audiomodel = Dolphin.from_pretrained("JusperLee/Dolphin").to(device)
	audiomodel.eval()

	# Process each speaker
	with torch.no_grad():
	for i in range(number_of_speakers):
	mouth_roi = np.load(os.path.join(output_path, "mouthroi", f"speaker{i+1}.npz"))["data"]
	mouth_roi = get_preprocessing_pipelines()["val"](mouth_roi)

	mix, sr = torchaudio.load(audio_output)
	mix = mix.mean(dim=0).to(device)

	window_size = 4 * sr
	hop_size = 4 * sr

	all_estimates = []

	# 滑动窗口处理
	start_idx = 0
	while start_idx < len(mix):
	end_idx = min(start_idx + window_size, len(mix))
	window_mix = mix[start_idx:end_idx]

	start_frame = int(start_idx / sr * 25)
	end_frame = int(end_idx / sr * 25)
	end_frame = min(end_frame, len(mouth_roi))
	window_mouth_roi = mouth_roi[start_frame:end_frame]

	est_sources = audiomodel(
	window_mix[None],
	torch.from_numpy(window_mouth_roi[None, None]).float().to(device)
	)

	all_estimates.append({
	'start': start_idx,
	'end': end_idx,
	'estimate': est_sources[0].cpu()
	})

	start_idx += hop_size

	if start_idx >= len(mix):
	break

	output_length = len(mix)
	merged_output = torch.zeros(1, output_length, device=device)
	weights = torch.zeros(output_length, device=device)

	for est in all_estimates:
	window_len = est['end'] - est['start']
	hann_window = torch.hann_window(window_len, device=device)

	merged_output[0, est['start']:est['end']] += est['estimate'][0, :window_len] * hann_window
	weights[est['start']:est['end']] += hann_window

	merged_output[:, weights > 0] /= weights[weights > 0]

	torchaudio.save(os.path.join(output_path, f"speaker{i+1}_est.wav"), merged_output, sr)


	# Merge video with separated audio for each speaker
	output_files = []
	for i in range(number_of_speakers):
	video_input = os.path.join(output_path, f"video_tracked{i+1}.mp4")
	audio_input = os.path.join(output_path, f"speaker{i+1}_est.wav")
	video_output = os.path.join(output_path, f"s{i+1}.mp4")

	merge_video_audio(video_input, audio_input, video_output)
	output_files.append(video_output)

	# Clean up temporary file
	if os.path.exists(temp_25fps_file):
	os.remove(temp_25fps_file)

	return output_files

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Video Speaker Separation using Dolphin model')
	parser.add_argument('--input', '-i', type=str, required=True,
	help='Path to input video file')
	parser.add_argument('--output', '-o', type=str, default=None,
	help='Output directory path (default: creates directory based on input filename)')
	parser.add_argument('--speakers', '-s', type=int, default=2,
	help='Number of speakers to separate (default: 2)')
	parser.add_argument('--detect-every-n', type=int, default=8,
	help='Detect faces every N frames (default: 8)')
	parser.add_argument('--face-scale', type=float, default=1.5,
	help='Face detection bounding box scale factor (default: 1.5)')
	parser.add_argument('--config', type=str, default="checkpoints/vox2/conf.yml",
	help='Path to model configuration file')

	args = parser.parse_args()

	# 验证输入文件是否存在
	if not os.path.exists(args.input):
	print(f"Error: Input file '{args.input}' does not exist")
	exit(1)

	# 如果没有指定输出路径，基于输入文件名创建输出目录
	if args.output is None:
	input_basename = os.path.splitext(os.path.basename(args.input))[0]
	args.output = os.path.join(os.path.dirname(args.input), input_basename + "_output")

	print(f"Processing video: {args.input}")
	print(f"Output directory: {args.output}")
	print(f"Number of speakers: {args.speakers}")
	print("Running in CPU mode")

	# 处理视频
	output_files = process_video(
	input_file=args.input,
	output_path=args.output,
	number_of_speakers=args.speakers,
	detect_every_N_frame=args.detect_every_n,
	scalar_face_detection=args.face_scale,
	config_path=args.config,
	)

	print("\nProcessing completed!")
	print("Output files:")
	for i, output_file in enumerate(output_files):
	print(f" Speaker {i+1}: {output_file}")