| | import warnings |
| | warnings.filterwarnings("ignore") |
| | import os |
| | import argparse |
| | import face_alignment |
| | import torch |
| | import torchaudio |
| | import numpy as np |
| | import cv2 |
| | from PIL import Image, ImageDraw |
| | from moviepy import * |
| | from collections import deque |
| | from skimage import transform as tf |
| | import yaml |
| |
|
| | from look2hear.models import Dolphin |
| | from look2hear.datas.transform import get_preprocessing_pipelines |
| |
|
| | from face_detection_utils import detect_faces |
| |
|
| | |
| | def linear_interpolate(landmarks, start_idx, stop_idx): |
| | start_landmarks = landmarks[start_idx] |
| | stop_landmarks = landmarks[stop_idx] |
| | delta = stop_landmarks - start_landmarks |
| | for idx in range(1, stop_idx-start_idx): |
| | landmarks[start_idx+idx] = start_landmarks + idx/float(stop_idx-start_idx) * delta |
| | return landmarks |
| |
|
| | |
| | def warp_img(src, dst, img, std_size): |
| | tform = tf.estimate_transform('similarity', src, dst) |
| | warped = tf.warp(img, inverse_map=tform.inverse, output_shape=std_size) |
| | warped = warped * 255 |
| | warped = warped.astype('uint8') |
| | return warped, tform |
| |
|
| | def apply_transform(transform, img, std_size): |
| | warped = tf.warp(img, inverse_map=transform.inverse, output_shape=std_size) |
| | warped = warped * 255 |
| | warped = warped.astype('uint8') |
| | return warped |
| |
|
| | |
| | def cut_patch(img, landmarks, height, width, threshold=5): |
| |
|
| | center_x, center_y = np.mean(landmarks, axis=0) |
| |
|
| | if center_y - height < 0: |
| | center_y = height |
| | if center_y - height < 0 - threshold: |
| | raise Exception('too much bias in height') |
| | if center_x - width < 0: |
| | center_x = width |
| | if center_x - width < 0 - threshold: |
| | raise Exception('too much bias in width') |
| | |
| | if center_y + height > img.shape[0]: |
| | center_y = img.shape[0] - height |
| | if center_y + height > img.shape[0] + threshold: |
| | raise Exception('too much bias in height') |
| | if center_x + width > img.shape[1]: |
| | center_x = img.shape[1] - width |
| | if center_x + width > img.shape[1] + threshold: |
| | raise Exception('too much bias in width') |
| | |
| | cutted_img = np.copy(img[ int(round(center_y) - round(height)): int(round(center_y) + round(height)), |
| | int(round(center_x) - round(width)): int(round(center_x) + round(width))]) |
| | return cutted_img |
| |
|
| | |
| | def convert_bgr2gray(data): |
| | return np.stack([cv2.cvtColor(_, cv2.COLOR_BGR2GRAY) for _ in data], axis=0) |
| |
|
| |
|
| | def save2npz(filename, data=None): |
| | assert data is not None, "data is {}".format(data) |
| | if not os.path.exists(os.path.dirname(filename)): |
| | os.makedirs(os.path.dirname(filename)) |
| | np.savez_compressed(filename, data=data) |
| | |
| | def read_video(filename): |
| | """Read video frames using MoviePy for better compatibility""" |
| | try: |
| | video_clip = VideoFileClip(filename) |
| | for frame in video_clip.iter_frames(): |
| | |
| | frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) |
| | yield frame_bgr |
| | video_clip.close() |
| | except Exception as e: |
| | print(f"Error reading video {filename}: {e}") |
| | return |
| |
|
| | def face2head(boxes, scale=1.5): |
| | new_boxes = [] |
| | for box in boxes: |
| | width = box[2] - box[0] |
| | height= box[3] - box[1] |
| | width_center = (box[2] + box[0]) / 2 |
| | height_center = (box[3] + box[1]) / 2 |
| | square_width = int(max(width, height) * scale) |
| | new_box = [width_center - square_width/2, height_center - square_width/2, width_center + square_width/2, height_center + square_width/2] |
| | new_boxes.append(new_box) |
| | return new_boxes |
| |
|
| | def bb_intersection_over_union(boxA, boxB): |
| | |
| | xA = max(boxA[0], boxB[0]) |
| | yA = max(boxA[1], boxB[1]) |
| | xB = min(boxA[2], boxB[2]) |
| | yB = min(boxA[3], boxB[3]) |
| | |
| | interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1) |
| | |
| | |
| | boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1) |
| | boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1) |
| | |
| | |
| | |
| | iou = interArea / float(boxAArea + boxBArea - interArea) |
| | |
| | return iou |
| |
|
| | def detectface(video_input_path, output_path, detect_every_N_frame, scalar_face_detection, number_of_speakers): |
| | device = torch.device("cpu") |
| | print('Running on device: {}'.format(device)) |
| | os.makedirs(os.path.join(output_path, 'faces'), exist_ok=True) |
| | os.makedirs(os.path.join(output_path, 'landmark'), exist_ok=True) |
| |
|
| | landmarks_dic = {} |
| | faces_dic = {} |
| | boxes_dic = {} |
| | |
| | for i in range(number_of_speakers): |
| | landmarks_dic[i] = [] |
| | faces_dic[i] = [] |
| | boxes_dic[i] = [] |
| |
|
| | video_clip = VideoFileClip(video_input_path) |
| | print("Video statistics: ", video_clip.w, video_clip.h, (video_clip.w, video_clip.h), video_clip.fps) |
| | frames = [Image.fromarray(frame) for frame in video_clip.iter_frames()] |
| | print('Number of frames in video: ', len(frames)) |
| | video_clip.close() |
| | fa = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D, flip_input=False, device='cpu') |
| | |
| | for i, frame in enumerate(frames): |
| | print('\rTracking frame: {}'.format(i + 1), end='') |
| | |
| | |
| | if i % detect_every_N_frame == 0: |
| | frame_array = np.array(frame) |
| |
|
| | detected_boxes, _ = detect_faces( |
| | frame_array, |
| | threshold=0.9, |
| | allow_upscaling=False, |
| | ) |
| |
|
| | if detected_boxes is None or len(detected_boxes) == 0: |
| | detected_boxes, _ = detect_faces( |
| | frame_array, |
| | threshold=0.7, |
| | allow_upscaling=True, |
| | ) |
| |
|
| | if detected_boxes is not None and len(detected_boxes) > 0: |
| | detected_boxes = detected_boxes[:number_of_speakers] |
| | detected_boxes = face2head(detected_boxes, scalar_face_detection) |
| | else: |
| | detected_boxes = [] |
| | |
| | |
| | if i == 0: |
| | |
| | if len(detected_boxes) < number_of_speakers: |
| | raise ValueError(f"First frame must detect at least {number_of_speakers} faces, but only found {len(detected_boxes)}") |
| | |
| | |
| | for j in range(number_of_speakers): |
| | box = detected_boxes[j] |
| | face = frame.crop((box[0], box[1], box[2], box[3])).resize((224,224)) |
| | preds = fa.get_landmarks(np.array(face)) |
| | |
| | if preds is None: |
| | raise ValueError(f"Face landmarks not detected in initial frame for speaker {j}") |
| | |
| | faces_dic[j].append(face) |
| | landmarks_dic[j].append(preds) |
| | boxes_dic[j].append(box) |
| | else: |
| | |
| | matched_speakers = set() |
| | speaker_boxes = [None] * number_of_speakers |
| | |
| | |
| | for box in detected_boxes: |
| | iou_scores = [] |
| | for speaker_id in range(number_of_speakers): |
| | if speaker_id in matched_speakers: |
| | iou_scores.append(-1) |
| | else: |
| | last_box = boxes_dic[speaker_id][-1] |
| | iou_score = bb_intersection_over_union(box, last_box) |
| | iou_scores.append(iou_score) |
| | |
| | if max(iou_scores) > 0: |
| | best_speaker = iou_scores.index(max(iou_scores)) |
| | speaker_boxes[best_speaker] = box |
| | matched_speakers.add(best_speaker) |
| | |
| | |
| | for speaker_id in range(number_of_speakers): |
| | if speaker_boxes[speaker_id] is not None: |
| | |
| | box = speaker_boxes[speaker_id] |
| | else: |
| | |
| | box = boxes_dic[speaker_id][-1] |
| | |
| | |
| | face = frame.crop((box[0], box[1], box[2], box[3])).resize((224,224)) |
| | preds = fa.get_landmarks(np.array(face)) |
| | |
| | if preds is None: |
| | |
| | preds = landmarks_dic[speaker_id][-1] |
| | |
| | faces_dic[speaker_id].append(face) |
| | landmarks_dic[speaker_id].append(preds) |
| | boxes_dic[speaker_id].append(box) |
| | |
| | |
| | frame_counts = [len(boxes_dic[s]) for s in range(number_of_speakers)] |
| | print(f"\nFrame counts per speaker: {frame_counts}") |
| | assert all(count == len(frames) for count in frame_counts), f"Inconsistent frame counts: {frame_counts}" |
| | |
| | |
| | for s in range(number_of_speakers): |
| | frames_tracked = [] |
| | for i, frame in enumerate(frames): |
| | frame_draw = frame.copy() |
| | draw = ImageDraw.Draw(frame_draw) |
| | draw.rectangle(boxes_dic[s][i], outline=(255, 0, 0), width=6) |
| | frames_tracked.append(frame_draw) |
| | |
| | |
| | tracked_frames = [np.array(frame) for frame in frames_tracked] |
| | if tracked_frames: |
| | tracked_clip = ImageSequenceClip(tracked_frames, fps=25.0) |
| | tracked_video_path = os.path.join(output_path, 'video_tracked' + str(s+1) + '.mp4') |
| | tracked_clip.write_videofile(tracked_video_path, codec='libx264', audio=False, logger=None) |
| | tracked_clip.close() |
| |
|
| | |
| | for i in range(number_of_speakers): |
| | save2npz(os.path.join(output_path, 'landmark', 'speaker' + str(i+1)+'.npz'), data=landmarks_dic[i]) |
| | |
| | |
| | face_frames = [np.array(frame) for frame in faces_dic[i]] |
| | if face_frames: |
| | face_clip = ImageSequenceClip(face_frames, fps=25.0) |
| | face_video_path = os.path.join(output_path, 'faces', 'speaker' + str(i+1) + '.mp4') |
| | face_clip.write_videofile(face_video_path, codec='libx264', audio=False, logger=None) |
| | face_clip.close() |
| |
|
| | |
| | parts = video_input_path.split('/') |
| | video_name = parts[-1][:-4] |
| | if not os.path.exists(os.path.join(output_path, 'filename_input')): |
| | os.mkdir(os.path.join(output_path, 'filename_input')) |
| | csvfile = open(os.path.join(output_path, 'filename_input', str(video_name) + '.csv'), 'w') |
| | for i in range(number_of_speakers): |
| | csvfile.write('speaker' + str(i+1)+ ',0\n') |
| | csvfile.close() |
| | return os.path.join(output_path, 'filename_input', str(video_name) + '.csv') |
| |
|
| |
|
| | def crop_patch(mean_face_landmarks, video_pathname, landmarks, window_margin, start_idx, stop_idx, crop_height, crop_width, STD_SIZE=(256, 256)): |
| |
|
| | """Crop mouth patch |
| | :param str video_pathname: pathname for the video_dieo |
| | :param list landmarks: interpolated landmarks |
| | """ |
| | |
| | stablePntsIDs = [33, 36, 39, 42, 45] |
| |
|
| | frame_idx = 0 |
| | frame_gen = read_video(video_pathname) |
| | while True: |
| | try: |
| | frame = frame_gen.__next__() |
| | except StopIteration: |
| | break |
| | if frame_idx == 0: |
| | q_frame, q_landmarks = deque(), deque() |
| | sequence = [] |
| |
|
| | q_landmarks.append(landmarks[frame_idx]) |
| | q_frame.append(frame) |
| | if len(q_frame) == window_margin: |
| | smoothed_landmarks = np.mean(q_landmarks, axis=0) |
| | cur_landmarks = q_landmarks.popleft() |
| | cur_frame = q_frame.popleft() |
| | |
| | trans_frame, trans = warp_img( smoothed_landmarks[stablePntsIDs, :], |
| | mean_face_landmarks[stablePntsIDs, :], |
| | cur_frame, |
| | STD_SIZE) |
| | trans_landmarks = trans(cur_landmarks) |
| | |
| | sequence.append( cut_patch( trans_frame, |
| | trans_landmarks[start_idx:stop_idx], |
| | crop_height//2, |
| | crop_width//2,)) |
| | if frame_idx == len(landmarks)-1: |
| | |
| | if len(landmarks) < window_margin: |
| | smoothed_landmarks = np.mean(q_landmarks, axis=0) |
| | cur_landmarks = q_landmarks.popleft() |
| | cur_frame = q_frame.popleft() |
| |
|
| | |
| | trans_frame, trans = warp_img(smoothed_landmarks[stablePntsIDs, :], |
| | mean_face_landmarks[stablePntsIDs, :], |
| | cur_frame, |
| | STD_SIZE) |
| | trans_landmarks = trans(cur_landmarks) |
| | |
| | sequence.append(cut_patch( trans_frame, |
| | trans_landmarks[start_idx:stop_idx], |
| | crop_height//2, |
| | crop_width//2,)) |
| |
|
| | while q_frame: |
| | cur_frame = q_frame.popleft() |
| | |
| | trans_frame = apply_transform( trans, cur_frame, STD_SIZE) |
| | |
| | trans_landmarks = trans(q_landmarks.popleft()) |
| | |
| | sequence.append( cut_patch( trans_frame, |
| | trans_landmarks[start_idx:stop_idx], |
| | crop_height//2, |
| | crop_width//2,)) |
| | return np.array(sequence) |
| | frame_idx += 1 |
| | return None |
| |
|
| | def landmarks_interpolate(landmarks): |
| | |
| | """Interpolate landmarks |
| | param list landmarks: landmarks detected in raw videos |
| | """ |
| |
|
| | valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None] |
| | if not valid_frames_idx: |
| | return None |
| | for idx in range(1, len(valid_frames_idx)): |
| | if valid_frames_idx[idx] - valid_frames_idx[idx-1] == 1: |
| | continue |
| | else: |
| | landmarks = linear_interpolate(landmarks, valid_frames_idx[idx-1], valid_frames_idx[idx]) |
| | valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None] |
| | |
| | if valid_frames_idx: |
| | landmarks[:valid_frames_idx[0]] = [landmarks[valid_frames_idx[0]]] * valid_frames_idx[0] |
| | landmarks[valid_frames_idx[-1]:] = [landmarks[valid_frames_idx[-1]]] * (len(landmarks) - valid_frames_idx[-1]) |
| | valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None] |
| | assert len(valid_frames_idx) == len(landmarks), "not every frame has landmark" |
| | return landmarks |
| |
|
| | def crop_mouth(video_direc, landmark_direc, filename_path, save_direc, convert_gray=False, testset_only=False): |
| | lines = open(filename_path).read().splitlines() |
| | lines = list(filter(lambda x: 'test' in x, lines)) if testset_only else lines |
| |
|
| | for filename_idx, line in enumerate(lines): |
| |
|
| | filename, person_id = line.split(',') |
| | print('idx: {} \tProcessing.\t{}'.format(filename_idx, filename)) |
| |
|
| | video_pathname = os.path.join(video_direc, filename+'.mp4') |
| | landmarks_pathname = os.path.join(landmark_direc, filename+'.npz') |
| | dst_pathname = os.path.join( save_direc, filename+'.npz') |
| |
|
| | |
| | |
| |
|
| | multi_sub_landmarks = np.load(landmarks_pathname, allow_pickle=True)['data'] |
| | landmarks = [None] * len(multi_sub_landmarks) |
| | for frame_idx in range(len(landmarks)): |
| | try: |
| | |
| | landmarks[frame_idx] = multi_sub_landmarks[frame_idx][int(person_id)] |
| | except (IndexError, TypeError): |
| | continue |
| |
|
| | |
| | preprocessed_landmarks = landmarks_interpolate(landmarks) |
| | if not preprocessed_landmarks: |
| | continue |
| |
|
| | |
| | mean_face_landmarks = np.load('assets/20words_mean_face.npy') |
| | sequence = crop_patch(mean_face_landmarks, video_pathname, preprocessed_landmarks, 12, 48, 68, 96, 96) |
| | assert sequence is not None, "cannot crop from {}.".format(filename) |
| |
|
| | |
| | data = convert_bgr2gray(sequence) if convert_gray else sequence[...,::-1] |
| | save2npz(dst_pathname, data=data) |
| |
|
| | def convert_video_fps(input_file, output_file, target_fps=25): |
| | """Convert video to target FPS using moviepy""" |
| | video = VideoFileClip(input_file) |
| | video_fps = video.fps |
| | |
| | if video_fps != target_fps: |
| | video.write_videofile( |
| | output_file, |
| | fps=target_fps, |
| | codec='libx264', |
| | audio_codec='aac', |
| | temp_audiofile='temp-audio.m4a', |
| | remove_temp=True, |
| | ) |
| | else: |
| | |
| | import shutil |
| | shutil.copy2(input_file, output_file) |
| | |
| | video.close() |
| | print(f'Video has been converted to {target_fps} fps and saved to {output_file}') |
| |
|
| | def extract_audio(video_file, audio_output_file, sample_rate=16000): |
| | """Extract audio from video using moviepy""" |
| | video = VideoFileClip(video_file) |
| | audio = video.audio |
| | |
| | |
| | audio.write_audiofile(audio_output_file, fps=sample_rate, nbytes=2, codec='pcm_s16le') |
| | |
| | video.close() |
| | audio.close() |
| |
|
| | def merge_video_audio(video_file, audio_file, output_file): |
| | """Merge video and audio using moviepy""" |
| | video = VideoFileClip(video_file) |
| | audio = AudioFileClip(audio_file) |
| | |
| | |
| | set_audio_fn = getattr(video, "set_audio", None) |
| | if callable(set_audio_fn): |
| | final_video = set_audio_fn(audio) |
| | else: |
| | with_audio_fn = getattr(video, "with_audio", None) |
| | if not callable(with_audio_fn): |
| | video.close() |
| | audio.close() |
| | raise AttributeError("VideoFileClip object lacks both set_audio and with_audio methods") |
| | final_video = with_audio_fn(audio) |
| | |
| | |
| | final_video.write_videofile(output_file, codec='libx264', audio_codec='aac', temp_audiofile='temp-audio.m4a', remove_temp=True) |
| | |
| | |
| | video.close() |
| | audio.close() |
| | final_video.close() |
| |
|
| | def process_video(input_file, output_path, number_of_speakers=2, |
| | detect_every_N_frame=8, scalar_face_detection=1.5, |
| | config_path="checkpoints/vox2/conf.yml"): |
| | """Main processing function for video speaker separation""" |
| | |
| | device = torch.device("cpu") |
| | |
| | |
| | os.makedirs(output_path, exist_ok=True) |
| | |
| | |
| | temp_25fps_file = os.path.join(output_path, 'temp_25fps.mp4') |
| | convert_video_fps(input_file, temp_25fps_file, target_fps=25) |
| | |
| | |
| | filename_path = detectface(video_input_path=temp_25fps_file, |
| | output_path=output_path, |
| | detect_every_N_frame=detect_every_N_frame, |
| | scalar_face_detection=scalar_face_detection, |
| | number_of_speakers=number_of_speakers) |
| | |
| | |
| | audio_output = os.path.join(output_path, 'audio.wav') |
| | extract_audio(temp_25fps_file, audio_output, sample_rate=16000) |
| | |
| | |
| | crop_mouth(video_direc=os.path.join(output_path, "faces"), |
| | landmark_direc=os.path.join(output_path, "landmark"), |
| | filename_path=filename_path, |
| | save_direc=os.path.join(output_path, "mouthroi"), |
| | convert_gray=True, |
| | testset_only=False) |
| | |
| | |
| | audiomodel = Dolphin.from_pretrained("JusperLee/Dolphin").to(device) |
| | audiomodel.eval() |
| | |
| | |
| | with torch.no_grad(): |
| | for i in range(number_of_speakers): |
| | mouth_roi = np.load(os.path.join(output_path, "mouthroi", f"speaker{i+1}.npz"))["data"] |
| | mouth_roi = get_preprocessing_pipelines()["val"](mouth_roi) |
| | |
| | mix, sr = torchaudio.load(audio_output) |
| | mix = mix.mean(dim=0).to(device) |
| | |
| | window_size = 4 * sr |
| | hop_size = 4 * sr |
| | |
| | all_estimates = [] |
| | |
| | |
| | start_idx = 0 |
| | while start_idx < len(mix): |
| | end_idx = min(start_idx + window_size, len(mix)) |
| | window_mix = mix[start_idx:end_idx] |
| | |
| | start_frame = int(start_idx / sr * 25) |
| | end_frame = int(end_idx / sr * 25) |
| | end_frame = min(end_frame, len(mouth_roi)) |
| | window_mouth_roi = mouth_roi[start_frame:end_frame] |
| | |
| | est_sources = audiomodel( |
| | window_mix[None], |
| | torch.from_numpy(window_mouth_roi[None, None]).float().to(device) |
| | ) |
| | |
| | all_estimates.append({ |
| | 'start': start_idx, |
| | 'end': end_idx, |
| | 'estimate': est_sources[0].cpu() |
| | }) |
| | |
| | start_idx += hop_size |
| | |
| | if start_idx >= len(mix): |
| | break |
| | |
| | output_length = len(mix) |
| | merged_output = torch.zeros(1, output_length, device=device) |
| | weights = torch.zeros(output_length, device=device) |
| | |
| | for est in all_estimates: |
| | window_len = est['end'] - est['start'] |
| | hann_window = torch.hann_window(window_len, device=device) |
| | |
| | merged_output[0, est['start']:est['end']] += est['estimate'][0, :window_len] * hann_window |
| | weights[est['start']:est['end']] += hann_window |
| | |
| | merged_output[:, weights > 0] /= weights[weights > 0] |
| | |
| | torchaudio.save(os.path.join(output_path, f"speaker{i+1}_est.wav"), merged_output, sr) |
| |
|
| | |
| | |
| | output_files = [] |
| | for i in range(number_of_speakers): |
| | video_input = os.path.join(output_path, f"video_tracked{i+1}.mp4") |
| | audio_input = os.path.join(output_path, f"speaker{i+1}_est.wav") |
| | video_output = os.path.join(output_path, f"s{i+1}.mp4") |
| | |
| | merge_video_audio(video_input, audio_input, video_output) |
| | output_files.append(video_output) |
| | |
| | |
| | if os.path.exists(temp_25fps_file): |
| | os.remove(temp_25fps_file) |
| | |
| | return output_files |
| |
|
| | if __name__ == '__main__': |
| | parser = argparse.ArgumentParser(description='Video Speaker Separation using Dolphin model') |
| | parser.add_argument('--input', '-i', type=str, required=True, |
| | help='Path to input video file') |
| | parser.add_argument('--output', '-o', type=str, default=None, |
| | help='Output directory path (default: creates directory based on input filename)') |
| | parser.add_argument('--speakers', '-s', type=int, default=2, |
| | help='Number of speakers to separate (default: 2)') |
| | parser.add_argument('--detect-every-n', type=int, default=8, |
| | help='Detect faces every N frames (default: 8)') |
| | parser.add_argument('--face-scale', type=float, default=1.5, |
| | help='Face detection bounding box scale factor (default: 1.5)') |
| | parser.add_argument('--config', type=str, default="checkpoints/vox2/conf.yml", |
| | help='Path to model configuration file') |
| | |
| | args = parser.parse_args() |
| | |
| | |
| | if not os.path.exists(args.input): |
| | print(f"Error: Input file '{args.input}' does not exist") |
| | exit(1) |
| | |
| | |
| | if args.output is None: |
| | input_basename = os.path.splitext(os.path.basename(args.input))[0] |
| | args.output = os.path.join(os.path.dirname(args.input), input_basename + "_output") |
| | |
| | print(f"Processing video: {args.input}") |
| | print(f"Output directory: {args.output}") |
| | print(f"Number of speakers: {args.speakers}") |
| | print("Running in CPU mode") |
| | |
| | |
| | output_files = process_video( |
| | input_file=args.input, |
| | output_path=args.output, |
| | number_of_speakers=args.speakers, |
| | detect_every_N_frame=args.detect_every_n, |
| | scalar_face_detection=args.face_scale, |
| | config_path=args.config, |
| | ) |
| | |
| | print("\nProcessing completed!") |
| | print("Output files:") |
| | for i, output_file in enumerate(output_files): |
| | print(f" Speaker {i+1}: {output_file}") |
| |
|