| | import os, glob, subprocess, argparse, sys, numpy, random, math, cv2
|
| | from itertools import repeat
|
| | from multiprocessing import Pool
|
| | from scipy.io import wavfile
|
| | from pydub import AudioSegment
|
| | from tqdm import tqdm
|
| |
|
| | def get_length(input_video):
|
| | result = subprocess.run(['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', input_video], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
| | return float(result.stdout)
|
| |
|
| | def read_Vox_lines(file):
|
| | Tlines, Flines = [], []
|
| | with open(file) as f_in:
|
| | while True:
|
| | line = f_in.readline()
|
| | if not line:
|
| | break
|
| | if int(line[0]):
|
| | Tlines.append(line)
|
| | else:
|
| | Flines.append(line)
|
| | return Tlines, Flines
|
| |
|
| | def read_LRS3_ST(file):
|
| | lines = []
|
| | with open(file) as f_in:
|
| | while True:
|
| | line = f_in.readline()
|
| | if not line:
|
| | break
|
| | lines.append(line)
|
| | return lines[:30000]
|
| |
|
| | def read_LRS3_S(file):
|
| | lines = []
|
| | with open(file) as f_in:
|
| | while True:
|
| | line = f_in.readline()
|
| | if not line:
|
| | break
|
| | start = int(line.split()[1]) / 100
|
| | end = int(line.split()[2]) / 100
|
| | if end - start <= 3:
|
| | lines.append(line)
|
| | return lines[:30000]
|
| |
|
| | def generate_TAudio(line, args):
|
| |
|
| | audio_name = line.split()[1][:-4]
|
| | video_name = line.split()[2][:-4]
|
| | id1 = audio_name.split('/')[0]
|
| | name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + audio_name.split('/')[2]
|
| | name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + video_name.split('/')[2]
|
| | name = name1 + '_' + name2
|
| | audio_path = os.path.join(args.Vox_audio, audio_name + '.wav')
|
| | video_path = os.path.join(args.Vox_video, video_name + '.mp4')
|
| | out_audio_path = os.path.join(args.out_path, 'TAudio', id1 + '/' + name + '.wav')
|
| | out_video_path = os.path.join(args.out_path, 'TAudio', id1 + '/' + name + '.mp4')
|
| | os.makedirs(os.path.join(args.out_path, 'TAudio', id1), exist_ok = True)
|
| |
|
| |
|
| | audio = AudioSegment.from_file(audio_path, format="wav")
|
| | length_audio = len(audio) / 1000.0
|
| | length_video = get_length(video_path)
|
| | length_data = int(min(length_video, length_audio) * 100) / 100
|
| | audio = audio[:int(length_data * 1000)]
|
| |
|
| |
|
| | start = 0
|
| | end = length_data
|
| | audio.export(out_audio_path, format="wav")
|
| | cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
|
| | subprocess.call(cmd, shell=True, stdout=None)
|
| |
|
| |
|
| | start_T, end_T = 0, length_data
|
| | start_F, end_F= 0, 0
|
| | line_new = "TAudio" + ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \
|
| | + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
|
| | return line_new
|
| |
|
| | def generate_FAudio(line, args):
|
| |
|
| | audio_name = line.split()[1][:-4]
|
| | video_name = line.split()[2][:-4]
|
| | id1 = audio_name.split('/')[0]
|
| | name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + audio_name.split('/')[2]
|
| | name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + video_name.split('/')[2]
|
| | name = name1 + '_' + name2
|
| | audio_path = os.path.join(args.Vox_audio, audio_name + '.wav')
|
| | video_path = os.path.join(args.Vox_video, video_name + '.mp4')
|
| | out_audio_path = os.path.join(args.out_path, 'FAudio', id1 + '/' + name + '.wav')
|
| | out_video_path = os.path.join(args.out_path, 'FAudio', id1 + '/' + name + '.mp4')
|
| | os.makedirs(os.path.join(args.out_path, 'FAudio', id1), exist_ok = True)
|
| |
|
| |
|
| | audio = AudioSegment.from_file(audio_path, format="wav")
|
| | length_audio = len(audio) / 1000.0
|
| | length_video = get_length(video_path)
|
| | length_data = int(min(length_video, length_audio) * 100) / 100
|
| | audio = audio[:int(length_data * 1000)]
|
| |
|
| |
|
| | start = 0
|
| | end = length_data
|
| | audio.export(out_audio_path, format="wav")
|
| | cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
|
| | subprocess.call(cmd, shell=True, stdout=None)
|
| |
|
| |
|
| | start_T, end_T = 0, 0
|
| | start_F, end_F= 0, length_data
|
| | line_new = "FAudio" + ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \
|
| | + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
|
| | return line_new
|
| |
|
| | def generate_TFAudio(line, args):
|
| |
|
| | audio_name = line.split()[1][:-4]
|
| | video_name = line.split()[2][:-4]
|
| | id1 = audio_name.split('/')[0]
|
| | name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + audio_name.split('/')[2]
|
| | name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + video_name.split('/')[2]
|
| | name = name1 + '_' + name2
|
| | audio_T_path = os.path.join(args.Vox_audio, video_name + '.wav')
|
| | audio_F_path = os.path.join(args.Vox_audio, audio_name + '.wav')
|
| | video_path = os.path.join(args.Vox_video, video_name + '.mp4')
|
| | out_audio_path = os.path.join(args.out_path, 'TFAudio', id1 + '/' + name + '.wav')
|
| | out_video_path = os.path.join(args.out_path, 'TFAudio', id1 + '/' + name + '.mp4')
|
| | os.makedirs(os.path.join(args.out_path, 'TFAudio', id1), exist_ok = True)
|
| |
|
| |
|
| | audio_T = AudioSegment.from_file(audio_T_path, format="wav")
|
| | audio_F = AudioSegment.from_file(audio_F_path, format="wav")
|
| | length_audio_T = len(audio_T) / 1000.0
|
| | length_audio_F = len(audio_F) / 1000.0
|
| | length_video = get_length(video_path)
|
| | length_data = int(min(length_audio_T, length_audio_F, length_video) * 100) / 100
|
| | audio_T = audio_T[:int(length_data * 1000)]
|
| | audio_F = audio_F[:int(length_data * 1000)]
|
| |
|
| |
|
| | changepoint = int((length_data * 0.25 + length_data * random.random() * 0.5) * 100) / 100
|
| | audio_dict = {}
|
| | audio_dict['T1'] = audio_T[:changepoint * 1000]
|
| | audio_dict['T2'] = audio_T[changepoint * 1000:]
|
| | audio_dict['F1'] = audio_F[:changepoint * 1000]
|
| | audio_dict['F2'] = audio_F[changepoint * 1000:]
|
| | seed = random.randint(0,1)
|
| | if seed == 1:
|
| | audio = audio_dict['T1'] + audio_dict['F2']
|
| | else:
|
| | audio = audio_dict['F1'] + audio_dict['T2']
|
| |
|
| | start = 0
|
| | end = length_data
|
| | audio.export(out_audio_path, format="wav")
|
| | cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
|
| | subprocess.call(cmd, shell=True, stdout=None)
|
| |
|
| |
|
| | if seed == 1:
|
| | start_T, end_T, start_F, end_F = 0, changepoint, changepoint, length_data
|
| | elif seed == 0:
|
| | start_F, end_F, start_T, end_T = 0, changepoint, changepoint, length_data
|
| | line_new = "TFAudio" + ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \
|
| | + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
|
| | return line_new
|
| |
|
| | def generate_TSilence(line, args):
|
| |
|
| | type_change = line.split()[0]
|
| | audio_name = line.split()[1]
|
| | video_name = line.split()[1]
|
| | id1 = audio_name.split('/')[0]
|
| | name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + line.split()[5]
|
| | name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + line.split()[5]
|
| | name = name1 + '_' + name2
|
| | start = int(line.split()[2]) / 100
|
| | mid = int(line.split()[3]) / 100
|
| | end = int(line.split()[4]) / 100
|
| | audio_path = os.path.join(args.lrs3_audio, 'pretrain', audio_name[8:] + '.wav')
|
| | video_path = os.path.join(args.lrs3_video, 'pretrain', video_name[8:]+ '.mp4')
|
| | out_audio_path = os.path.join(args.out_path, 'TSilence', id1 + '/' + name + '.wav')
|
| | out_video_path = os.path.join(args.out_path, 'TSilence', id1 + '/' + name + '.mp4')
|
| | os.makedirs(os.path.join(os.path.join(args.out_path, 'TSilence'), id1), exist_ok = True)
|
| |
|
| |
|
| | audio = AudioSegment.from_file(audio_path, format="wav")
|
| |
|
| |
|
| | length_data = int((end - start) * 100) / 100
|
| | audio = audio[int(start * 1000):int(end * 1000)]
|
| | audio.export(out_audio_path, format="wav")
|
| | cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
|
| | subprocess.call(cmd, shell=True, stdout=None)
|
| |
|
| | changepoint = int((mid - start) * 100) / 100
|
| | if type_change == "10":
|
| | start_T, end_T, start_F, end_F = 0, changepoint, changepoint, length_data
|
| | elif type_change == "01":
|
| | start_T, end_T, start_F, end_F = changepoint, length_data, 0, changepoint
|
| |
|
| | audio_name = audio_name[:-5] + line.split()[5]
|
| | video_name = video_name[:-5] + line.split()[5]
|
| | line_new = "TSilence" + ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \
|
| | + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
|
| | return line_new
|
| |
|
| | def generate_FSilence(line, Flines, args):
|
| |
|
| | audio_T_name = line.split()[0]
|
| | video_name = line.split()[0]
|
| | start = int(line.split()[1]) / 100
|
| | end = int(line.split()[2]) / 100
|
| | length_data = int((end - start) * 100) / 100
|
| | changepoint = int((length_data * 0.25 + length_data * random.random() * 0.5) * 100) / 100
|
| | speech_line = random.choice(Flines)
|
| | length_speech = float(speech_line.split()[-1])
|
| | while length_speech < length_data:
|
| | speech_line = random.choice(Flines)
|
| | length_speech = float(speech_line.split()[-1])
|
| | audio_F_name = speech_line.split()[1][:-4]
|
| | id1 = audio_F_name.split('/')[0]
|
| | name1 = audio_F_name.split('/')[0] + '_' + audio_F_name.split('/')[1] + '_' + audio_F_name.split('/')[2]
|
| | name2 = audio_T_name.split('/')[0] + '_' + audio_T_name.split('/')[1] + '_' + line.split()[-1]
|
| | name = name1 + '_' + name2
|
| |
|
| |
|
| | video_path = os.path.join(args.lrs3_video, 'pretrain', video_name[8:]+ '.mp4')
|
| | audio_T_path = os.path.join(args.lrs3_audio, 'pretrain', audio_T_name[8:] + '.wav')
|
| | audio_F_path = os.path.join(args.Vox_audio, audio_F_name + '.wav')
|
| | out_audio_path = os.path.join(args.out_path, 'FSilence', id1 + '/' + name + '.wav')
|
| | out_video_path = os.path.join(args.out_path, 'FSilence', id1 + '/' + name + '.mp4')
|
| | os.makedirs(os.path.join(args.out_path, 'FSilence', id1), exist_ok = True)
|
| |
|
| |
|
| | audio_T = AudioSegment.from_file(audio_T_path, format="wav")
|
| | audio_T = audio_T[int(start * 1000):int(end * 1000)]
|
| | audio_F = AudioSegment.from_file(audio_F_path, format="wav")
|
| | length_audio_T = len(audio_T) / 1000.0
|
| | length_audio_F = len(audio_F) / 1000.0
|
| | length_video = get_length(video_path)
|
| | length_data = int(min(length_audio_T, length_audio_F, length_video) * 100) / 100
|
| | audio_T = audio_T[:int(length_data * 1000)]
|
| | audio_F = audio_F[:int(length_data * 1000)]
|
| |
|
| |
|
| | audio_dict = {}
|
| | audio_dict['T1'] = audio_T[:changepoint * 1000]
|
| | audio_dict['T2'] = audio_T[changepoint * 1000:]
|
| | audio_dict['F1'] = audio_F[:changepoint * 1000]
|
| | audio_dict['F2'] = audio_F[changepoint * 1000:]
|
| | seed = random.randint(0,1)
|
| | if seed == 1:
|
| | audio = audio_dict['T1'] + audio_dict['F2']
|
| | else:
|
| | audio = audio_dict['F1'] + audio_dict['T2']
|
| |
|
| | audio.export(out_audio_path, format="wav")
|
| | cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
|
| | subprocess.call(cmd, shell=True, stdout=None)
|
| |
|
| |
|
| | if seed == 1:
|
| | start_T, end_T, start_F, end_F = 0, changepoint, changepoint, length_data
|
| | elif seed == 0:
|
| | start_F, end_F, start_T, end_T = 0, changepoint, changepoint, length_data
|
| |
|
| | video_name = video_name[:-5] + line.split()[-1]
|
| | line_new = "FSilence" + ' ' + str(audio_F_name) + ' ' + str(video_name) + ' ' + str(length_data) \
|
| | + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
|
| | return line_new
|
| |
|
| |
|
| | parser = argparse.ArgumentParser(description = "generate_Dataset")
|
| |
|
| | parser.add_argument('--List_folder', type=str, default= 'lists')
|
| | parser.add_argument('--out_path', type=str, default= '/data07/ruijie/database/TalkSet_final')
|
| | parser.add_argument('--Vox_audio', type=str, default= '/home/ruijie/database/VoxCeleb2/audio/audio_clean/clean/train')
|
| | parser.add_argument('--Vox_video', type=str, default= '/home/ruijie/database/VoxCeleb2/video/orig/train')
|
| | parser.add_argument('--lrs3_audio', type=str, default='/data07/ruijie/database/LRS3/audio/orig_audio/clean')
|
| | parser.add_argument('--lrs3_video', type=str, default='/data07/ruijie/database/LRS3/video/orig_video')
|
| | parser.add_argument('--task', type=str, default='TAudio')
|
| | parser.add_argument('--num_cpu', type=int, default=10)
|
| | args = parser.parse_args()
|
| |
|
| | os.makedirs(os.path.join(args.out_path, 'TAudio'), exist_ok = True)
|
| | os.makedirs(os.path.join(args.out_path, 'FAudio'), exist_ok = True)
|
| | os.makedirs(os.path.join(args.out_path, 'TFAudio'), exist_ok = True)
|
| | os.makedirs(os.path.join(args.out_path, 'FSilence'), exist_ok = True)
|
| | os.makedirs(os.path.join(args.out_path, 'TSilence'), exist_ok = True)
|
| |
|
| | args.list_Vox = os.path.join(args.List_folder, 'lists_in', 'Vox_list.txt')
|
| | args.list_LRS3_S = os.path.join(args.List_folder, 'lists_in', 'LRS3_S_list.txt')
|
| | args.list_LRS3_ST = os.path.join(args.List_folder, 'lists_in', 'LRS3_ST_list.txt')
|
| | args.list_out = os.path.join(args.List_folder, 'lists_out')
|
| | args.list_out_train = os.path.join(args.list_out, 'train.txt')
|
| | args.list_out_test = os.path.join(args.list_out, 'test.txt')
|
| |
|
| | if args.task == 'TAudio':
|
| | Tlines, _ = read_Vox_lines(args.list_Vox)
|
| | Tlines_new = []
|
| |
|
| | with Pool(args.num_cpu) as p:
|
| | Tlines_new.append(p.starmap(generate_TAudio, zip(Tlines, repeat(args))))
|
| |
|
| | out_Tlist_file = open(os.path.join(args.list_out, 'TAudio.txt'), "w")
|
| | for line_new in Tlines_new[0]:
|
| | out_Tlist_file.write(line_new)
|
| | print('TAudio Finish')
|
| |
|
| | if args.task == 'FAudio':
|
| | _, Flines = read_Vox_lines(args.list_Vox)
|
| | Flines_new = []
|
| |
|
| | with Pool(args.num_cpu) as p:
|
| | Flines_new.append(p.starmap(generate_FAudio, zip(Flines, repeat(args))))
|
| |
|
| |
|
| | out_Flist_file = open(os.path.join(args.list_out, 'FAudio.txt'), "w")
|
| | for line_new in Flines_new[0]:
|
| | out_Flist_file.write(line_new)
|
| | print('FAudio Finish')
|
| |
|
| | if args.task == 'TFAudio':
|
| | _, Flines = read_Vox_lines(args.list_Vox)
|
| | TFlines_new = []
|
| |
|
| | with Pool(args.num_cpu) as p:
|
| | TFlines_new.append(p.starmap(generate_TFAudio, zip(Flines, repeat(args))))
|
| |
|
| |
|
| | out_TFlist_file = open(os.path.join(args.list_out, 'TFAudio.txt'), "w")
|
| | for line_new in TFlines_new[0]:
|
| | out_TFlist_file.write(line_new)
|
| | print('TFAudio Finish')
|
| |
|
| | if args.task == 'TSilence':
|
| | Slines = read_LRS3_ST(args.list_LRS3_ST)
|
| | TSlines_new = []
|
| | with Pool(args.num_cpu) as p:
|
| | TSlines_new.append(p.starmap(generate_TSilence, zip(Slines, repeat(args))))
|
| |
|
| |
|
| | out_TSlist_file = open(os.path.join(args.list_out, 'TSilence.txt'), "w")
|
| | for line_new in TSlines_new[0]:
|
| | out_TSlist_file.write(line_new)
|
| | print('TSilence Finish')
|
| |
|
| | if args.task == 'FSilence':
|
| | Tlines, _ = read_Vox_lines(args.list_Vox)
|
| | Slines = read_LRS3_S(args.list_LRS3_S)
|
| | FSlines_new = []
|
| | with Pool(args.num_cpu) as p:
|
| | FSlines_new.append(p.starmap(generate_FSilence, zip(Slines, repeat(Tlines), repeat(args))))
|
| |
|
| | out_FSlist_file = open(os.path.join(args.list_out, 'FSilence.txt'), "w")
|
| | for line_new in FSlines_new[0]:
|
| | out_FSlist_file.write(line_new)
|
| | print('FSilence Finish')
|
| |
|
| | if args.task == 'Fusion':
|
| | lines = []
|
| | for name in {'TAudio', 'FAudio', 'TFAudio', 'TSilence', 'FSilence'}:
|
| | with open(args.list_out + '/' + name + '.txt') as f:
|
| | while True:
|
| | line = f.readline()
|
| | if not line:
|
| | break
|
| | lines.append(line)
|
| | train_file = open(args.list_out_train, "w")
|
| | test_file = open(args.list_out_test, "w")
|
| | random.shuffle(lines)
|
| | for num, line in enumerate(lines):
|
| | data = line.split()
|
| | if float(data[3]) > 6:
|
| | line = str(data[0]) + ' ' + str(data[1]) + ' ' + str(data[2]) + ' ' + \
|
| | str(min(float(data[3]), 6)) + ' ' + str(min(float(data[4]), 6)) + ' ' + \
|
| | str(min(float(data[5]), 6)) + ' ' + str(min(float(data[6]), 6)) + ' ' + \
|
| | str(min(float(data[7]), 6)) + ' ' + "%06d"%int(num) + '\n'
|
| | else:
|
| | line = str(data[0]) + ' ' + str(data[1]) + ' ' + str(data[2]) + ' ' + \
|
| | str(data[3]) + ' ' + str(data[4]) + ' ' + \
|
| | str(data[5]) + ' ' + str(data[6]) + ' ' + \
|
| | str(data[7]) + ' ' + "%06d"%int(num) + '\n'
|
| | if num % 30000 < 27000:
|
| | train_file.write(line)
|
| | else:
|
| | test_file.write(line) |