diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..e27fe35e7f7a5a6efc6fdb72fa0a8fd398c879e4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +*.jsonl filter=lfs diff=lfs merge=lfs -text +*.tsv filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index 7b95401dc46245ac339fc25059d4a56d90b4cde5..41653912482ec0b40e1d70a49cadc3e1b03bae7e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,21 @@ ---- -license: apache-2.0 ---- +## 代码 +https://github.com/XiaoshanHsj/speechllm/tree/multispk_lingmeng + +看代码的README_lingmeng.md + +## 数据: +英文部分: `./data/Whisper-Sidecar-data-metadata/data_for_wavllm` + +德文相关: `./data/de-en-mix` + +targetASR (target-talker ASR)用到的reference audio: `./data/reference_enroll_audio/all` + +英文部分只有metadata,可以从librispeech生成。德文部分还备份了测试集音频。 + +## 模型: +tokenizer: `./llama_model/llama/tokenizer.model` + +llama-2-chat: `./llama_model/llama-2-7b-chat/consolidated.00.pth` + +训练好的MT-LLM模型目录: `./lingmeng_multispk_multitask_retrain_speechllm_v0.1_llama2_chat_wavlm_weighted_update_lora_32_32_prompt_build_multispk_multitask_de.yaml_16gpu_1accum` + diff --git a/data/Whisper-Sidecar-data-metadata/convert_to_wavllm_data_format.py b/data/Whisper-Sidecar-data-metadata/convert_to_wavllm_data_format.py new file mode 100644 index 0000000000000000000000000000000000000000..173d1318dcb07c6abb7c63b88bbce3d38fcc487a --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/convert_to_wavllm_data_format.py @@ -0,0 +1,42 @@ +import soundfile as sf +import json +from tqdm import tqdm + +input_jsonl = "/home/v-lingmeng/datasets/Whisper-Sidecar-data-metadata/data/librispeech3mix_test.jsonl" +output_tsv = "/home/v-lingmeng/datasets/Whisper-Sidecar-data-metadata/data_for_wavllm/" + input_jsonl.split("/")[-1].replace(".jsonl", "1.tsv") +print(output_tsv) +head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"]) +prompts = ['Transcribe the given audio into text. If multiple speakers are speaking, transcribe the utterances of multiple speakers in the order of their start times, separated by "".'] + + + +with open("/home/v-lingmeng/datasets/LibriSpeech/SPEAKERS.TXT", "r") as f: + speaker_info = f.readlines() + speaker_gender = {l.split("|")[0].strip():l.split("|")[1].strip() for l in speaker_info if not l.startswith(";")} +with open(input_jsonl, "r") as f: + lines = f.readlines() +new_lines = [] +for line in tqdm(lines): + line = json.loads(line.strip()) + audio = line['audio']['path'].replace("./dataset", "/valleblob/v-lingmeng/speech/data") + wav_id = audio.split("/")[-1] + n_frames = str(sf.read(audio)[0].shape[0]) + prompt = prompts[0] + tgt_text = line["sentence"] + codec = "None" + with_speech = "True" + language = "en" + if "speakers" in line: + speakers = "|".join(line["speakers"]) + else: + speakers = "|".join([_id.split("-")[0] for _id in wav_id.split("_")]) + genders = "|".join([speaker_gender[spk] for spk in speakers.split("|")]) + + new_line = "\t".join([wav_id, audio, n_frames, prompt, tgt_text, codec, with_speech, language, speakers, genders]) + new_lines.append(new_line) + +with open(output_tsv, "w") as f: + new_lines.insert(0, head) + f.write("\n".join(new_lines)) + +# print(speaker_gender) diff --git a/data/Whisper-Sidecar-data-metadata/data/aishell1mix2_dev.jsonl b/data/Whisper-Sidecar-data-metadata/data/aishell1mix2_dev.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..83e81f045cd3345de800d77b0419edec68e127a8 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/aishell1mix2_dev.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:132bd6f6c6e353bca38831088bbe2eae65ce67635b5cc204249bd070f9c56e2a +size 2460550 diff --git a/data/Whisper-Sidecar-data-metadata/data/aishell1mix2_test.jsonl b/data/Whisper-Sidecar-data-metadata/data/aishell1mix2_test.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..16ca7580a2b83fd20ca49bfefd71f71b1c8d7ea9 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/aishell1mix2_test.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53b0d3301937ee8affd9a7da43c8c7940dc6f632b1594efa7de4b7faa4524f5b +size 1483980 diff --git a/data/Whisper-Sidecar-data-metadata/data/aishell1mix2_test20.jsonl b/data/Whisper-Sidecar-data-metadata/data/aishell1mix2_test20.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..16ca7580a2b83fd20ca49bfefd71f71b1c8d7ea9 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/aishell1mix2_test20.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53b0d3301937ee8affd9a7da43c8c7940dc6f632b1594efa7de4b7faa4524f5b +size 1483980 diff --git a/data/Whisper-Sidecar-data-metadata/data/aishell1mix2_train.jsonl b/data/Whisper-Sidecar-data-metadata/data/aishell1mix2_train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..74f74207936f22f133339863bd01c47d6b41381a --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/aishell1mix2_train.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a612cd6616cdb756e6c2ad587eebad584e76a1101e754d220bb01b22224c0221 +size 27055103 diff --git a/data/Whisper-Sidecar-data-metadata/data/aishell1mix3_dev.jsonl b/data/Whisper-Sidecar-data-metadata/data/aishell1mix3_dev.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2f352c8ed15b75a197a85ad6ce05c819b7b0f9a0 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/aishell1mix3_dev.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bbe7c57288ad1565ffd0d630004508a26b48204b189cf30dd8a0f8f295e01a1 +size 2870838 diff --git a/data/Whisper-Sidecar-data-metadata/data/aishell1mix3_test.jsonl b/data/Whisper-Sidecar-data-metadata/data/aishell1mix3_test.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..82b89a70d11a7ff8df5e25bba3572b09635611b2 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/aishell1mix3_test.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2d2fb69a6fbe3cd6c89f89a002e28e01944d96c3d85a6cd240fcb52bfbd8ec2 +size 1442387 diff --git a/data/Whisper-Sidecar-data-metadata/data/aishell1mix3_train.jsonl b/data/Whisper-Sidecar-data-metadata/data/aishell1mix3_train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b7a8222593fd269b0d63f2cf6371ffad30d7dd42 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/aishell1mix3_train.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f583f9f8ff84df2ed1eb4da4dfd3a3d615d87c91bd4a3fda34dcf565028e76c +size 23867370 diff --git a/data/Whisper-Sidecar-data-metadata/data/data_prepare_aishellmix.py b/data/Whisper-Sidecar-data-metadata/data/data_prepare_aishellmix.py new file mode 100644 index 0000000000000000000000000000000000000000..3a6f0fa5a20a930c64120eb26317d1790df6f633 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/data_prepare_aishellmix.py @@ -0,0 +1,71 @@ +# prepare from msra-dev-node + +import jsonlines +import soundfile as sf +import glob +import numpy as np +import os +import pandas as pd + +def generate_jsonl_from_fairseq_datafile(root_data_dir, output_dir, with_timestamps=False): + splits = ['test', 'dev', 'train'] + num_spks = ["2", "3"] + # convert transcript file to two list, ID and text + transcript_path = '/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/aishell1/data_aishell/transcript/aishell_transcript_v0.8.txt' + transcripts = open(transcript_path, 'r').readlines() # ID\ttext\n for each line + ID_list = [] + text_list = [] + for line in transcripts: + line = line.strip() + _id, *text = line.split(' ') + ID_list.append(_id) + text_list.append(''.join(text)) + + id_text_dict = dict(zip(ID_list, text_list)) + + for num_spk in num_spks: + for split in splits: + # data_dir = os.path,join(root_data_dir, 'Aishell1Mix', "data", f'Aishell1Mix{num_spk}', 'wav16k', 'max', split, 'mix_clean') + + metadata = os.path.join(root_data_dir, 'Aishell1Mix', "data", f'Aishell1Mix{num_spk}', 'wav16k', 'max', 'metadata', f'mixture_{split}_mix_clean.csv') + df = pd.read_csv(metadata) + mix_id_list = df['mixture_ID'].tolist() + mix_path_list = df['mixture_path'].tolist() + source_wav_root = os.path.join("/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/aishell1/data_aishell/wav", split) + new_jsonl = os.path.join(output_dir, f'aishell1mix{num_spk}_{split}.jsonl') + if os.path.exists(new_jsonl): + os.remove(new_jsonl) + + + for mix_id, mix_path in zip(mix_id_list, mix_path_list): + source_ids = mix_id.split('_') + source_texts = [id_text_dict[source_id] for source_id in source_ids] + source_text = ''.join(source_texts) + + speakers = [source_id.split("S")[1].split("W")[0] for source_id in source_ids] + + source_paths = [os.path.join(source_wav_root, "S"+str(speakers[i]), source_id + '.wav') for i, source_id in enumerate(source_ids)] + source_durations = [sf.info(source_path).duration for source_path in source_paths] + + duration = max(source_durations) + dic = {"audio": {"path": mix_path}, + "language": "zh", + "duration": duration, + "speakers": speakers, + "sentence": source_text} + + starts = [0] * len(source_durations) + ends = source_durations + sentences_dict = [] + for i in range(len(starts)): + sentences_dict.append({"start": starts[i], "end": ends[i], "text": source_texts[i]}) + dic["sentences"] = sentences_dict + + with jsonlines.open(new_jsonl, mode='a') as writer: + writer.write(dic) + + +if __name__ == '__main__': + root_data_dir = '/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset' + output_dir = '/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/' + generate_jsonl_from_fairseq_datafile(root_data_dir, output_dir, with_timestamps=True) diff --git a/data/Whisper-Sidecar-data-metadata/data/data_prepare_librimix.py b/data/Whisper-Sidecar-data-metadata/data/data_prepare_librimix.py new file mode 100644 index 0000000000000000000000000000000000000000..085d52d6faf2c4cbbcd3864a075e7e0394a52a7e --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/data_prepare_librimix.py @@ -0,0 +1,49 @@ +# prepare from SEPC + +import jsonlines +# import soundfile as sf +import glob +import numpy as np +import os + +def generate_jsonl_from_fairseq_datafile(root_data_dir, output_dir): + splits = ['test', 'dev', 'train'] + num_spks = ["2", "3"] + + for num_spk in num_spks: + for split in splits: + data_dir = root_data_dir + f'Libri{num_spk}Mix_wav16k_max/' + wrd = data_dir + split + '.wrd' + fairseq_jsonl = data_dir + split + '_clean.jsonl' + new_jsonl = output_dir + f"libri{num_spk}mix_" + split + '.jsonl' + if os.path.exists(new_jsonl): + os.remove(new_jsonl) + + with jsonlines.open(fairseq_jsonl) as reader: + with open(wrd, 'r') as f: + for meta, text in zip(reader, f.readlines()): + # print(meta, text) + dic = {"audio": {"path": f"./dataset/LibriMix/data/Libri{num_spk}Mix/wav16k/max/" + meta['mixed_wav']}, + "language": "en", + "duration": max(meta['durations']), + "speakers": meta['speakers'], + "sentence": text.strip().lower()} + + starts = meta['delays'] + durations = meta['durations'] + ends = list(map(lambda x, y: x + y, starts, durations)) + texts = text.strip().lower().split(" ") + sentences_dict = [] + for i in range(len(starts)): + sentences_dict.append({"start": starts[i], "end": ends[i], "text": texts[i].strip().lower()}) + dic["sentences"] = sentences_dict + + with jsonlines.open(new_jsonl, mode='a') as writer: + writer.write(dic) + + + +if __name__ == '__main__': + root_data_dir = '/mnt/users/hccl.local/lmeng/workspaces/overlapASR/egs_wav2vec/data/' + output_dir = '/mnt/users/hccl.local/lmeng/workspaces/overlapASR/Whisper-Finetune-ovlp/dataset/' + generate_jsonl_from_fairseq_datafile(root_data_dir, output_dir) diff --git a/data/Whisper-Sidecar-data-metadata/data/data_prepare_librispeech.py b/data/Whisper-Sidecar-data-metadata/data/data_prepare_librispeech.py new file mode 100644 index 0000000000000000000000000000000000000000..b6fdab36f30879d7ee983cec0bf27760a9840246 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/data_prepare_librispeech.py @@ -0,0 +1,35 @@ +# prepare from SEPC + +import jsonlines +# import soundfile as sf +import glob +import numpy as np + +def generate_jsonl_from_fairseq_datafile(data_dir, output_dir): + splits = ['test'] + + for split in splits: + wrd = data_dir + split + '.wrd' + # fairseq_jsonl = data_dir + split + '_clean.jsonl' + tsv = data_dir + split + '.tsv' + new_jsonl = output_dir + split + '.jsonl' + + with open(tsv, 'r') as flac_path_f: + # remove the first line + flac_path_f.readline() + with open(wrd, 'r') as trans_f: + for flac_path, trans in zip(flac_path_f.readlines(), trans_f.readlines()): + # print(meta, text) + flac_path, duration = flac_path.strip().split('\t') + dic = {"audio": {"path": "./dataset/librispeech/" + flac_path}, + "language": "en", + "duration": int(duration)/16000.0, + "sentence": trans.strip().lower()} + with jsonlines.open(new_jsonl, mode='a') as writer: + writer.write(dic) + + +if __name__ == '__main__': + data_dir = '/mnt/users/hccl.local/lmeng/workspaces/overlapASR/egs_wav2vec/data/LibriSpeech/' + output_dir = '/mnt/users/hccl.local/lmeng/workspaces/overlapASR/Whisper-Finetune/dataset/' + generate_jsonl_from_fairseq_datafile(data_dir, output_dir) diff --git a/data/Whisper-Sidecar-data-metadata/data/data_prepare_librispeechmix.py b/data/Whisper-Sidecar-data-metadata/data/data_prepare_librispeechmix.py new file mode 100644 index 0000000000000000000000000000000000000000..738765008dbbbf44b3d5e92130b93544a97df30b --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/data_prepare_librispeechmix.py @@ -0,0 +1,53 @@ +# prepare from SEPC + +import jsonlines +# import soundfile as sf +import glob +import numpy as np +import os + +def generate_jsonl_from_fairseq_datafile(root_data_dir, output_dir, with_timestamps=False): + splits = ['test', 'dev', 'train'] + num_spks = ["2", "3"] + for num_spk in num_spks: + for split in splits: + data_dir = root_data_dir + f'LibriSpeechMix-{num_spk}spkr/' + wrd = data_dir + split + '.wrd' + fairseq_jsonl = data_dir + split + '_clean.jsonl' + if with_timestamps: + new_jsonl = output_dir + f"librispeech{num_spk}mix_timestamps_" + split + '.jsonl' + else: + new_jsonl = output_dir + f"librispeech{num_spk}mix_" + split + '.jsonl' + + if os.path.exists(new_jsonl): + os.remove(new_jsonl) + + with jsonlines.open(fairseq_jsonl) as reader: + with open(wrd, 'r') as f: + for meta, text in zip(reader, f.readlines()): + # print(meta, text) + dic = {"audio": {"path": "./dataset/LibriSpeechMix/" + meta['mixed_wav']}, + "language": "en", + "duration": max(list(map(lambda x, y: x + y, meta['durations'], meta['delays']))), + "speakers": meta['speakers'], + "sentence": text.strip().lower(), + } + if with_timestamps: + starts = meta['delays'] + durations = meta['durations'] + ends = list(map(lambda x, y: x + y, starts, durations)) + texts = meta['texts'] + sentences_dict = [] + for i in range(len(starts)): + sentences_dict.append({"start": starts[i], "end": ends[i], "text": texts[i].strip().lower()}) + dic["sentences"] = sentences_dict + + with jsonlines.open(new_jsonl, mode='a') as writer: + writer.write(dic) + + + +if __name__ == '__main__': + root_data_dir = '/mnt/users/hccl.local/lmeng/workspaces/overlapASR/egs_wav2vec/data/' + output_dir = '/mnt/users/hccl.local/lmeng/workspaces/overlapASR/Whisper-Finetune-ovlp/dataset/' + generate_jsonl_from_fairseq_datafile(root_data_dir, output_dir, with_timestamps=True) diff --git a/data/Whisper-Sidecar-data-metadata/data/generate_librimix_wav_from_jsonl.py b/data/Whisper-Sidecar-data-metadata/data/generate_librimix_wav_from_jsonl.py new file mode 100644 index 0000000000000000000000000000000000000000..7e15ad17148fa84b51342a78150b27c44bafc989 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/generate_librimix_wav_from_jsonl.py @@ -0,0 +1,60 @@ +import json +import os +import glob +import soundfile +import librosa +from tqdm import tqdm +import numpy as np + +def get_delayed_audio(wav_file, delay, sampling_rate=16000): + audio, _ = soundfile.read(wav_file) + delay_frame = int(delay * sampling_rate) + if delay_frame != 0: + audio = np.append(np.zeros(delay_frame), audio) + return audio + +def mix_audio(wav_files, delays): + for i, wav_file in enumerate(wav_files): + if i == 0: + audio = get_delayed_audio(wav_file, delays[i]) + else: + additional_audio = get_delayed_audio(wav_file, delays[i]) + # tune length & sum up to audio + target_length = max(len(audio), len(additional_audio)) + # print(additional_audio.shape) + audio = librosa.util.fix_length(audio, size=target_length) + additional_audio = librosa.util.fix_length(additional_audio, size=target_length) + audio = audio + additional_audio + return audio + + +jsonl_path = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/libri3mix_train.jsonl" +if "test" in jsonl_path: + subset = "test" +elif "train" in jsonl_path: + subset = "train" +else: + subset = "dev" +librispeech_dir = f"/home/v-lingmeng/datasets/LibriSpeech/{subset}*/" +output_dir = "/home/v-lingmeng/datasets" + + +with open(jsonl_path, 'r', encoding='utf-8') as file: + json_list = [json.loads(line.strip()) for line in file] + +for line in tqdm(json_list): + audio_name = line['audio']['path'].replace("./dataset", output_dir) + if not os.path.exists(os.path.split(audio_name)[0]): + os.makedirs(os.path.split(audio_name)[0]) + + source_ids = os.path.split(audio_name)[1].split(".")[0].split("_") + source_files = [glob.glob(librispeech_dir + "/".join(i.split("-")[:-1]) + f"/{i}*")[0] for i in source_ids] + delays = [l["start"] for l in line['sentences']] + # ends = [l["end"] for l in line['sentences']] + + # for source_audio, start, end in zip(source_auidos, starts, ends): + mixed_audio = mix_audio(source_files, delays) + soundfile.write(audio_name, mixed_audio, samplerate=16000) + print(audio_name) + +# print(delays) diff --git a/data/Whisper-Sidecar-data-metadata/data/libri2mix_dev-both.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri2mix_dev-both.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..83b65d65629a73073f8fb593751ba4405c25194d --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri2mix_dev-both.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a00333b0e46f84e4389e63df5045f8a762487eb29bb89eb7de9f70941e1434c +size 2224222 diff --git a/data/Whisper-Sidecar-data-metadata/data/libri2mix_dev.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri2mix_dev.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c1eee09ef5626ef4b7c435d05977f6dfdf151fbe --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri2mix_dev.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c08740dd203b33312ea9793a8859302c2ff41f44de77ffb404c6d5859df2287 +size 2227222 diff --git a/data/Whisper-Sidecar-data-metadata/data/libri2mix_test-both.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri2mix_test-both.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6baa9637f0698ae31ed313a5a0523ce84bd61fce --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri2mix_test-both.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88e089c638beff65237045f526348e50341026b0324e0230e1ee17568b3173c1 +size 2094572 diff --git a/data/Whisper-Sidecar-data-metadata/data/libri2mix_test.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri2mix_test.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a47d98dafb58428c7795ec3a0d811aa69809c19c --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri2mix_test.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbf70cb2bca21af6465f534a1910756cce13eb410815eccc90fc74b24567e4b6 +size 2097572 diff --git a/data/Whisper-Sidecar-data-metadata/data/libri2mix_test20.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri2mix_test20.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c4d135d44aff07cf1778f12556f12c568c1eb187 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri2mix_test20.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9831f61da77ba9a17367845c6288c9a7b24858cf69dcbb53e4377cb23640820b +size 14417 diff --git a/data/Whisper-Sidecar-data-metadata/data/libri2mix_train-100-both.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri2mix_train-100-both.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f90410f9a56f116be4dcc06fc122f73bff438787 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri2mix_train-100-both.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f0eb2462fa951da717d18e172a5b781b6149d3483e56022bcea7459d163b76e +size 14778639 diff --git a/data/Whisper-Sidecar-data-metadata/data/libri2mix_train-100.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri2mix_train-100.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..441f03f6b1c925766eb2c33d33474aebc7b46abf --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri2mix_train-100.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10ac8db4f8c3851c7e34f4643268355c2d34a7f8952aa316b0b70d2019733f70 +size 14792539 diff --git a/data/Whisper-Sidecar-data-metadata/data/libri2mix_train-200.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri2mix_train-200.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d19d9a5d339f7e1d6243bea87e2ebd0c7ec8be89 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri2mix_train-200.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94e70011854f573bf05586a9793f19510d6be7188be89c032459910ad1d3e11d +size 29571178 diff --git a/data/Whisper-Sidecar-data-metadata/data/libri2mix_train-both.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri2mix_train-both.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e7968254be4f1bbccdef8adfbc65e2ad78859bbc --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri2mix_train-both.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f72957a086f85d8fee3aea616c471dd7c7baf465d355db361f796f13c7e6478e +size 68694922 diff --git a/data/Whisper-Sidecar-data-metadata/data/libri2mix_train.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri2mix_train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8e23d8c7e8a0da190244b386c1e5a6222f04b1dc --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri2mix_train.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9bd7eceb100a8f842e1b708066e5da7ac77e5008ef013266c567528ab1f8c4f +size 68759621 diff --git a/data/Whisper-Sidecar-data-metadata/data/libri2mix_train20.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri2mix_train20.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0d741f2a61c4fcbb37a5d19b20973cd701343acf --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri2mix_train20.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41d2502e124c4cdf60adb5a83f8942f2fb5d710634485b5a0382e45873570048 +size 22316 diff --git a/data/Whisper-Sidecar-data-metadata/data/libri2mix_train_remove_enroll.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri2mix_train_remove_enroll.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..826701e00a17189a20e0867eb83a85cc21c19af3 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri2mix_train_remove_enroll.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3d3c34060e44c767cd70ebb53aa18d53db4a3828fa92037addeed9c04e9f8a5 +size 68268344 diff --git a/data/Whisper-Sidecar-data-metadata/data/libri3mix_test.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri3mix_test.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..23a0a11ce26a1c669db47883a503b35b785bec87 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri3mix_test.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daf3f6411bb717e86d77b4f17a455323c5d9df5fdb37fe6513a11859d50f41e9 +size 2855611 diff --git a/data/Whisper-Sidecar-data-metadata/data/libri3mix_test20.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri3mix_test20.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..07e5e0ea13d078ac93f23952be8a308cf6d668c2 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri3mix_test20.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f66ad5735f3e3d1bd24bf960439142e181dbe4ab50a951a3807b4050040568c +size 19612 diff --git a/data/Whisper-Sidecar-data-metadata/data/libri3mix_train.jsonl b/data/Whisper-Sidecar-data-metadata/data/libri3mix_train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c3afe0e0834935b8c65279c6474eb36636fee9ba --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/libri3mix_train.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ba19b8a28a4430dbab099d5450ae075f6096c869ca318846f1ed219b471c997 +size 65232138 diff --git a/data/Whisper-Sidecar-data-metadata/data/librispeech2mix_test.jsonl b/data/Whisper-Sidecar-data-metadata/data/librispeech2mix_test.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..475876d0b322c33cdc4731b37ed3ed8aea32f418 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/librispeech2mix_test.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dfd13ad50aeb92257fc7c8c4d721fad1d2e3901be95eed0aa4d58211027d471 +size 1945661 diff --git a/data/Whisper-Sidecar-data-metadata/data/librispeech2mix_test_30s.jsonl b/data/Whisper-Sidecar-data-metadata/data/librispeech2mix_test_30s.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9ab4cb9957c6628322651c187063360b76d63b98 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/librispeech2mix_test_30s.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1d07d099fe7d64cfec16efcc06b5dcac825984ca8668909152882adef2edcb9 +size 1945138 diff --git a/data/Whisper-Sidecar-data-metadata/data/librispeech2mix_train.jsonl b/data/Whisper-Sidecar-data-metadata/data/librispeech2mix_train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bdf4685fa7abc23e3d97a9db791fb0995d608dd6 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/librispeech2mix_train.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f629fb7f89d02d937e5f4d1ced117c70904fc5d97d520706758a3e41010c1c2 +size 287742080 diff --git a/data/Whisper-Sidecar-data-metadata/data/librispeech3mix_test.jsonl b/data/Whisper-Sidecar-data-metadata/data/librispeech3mix_test.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5940d92a012f113d43c1cbe2c775fff75a2c1ada --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/librispeech3mix_test.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4019227f5a2f827f2517e942c8c6f72b7a6d80fe8fe32db2909393254d7af771 +size 2730857 diff --git a/data/Whisper-Sidecar-data-metadata/data/librispeech3mix_test_temp.jsonl b/data/Whisper-Sidecar-data-metadata/data/librispeech3mix_test_temp.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5399b2d7979284e66140c07c0d6c3bc61cb2cd9b --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/librispeech3mix_test_temp.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be1fafb707b168a23d3c8225e3f04407da63360b27b99d435d11c8a7560dee9c +size 2729249 diff --git a/data/Whisper-Sidecar-data-metadata/data/librispeech3mix_train.jsonl b/data/Whisper-Sidecar-data-metadata/data/librispeech3mix_train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0c145ab4c837c491785c00d860a2c3738c1f40b7 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/librispeech3mix_train.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47917569ed5a933e6ad66ccef5ccefded8f4c117a0296dafcbde9fc564e835e6 +size 410988677 diff --git a/data/Whisper-Sidecar-data-metadata/data/librispeech_dev.jsonl b/data/Whisper-Sidecar-data-metadata/data/librispeech_dev.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8e1bb7c08abc3a7a164d0ca915366f11fe5aad52 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/librispeech_dev.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca54eb22dd7f421bd9c8b8e60a5df32946a7823d5547e49eaa030769c48de0da +size 2194617 diff --git a/data/Whisper-Sidecar-data-metadata/data/librispeech_test.jsonl b/data/Whisper-Sidecar-data-metadata/data/librispeech_test.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e22bcd6c7a34d74b15ff8eab829cc5b691e809ad --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/librispeech_test.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8421fbe45b046b2a5644782b3e982538bd8483d72964b6f08fc208e5e7059648 +size 2197082 diff --git a/data/Whisper-Sidecar-data-metadata/data/librispeech_train.jsonl b/data/Whisper-Sidecar-data-metadata/data/librispeech_train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3698674e4af767643af27d1f1cd77099cd008a42 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/librispeech_train.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc923ead53a1248037e622bbeb75899a0c2637720600cc8c386530d5cdfa95b8 +size 91088243 diff --git a/data/Whisper-Sidecar-data-metadata/data/long_wav_resample.py b/data/Whisper-Sidecar-data-metadata/data/long_wav_resample.py new file mode 100644 index 0000000000000000000000000000000000000000..d0dd08ca20c1e87dd4d66a6b26e4dcd729c4e994 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/long_wav_resample.py @@ -0,0 +1,52 @@ +# resamle long wav (>30s) to 16k 30s, and update the jsonl file + + +import jsonlines +import os + +from pydub import AudioSegment +from pydub.playback import play +import soundfile as sf + + + +jsonl_file = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/librispeech3mix_test.jsonl" +temp_file = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/librispeech3mix_test_temp.jsonl" +# time strench long wav (>30s) to 16k 30s, and update the jsonl line in-place + +with jsonlines.open(jsonl_file, 'r') as reader, jsonlines.open(temp_file, 'w') as writer: + for obj in reader: + wav_path = obj['audio']['path'] + duration = obj['duration'] + sentences = obj['sentences'] + # resample wav + if duration > 30.1: + print(wav_path, duration) + wav = AudioSegment.from_file(wav_path) + target_len = 30.0 * 1000 + speed_up_rate = len(wav) / target_len + wav = wav.speedup(playback_speed=speed_up_rate) + wav = wav[:target_len] + wav.export(wav_path, format="wav") + print(speed_up_rate) + obj['duration'] = 30 + for sentence in sentences: + sentence['start'] = sentence['start'] / speed_up_rate + sentence['end'] = sentence['end'] / speed_up_rate + if sentence['start'] > 30: + sentence['start'] = 30 + if sentence['end'] > 30: + sentence['end'] = 30 + obj['sentences'] = sentences + elif duration > 30.0: + wav, sr = sf.read(wav_path) + wav = wav[:int(16000 * 30)] + obj['duration'] = 30 if duration > 30 else duration + for sentence in sentences: + sentence['start'] = sentence['start'] if sentence['start'] < 30 else 30 + sentence['end'] = sentence['end'] if sentence['end'] < 30 else 30 + obj['sentences'] = sentences + sf.write(wav_path, wav, sr) + + + writer.write(obj) \ No newline at end of file diff --git a/data/Whisper-Sidecar-data-metadata/data/select_prompt_wav.py b/data/Whisper-Sidecar-data-metadata/data/select_prompt_wav.py new file mode 100644 index 0000000000000000000000000000000000000000..3eacbb47281212d29581296a59dfe9324214c647 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/select_prompt_wav.py @@ -0,0 +1,132 @@ +import os +import glob +import random + +import jsonlines + +# backup_dir = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/temp/removed_mix_wav" +# librispeech_dir = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/LibriSpeech/train-clean-360" +# enroll_dir = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/temp/new_enroll" + +# # 1. 获取所有的wav文件 +# wav_files = glob.glob("/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/LibriMix/data/Libri2Mix/wav16k/max/train-360/mix_clean/*.wav") + +# # 2. 每个wav文件有两个说话人,记录所有wav文件出现的说话人 +# all_speakers = set() +# for wav_file in wav_files: +# speakers = [f.split('-')[0] for f in os.path.basename(wav_file).split("_")] +# all_speakers.update(speakers) +# # random.shuffle(wav_files) + +# len_all_speakers = len(all_speakers) +# # 3. 对每个说话人,复制且只复制一个具有它的语音文件 +# count = 0 +# for wav_file in wav_files: +# source_wavs = os.path.basename(wav_file).split("_") +# speakers = [f.split('-')[0] for f in source_wavs] +# # 如果有任意一个说话人不在all_speakers中,跳过这个文件 +# if not all(s in all_speakers for s in speakers): +# continue +# else: +# # 从all_speakers中删除这两个说话人 +# all_speakers.difference_update(speakers) +# # 复制这个文件 +# os.system(f"cp {wav_file} {backup_dir}") +# # print(f"cp {wav_file} {backup_dir}") +# # 复制source_wavs中的每个说话人的语音文件 +# for source_wav in source_wavs: +# count+=1 +# source_wav_path = os.path.join(librispeech_dir, source_wav.split("-")[0], source_wav.split("-")[1], source_wav.split('.')[0] + ".flac") +# # 判断是否存在 +# if not os.path.exists(source_wav_path): +# print(f"source_wav_path: {source_wav_path} not exists") +# continue +# os.system(f"cp {source_wav_path} {enroll_dir}") + +# print(all_speakers) +# print(count, len_all_speakers) + + + +# source = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/temp/new_enroll" +# enroll_dir = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/LibriMix_enroll_audio/train-360" + +# flac_files = glob.glob(f"{source}/*.flac") + +# for flac_files in flac_files: +# # mkdir enroll_dir/spk_id +# spk_id = os.path.basename(flac_files).split("-")[0] +# spk_dir = os.path.join(enroll_dir, spk_id) +# if os.path.exists(spk_dir): +# # 删除 +# os.system(f"rm -rf {spk_dir}") +# os.makedirs(spk_dir, exist_ok=True) +# # convert flac to wav, move to spk_dir +# wav_file = os.path.join(spk_dir, spk_id+ ".wav") +# os.system(f"ffmpeg -i {flac_files} {wav_file}") + + +# jsonl_file = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/libri2mix_train_remove_enroll.jsonl" +# enrolled_dir = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/temp/removed_mix_wav" + +# # remove wav file in enrolled_dir from jsonl_file +# with jsonlines.open(jsonl_file) as reader: +# lines = list(reader) +# print(len(lines)) +# for line in lines: +# mix_wav = line['audio']['path'] +# if os.path.exists(os.path.join(enrolled_dir, os.path.basename(mix_wav))): +# lines.remove(line) +# print(len(lines)) +# # write to new jsonl file +# new_jsonl_file = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/libri2mix_train_remove_enroll.jsonl" +# with jsonlines.open(new_jsonl_file, "w") as writer: +# for line in lines: +# writer.write(line) +# print("done") + + +librispeech_dir = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/LibriSpeech/train-other-500" +enroll_path ="/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/LibriMix_enroll_audio/train-500" + +# 检查librispeech_dir中每个speaker是否存在在enroll_path中 +speaker_dirs = glob.glob(f"{librispeech_dir}/*") +new_file = [] +for speaker_dir in speaker_dirs: + if ".TXT" in speaker_dir: + continue + speaker_id = os.path.basename(speaker_dir) + enroll_speaker_dir = os.path.join(enroll_path, speaker_id) + if not os.path.exists(enroll_speaker_dir): + print(f"{enroll_speaker_dir} not exists") + os.makedirs(enroll_speaker_dir, exist_ok=True) + + if len(glob.glob(f"{enroll_speaker_dir}/*.wav")) == 0: + # 从librispeech_dir中复制一个语音文件到enroll_speaker_dir + flac_files = glob.glob(f"{speaker_dir}/*/*.flac") + # 从flac_files中随机选择一个, 并复制到enroll_speaker_dir + flac_file = random.choice(flac_files) + # 复制到enroll_speaker_dir + new_flac_file = os.path.join(enroll_speaker_dir, os.path.basename(flac_file)) + print(new_flac_file) + os.system(f"cp {flac_file} {new_flac_file}") + # 记录flac_file的名字 + new_file.append(flac_file) + else: + print(glob.glob(f"{enroll_speaker_dir}/*.wav")) + + # 检查enroll_path中speaker的语音命名是否为{speaker_id}.wav + enroll_wav_files = glob.glob(f"{enroll_speaker_dir}/*") + for enroll_wav_file in enroll_wav_files: + if os.path.basename(enroll_wav_file).split(".")[0] != speaker_id: + print(f"{enroll_wav_file} not match") + # 转为wav格式, 名字只保留speaker_id + wav_file = os.path.join(enroll_speaker_dir, speaker_id + ".wav") + os.system(f"ffmpeg -i {enroll_wav_file} {wav_file}") + # 删除原来的文件 + os.system(f"rm -rf {enroll_wav_file}") + +# 记录新的flac文件 +with open("/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/temp/new_flac_files.txt", "w") as f: + for flac_file in new_file: + f.write(flac_file + "\n") \ No newline at end of file diff --git a/data/Whisper-Sidecar-data-metadata/data/test_examples.jsonl b/data/Whisper-Sidecar-data-metadata/data/test_examples.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2849ffb4362d2ce19b8d8d38b84fe0805c1c0fc2 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data/test_examples.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcf048b062b475972c1e6dad258244680a88a9acfc8841dd8ea284ad56efc3dc +size 794 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test.tsv new file mode 100644 index 0000000000000000000000000000000000000000..4e331b020395b5af712ff4de4b05c6bc6af5d945 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53fb93ebd24db3e32a7dd7fd5908222f2f0a498a7d0ede6835140c2d4b1f3552 +size 8892480 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_1350.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_1350.tsv new file mode 100644 index 0000000000000000000000000000000000000000..b85f5e8558a95858275292f9cb6f65146fc93ea5 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_1350.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec64b8b840655de399e374931c056dd4b22d4070c8b97cb197863182599af765 +size 889784 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_1350_targetLingual.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_1350_targetLingual.tsv new file mode 100644 index 0000000000000000000000000000000000000000..312cc13add57e5b63a0240911efd2f1140d9b559 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_1350_targetLingual.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:971baf65ee1603b22483ef12a1040d884419077069b68dad7931a305ffe2f7aa +size 632636 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_targetLingual.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_targetLingual.tsv new file mode 100644 index 0000000000000000000000000000000000000000..c8e6e1a99cd79d9679cc0a2fe11a04520506aef6 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_targetLingual.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12249076735100be44176352c65ab89a30751ae7ac5621583130674529774d57 +size 6361360 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_targetLingual_1350.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_targetLingual_1350.tsv new file mode 100644 index 0000000000000000000000000000000000000000..b4a4679c8eea26a376f6a8c6dd2a71f46f52036e --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_targetLingual_1350.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:968fd7a78bdca0783e5a2baf7db53f621ed274ba1295f5bd4d2553794a4718da +size 636201 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_train.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_train.tsv new file mode 100644 index 0000000000000000000000000000000000000000..620b99f0cb43303f40ba78427fe9739463e79885 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_train.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21cb5e6b2243c0ce5a2cf6e431dd1c9c5dc5efb79fb3c2a9d548dbbc335c212f +size 93936028 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_train_targetLingual.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_train_targetLingual.tsv new file mode 100644 index 0000000000000000000000000000000000000000..a400badf704aa3ec62c2c14c994046325c8c108c --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_train_targetLingual.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aad7b8718ed09f7e98742caa6c85e496f68e332f73e65a557ecdac06cec8f2a +size 65029852 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test.tsv new file mode 100644 index 0000000000000000000000000000000000000000..1e95569820361328578d040ee3a172938f3f6fba --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2e84678170247fd7ef2a06622a32d9d42abf3e1b83e8c4426e4a9d03e44ce72 +size 10851471 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test_1350.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test_1350.tsv new file mode 100644 index 0000000000000000000000000000000000000000..67193a21e8df640db4654367ca52a17e12248da5 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test_1350.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaa506c275e319dccec8357afa545cb04a77fbe8db48579248546658351c2590 +size 1085187 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test_1350_targetLingual.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test_1350_targetLingual.tsv new file mode 100644 index 0000000000000000000000000000000000000000..7cf60b20efad4d614a12f9d4666375964d80ec01 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test_1350_targetLingual.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d01a55200a1413693e8c8bb5d24ed36e11e56a8a49f1607613fc1f6b9bc3120 +size 759724 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test_targetLingual.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test_targetLingual.tsv new file mode 100644 index 0000000000000000000000000000000000000000..1910051665cf56ff82fe3c94f9dfbbf66bcd0946 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test_targetLingual.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c109b3299a7540bb54a336176f0b5bf14ae719a79270b674561786dcd3d2fd2c +size 7570211 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test_targetLingual_1350.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test_targetLingual_1350.tsv new file mode 100644 index 0000000000000000000000000000000000000000..bcd04743241d84cc93fbe85f79f788092a95cc92 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test_targetLingual_1350.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4e78e235c28b2942eb1a198e97f19aa4ac2c8bbf134fee2c72cc4740bfe44e +size 757435 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_train.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_train.tsv new file mode 100644 index 0000000000000000000000000000000000000000..cb6bed6f2e05799e8d95b5c165958f4d006e87f7 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_train.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9c5519a7a236abf99102ee6b29291494e4dcb2af55d343e9db1db106ed19455 +size 122416553 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_train_targetLingual.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_train_targetLingual.tsv new file mode 100644 index 0000000000000000000000000000000000000000..c71d25f0ec4677f3b3217bdede392f15d931bd37 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_train_targetLingual.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4eae93d3edada6ce3273b5b37561a76575f157b16070c39ad071859e206d3bc +size 81906653 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_test.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_test.tsv new file mode 100644 index 0000000000000000000000000000000000000000..341bbab03dd9583f4973fc8d9f4fcfbaa53cef9a --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_test.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f7251d28b23392256aaa607566237762247ff81fa40dd5a90ae6d780a47b41c +size 5227950 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_test_1350.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_test_1350.tsv new file mode 100644 index 0000000000000000000000000000000000000000..c646302b7da3836ffbf0cf422b3b51e5772c7f88 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_test_1350.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9b0220a62885edee96adcd238931abd7ae2a4af288ee1e0d15bc67b0685ddbf +size 522097 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_train.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_train.tsv new file mode 100644 index 0000000000000000000000000000000000000000..36e2d35545edf13c0cb8654a75c9f386240d7cf2 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_train.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3911cbb91e28758c7380c6987c5896f00f0e46308b6fc5a7207d521d65265692 +size 48866960 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/dict.txt b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..d328f2152d8b37a96b3e5470c56fcd6755a7a7ad --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/dict.txt @@ -0,0 +1,5 @@ +1 1 +2 2 +3 3 +4 4 +5 5 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/fix_data.py b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/fix_data.py new file mode 100644 index 0000000000000000000000000000000000000000..e99bbb922d3d39c1fdfa80a4a1bdfced4e97275b --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/fix_data.py @@ -0,0 +1,12 @@ +# change the separation symbol from <\s> or to + +import glob + +for filename in glob.glob('*.tsv'): + with open(filename, 'r') as f: + lines = f.readlines() + with open(filename, 'w') as f: + for line in lines: + line = line.replace('', '') + line = line.replace('<\s>', '') + f.write(line) \ No newline at end of file diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_test.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_test.tsv new file mode 100644 index 0000000000000000000000000000000000000000..dfd915af128477b0e041338beffb4444e5288bc0 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_test.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abd782476290544d5874dadeafa39b8aa215f2c415a2fe1f77bd6cfdb9a4d7d4 +size 1707193 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_test_gender.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_test_gender.tsv new file mode 100644 index 0000000000000000000000000000000000000000..a61cb05f48e8d8214376c8983147335d99d17469 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_test_gender.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cd2227e6677c0d60d1d3c98e179dc800ee6688785ac04cb9c4f92ef137f3a52 +size 554102 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_test_keyword.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_test_keyword.tsv new file mode 100644 index 0000000000000000000000000000000000000000..2a1c019db2337a1a038e2e89c5534895c00287ca --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_test_keyword.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac66eef4cf3828a4d6947dd98bbe838a3b440b73255117385d001017861197f4 +size 1215967 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_test_targetASR.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_test_targetASR.tsv new file mode 100644 index 0000000000000000000000000000000000000000..9377fd4f83000e061fad98f1c941df1ea1ed8a37 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_test_targetASR.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c3edb59b3c35638832449c9c359cdb35d758fef61d9e93413f7fe04593fce47 +size 1731193 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_train.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_train.tsv new file mode 100644 index 0000000000000000000000000000000000000000..c0748346b5c4bbae88d4e5396bf37955876831ed --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_train.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5d20598993fee48384704a561940f32213e50136d7e67c13446255c4d456a27 +size 48753757 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_train_gender.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_train_gender.tsv new file mode 100644 index 0000000000000000000000000000000000000000..e68d701ee681eb66e7fd729ddbad92e44ee37330 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_train_gender.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a07645227eb5c5b1c2079d9383fc972339fe41faedf5ec8985227c85fb8f88ef +size 14997756 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_train_keyword.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_train_keyword.tsv new file mode 100644 index 0000000000000000000000000000000000000000..80ddbb7f7e712ace45c35874d928794cc747061d --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_train_keyword.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ae5a2c6bca8656559e2a1df923c5c675e7e880fdae1252e83ecfe1730dd7307 +size 32300627 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_train_targetASR.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_train_targetASR.tsv new file mode 100644 index 0000000000000000000000000000000000000000..fa2d8d5b23ca5ee52a4e6aaf4d4dac09640da3d5 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_train_targetASR.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:406b6cf55a146f26de8fcd39f997c059b80c24a87190cdb8a46182383b6b7fad +size 49271349 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_test.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_test.tsv new file mode 100644 index 0000000000000000000000000000000000000000..43471d732706273d89753b1178d54b16df7f7cb4 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_test.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58a4e9e2713d4edfa639f7c948c44dd7473246538044487a21f27bb37e9c5057 +size 2122593 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_test_gender.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_test_gender.tsv new file mode 100644 index 0000000000000000000000000000000000000000..1c14598794235e5fe8505c9c74f908cfeb305b47 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_test_gender.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:580a5ec72ecea86a4cdbf4289552033f699d351ae9b4b28c9871df2b8d15bf73 +size 1043335 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_test_keyword.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_test_keyword.tsv new file mode 100644 index 0000000000000000000000000000000000000000..5a46772be39c9062e3610b3a7ba46bc14de8fa78 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_test_keyword.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b069534584568efe90d3c90f522a5423cb1b0196ef43774cdfecdf8e539818f +size 1335726 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_test_targetASR.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_test_targetASR.tsv new file mode 100644 index 0000000000000000000000000000000000000000..7148a2add22aa6777797235c3e8c43aad0c8e134 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_test_targetASR.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:866e8cd48cbc4a1e4f4bdbbe47677ef26388887b76cee1b98e9ab4d40c37cc7d +size 2137593 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_train.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_train.tsv new file mode 100644 index 0000000000000000000000000000000000000000..86f7be4c7213a83220956c8274ccaefdf2e6101a --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_train.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad0fbaba6567615ed2e71a4fd123800d0c1687103f7cdcea54d5e3147de1581 +size 42611054 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_train_gender.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_train_gender.tsv new file mode 100644 index 0000000000000000000000000000000000000000..f9d33a32bc1a931de06c3ec60bb97fdb0e11114d --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_train_gender.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84151ac531719332cc40c810748f5b1f5618d5d5a5ebdb3df869f0057ebcf85a +size 19381505 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_train_keyword.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_train_keyword.tsv new file mode 100644 index 0000000000000000000000000000000000000000..2a0ae6d92b15628594f8633d097c5bb8c257fd2f --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_train_keyword.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0701d55a886c38a652798761074c945cc0b7c49266e514324057436f30617f17 +size 23397918 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_train_targetASR.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_train_targetASR.tsv new file mode 100644 index 0000000000000000000000000000000000000000..886c976bcb9033414b84f17902c7ba4bbd447ce3 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_train_targetASR.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b552729e38a4f68730dcf254e892228bd5a28d9482269769319592dbc42c3c +size 42956654 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test.tsv new file mode 100644 index 0000000000000000000000000000000000000000..19d84c7680fc927d96efdba74b2afa6597ed4934 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a7da83295679a995a07db1775afe73e820db8d0e4a7b2a8fa4cfb7b44694d54 +size 1429201 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test_gender.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test_gender.tsv new file mode 100644 index 0000000000000000000000000000000000000000..4df887bf6aea65b4e9ac9229b25b01389889f7fb --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test_gender.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa83c5427d1658509f7c88476a93a8e71c6b28e06f91c2364c2ce0999b7c30ba +size 443151 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test_keyword.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test_keyword.tsv new file mode 100644 index 0000000000000000000000000000000000000000..240f3ea716fbb81154dd96f8ce8732b1df395619 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test_keyword.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ecdb004f7f813b33febd551fa047f14c51ea79db38b9e983dbc4538bcc5575c +size 1002591 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test_order.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test_order.tsv new file mode 100644 index 0000000000000000000000000000000000000000..38023789179139cb20ce8524f4239dbb4ff8563a --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test_order.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efa60e033db5970c6d7d7dd8213322a685b528430c6f622ffb29e1e148f0c05d +size 939769 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test_targetASR.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test_targetASR.tsv new file mode 100644 index 0000000000000000000000000000000000000000..c24ab598b0f838f79ec40969b26d18de4ebb7ed3 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test_targetASR.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90cb5be6066df8f49d1a60d52b386887e4738bfcc7f297c28355ffc299c68e3a +size 1450145 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train.tsv new file mode 100644 index 0000000000000000000000000000000000000000..cd3eeeb428e2a29e65b7dbba2c2edf969d25641b --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bc631089c416cb1f8f85cb7fac368fc2154ad881821103ab6168c7200e40914 +size 193134514 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train_gender.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train_gender.tsv new file mode 100644 index 0000000000000000000000000000000000000000..1cc74f859fcb822624d7c89bc7fbc646722b4d92 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train_gender.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a39a5e94f1dc225559717e5ea18630824f9fb55a4ed893b45a0d619fefd790f +size 57154620 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train_keyword.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train_keyword.tsv new file mode 100644 index 0000000000000000000000000000000000000000..fcda79e59427e3260ba3e3db3bdd98ddfce90f21 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train_keyword.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8e3a4235b3d8911f38e6dda8d600fa5731fa94156fe3eea12e91d1982cab43d +size 125481936 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train_order.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train_order.tsv new file mode 100644 index 0000000000000000000000000000000000000000..5b58d45523414cc2fc7385fbb6224c105e38565d --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train_order.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d3e376e9f8861738f6658f1e3a46f3143a3be0693369d9977d4ae37d43563c7 +size 121260123 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train_targetASR.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train_targetASR.tsv new file mode 100644 index 0000000000000000000000000000000000000000..5d0c54402fe802c33b24f0b7e9c31332b76c2204 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train_targetASR.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4020aa3de361138ac6aa9e6b2bda5d5b0a9a5435d67ee85996afc003eedba08 +size 195384426 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test.tsv new file mode 100644 index 0000000000000000000000000000000000000000..40b3851bd751fbfa40066761c8372ef97de05269 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:315e1c33aeeaaf8e15e6880c23b6fd0c7d3b751b9edf512aa7f83dd9fd8defec +size 1726904 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test_gender.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test_gender.tsv new file mode 100644 index 0000000000000000000000000000000000000000..6c309aeee035a28a47228a0c4609d525229c0f3c --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test_gender.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f736fe7baf5c4d74dba9814c54c3a59781f0d20a0e4f97095908080ec55d1a84 +size 805156 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test_keyword.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test_keyword.tsv new file mode 100644 index 0000000000000000000000000000000000000000..22cdd96afcafbe1a7c571da4320643f49098df3a --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test_keyword.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54bd8b064f2a4367f8703ae2b48bbb7d67294a9fb9e7b48fa28dce14d71be84b +size 1037649 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test_order.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test_order.tsv new file mode 100644 index 0000000000000000000000000000000000000000..5957d33591d937d7cdac1da3085b608d25f76c78 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test_order.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e476b6d9164638f8f4fb985663518963131a80da9e5798e1ca74c410d007d047 +size 955658 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test_targetASR.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test_targetASR.tsv new file mode 100644 index 0000000000000000000000000000000000000000..f74b36fe825b2a9d69c4f762af6d7458e1217df7 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test_targetASR.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1408cd311290ea46bb706ce83b277b74371296cdc1fbbddc5529f0d49955eef3 +size 1747728 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train.tsv new file mode 100644 index 0000000000000000000000000000000000000000..038f7c89825d5ff80569b5396d228a28a3eb5d28 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5beee364fb409e50032387cd6695504c342c2b7005fb0be7685281f70a7e41e8 +size 242713442 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train_gender.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train_gender.tsv new file mode 100644 index 0000000000000000000000000000000000000000..bf51fe311ad804fb830b44168062258b9350d1db --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train_gender.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d50912112bea06dffcb59c00ea8d429b3ccce7f121fdffeba94cc51c0c7a944b +size 104772496 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train_keyword.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train_keyword.tsv new file mode 100644 index 0000000000000000000000000000000000000000..549a6da65ef2dfe0d114dbf3eb9ad7e4b2aa2660 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train_keyword.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffeb0f22aa8cdf06d49bf3b8758574802fbab3783e9aa0afd96421091b57cca2 +size 126509909 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train_order.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train_order.tsv new file mode 100644 index 0000000000000000000000000000000000000000..f4cc662479bc38603cf93e6fc24d446c6ef24eb0 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train_order.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5abe1fe2b83fec9db0494715ffa3ca573641103ee630e577384ad51c943d1f1c +size 121342102 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train_targetASR.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train_targetASR.tsv new file mode 100644 index 0000000000000000000000000000000000000000..654258796fdfe40fffe3236317df9d4ea86e9db9 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train_targetASR.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94576fbc9162c6e664c1fd3a01a9a9d8327e08b59dae32b61594ad7132bf5f04 +size 244932674 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_test.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_test.tsv new file mode 100644 index 0000000000000000000000000000000000000000..5492d510cc55e72cf7d8813d13bcd3824819cb3a --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_test.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0753839e606effdc7b480b00adbb5622dbf067d62c9e70f7664215f0a44afbc +size 2366608 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_test_clean.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_test_clean.tsv new file mode 100644 index 0000000000000000000000000000000000000000..1e8e1fb140f763d5666f4958cf7cba439a1203b0 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_test_clean.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d6608467e6bbe61d3459e60233f75c26be62fc29eb2e20d43d4f9ebbc192855 +size 1133930 diff --git a/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_train.tsv b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_train.tsv new file mode 100644 index 0000000000000000000000000000000000000000..857d1e829e9d565d88561277eb0355bd5201a0e9 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_train.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55fac030be50f6c70fe72089cd75cd5b6916395a5d626ff109e075b57cd3d656 +size 143035279 diff --git a/data/Whisper-Sidecar-data-metadata/generate_multitask_data.py b/data/Whisper-Sidecar-data-metadata/generate_multitask_data.py new file mode 100644 index 0000000000000000000000000000000000000000..3cef92ff7923c07246940afcc72f703e5d75b829 --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/generate_multitask_data.py @@ -0,0 +1,141 @@ +import glob +import random + +tsv_files = glob.glob("/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/*.tsv") + + + +head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"]) +for tsv in tsv_files: + gender_lines = [] + order_lines = [] + keyword_lines = [] + target_lines = [] + targetLingual_lines = [] + if "gender" in tsv or "order" in tsv or "keyword" in tsv or "librispeech_" in tsv or "target" in tsv: + continue + with open(tsv, "r") as f: + + _ = f.readline() + lines = f.readlines() + # # gender task + # for line in lines: + # genders = line.split("\t")[-1].strip().split("|") + # targets = line.split("\t")[4].split(" ") + # if "M" not in genders or "F" not in genders: + # continue + # else: + # target_gen = "male" if random.random() > 0.5 else "female" + # prompt = f"Please transcribe the contents spoken by {target_gen} speakers in overlapping speech." + + # new_targets = [target for gender, target in zip(genders, targets) if (gender == "F" and target_gen == "female") or (gender == "M" and target_gen == "male")] + # new_targets = " ".join(new_targets) + + # new_line = line.strip().split("\t") + # new_line[3] = prompt + # new_line[4] = new_targets + # new_line = "\t".join(new_line) + # gender_lines.append(new_line) + # output_file_gender = tsv.replace(".tsv", "_gender.tsv") + # with open(output_file_gender, "w") as f: + # f.write(head + "\n" + "\n".join(gender_lines)) + + # # order task + # for line in lines: + # if "2mix" in tsv: + # num_spk = 2 + # elif "3mix" in tsv: + # num_spk = 3 + # else: + # continue + # target_idx = random.randint(0, num_spk-1) + # targets = line.split("\t")[4].split(" ") + # new_targets = targets[target_idx] + + # _ = ['first', "second", "third"][target_idx] + # prompt = f"There are multiple speakers in the audio. Please transcribe the speech of the {_} speaker into text." + + # new_line = line.strip().split("\t") + # new_line[3] = prompt + # new_line[4] = new_targets + # new_line = "\t".join(new_line) + # order_lines.append(new_line) + # output_file_order = tsv.replace(".tsv", "_order.tsv") + # with open(output_file_order, "w") as f: + # f.write(head + "\n" + "\n".join(order_lines)) + + + # # keyword task + # for line in lines: + # # 获取 targets 并将它们拆成单词集合 + # targets = line.split("\t")[4].split(" ") + # # 筛掉长度小于等 + # targets_list = [set(word for word in t.split(" ") if len(word) >= 6) for t in targets] + # # 求所有 targets 中的单词集合并 + # all_word_set = set.union(*targets_list) + # if len(all_word_set) == 0: + # continue + # # 找出每个 target 中独有的单词 + # unique_targets_list = [] + # for target_set in targets_list: + # other_targets_union = set.union(*[s for s in targets_list if s != target_set]) + # unique_words = target_set - other_targets_union + # unique_targets_list.append(unique_words) + + # all_unique_word_set = set.union(*unique_targets_list) + # if len(all_unique_word_set) == 0: + # continue + + # sampled_word = random.choice(list(all_unique_word_set)) + # # 找到这个单词属于的集合索引 + # for i, unique_words in enumerate(unique_targets_list): + # if sampled_word in unique_words: + # set_index = i + # break + + + # new_targets = targets[set_index] + # prompt = f'Please transcribe the speech of the speaker who said the word "{sampled_word}" in the overlapping speech audio.' + + # new_line = line.strip().split("\t") + # new_line[3] = prompt + # new_line[4] = new_targets + # new_line = "\t".join(new_line) + # keyword_lines.append(new_line) + # output_file_keyword = tsv.replace(".tsv", "_keyword.tsv") + # with open(output_file_keyword, "w") as f: + # f.write(head + "\n" + "\n".join(keyword_lines)) + + # # target talker ASR + # this part is processed by the model dataloader + # for line in lines: + # prompt = "The audio file starts with a 3-second reference speech by the target speaker, followed by overlapping speech. Please transcribe the target speaker's part from the overlapping section." + # new_line = line.strip().split("\t") + # new_line[3] = prompt + # new_line = "\t".join(new_line) + # target_lines.append(new_line) + # output_file_targetASR = tsv.replace(".tsv", "_targetASR.tsv") + # with open(output_file_targetASR, "w") as f: + # f.write(head + "\n" + "\n".join(target_lines)) + + + # target talker lingual + if "de-en-" not in tsv: + continue + for line in lines: + target_lingual = random.choice(["English", "German"]) + prompt = f"Please transcribe the person speaking {target_lingual} from the overlapping speech audio." + new_line = line.strip().split("\t") + new_line[3] = prompt + + target_lingual = "en" if target_lingual=="English" else "de" + tgt_texts = new_line[4].split(" ") + langs = new_line[7].split("|") + tgt_text = [text for text, lang in zip(tgt_texts, langs) if lang == target_lingual] + new_line[4] = " ".join(tgt_text) + + new_line = "\t".join(new_line) + targetLingual_lines.append(new_line) + output_file_targetLingual = tsv.replace(".tsv", "_targetLingual.tsv") + with open(output_file_targetLingual, "w") as f: + f.write(head + "\n" + "\n".join(targetLingual_lines)) \ No newline at end of file diff --git a/data/Whisper-Sidecar-data-metadata/select_prompt_wav.py b/data/Whisper-Sidecar-data-metadata/select_prompt_wav.py new file mode 100644 index 0000000000000000000000000000000000000000..fcc666b085a6f23197efc0ae7940d6082ff91a7e --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/select_prompt_wav.py @@ -0,0 +1,50 @@ +import os +import glob +import random + +import jsonlines + +librispeech_dir = "./dataset/LibriSpeech/*" +enroll_path = "./prompt_enroll_audio/all" + +if not os.path.exists(enroll_path): + os.makedirs(enroll_path) + +speaker_dirs = glob.glob(f"{librispeech_dir}/*") +new_files = [] +for speaker_dir in speaker_dirs: + if not "train-" in speaker_dir and not "test-" in speaker_dir and not "dev-" in speaker_dir: + continue + speaker_id = os.path.basename(speaker_dir) + enroll_speaker_dir = os.path.join(enroll_path, speaker_id) + if not os.path.exists(enroll_speaker_dir): + os.makedirs(enroll_speaker_dir, exist_ok=True) + # Check whether each speaker in librispeech_dir exists in enroll_path + if len(glob.glob(f"{enroll_speaker_dir}/*.wav")) == 0: + # Copy an audio file from librispeech_dir to enroll_speaker_dir + flac_files = glob.glob(f"{speaker_dir}/*/*.flac") + try: + flac_files = random.sample(flac_files, 5) + except: + print() + print(flac_files) + print(speaker_dir) + for flac_file in flac_files: + new_flac_file = os.path.join(enroll_speaker_dir, os.path.basename(flac_file)) + # print(new_flac_file) + os.system(f"cp {flac_file} {new_flac_file}") + # Record the name of the flac_file + new_files.append(flac_file) + else: + print(glob.glob(f"{enroll_speaker_dir}/*.wav")) + + # Check whether the voice naming of the speaker in enroll_path is {speaker_id}.wav + enroll_wav_files = glob.glob(f"{enroll_speaker_dir}/*") + for i, enroll_wav_file in enumerate(enroll_wav_files): + wav_file = os.path.join(enroll_speaker_dir, speaker_id + f"_{i}.wav") + os.system(f"ffmpeg -i {enroll_wav_file} -t 3 {wav_file}") + os.system(f"rm -rf {enroll_wav_file}") + +# Record the original name of the enroll wav +with open(f"{enroll_path}/enrolled_wavs.txt", "w") as f: + f.write("\n".join(new_files)) \ No newline at end of file diff --git a/data/Whisper-Sidecar-data-metadata/stat.py b/data/Whisper-Sidecar-data-metadata/stat.py new file mode 100644 index 0000000000000000000000000000000000000000..1465dd38c1b5d2bd0e4ceae15f31ef845edcb2de --- /dev/null +++ b/data/Whisper-Sidecar-data-metadata/stat.py @@ -0,0 +1,56 @@ +import glob + +tsv_files = glob.glob("/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/*.tsv") +head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"]) + +tsv_files = [ + "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_train.tsv", + "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_train.tsv", + "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_train.tsv", + # "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_train_targetLingual.tsv", + # "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_train_targetLingual.tsv", + "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_train_fixed.tsv", + "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train_fixed.tsv", + "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train_fixed.tsv", + "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_train_fixed.tsv", + "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_train_fixed.tsv", + # "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_test.tsv", + # "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test.tsv", + # "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test.tsv", + # "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_targetLingual.tsv", + # "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test_targetLingual.tsv", + # "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_test_fixed.tsv", + # "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test_fixed.tsv", + # "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test_fixed.tsv", + # "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_test_fixed.tsv", + # "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_test_fixed.tsv", +] + +all_count = 0.0 +for tsv in tsv_files: + if "test" in tsv: + continue + with open(tsv, "r") as f: + lines = f.readline() + lines = f.readlines() + print(tsv) + print(len(lines)) + count = 0.0 + max_sec = 0 + new_lines = [] + for line in lines: + new_line = line.strip().split("\t") + # if float(new_line[2])/16000 > 40: + # continue + new_lines.append("\t".join(new_line)) + # max_sec = max_sec if max_sec > float(new_line[2])/16000 else float(new_line[2])/16000 + count += float(new_line[2])/16000 + # with open(tsv, "w") as f: + # f.write(head + "\n" + "\n".join(new_lines)) + + print(count/60/60) + print(max_sec) + print() + all_count += count/60/60 + +print(all_count) \ No newline at end of file diff --git a/data/de-en-mix/README.md b/data/de-en-mix/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ca891fb064be75ad3dd329a5ddc4c8d109489254 --- /dev/null +++ b/data/de-en-mix/README.md @@ -0,0 +1,5 @@ +de is from commonvoice, `/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_train.tsv` + +en is from librispeech, `/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_train.tsv` + +only kept 1350 test samples. \ No newline at end of file diff --git a/data/de-en-mix/de-en-2mix_test.tar.gz b/data/de-en-mix/de-en-2mix_test.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..94ac841220cf090ccfb3f4a902bd5c47dce36b5b --- /dev/null +++ b/data/de-en-mix/de-en-2mix_test.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:821f2704b552598bf8add0f62675a65e4738782a2d5d73b17248fe845cd6612f +size 319824887 diff --git a/data/de-en-mix/de-en-3mix_test.tar.gz b/data/de-en-mix/de-en-3mix_test.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..5ffc0198718565a8db72094be1b0c237e8f42592 --- /dev/null +++ b/data/de-en-mix/de-en-3mix_test.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1c6a49566217d47b00f084a97330315077765d623fdc79f639d2472409092ab +size 452491757 diff --git a/data/de-en-mix/de_test.tar.gz b/data/de-en-mix/de_test.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..f9cc76b36cd34f0874ff46c8bfed508e865746f4 --- /dev/null +++ b/data/de-en-mix/de_test.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2173a5c71ddb6ed276cf5f4a46e7ffb5a1bf531c2c277de62861fdd9e73ed680 +size 179623462 diff --git a/data/de-en-mix/generate_metadate.py b/data/de-en-mix/generate_metadate.py new file mode 100644 index 0000000000000000000000000000000000000000..2abae4824e00c39515fd7acb87e2c1347f8f6764 --- /dev/null +++ b/data/de-en-mix/generate_metadate.py @@ -0,0 +1,92 @@ +import os +import json +import random +import soundfile as sf +from tqdm import tqdm + +# # # de-en-2mix +# subset = "test" +# tsv_path = f"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_{subset}.tsv" +# tsv_librispeech_path = f"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_{subset}.tsv" +# output_jsonl_path = f"/valleblob/v-lingmeng/speech/data/de-en-mix/metadata/de-en-2mix_{subset}.jsonl" + +# with open(tsv_path, "r") as f1, open(tsv_librispeech_path, "r") as f2: +# lines = f1.readline() +# lines = f1.readlines() +# ls_lines = f2.readline() +# ls_lines = f2.readlines() + +# new_lines = [] +# count = 0 +# for i, line in enumerate(lines): +# new_line = {} +# ls_line = random.choice(ls_lines).strip().split("\t") +# group = [line.strip().split("\t"), ls_line] +# idx = random.randint(0, 1) +# new_line["mixed_wav"] = f'/valleblob/v-lingmeng/speech/data/de-en-mix/de-en-2mix_{subset}/{group[idx][0].split(".")[0].replace("_", "-")}_{group[1-idx][0].split(".")[0].replace("_", "-")}.wav' +# new_line["wavs"] = [group[idx][1], group[1-idx][1]] +# wav_path = group[idx][1] +# wav, _ = sf.read(wav_path) +# assert _ == 16000 +# delay = round(random.uniform(0, int(len(wav)/16000)), 3) +# new_line["delays"] = [0.0, delay] +# # print(str(new_line)) +# new_lines.append(json.dumps(new_line)) +# # count += 1 +# # if count == 3000: +# # break + + +# with open(output_jsonl_path, "w") as f: +# f.write("\n".join(new_lines)) + + +################## + + + +# de-en-3mix +subset = "train" +tsv_path = f"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_{subset}.tsv" +tsv_librispeech_path = f"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_{subset}.tsv" +output_jsonl_path = f"/valleblob/v-lingmeng/speech/data/de-en-mix/metadata/de-en-3mix_{subset}.jsonl" + +with open(tsv_path, "r") as f1, open(tsv_librispeech_path, "r") as f2: + lines = f1.readline() + lines = f1.readlines() + ls_lines = f2.readline() + ls_lines = f2.readlines() + +new_lines = [] +count = 0 +for i, line in tqdm(enumerate(lines)): + new_line = {} + ls_line_ = random.sample(ls_lines, 2) + group = [line.strip().split("\t"), ls_line_[0].strip().split("\t"), ls_line_[1].strip().split("\t")] + random.shuffle(group) + + new_line["mixed_wav"] = f'/valleblob/v-lingmeng/speech/data/de-en-mix/de-en-3mix_{subset}/{group[0][0].split(".")[0].replace("_", "-")}_{group[1][0].split(".")[0].replace("_", "-")}_{group[2][0].split(".")[0].replace("_", "-")}.wav' + new_line["wavs"] = [group[0][1], group[1][1], group[2][1]] + wav_path = group[0][1] + wav, _ = sf.read(wav_path) + wav1_len = int(len(wav)/16000) + assert _ == 16000 + delay1 = round(random.uniform(0, wav1_len), 3) + + wav_path = group[1][1] + wav, _ = sf.read(wav_path) + assert _ == 16000 + wav2_len = int(len(wav)/16000) + max_wav_len = max(wav2_len+delay1, wav1_len) + delay2 = round(random.uniform(delay1+0.5, max_wav_len), 3) + + new_line["delays"] = [0.0, delay1, delay2] + # print(str(new_line)) + new_lines.append(json.dumps(new_line)) + # count += 1 + # if count == 3000: + # break + + +with open(output_jsonl_path, "w") as f: + f.write("\n".join(new_lines)) \ No newline at end of file diff --git a/data/de-en-mix/generate_tsv.py b/data/de-en-mix/generate_tsv.py new file mode 100644 index 0000000000000000000000000000000000000000..7a38949fbe4440debb3c02ff23f385db9b876cc3 --- /dev/null +++ b/data/de-en-mix/generate_tsv.py @@ -0,0 +1,51 @@ +import json +import soundfile as sf +import pandas as pd +from tqdm import tqdm + +n_spks = 3 +subset = "train" +jsonl_file = f"/valleblob/v-lingmeng/speech/data/de-en-mix/metadata/de-en-{n_spks}mix_{subset}.jsonl" +en_df = pd.read_csv(f"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_{subset}.tsv", delimiter='\t', encoding='utf-8') +de_df = pd.read_csv(f"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_{subset}.tsv", delimiter='\t', encoding='utf-8') +output_tsv = f"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-{n_spks}mix_{subset}.tsv" + +head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"]) + +with open(jsonl_file, "r") as f: + lines = f.readlines() + +new_lines = [] +for line in tqdm(lines): + json_dict = json.loads(line) + + id_ = json_dict['mixed_wav'].split("/")[-1].split(".wav")[0] + audio = json_dict['mixed_wav'] + n_frames = str(len(sf.read(audio)[0])) + prompt = '"Transcribe the given audio into text, capturing the utterances of multiple speakers in English or German. List the utterances in the order of their start times, separated by "".' + source_ids = id_.split("_") + # print(de_df["id"].apply(lambda x: x.split(".")[0].replace("-","_")) ) + # print(source_ids[1].split(".")[0]) + # print(de_df[de_df["id"].apply(lambda x: x.split(".")[0].replace("_","-")) == source_ids[1].split(".")[0]]["tgt_text"]) + + tgt_texts = [ + de_df[de_df["id"] == source_id.replace("-", "_")]["tgt_text"].iloc[0] if "common" in source_id else en_df[en_df["id"] == source_id+".flac"]["tgt_text"].iloc[0] + for source_id in source_ids + ] + tgt_text = " ".join(tgt_texts) + codec = "None" + with_speech = "True" + language = "|".join(["de" if "common" in source_id else "en" for source_id in source_ids ]) + speakers = [ + de_df[de_df["id"] == source_id.replace("-", "_")]["speakers"].iloc[0] if "common" in source_id else en_df[en_df["id"] == source_id+".flac"]["speakers"].iloc[0] + for source_id in source_ids + ] + speakers = "|".join([str(s) for s in speakers]) + genders = "None" + + new_line = "\t".join([id_, audio, n_frames, prompt, tgt_text, codec, with_speech, language, speakers, genders]) + new_lines.append(new_line) + +with open(output_tsv, "w") as f: + new_lines.insert(0, head) + f.write("\n".join(new_lines)) diff --git a/data/de-en-mix/generate_wav_from_meta_jsonl.py b/data/de-en-mix/generate_wav_from_meta_jsonl.py new file mode 100644 index 0000000000000000000000000000000000000000..7c676e81b0c4fe795cd9c469e127c96ca56078cf --- /dev/null +++ b/data/de-en-mix/generate_wav_from_meta_jsonl.py @@ -0,0 +1,60 @@ +import json +import os +import glob +import soundfile +import librosa +from tqdm import tqdm +import numpy as np + +def get_delayed_audio(wav_file, delay, sampling_rate=16000): + audio, _ = soundfile.read(wav_file) + delay_frame = int(delay * sampling_rate) + if delay_frame != 0: + audio = np.append(np.zeros(delay_frame), audio) + return audio + +def mix_audio(wav_files, delays, weight): + for i, wav_file in enumerate(wav_files): + if i == 0: + audio = get_delayed_audio(wav_file, delays[i]) + audio = weight[i] * audio + + else: + additional_audio = get_delayed_audio(wav_file, delays[i]) + additional_audio = weight[i] * additional_audio + # tune length & sum up to audio + target_length = max(len(audio), len(additional_audio)) + # print(additional_audio.shape) + audio = librosa.util.fix_length(audio, size=target_length) + additional_audio = librosa.util.fix_length(additional_audio, size=target_length) + audio = audio + additional_audio + return audio + + +jsonl_path = "/valleblob/v-lingmeng/speech/data/de-en-mix/metadata/de-en-3mix_train.jsonl" +librispeech_dir = f"" +output_dir = "" + + +with open(jsonl_path, 'r', encoding='utf-8') as file: + json_list = [json.loads(line.strip()) for line in file] + +for line in tqdm(json_list): + audio_name = output_dir + line['mixed_wav'] + _audio_name = audio_name.split("/")[-1].split("_") + weight = [1.0 if "common" in _name else 0.3 for _name in _audio_name] # 德语声音非常小 + + if not os.path.exists(os.path.split(audio_name)[0]): + os.makedirs(os.path.split(audio_name)[0]) + + source_ids = line['wavs'] + source_files = [librispeech_dir + i.replace(".wav", ".flac") for i in source_ids] + delays = line["delays"] + # ends = [l["end"] for l in line['sentences']] + + # for source_audio, start, end in zip(source_auidos, starts, ends): + mixed_audio = mix_audio(source_files, delays, weight) + soundfile.write(audio_name, mixed_audio, samplerate=16000) + # print(audio_name) + +# print(delays) diff --git a/data/de-en-mix/metadata/de-en-2mix_test.jsonl b/data/de-en-mix/metadata/de-en-2mix_test.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ef4192a77333813df22bd6d918be70fa56a548d8 --- /dev/null +++ b/data/de-en-mix/metadata/de-en-2mix_test.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dfc774785b5a6f4a886c993b9150c6b43cd4fdc8f578db0b9fe2adb832aa08a +size 4911835 diff --git a/data/de-en-mix/metadata/de-en-2mix_train.jsonl b/data/de-en-mix/metadata/de-en-2mix_train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..55635d9602120feec6529bf719765ada00cb7912 --- /dev/null +++ b/data/de-en-mix/metadata/de-en-2mix_train.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51190146b9cdcd9f450f4644f619f972ea498e74a672733e14d14a809f1abe12 +size 47206360 diff --git a/data/de-en-mix/metadata/de-en-3mix_test.jsonl b/data/de-en-mix/metadata/de-en-3mix_test.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e05fbedf3420fbfe6bed0f0622d17bd581237563 --- /dev/null +++ b/data/de-en-mix/metadata/de-en-3mix_test.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a2e30c4cc607151c514d4716fcdf1da5c8ee51deb79548d4a4d7e0fb81126ee +size 6637883 diff --git a/data/de-en-mix/metadata/de-en-3mix_train.jsonl b/data/de-en-mix/metadata/de-en-3mix_train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3be404e1f39bd0861a1cd6cdad3435084f18b19b --- /dev/null +++ b/data/de-en-mix/metadata/de-en-3mix_train.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7fbad74fc6a0ca52783b0498d1556a57e0bc3162ae28028485ff4096ee5b6b6 +size 64220827 diff --git a/data/reference_enroll_audio.tar.gz b/data/reference_enroll_audio.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..c0114e9ff8e555bb1ce2c7191bfd641a3be60927 --- /dev/null +++ b/data/reference_enroll_audio.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62a481aa7bcbc9eb8c27c48ac8d1a8f69d2bd1f6f41558547d4a3511a25de203 +size 968316801 diff --git a/llama_model/llama-2-7b-chat/checklist.chk b/llama_model/llama-2-7b-chat/checklist.chk new file mode 100644 index 0000000000000000000000000000000000000000..150da5509a7461cf6c16a6e1acb0ca6329249b66 --- /dev/null +++ b/llama_model/llama-2-7b-chat/checklist.chk @@ -0,0 +1,2 @@ +0c4837f3ef97f648452f91faed308a07 consolidated.00.pth +1c39bc3c6b51079fd807cc105b86c9df params.json diff --git a/llama_model/llama-2-7b-chat/consolidated.00.pth b/llama_model/llama-2-7b-chat/consolidated.00.pth new file mode 100644 index 0000000000000000000000000000000000000000..a1e414306a4aa4a7d54985c55a655307299da8c4 --- /dev/null +++ b/llama_model/llama-2-7b-chat/consolidated.00.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6234f92a9191a4887b65a7f14a9692b4af3beffa2a26359869daf36bdf71b8d8 +size 13476925163 diff --git a/llama_model/llama-2-7b-chat/params.json b/llama_model/llama-2-7b-chat/params.json new file mode 100644 index 0000000000000000000000000000000000000000..e4cef7be9cf058c8a44bb166ea0adbcf94173f97 --- /dev/null +++ b/llama_model/llama-2-7b-chat/params.json @@ -0,0 +1 @@ +{"dim": 4096, "multiple_of": 256, "n_heads": 32, "n_layers": 32, "norm_eps": 1e-06, "vocab_size": -1} diff --git a/llama_model/llama/tokenizer.model b/llama_model/llama/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/llama_model/llama/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/llama_model/llama/tokenizer_checklist.chk b/llama_model/llama/tokenizer_checklist.chk new file mode 100644 index 0000000000000000000000000000000000000000..4531f05cde0f2f2cb2d44055cf08e1d467d40196 --- /dev/null +++ b/llama_model/llama/tokenizer_checklist.chk @@ -0,0 +1 @@ +eeec4125e9c7560836b4873b6f8e3025 tokenizer.model