Lingwei Meng commited on
Commit ·
c52df1b
1
Parent(s): c3e45ef
add data
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +2 -0
- README.md +21 -3
- data/Whisper-Sidecar-data-metadata/convert_to_wavllm_data_format.py +42 -0
- data/Whisper-Sidecar-data-metadata/data/aishell1mix2_dev.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/aishell1mix2_test.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/aishell1mix2_test20.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/aishell1mix2_train.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/aishell1mix3_dev.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/aishell1mix3_test.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/aishell1mix3_train.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/data_prepare_aishellmix.py +71 -0
- data/Whisper-Sidecar-data-metadata/data/data_prepare_librimix.py +49 -0
- data/Whisper-Sidecar-data-metadata/data/data_prepare_librispeech.py +35 -0
- data/Whisper-Sidecar-data-metadata/data/data_prepare_librispeechmix.py +53 -0
- data/Whisper-Sidecar-data-metadata/data/generate_librimix_wav_from_jsonl.py +60 -0
- data/Whisper-Sidecar-data-metadata/data/libri2mix_dev-both.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/libri2mix_dev.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/libri2mix_test-both.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/libri2mix_test.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/libri2mix_test20.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/libri2mix_train-100-both.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/libri2mix_train-100.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/libri2mix_train-200.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/libri2mix_train-both.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/libri2mix_train.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/libri2mix_train20.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/libri2mix_train_remove_enroll.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/libri3mix_test.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/libri3mix_test20.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/libri3mix_train.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/librispeech2mix_test.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/librispeech2mix_test_30s.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/librispeech2mix_train.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/librispeech3mix_test.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/librispeech3mix_test_temp.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/librispeech3mix_train.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/librispeech_dev.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/librispeech_test.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/librispeech_train.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data/long_wav_resample.py +52 -0
- data/Whisper-Sidecar-data-metadata/data/select_prompt_wav.py +132 -0
- data/Whisper-Sidecar-data-metadata/data/test_examples.jsonl +3 -0
- data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test.tsv +3 -0
- data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_1350.tsv +3 -0
- data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_1350_targetLingual.tsv +3 -0
- data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_targetLingual.tsv +3 -0
- data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_targetLingual_1350.tsv +3 -0
- data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_train.tsv +3 -0
- data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_train_targetLingual.tsv +3 -0
- data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test.tsv +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.tsv filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,3 +1,21 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## 代码
|
| 2 |
+
https://github.com/XiaoshanHsj/speechllm/tree/multispk_lingmeng
|
| 3 |
+
|
| 4 |
+
看代码的README_lingmeng.md
|
| 5 |
+
|
| 6 |
+
## 数据:
|
| 7 |
+
英文部分: `./data/Whisper-Sidecar-data-metadata/data_for_wavllm`
|
| 8 |
+
|
| 9 |
+
德文相关: `./data/de-en-mix`
|
| 10 |
+
|
| 11 |
+
targetASR (target-talker ASR)用到的reference audio: `./data/reference_enroll_audio/all`
|
| 12 |
+
|
| 13 |
+
英文部分只有metadata,可以从librispeech生成。德文部分还备份了测试集音频。
|
| 14 |
+
|
| 15 |
+
## 模型:
|
| 16 |
+
tokenizer: `./llama_model/llama/tokenizer.model`
|
| 17 |
+
|
| 18 |
+
llama-2-chat: `./llama_model/llama-2-7b-chat/consolidated.00.pth`
|
| 19 |
+
|
| 20 |
+
训练好的MT-LLM模型目录: `./lingmeng_multispk_multitask_retrain_speechllm_v0.1_llama2_chat_wavlm_weighted_update_lora_32_32_prompt_build_multispk_multitask_de.yaml_16gpu_1accum`
|
| 21 |
+
|
data/Whisper-Sidecar-data-metadata/convert_to_wavllm_data_format.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import soundfile as sf
|
| 2 |
+
import json
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
|
| 5 |
+
input_jsonl = "/home/v-lingmeng/datasets/Whisper-Sidecar-data-metadata/data/librispeech3mix_test.jsonl"
|
| 6 |
+
output_tsv = "/home/v-lingmeng/datasets/Whisper-Sidecar-data-metadata/data_for_wavllm/" + input_jsonl.split("/")[-1].replace(".jsonl", "1.tsv")
|
| 7 |
+
print(output_tsv)
|
| 8 |
+
head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"])
|
| 9 |
+
prompts = ['Transcribe the given audio into text. If multiple speakers are speaking, transcribe the utterances of multiple speakers in the order of their start times, separated by "<sc>".']
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
with open("/home/v-lingmeng/datasets/LibriSpeech/SPEAKERS.TXT", "r") as f:
|
| 14 |
+
speaker_info = f.readlines()
|
| 15 |
+
speaker_gender = {l.split("|")[0].strip():l.split("|")[1].strip() for l in speaker_info if not l.startswith(";")}
|
| 16 |
+
with open(input_jsonl, "r") as f:
|
| 17 |
+
lines = f.readlines()
|
| 18 |
+
new_lines = []
|
| 19 |
+
for line in tqdm(lines):
|
| 20 |
+
line = json.loads(line.strip())
|
| 21 |
+
audio = line['audio']['path'].replace("./dataset", "/valleblob/v-lingmeng/speech/data")
|
| 22 |
+
wav_id = audio.split("/")[-1]
|
| 23 |
+
n_frames = str(sf.read(audio)[0].shape[0])
|
| 24 |
+
prompt = prompts[0]
|
| 25 |
+
tgt_text = line["sentence"]
|
| 26 |
+
codec = "None"
|
| 27 |
+
with_speech = "True"
|
| 28 |
+
language = "en"
|
| 29 |
+
if "speakers" in line:
|
| 30 |
+
speakers = "|".join(line["speakers"])
|
| 31 |
+
else:
|
| 32 |
+
speakers = "|".join([_id.split("-")[0] for _id in wav_id.split("_")])
|
| 33 |
+
genders = "|".join([speaker_gender[spk] for spk in speakers.split("|")])
|
| 34 |
+
|
| 35 |
+
new_line = "\t".join([wav_id, audio, n_frames, prompt, tgt_text, codec, with_speech, language, speakers, genders])
|
| 36 |
+
new_lines.append(new_line)
|
| 37 |
+
|
| 38 |
+
with open(output_tsv, "w") as f:
|
| 39 |
+
new_lines.insert(0, head)
|
| 40 |
+
f.write("\n".join(new_lines))
|
| 41 |
+
|
| 42 |
+
# print(speaker_gender)
|
data/Whisper-Sidecar-data-metadata/data/aishell1mix2_dev.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:132bd6f6c6e353bca38831088bbe2eae65ce67635b5cc204249bd070f9c56e2a
|
| 3 |
+
size 2460550
|
data/Whisper-Sidecar-data-metadata/data/aishell1mix2_test.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:53b0d3301937ee8affd9a7da43c8c7940dc6f632b1594efa7de4b7faa4524f5b
|
| 3 |
+
size 1483980
|
data/Whisper-Sidecar-data-metadata/data/aishell1mix2_test20.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:53b0d3301937ee8affd9a7da43c8c7940dc6f632b1594efa7de4b7faa4524f5b
|
| 3 |
+
size 1483980
|
data/Whisper-Sidecar-data-metadata/data/aishell1mix2_train.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a612cd6616cdb756e6c2ad587eebad584e76a1101e754d220bb01b22224c0221
|
| 3 |
+
size 27055103
|
data/Whisper-Sidecar-data-metadata/data/aishell1mix3_dev.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4bbe7c57288ad1565ffd0d630004508a26b48204b189cf30dd8a0f8f295e01a1
|
| 3 |
+
size 2870838
|
data/Whisper-Sidecar-data-metadata/data/aishell1mix3_test.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e2d2fb69a6fbe3cd6c89f89a002e28e01944d96c3d85a6cd240fcb52bfbd8ec2
|
| 3 |
+
size 1442387
|
data/Whisper-Sidecar-data-metadata/data/aishell1mix3_train.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f583f9f8ff84df2ed1eb4da4dfd3a3d615d87c91bd4a3fda34dcf565028e76c
|
| 3 |
+
size 23867370
|
data/Whisper-Sidecar-data-metadata/data/data_prepare_aishellmix.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# prepare from msra-dev-node
|
| 2 |
+
|
| 3 |
+
import jsonlines
|
| 4 |
+
import soundfile as sf
|
| 5 |
+
import glob
|
| 6 |
+
import numpy as np
|
| 7 |
+
import os
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
def generate_jsonl_from_fairseq_datafile(root_data_dir, output_dir, with_timestamps=False):
|
| 11 |
+
splits = ['test', 'dev', 'train']
|
| 12 |
+
num_spks = ["2", "3"]
|
| 13 |
+
# convert transcript file to two list, ID and text
|
| 14 |
+
transcript_path = '/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/aishell1/data_aishell/transcript/aishell_transcript_v0.8.txt'
|
| 15 |
+
transcripts = open(transcript_path, 'r').readlines() # ID\ttext\n for each line
|
| 16 |
+
ID_list = []
|
| 17 |
+
text_list = []
|
| 18 |
+
for line in transcripts:
|
| 19 |
+
line = line.strip()
|
| 20 |
+
_id, *text = line.split(' ')
|
| 21 |
+
ID_list.append(_id)
|
| 22 |
+
text_list.append(''.join(text))
|
| 23 |
+
|
| 24 |
+
id_text_dict = dict(zip(ID_list, text_list))
|
| 25 |
+
|
| 26 |
+
for num_spk in num_spks:
|
| 27 |
+
for split in splits:
|
| 28 |
+
# data_dir = os.path,join(root_data_dir, 'Aishell1Mix', "data", f'Aishell1Mix{num_spk}', 'wav16k', 'max', split, 'mix_clean')
|
| 29 |
+
|
| 30 |
+
metadata = os.path.join(root_data_dir, 'Aishell1Mix', "data", f'Aishell1Mix{num_spk}', 'wav16k', 'max', 'metadata', f'mixture_{split}_mix_clean.csv')
|
| 31 |
+
df = pd.read_csv(metadata)
|
| 32 |
+
mix_id_list = df['mixture_ID'].tolist()
|
| 33 |
+
mix_path_list = df['mixture_path'].tolist()
|
| 34 |
+
source_wav_root = os.path.join("/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/aishell1/data_aishell/wav", split)
|
| 35 |
+
new_jsonl = os.path.join(output_dir, f'aishell1mix{num_spk}_{split}.jsonl')
|
| 36 |
+
if os.path.exists(new_jsonl):
|
| 37 |
+
os.remove(new_jsonl)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
for mix_id, mix_path in zip(mix_id_list, mix_path_list):
|
| 41 |
+
source_ids = mix_id.split('_')
|
| 42 |
+
source_texts = [id_text_dict[source_id] for source_id in source_ids]
|
| 43 |
+
source_text = '</s>'.join(source_texts)
|
| 44 |
+
|
| 45 |
+
speakers = [source_id.split("S")[1].split("W")[0] for source_id in source_ids]
|
| 46 |
+
|
| 47 |
+
source_paths = [os.path.join(source_wav_root, "S"+str(speakers[i]), source_id + '.wav') for i, source_id in enumerate(source_ids)]
|
| 48 |
+
source_durations = [sf.info(source_path).duration for source_path in source_paths]
|
| 49 |
+
|
| 50 |
+
duration = max(source_durations)
|
| 51 |
+
dic = {"audio": {"path": mix_path},
|
| 52 |
+
"language": "zh",
|
| 53 |
+
"duration": duration,
|
| 54 |
+
"speakers": speakers,
|
| 55 |
+
"sentence": source_text}
|
| 56 |
+
|
| 57 |
+
starts = [0] * len(source_durations)
|
| 58 |
+
ends = source_durations
|
| 59 |
+
sentences_dict = []
|
| 60 |
+
for i in range(len(starts)):
|
| 61 |
+
sentences_dict.append({"start": starts[i], "end": ends[i], "text": source_texts[i]})
|
| 62 |
+
dic["sentences"] = sentences_dict
|
| 63 |
+
|
| 64 |
+
with jsonlines.open(new_jsonl, mode='a') as writer:
|
| 65 |
+
writer.write(dic)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
if __name__ == '__main__':
|
| 69 |
+
root_data_dir = '/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset'
|
| 70 |
+
output_dir = '/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/'
|
| 71 |
+
generate_jsonl_from_fairseq_datafile(root_data_dir, output_dir, with_timestamps=True)
|
data/Whisper-Sidecar-data-metadata/data/data_prepare_librimix.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# prepare from SEPC
|
| 2 |
+
|
| 3 |
+
import jsonlines
|
| 4 |
+
# import soundfile as sf
|
| 5 |
+
import glob
|
| 6 |
+
import numpy as np
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
def generate_jsonl_from_fairseq_datafile(root_data_dir, output_dir):
|
| 10 |
+
splits = ['test', 'dev', 'train']
|
| 11 |
+
num_spks = ["2", "3"]
|
| 12 |
+
|
| 13 |
+
for num_spk in num_spks:
|
| 14 |
+
for split in splits:
|
| 15 |
+
data_dir = root_data_dir + f'Libri{num_spk}Mix_wav16k_max/'
|
| 16 |
+
wrd = data_dir + split + '.wrd'
|
| 17 |
+
fairseq_jsonl = data_dir + split + '_clean.jsonl'
|
| 18 |
+
new_jsonl = output_dir + f"libri{num_spk}mix_" + split + '.jsonl'
|
| 19 |
+
if os.path.exists(new_jsonl):
|
| 20 |
+
os.remove(new_jsonl)
|
| 21 |
+
|
| 22 |
+
with jsonlines.open(fairseq_jsonl) as reader:
|
| 23 |
+
with open(wrd, 'r') as f:
|
| 24 |
+
for meta, text in zip(reader, f.readlines()):
|
| 25 |
+
# print(meta, text)
|
| 26 |
+
dic = {"audio": {"path": f"./dataset/LibriMix/data/Libri{num_spk}Mix/wav16k/max/" + meta['mixed_wav']},
|
| 27 |
+
"language": "en",
|
| 28 |
+
"duration": max(meta['durations']),
|
| 29 |
+
"speakers": meta['speakers'],
|
| 30 |
+
"sentence": text.strip().lower()}
|
| 31 |
+
|
| 32 |
+
starts = meta['delays']
|
| 33 |
+
durations = meta['durations']
|
| 34 |
+
ends = list(map(lambda x, y: x + y, starts, durations))
|
| 35 |
+
texts = text.strip().lower().split(" </s> ")
|
| 36 |
+
sentences_dict = []
|
| 37 |
+
for i in range(len(starts)):
|
| 38 |
+
sentences_dict.append({"start": starts[i], "end": ends[i], "text": texts[i].strip().lower()})
|
| 39 |
+
dic["sentences"] = sentences_dict
|
| 40 |
+
|
| 41 |
+
with jsonlines.open(new_jsonl, mode='a') as writer:
|
| 42 |
+
writer.write(dic)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
if __name__ == '__main__':
|
| 47 |
+
root_data_dir = '/mnt/users/hccl.local/lmeng/workspaces/overlapASR/egs_wav2vec/data/'
|
| 48 |
+
output_dir = '/mnt/users/hccl.local/lmeng/workspaces/overlapASR/Whisper-Finetune-ovlp/dataset/'
|
| 49 |
+
generate_jsonl_from_fairseq_datafile(root_data_dir, output_dir)
|
data/Whisper-Sidecar-data-metadata/data/data_prepare_librispeech.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# prepare from SEPC
|
| 2 |
+
|
| 3 |
+
import jsonlines
|
| 4 |
+
# import soundfile as sf
|
| 5 |
+
import glob
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
def generate_jsonl_from_fairseq_datafile(data_dir, output_dir):
|
| 9 |
+
splits = ['test']
|
| 10 |
+
|
| 11 |
+
for split in splits:
|
| 12 |
+
wrd = data_dir + split + '.wrd'
|
| 13 |
+
# fairseq_jsonl = data_dir + split + '_clean.jsonl'
|
| 14 |
+
tsv = data_dir + split + '.tsv'
|
| 15 |
+
new_jsonl = output_dir + split + '.jsonl'
|
| 16 |
+
|
| 17 |
+
with open(tsv, 'r') as flac_path_f:
|
| 18 |
+
# remove the first line
|
| 19 |
+
flac_path_f.readline()
|
| 20 |
+
with open(wrd, 'r') as trans_f:
|
| 21 |
+
for flac_path, trans in zip(flac_path_f.readlines(), trans_f.readlines()):
|
| 22 |
+
# print(meta, text)
|
| 23 |
+
flac_path, duration = flac_path.strip().split('\t')
|
| 24 |
+
dic = {"audio": {"path": "./dataset/librispeech/" + flac_path},
|
| 25 |
+
"language": "en",
|
| 26 |
+
"duration": int(duration)/16000.0,
|
| 27 |
+
"sentence": trans.strip().lower()}
|
| 28 |
+
with jsonlines.open(new_jsonl, mode='a') as writer:
|
| 29 |
+
writer.write(dic)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
if __name__ == '__main__':
|
| 33 |
+
data_dir = '/mnt/users/hccl.local/lmeng/workspaces/overlapASR/egs_wav2vec/data/LibriSpeech/'
|
| 34 |
+
output_dir = '/mnt/users/hccl.local/lmeng/workspaces/overlapASR/Whisper-Finetune/dataset/'
|
| 35 |
+
generate_jsonl_from_fairseq_datafile(data_dir, output_dir)
|
data/Whisper-Sidecar-data-metadata/data/data_prepare_librispeechmix.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# prepare from SEPC
|
| 2 |
+
|
| 3 |
+
import jsonlines
|
| 4 |
+
# import soundfile as sf
|
| 5 |
+
import glob
|
| 6 |
+
import numpy as np
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
def generate_jsonl_from_fairseq_datafile(root_data_dir, output_dir, with_timestamps=False):
|
| 10 |
+
splits = ['test', 'dev', 'train']
|
| 11 |
+
num_spks = ["2", "3"]
|
| 12 |
+
for num_spk in num_spks:
|
| 13 |
+
for split in splits:
|
| 14 |
+
data_dir = root_data_dir + f'LibriSpeechMix-{num_spk}spkr/'
|
| 15 |
+
wrd = data_dir + split + '.wrd'
|
| 16 |
+
fairseq_jsonl = data_dir + split + '_clean.jsonl'
|
| 17 |
+
if with_timestamps:
|
| 18 |
+
new_jsonl = output_dir + f"librispeech{num_spk}mix_timestamps_" + split + '.jsonl'
|
| 19 |
+
else:
|
| 20 |
+
new_jsonl = output_dir + f"librispeech{num_spk}mix_" + split + '.jsonl'
|
| 21 |
+
|
| 22 |
+
if os.path.exists(new_jsonl):
|
| 23 |
+
os.remove(new_jsonl)
|
| 24 |
+
|
| 25 |
+
with jsonlines.open(fairseq_jsonl) as reader:
|
| 26 |
+
with open(wrd, 'r') as f:
|
| 27 |
+
for meta, text in zip(reader, f.readlines()):
|
| 28 |
+
# print(meta, text)
|
| 29 |
+
dic = {"audio": {"path": "./dataset/LibriSpeechMix/" + meta['mixed_wav']},
|
| 30 |
+
"language": "en",
|
| 31 |
+
"duration": max(list(map(lambda x, y: x + y, meta['durations'], meta['delays']))),
|
| 32 |
+
"speakers": meta['speakers'],
|
| 33 |
+
"sentence": text.strip().lower(),
|
| 34 |
+
}
|
| 35 |
+
if with_timestamps:
|
| 36 |
+
starts = meta['delays']
|
| 37 |
+
durations = meta['durations']
|
| 38 |
+
ends = list(map(lambda x, y: x + y, starts, durations))
|
| 39 |
+
texts = meta['texts']
|
| 40 |
+
sentences_dict = []
|
| 41 |
+
for i in range(len(starts)):
|
| 42 |
+
sentences_dict.append({"start": starts[i], "end": ends[i], "text": texts[i].strip().lower()})
|
| 43 |
+
dic["sentences"] = sentences_dict
|
| 44 |
+
|
| 45 |
+
with jsonlines.open(new_jsonl, mode='a') as writer:
|
| 46 |
+
writer.write(dic)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
if __name__ == '__main__':
|
| 51 |
+
root_data_dir = '/mnt/users/hccl.local/lmeng/workspaces/overlapASR/egs_wav2vec/data/'
|
| 52 |
+
output_dir = '/mnt/users/hccl.local/lmeng/workspaces/overlapASR/Whisper-Finetune-ovlp/dataset/'
|
| 53 |
+
generate_jsonl_from_fairseq_datafile(root_data_dir, output_dir, with_timestamps=True)
|
data/Whisper-Sidecar-data-metadata/data/generate_librimix_wav_from_jsonl.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import glob
|
| 4 |
+
import soundfile
|
| 5 |
+
import librosa
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
import numpy as np
|
| 8 |
+
|
| 9 |
+
def get_delayed_audio(wav_file, delay, sampling_rate=16000):
|
| 10 |
+
audio, _ = soundfile.read(wav_file)
|
| 11 |
+
delay_frame = int(delay * sampling_rate)
|
| 12 |
+
if delay_frame != 0:
|
| 13 |
+
audio = np.append(np.zeros(delay_frame), audio)
|
| 14 |
+
return audio
|
| 15 |
+
|
| 16 |
+
def mix_audio(wav_files, delays):
|
| 17 |
+
for i, wav_file in enumerate(wav_files):
|
| 18 |
+
if i == 0:
|
| 19 |
+
audio = get_delayed_audio(wav_file, delays[i])
|
| 20 |
+
else:
|
| 21 |
+
additional_audio = get_delayed_audio(wav_file, delays[i])
|
| 22 |
+
# tune length & sum up to audio
|
| 23 |
+
target_length = max(len(audio), len(additional_audio))
|
| 24 |
+
# print(additional_audio.shape)
|
| 25 |
+
audio = librosa.util.fix_length(audio, size=target_length)
|
| 26 |
+
additional_audio = librosa.util.fix_length(additional_audio, size=target_length)
|
| 27 |
+
audio = audio + additional_audio
|
| 28 |
+
return audio
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
jsonl_path = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/libri3mix_train.jsonl"
|
| 32 |
+
if "test" in jsonl_path:
|
| 33 |
+
subset = "test"
|
| 34 |
+
elif "train" in jsonl_path:
|
| 35 |
+
subset = "train"
|
| 36 |
+
else:
|
| 37 |
+
subset = "dev"
|
| 38 |
+
librispeech_dir = f"/home/v-lingmeng/datasets/LibriSpeech/{subset}*/"
|
| 39 |
+
output_dir = "/home/v-lingmeng/datasets"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
with open(jsonl_path, 'r', encoding='utf-8') as file:
|
| 43 |
+
json_list = [json.loads(line.strip()) for line in file]
|
| 44 |
+
|
| 45 |
+
for line in tqdm(json_list):
|
| 46 |
+
audio_name = line['audio']['path'].replace("./dataset", output_dir)
|
| 47 |
+
if not os.path.exists(os.path.split(audio_name)[0]):
|
| 48 |
+
os.makedirs(os.path.split(audio_name)[0])
|
| 49 |
+
|
| 50 |
+
source_ids = os.path.split(audio_name)[1].split(".")[0].split("_")
|
| 51 |
+
source_files = [glob.glob(librispeech_dir + "/".join(i.split("-")[:-1]) + f"/{i}*")[0] for i in source_ids]
|
| 52 |
+
delays = [l["start"] for l in line['sentences']]
|
| 53 |
+
# ends = [l["end"] for l in line['sentences']]
|
| 54 |
+
|
| 55 |
+
# for source_audio, start, end in zip(source_auidos, starts, ends):
|
| 56 |
+
mixed_audio = mix_audio(source_files, delays)
|
| 57 |
+
soundfile.write(audio_name, mixed_audio, samplerate=16000)
|
| 58 |
+
print(audio_name)
|
| 59 |
+
|
| 60 |
+
# print(delays)
|
data/Whisper-Sidecar-data-metadata/data/libri2mix_dev-both.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a00333b0e46f84e4389e63df5045f8a762487eb29bb89eb7de9f70941e1434c
|
| 3 |
+
size 2224222
|
data/Whisper-Sidecar-data-metadata/data/libri2mix_dev.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c08740dd203b33312ea9793a8859302c2ff41f44de77ffb404c6d5859df2287
|
| 3 |
+
size 2227222
|
data/Whisper-Sidecar-data-metadata/data/libri2mix_test-both.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88e089c638beff65237045f526348e50341026b0324e0230e1ee17568b3173c1
|
| 3 |
+
size 2094572
|
data/Whisper-Sidecar-data-metadata/data/libri2mix_test.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cbf70cb2bca21af6465f534a1910756cce13eb410815eccc90fc74b24567e4b6
|
| 3 |
+
size 2097572
|
data/Whisper-Sidecar-data-metadata/data/libri2mix_test20.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9831f61da77ba9a17367845c6288c9a7b24858cf69dcbb53e4377cb23640820b
|
| 3 |
+
size 14417
|
data/Whisper-Sidecar-data-metadata/data/libri2mix_train-100-both.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f0eb2462fa951da717d18e172a5b781b6149d3483e56022bcea7459d163b76e
|
| 3 |
+
size 14778639
|
data/Whisper-Sidecar-data-metadata/data/libri2mix_train-100.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:10ac8db4f8c3851c7e34f4643268355c2d34a7f8952aa316b0b70d2019733f70
|
| 3 |
+
size 14792539
|
data/Whisper-Sidecar-data-metadata/data/libri2mix_train-200.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:94e70011854f573bf05586a9793f19510d6be7188be89c032459910ad1d3e11d
|
| 3 |
+
size 29571178
|
data/Whisper-Sidecar-data-metadata/data/libri2mix_train-both.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f72957a086f85d8fee3aea616c471dd7c7baf465d355db361f796f13c7e6478e
|
| 3 |
+
size 68694922
|
data/Whisper-Sidecar-data-metadata/data/libri2mix_train.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b9bd7eceb100a8f842e1b708066e5da7ac77e5008ef013266c567528ab1f8c4f
|
| 3 |
+
size 68759621
|
data/Whisper-Sidecar-data-metadata/data/libri2mix_train20.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:41d2502e124c4cdf60adb5a83f8942f2fb5d710634485b5a0382e45873570048
|
| 3 |
+
size 22316
|
data/Whisper-Sidecar-data-metadata/data/libri2mix_train_remove_enroll.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b3d3c34060e44c767cd70ebb53aa18d53db4a3828fa92037addeed9c04e9f8a5
|
| 3 |
+
size 68268344
|
data/Whisper-Sidecar-data-metadata/data/libri3mix_test.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:daf3f6411bb717e86d77b4f17a455323c5d9df5fdb37fe6513a11859d50f41e9
|
| 3 |
+
size 2855611
|
data/Whisper-Sidecar-data-metadata/data/libri3mix_test20.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f66ad5735f3e3d1bd24bf960439142e181dbe4ab50a951a3807b4050040568c
|
| 3 |
+
size 19612
|
data/Whisper-Sidecar-data-metadata/data/libri3mix_train.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ba19b8a28a4430dbab099d5450ae075f6096c869ca318846f1ed219b471c997
|
| 3 |
+
size 65232138
|
data/Whisper-Sidecar-data-metadata/data/librispeech2mix_test.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6dfd13ad50aeb92257fc7c8c4d721fad1d2e3901be95eed0aa4d58211027d471
|
| 3 |
+
size 1945661
|
data/Whisper-Sidecar-data-metadata/data/librispeech2mix_test_30s.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c1d07d099fe7d64cfec16efcc06b5dcac825984ca8668909152882adef2edcb9
|
| 3 |
+
size 1945138
|
data/Whisper-Sidecar-data-metadata/data/librispeech2mix_train.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f629fb7f89d02d937e5f4d1ced117c70904fc5d97d520706758a3e41010c1c2
|
| 3 |
+
size 287742080
|
data/Whisper-Sidecar-data-metadata/data/librispeech3mix_test.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4019227f5a2f827f2517e942c8c6f72b7a6d80fe8fe32db2909393254d7af771
|
| 3 |
+
size 2730857
|
data/Whisper-Sidecar-data-metadata/data/librispeech3mix_test_temp.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be1fafb707b168a23d3c8225e3f04407da63360b27b99d435d11c8a7560dee9c
|
| 3 |
+
size 2729249
|
data/Whisper-Sidecar-data-metadata/data/librispeech3mix_train.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47917569ed5a933e6ad66ccef5ccefded8f4c117a0296dafcbde9fc564e835e6
|
| 3 |
+
size 410988677
|
data/Whisper-Sidecar-data-metadata/data/librispeech_dev.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca54eb22dd7f421bd9c8b8e60a5df32946a7823d5547e49eaa030769c48de0da
|
| 3 |
+
size 2194617
|
data/Whisper-Sidecar-data-metadata/data/librispeech_test.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8421fbe45b046b2a5644782b3e982538bd8483d72964b6f08fc208e5e7059648
|
| 3 |
+
size 2197082
|
data/Whisper-Sidecar-data-metadata/data/librispeech_train.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cc923ead53a1248037e622bbeb75899a0c2637720600cc8c386530d5cdfa95b8
|
| 3 |
+
size 91088243
|
data/Whisper-Sidecar-data-metadata/data/long_wav_resample.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# resamle long wav (>30s) to 16k 30s, and update the jsonl file
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
import jsonlines
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
from pydub import AudioSegment
|
| 8 |
+
from pydub.playback import play
|
| 9 |
+
import soundfile as sf
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
jsonl_file = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/librispeech3mix_test.jsonl"
|
| 14 |
+
temp_file = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/librispeech3mix_test_temp.jsonl"
|
| 15 |
+
# time strench long wav (>30s) to 16k 30s, and update the jsonl line in-place
|
| 16 |
+
|
| 17 |
+
with jsonlines.open(jsonl_file, 'r') as reader, jsonlines.open(temp_file, 'w') as writer:
|
| 18 |
+
for obj in reader:
|
| 19 |
+
wav_path = obj['audio']['path']
|
| 20 |
+
duration = obj['duration']
|
| 21 |
+
sentences = obj['sentences']
|
| 22 |
+
# resample wav
|
| 23 |
+
if duration > 30.1:
|
| 24 |
+
print(wav_path, duration)
|
| 25 |
+
wav = AudioSegment.from_file(wav_path)
|
| 26 |
+
target_len = 30.0 * 1000
|
| 27 |
+
speed_up_rate = len(wav) / target_len
|
| 28 |
+
wav = wav.speedup(playback_speed=speed_up_rate)
|
| 29 |
+
wav = wav[:target_len]
|
| 30 |
+
wav.export(wav_path, format="wav")
|
| 31 |
+
print(speed_up_rate)
|
| 32 |
+
obj['duration'] = 30
|
| 33 |
+
for sentence in sentences:
|
| 34 |
+
sentence['start'] = sentence['start'] / speed_up_rate
|
| 35 |
+
sentence['end'] = sentence['end'] / speed_up_rate
|
| 36 |
+
if sentence['start'] > 30:
|
| 37 |
+
sentence['start'] = 30
|
| 38 |
+
if sentence['end'] > 30:
|
| 39 |
+
sentence['end'] = 30
|
| 40 |
+
obj['sentences'] = sentences
|
| 41 |
+
elif duration > 30.0:
|
| 42 |
+
wav, sr = sf.read(wav_path)
|
| 43 |
+
wav = wav[:int(16000 * 30)]
|
| 44 |
+
obj['duration'] = 30 if duration > 30 else duration
|
| 45 |
+
for sentence in sentences:
|
| 46 |
+
sentence['start'] = sentence['start'] if sentence['start'] < 30 else 30
|
| 47 |
+
sentence['end'] = sentence['end'] if sentence['end'] < 30 else 30
|
| 48 |
+
obj['sentences'] = sentences
|
| 49 |
+
sf.write(wav_path, wav, sr)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
writer.write(obj)
|
data/Whisper-Sidecar-data-metadata/data/select_prompt_wav.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import glob
|
| 3 |
+
import random
|
| 4 |
+
|
| 5 |
+
import jsonlines
|
| 6 |
+
|
| 7 |
+
# backup_dir = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/temp/removed_mix_wav"
|
| 8 |
+
# librispeech_dir = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/LibriSpeech/train-clean-360"
|
| 9 |
+
# enroll_dir = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/temp/new_enroll"
|
| 10 |
+
|
| 11 |
+
# # 1. 获取所有的wav文件
|
| 12 |
+
# wav_files = glob.glob("/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/LibriMix/data/Libri2Mix/wav16k/max/train-360/mix_clean/*.wav")
|
| 13 |
+
|
| 14 |
+
# # 2. 每个wav文件有两个说话人,记录所有wav文件出现的说话人
|
| 15 |
+
# all_speakers = set()
|
| 16 |
+
# for wav_file in wav_files:
|
| 17 |
+
# speakers = [f.split('-')[0] for f in os.path.basename(wav_file).split("_")]
|
| 18 |
+
# all_speakers.update(speakers)
|
| 19 |
+
# # random.shuffle(wav_files)
|
| 20 |
+
|
| 21 |
+
# len_all_speakers = len(all_speakers)
|
| 22 |
+
# # 3. 对每个说话人,复制且只复制一个具有它的语音文件
|
| 23 |
+
# count = 0
|
| 24 |
+
# for wav_file in wav_files:
|
| 25 |
+
# source_wavs = os.path.basename(wav_file).split("_")
|
| 26 |
+
# speakers = [f.split('-')[0] for f in source_wavs]
|
| 27 |
+
# # 如果有任意一个说话人不在all_speakers中,跳过这个文件
|
| 28 |
+
# if not all(s in all_speakers for s in speakers):
|
| 29 |
+
# continue
|
| 30 |
+
# else:
|
| 31 |
+
# # 从all_speakers中删除这两个说话人
|
| 32 |
+
# all_speakers.difference_update(speakers)
|
| 33 |
+
# # 复制这个文件
|
| 34 |
+
# os.system(f"cp {wav_file} {backup_dir}")
|
| 35 |
+
# # print(f"cp {wav_file} {backup_dir}")
|
| 36 |
+
# # 复制source_wavs中的每个说话人的语音文件
|
| 37 |
+
# for source_wav in source_wavs:
|
| 38 |
+
# count+=1
|
| 39 |
+
# source_wav_path = os.path.join(librispeech_dir, source_wav.split("-")[0], source_wav.split("-")[1], source_wav.split('.')[0] + ".flac")
|
| 40 |
+
# # 判断是否存在
|
| 41 |
+
# if not os.path.exists(source_wav_path):
|
| 42 |
+
# print(f"source_wav_path: {source_wav_path} not exists")
|
| 43 |
+
# continue
|
| 44 |
+
# os.system(f"cp {source_wav_path} {enroll_dir}")
|
| 45 |
+
|
| 46 |
+
# print(all_speakers)
|
| 47 |
+
# print(count, len_all_speakers)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# source = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/temp/new_enroll"
|
| 52 |
+
# enroll_dir = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/LibriMix_enroll_audio/train-360"
|
| 53 |
+
|
| 54 |
+
# flac_files = glob.glob(f"{source}/*.flac")
|
| 55 |
+
|
| 56 |
+
# for flac_files in flac_files:
|
| 57 |
+
# # mkdir enroll_dir/spk_id
|
| 58 |
+
# spk_id = os.path.basename(flac_files).split("-")[0]
|
| 59 |
+
# spk_dir = os.path.join(enroll_dir, spk_id)
|
| 60 |
+
# if os.path.exists(spk_dir):
|
| 61 |
+
# # 删除
|
| 62 |
+
# os.system(f"rm -rf {spk_dir}")
|
| 63 |
+
# os.makedirs(spk_dir, exist_ok=True)
|
| 64 |
+
# # convert flac to wav, move to spk_dir
|
| 65 |
+
# wav_file = os.path.join(spk_dir, spk_id+ ".wav")
|
| 66 |
+
# os.system(f"ffmpeg -i {flac_files} {wav_file}")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# jsonl_file = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/libri2mix_train_remove_enroll.jsonl"
|
| 70 |
+
# enrolled_dir = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/temp/removed_mix_wav"
|
| 71 |
+
|
| 72 |
+
# # remove wav file in enrolled_dir from jsonl_file
|
| 73 |
+
# with jsonlines.open(jsonl_file) as reader:
|
| 74 |
+
# lines = list(reader)
|
| 75 |
+
# print(len(lines))
|
| 76 |
+
# for line in lines:
|
| 77 |
+
# mix_wav = line['audio']['path']
|
| 78 |
+
# if os.path.exists(os.path.join(enrolled_dir, os.path.basename(mix_wav))):
|
| 79 |
+
# lines.remove(line)
|
| 80 |
+
# print(len(lines))
|
| 81 |
+
# # write to new jsonl file
|
| 82 |
+
# new_jsonl_file = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/libri2mix_train_remove_enroll.jsonl"
|
| 83 |
+
# with jsonlines.open(new_jsonl_file, "w") as writer:
|
| 84 |
+
# for line in lines:
|
| 85 |
+
# writer.write(line)
|
| 86 |
+
# print("done")
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
librispeech_dir = "/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/LibriSpeech/train-other-500"
|
| 90 |
+
enroll_path ="/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/LibriMix_enroll_audio/train-500"
|
| 91 |
+
|
| 92 |
+
# 检查librispeech_dir中每个speaker是否存在在enroll_path中
|
| 93 |
+
speaker_dirs = glob.glob(f"{librispeech_dir}/*")
|
| 94 |
+
new_file = []
|
| 95 |
+
for speaker_dir in speaker_dirs:
|
| 96 |
+
if ".TXT" in speaker_dir:
|
| 97 |
+
continue
|
| 98 |
+
speaker_id = os.path.basename(speaker_dir)
|
| 99 |
+
enroll_speaker_dir = os.path.join(enroll_path, speaker_id)
|
| 100 |
+
if not os.path.exists(enroll_speaker_dir):
|
| 101 |
+
print(f"{enroll_speaker_dir} not exists")
|
| 102 |
+
os.makedirs(enroll_speaker_dir, exist_ok=True)
|
| 103 |
+
|
| 104 |
+
if len(glob.glob(f"{enroll_speaker_dir}/*.wav")) == 0:
|
| 105 |
+
# 从librispeech_dir中复制一个语音文件到enroll_speaker_dir
|
| 106 |
+
flac_files = glob.glob(f"{speaker_dir}/*/*.flac")
|
| 107 |
+
# 从flac_files中随机选择一个, 并复制到enroll_speaker_dir
|
| 108 |
+
flac_file = random.choice(flac_files)
|
| 109 |
+
# 复制到enroll_speaker_dir
|
| 110 |
+
new_flac_file = os.path.join(enroll_speaker_dir, os.path.basename(flac_file))
|
| 111 |
+
print(new_flac_file)
|
| 112 |
+
os.system(f"cp {flac_file} {new_flac_file}")
|
| 113 |
+
# 记录flac_file的名字
|
| 114 |
+
new_file.append(flac_file)
|
| 115 |
+
else:
|
| 116 |
+
print(glob.glob(f"{enroll_speaker_dir}/*.wav"))
|
| 117 |
+
|
| 118 |
+
# 检查enroll_path中speaker的语音命名是否为{speaker_id}.wav
|
| 119 |
+
enroll_wav_files = glob.glob(f"{enroll_speaker_dir}/*")
|
| 120 |
+
for enroll_wav_file in enroll_wav_files:
|
| 121 |
+
if os.path.basename(enroll_wav_file).split(".")[0] != speaker_id:
|
| 122 |
+
print(f"{enroll_wav_file} not match")
|
| 123 |
+
# 转为wav格式, 名字只保留speaker_id
|
| 124 |
+
wav_file = os.path.join(enroll_speaker_dir, speaker_id + ".wav")
|
| 125 |
+
os.system(f"ffmpeg -i {enroll_wav_file} {wav_file}")
|
| 126 |
+
# 删除原来的文件
|
| 127 |
+
os.system(f"rm -rf {enroll_wav_file}")
|
| 128 |
+
|
| 129 |
+
# 记录新的flac文件
|
| 130 |
+
with open("/home/v-lingmeng/codebase/Whisper-Finetune-ovlp/dataset/temp/new_flac_files.txt", "w") as f:
|
| 131 |
+
for flac_file in new_file:
|
| 132 |
+
f.write(flac_file + "\n")
|
data/Whisper-Sidecar-data-metadata/data/test_examples.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bcf048b062b475972c1e6dad258244680a88a9acfc8841dd8ea284ad56efc3dc
|
| 3 |
+
size 794
|
data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test.tsv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:53fb93ebd24db3e32a7dd7fd5908222f2f0a498a7d0ede6835140c2d4b1f3552
|
| 3 |
+
size 8892480
|
data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_1350.tsv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec64b8b840655de399e374931c056dd4b22d4070c8b97cb197863182599af765
|
| 3 |
+
size 889784
|
data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_1350_targetLingual.tsv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:971baf65ee1603b22483ef12a1040d884419077069b68dad7931a305ffe2f7aa
|
| 3 |
+
size 632636
|
data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_targetLingual.tsv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12249076735100be44176352c65ab89a30751ae7ac5621583130674529774d57
|
| 3 |
+
size 6361360
|
data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_targetLingual_1350.tsv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:968fd7a78bdca0783e5a2baf7db53f621ed274ba1295f5bd4d2553794a4718da
|
| 3 |
+
size 636201
|
data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_train.tsv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21cb5e6b2243c0ce5a2cf6e431dd1c9c5dc5efb79fb3c2a9d548dbbc335c212f
|
| 3 |
+
size 93936028
|
data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_train_targetLingual.tsv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8aad7b8718ed09f7e98742caa6c85e496f68e332f73e65a557ecdac06cec8f2a
|
| 3 |
+
size 65029852
|
data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test.tsv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c2e84678170247fd7ef2a06622a32d9d42abf3e1b83e8c4426e4a9d03e44ce72
|
| 3 |
+
size 10851471
|