speechllm_multispk / data /Whisper-Sidecar-data-metadata /convert_to_wavllm_data_format.py
Lingwei Meng
add data
c52df1b
import soundfile as sf
import json
from tqdm import tqdm
input_jsonl = "/home/v-lingmeng/datasets/Whisper-Sidecar-data-metadata/data/librispeech3mix_test.jsonl"
output_tsv = "/home/v-lingmeng/datasets/Whisper-Sidecar-data-metadata/data_for_wavllm/" + input_jsonl.split("/")[-1].replace(".jsonl", "1.tsv")
print(output_tsv)
head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"])
prompts = ['Transcribe the given audio into text. If multiple speakers are speaking, transcribe the utterances of multiple speakers in the order of their start times, separated by "<sc>".']
with open("/home/v-lingmeng/datasets/LibriSpeech/SPEAKERS.TXT", "r") as f:
speaker_info = f.readlines()
speaker_gender = {l.split("|")[0].strip():l.split("|")[1].strip() for l in speaker_info if not l.startswith(";")}
with open(input_jsonl, "r") as f:
lines = f.readlines()
new_lines = []
for line in tqdm(lines):
line = json.loads(line.strip())
audio = line['audio']['path'].replace("./dataset", "/valleblob/v-lingmeng/speech/data")
wav_id = audio.split("/")[-1]
n_frames = str(sf.read(audio)[0].shape[0])
prompt = prompts[0]
tgt_text = line["sentence"]
codec = "None"
with_speech = "True"
language = "en"
if "speakers" in line:
speakers = "|".join(line["speakers"])
else:
speakers = "|".join([_id.split("-")[0] for _id in wav_id.split("_")])
genders = "|".join([speaker_gender[spk] for spk in speakers.split("|")])
new_line = "\t".join([wav_id, audio, n_frames, prompt, tgt_text, codec, with_speech, language, speakers, genders])
new_lines.append(new_line)
with open(output_tsv, "w") as f:
new_lines.insert(0, head)
f.write("\n".join(new_lines))
# print(speaker_gender)