| import soundfile as sf |
| import json |
| from tqdm import tqdm |
|
|
| input_jsonl = "/home/v-lingmeng/datasets/Whisper-Sidecar-data-metadata/data/librispeech3mix_test.jsonl" |
| output_tsv = "/home/v-lingmeng/datasets/Whisper-Sidecar-data-metadata/data_for_wavllm/" + input_jsonl.split("/")[-1].replace(".jsonl", "1.tsv") |
| print(output_tsv) |
| head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"]) |
| prompts = ['Transcribe the given audio into text. If multiple speakers are speaking, transcribe the utterances of multiple speakers in the order of their start times, separated by "<sc>".'] |
|
|
|
|
|
|
| with open("/home/v-lingmeng/datasets/LibriSpeech/SPEAKERS.TXT", "r") as f: |
| speaker_info = f.readlines() |
| speaker_gender = {l.split("|")[0].strip():l.split("|")[1].strip() for l in speaker_info if not l.startswith(";")} |
| with open(input_jsonl, "r") as f: |
| lines = f.readlines() |
| new_lines = [] |
| for line in tqdm(lines): |
| line = json.loads(line.strip()) |
| audio = line['audio']['path'].replace("./dataset", "/valleblob/v-lingmeng/speech/data") |
| wav_id = audio.split("/")[-1] |
| n_frames = str(sf.read(audio)[0].shape[0]) |
| prompt = prompts[0] |
| tgt_text = line["sentence"] |
| codec = "None" |
| with_speech = "True" |
| language = "en" |
| if "speakers" in line: |
| speakers = "|".join(line["speakers"]) |
| else: |
| speakers = "|".join([_id.split("-")[0] for _id in wav_id.split("_")]) |
| genders = "|".join([speaker_gender[spk] for spk in speakers.split("|")]) |
| |
| new_line = "\t".join([wav_id, audio, n_frames, prompt, tgt_text, codec, with_speech, language, speakers, genders]) |
| new_lines.append(new_line) |
|
|
| with open(output_tsv, "w") as f: |
| new_lines.insert(0, head) |
| f.write("\n".join(new_lines)) |
|
|
| |
|
|