LingweiMeng
/

speechllm_multispk

Model card Files Files and versions

Metrics Training metrics Community

speechllm_multispk / data /Whisper-Sidecar-data-metadata /convert_to_wavllm_data_format.py

Lingwei Meng

add data

c52df1b about 1 year ago

history blame contribute delete

1.82 kB

	import soundfile as sf
	import json
	from tqdm import tqdm

	input_jsonl = "/home/v-lingmeng/datasets/Whisper-Sidecar-data-metadata/data/librispeech3mix_test.jsonl"
	output_tsv = "/home/v-lingmeng/datasets/Whisper-Sidecar-data-metadata/data_for_wavllm/" + input_jsonl.split("/")[-1].replace(".jsonl", "1.tsv")
	print(output_tsv)
	head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"])
	prompts = ['Transcribe the given audio into text. If multiple speakers are speaking, transcribe the utterances of multiple speakers in the order of their start times, separated by "<sc>".']



	with open("/home/v-lingmeng/datasets/LibriSpeech/SPEAKERS.TXT", "r") as f:
	speaker_info = f.readlines()
	speaker_gender = {l.split("\|")[0].strip():l.split("\|")[1].strip() for l in speaker_info if not l.startswith(";")}
	with open(input_jsonl, "r") as f:
	lines = f.readlines()
	new_lines = []
	for line in tqdm(lines):
	line = json.loads(line.strip())
	audio = line['audio']['path'].replace("./dataset", "/valleblob/v-lingmeng/speech/data")
	wav_id = audio.split("/")[-1]
	n_frames = str(sf.read(audio)[0].shape[0])
	prompt = prompts[0]
	tgt_text = line["sentence"]
	codec = "None"
	with_speech = "True"
	language = "en"
	if "speakers" in line:
	speakers = "\|".join(line["speakers"])
	else:
	speakers = "\|".join([_id.split("-")[0] for _id in wav_id.split("_")])
	genders = "\|".join([speaker_gender[spk] for spk in speakers.split("\|")])

	new_line = "\t".join([wav_id, audio, n_frames, prompt, tgt_text, codec, with_speech, language, speakers, genders])
	new_lines.append(new_line)

	with open(output_tsv, "w") as f:
	new_lines.insert(0, head)
	f.write("\n".join(new_lines))

	# print(speaker_gender)