Lingwei Meng

add data

c52df1b about 1 year ago

6.33 kB

	import glob
	import random

	tsv_files = glob.glob("/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/*.tsv")



	head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"])
	for tsv in tsv_files:
	gender_lines = []
	order_lines = []
	keyword_lines = []
	target_lines = []
	targetLingual_lines = []
	if "gender" in tsv or "order" in tsv or "keyword" in tsv or "librispeech_" in tsv or "target" in tsv:
	continue
	with open(tsv, "r") as f:

	_ = f.readline()
	lines = f.readlines()
	# # gender task
	# for line in lines:
	# genders = line.split("\t")[-1].strip().split("\|")
	# targets = line.split("\t")[4].split(" <sc> ")
	# if "M" not in genders or "F" not in genders:
	# continue
	# else:
	# target_gen = "male" if random.random() > 0.5 else "female"
	# prompt = f"Please transcribe the contents spoken by {target_gen} speakers in overlapping speech."

	# new_targets = [target for gender, target in zip(genders, targets) if (gender == "F" and target_gen == "female") or (gender == "M" and target_gen == "male")]
	# new_targets = " <sc> ".join(new_targets)

	# new_line = line.strip().split("\t")
	# new_line[3] = prompt
	# new_line[4] = new_targets
	# new_line = "\t".join(new_line)
	# gender_lines.append(new_line)
	# output_file_gender = tsv.replace(".tsv", "_gender.tsv")
	# with open(output_file_gender, "w") as f:
	# f.write(head + "\n" + "\n".join(gender_lines))

	# # order task
	# for line in lines:
	# if "2mix" in tsv:
	# num_spk = 2
	# elif "3mix" in tsv:
	# num_spk = 3
	# else:
	# continue
	# target_idx = random.randint(0, num_spk-1)
	# targets = line.split("\t")[4].split(" <sc> ")
	# new_targets = targets[target_idx]

	# _ = ['first', "second", "third"][target_idx]
	# prompt = f"There are multiple speakers in the audio. Please transcribe the speech of the {_} speaker into text."

	# new_line = line.strip().split("\t")
	# new_line[3] = prompt
	# new_line[4] = new_targets
	# new_line = "\t".join(new_line)
	# order_lines.append(new_line)
	# output_file_order = tsv.replace(".tsv", "_order.tsv")
	# with open(output_file_order, "w") as f:
	# f.write(head + "\n" + "\n".join(order_lines))


	# # keyword task
	# for line in lines:
	# # 获取 targets 并将它们拆成单词集合
	# targets = line.split("\t")[4].split(" <sc> ")
	# # 筛掉长度小于等
	# targets_list = [set(word for word in t.split(" ") if len(word) >= 6) for t in targets]
	# # 求所有 targets 中的单词集合并
	# all_word_set = set.union(*targets_list)
	# if len(all_word_set) == 0:
	# continue
	# # 找出每个 target 中独有的单词
	# unique_targets_list = []
	# for target_set in targets_list:
	# other_targets_union = set.union(*[s for s in targets_list if s != target_set])
	# unique_words = target_set - other_targets_union
	# unique_targets_list.append(unique_words)

	# all_unique_word_set = set.union(*unique_targets_list)
	# if len(all_unique_word_set) == 0:
	# continue

	# sampled_word = random.choice(list(all_unique_word_set))
	# # 找到这个单词属于的集合索引
	# for i, unique_words in enumerate(unique_targets_list):
	# if sampled_word in unique_words:
	# set_index = i
	# break


	# new_targets = targets[set_index]
	# prompt = f'Please transcribe the speech of the speaker who said the word "{sampled_word}" in the overlapping speech audio.'

	# new_line = line.strip().split("\t")
	# new_line[3] = prompt
	# new_line[4] = new_targets
	# new_line = "\t".join(new_line)
	# keyword_lines.append(new_line)
	# output_file_keyword = tsv.replace(".tsv", "_keyword.tsv")
	# with open(output_file_keyword, "w") as f:
	# f.write(head + "\n" + "\n".join(keyword_lines))

	# # target talker ASR
	# this part is processed by the model dataloader
	# for line in lines:
	# prompt = "The audio file starts with a 3-second reference speech by the target speaker, followed by overlapping speech. Please transcribe the target speaker's part from the overlapping section."
	# new_line = line.strip().split("\t")
	# new_line[3] = prompt
	# new_line = "\t".join(new_line)
	# target_lines.append(new_line)
	# output_file_targetASR = tsv.replace(".tsv", "_targetASR.tsv")
	# with open(output_file_targetASR, "w") as f:
	# f.write(head + "\n" + "\n".join(target_lines))


	# target talker lingual
	if "de-en-" not in tsv:
	continue
	for line in lines:
	target_lingual = random.choice(["English", "German"])
	prompt = f"Please transcribe the person speaking {target_lingual} from the overlapping speech audio."
	new_line = line.strip().split("\t")
	new_line[3] = prompt

	target_lingual = "en" if target_lingual=="English" else "de"
	tgt_texts = new_line[4].split(" <sc> ")
	langs = new_line[7].split("\|")
	tgt_text = [text for text, lang in zip(tgt_texts, langs) if lang == target_lingual]
	new_line[4] = " <sc> ".join(tgt_text)

	new_line = "\t".join(new_line)
	targetLingual_lines.append(new_line)
	output_file_targetLingual = tsv.replace(".tsv", "_targetLingual.tsv")
	with open(output_file_targetLingual, "w") as f:
	f.write(head + "\n" + "\n".join(targetLingual_lines))