speechllm_multispk / data /Whisper-Sidecar-data-metadata /generate_multitask_data.py
Lingwei Meng
add data
c52df1b
import glob
import random
tsv_files = glob.glob("/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/*.tsv")
head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"])
for tsv in tsv_files:
gender_lines = []
order_lines = []
keyword_lines = []
target_lines = []
targetLingual_lines = []
if "gender" in tsv or "order" in tsv or "keyword" in tsv or "librispeech_" in tsv or "target" in tsv:
continue
with open(tsv, "r") as f:
_ = f.readline()
lines = f.readlines()
# # gender task
# for line in lines:
# genders = line.split("\t")[-1].strip().split("|")
# targets = line.split("\t")[4].split(" <sc> ")
# if "M" not in genders or "F" not in genders:
# continue
# else:
# target_gen = "male" if random.random() > 0.5 else "female"
# prompt = f"Please transcribe the contents spoken by {target_gen} speakers in overlapping speech."
# new_targets = [target for gender, target in zip(genders, targets) if (gender == "F" and target_gen == "female") or (gender == "M" and target_gen == "male")]
# new_targets = " <sc> ".join(new_targets)
# new_line = line.strip().split("\t")
# new_line[3] = prompt
# new_line[4] = new_targets
# new_line = "\t".join(new_line)
# gender_lines.append(new_line)
# output_file_gender = tsv.replace(".tsv", "_gender.tsv")
# with open(output_file_gender, "w") as f:
# f.write(head + "\n" + "\n".join(gender_lines))
# # order task
# for line in lines:
# if "2mix" in tsv:
# num_spk = 2
# elif "3mix" in tsv:
# num_spk = 3
# else:
# continue
# target_idx = random.randint(0, num_spk-1)
# targets = line.split("\t")[4].split(" <sc> ")
# new_targets = targets[target_idx]
# _ = ['first', "second", "third"][target_idx]
# prompt = f"There are multiple speakers in the audio. Please transcribe the speech of the {_} speaker into text."
# new_line = line.strip().split("\t")
# new_line[3] = prompt
# new_line[4] = new_targets
# new_line = "\t".join(new_line)
# order_lines.append(new_line)
# output_file_order = tsv.replace(".tsv", "_order.tsv")
# with open(output_file_order, "w") as f:
# f.write(head + "\n" + "\n".join(order_lines))
# # keyword task
# for line in lines:
# # 获取 targets 并将它们拆成单词集合
# targets = line.split("\t")[4].split(" <sc> ")
# # 筛掉长度小于等
# targets_list = [set(word for word in t.split(" ") if len(word) >= 6) for t in targets]
# # 求所有 targets 中的单词集合并
# all_word_set = set.union(*targets_list)
# if len(all_word_set) == 0:
# continue
# # 找出每个 target 中独有的单词
# unique_targets_list = []
# for target_set in targets_list:
# other_targets_union = set.union(*[s for s in targets_list if s != target_set])
# unique_words = target_set - other_targets_union
# unique_targets_list.append(unique_words)
# all_unique_word_set = set.union(*unique_targets_list)
# if len(all_unique_word_set) == 0:
# continue
# sampled_word = random.choice(list(all_unique_word_set))
# # 找到这个单词属于的集合索引
# for i, unique_words in enumerate(unique_targets_list):
# if sampled_word in unique_words:
# set_index = i
# break
# new_targets = targets[set_index]
# prompt = f'Please transcribe the speech of the speaker who said the word "{sampled_word}" in the overlapping speech audio.'
# new_line = line.strip().split("\t")
# new_line[3] = prompt
# new_line[4] = new_targets
# new_line = "\t".join(new_line)
# keyword_lines.append(new_line)
# output_file_keyword = tsv.replace(".tsv", "_keyword.tsv")
# with open(output_file_keyword, "w") as f:
# f.write(head + "\n" + "\n".join(keyword_lines))
# # target talker ASR
# this part is processed by the model dataloader
# for line in lines:
# prompt = "The audio file starts with a 3-second reference speech by the target speaker, followed by overlapping speech. Please transcribe the target speaker's part from the overlapping section."
# new_line = line.strip().split("\t")
# new_line[3] = prompt
# new_line = "\t".join(new_line)
# target_lines.append(new_line)
# output_file_targetASR = tsv.replace(".tsv", "_targetASR.tsv")
# with open(output_file_targetASR, "w") as f:
# f.write(head + "\n" + "\n".join(target_lines))
# target talker lingual
if "de-en-" not in tsv:
continue
for line in lines:
target_lingual = random.choice(["English", "German"])
prompt = f"Please transcribe the person speaking {target_lingual} from the overlapping speech audio."
new_line = line.strip().split("\t")
new_line[3] = prompt
target_lingual = "en" if target_lingual=="English" else "de"
tgt_texts = new_line[4].split(" <sc> ")
langs = new_line[7].split("|")
tgt_text = [text for text, lang in zip(tgt_texts, langs) if lang == target_lingual]
new_line[4] = " <sc> ".join(tgt_text)
new_line = "\t".join(new_line)
targetLingual_lines.append(new_line)
output_file_targetLingual = tsv.replace(".tsv", "_targetLingual.tsv")
with open(output_file_targetLingual, "w") as f:
f.write(head + "\n" + "\n".join(targetLingual_lines))