| import glob | |
| import random | |
| tsv_files = glob.glob("/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/*.tsv") | |
| head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"]) | |
| for tsv in tsv_files: | |
| gender_lines = [] | |
| order_lines = [] | |
| keyword_lines = [] | |
| target_lines = [] | |
| targetLingual_lines = [] | |
| if "gender" in tsv or "order" in tsv or "keyword" in tsv or "librispeech_" in tsv or "target" in tsv: | |
| continue | |
| with open(tsv, "r") as f: | |
| _ = f.readline() | |
| lines = f.readlines() | |
| # # gender task | |
| # for line in lines: | |
| # genders = line.split("\t")[-1].strip().split("|") | |
| # targets = line.split("\t")[4].split(" <sc> ") | |
| # if "M" not in genders or "F" not in genders: | |
| # continue | |
| # else: | |
| # target_gen = "male" if random.random() > 0.5 else "female" | |
| # prompt = f"Please transcribe the contents spoken by {target_gen} speakers in overlapping speech." | |
| # new_targets = [target for gender, target in zip(genders, targets) if (gender == "F" and target_gen == "female") or (gender == "M" and target_gen == "male")] | |
| # new_targets = " <sc> ".join(new_targets) | |
| # new_line = line.strip().split("\t") | |
| # new_line[3] = prompt | |
| # new_line[4] = new_targets | |
| # new_line = "\t".join(new_line) | |
| # gender_lines.append(new_line) | |
| # output_file_gender = tsv.replace(".tsv", "_gender.tsv") | |
| # with open(output_file_gender, "w") as f: | |
| # f.write(head + "\n" + "\n".join(gender_lines)) | |
| # # order task | |
| # for line in lines: | |
| # if "2mix" in tsv: | |
| # num_spk = 2 | |
| # elif "3mix" in tsv: | |
| # num_spk = 3 | |
| # else: | |
| # continue | |
| # target_idx = random.randint(0, num_spk-1) | |
| # targets = line.split("\t")[4].split(" <sc> ") | |
| # new_targets = targets[target_idx] | |
| # _ = ['first', "second", "third"][target_idx] | |
| # prompt = f"There are multiple speakers in the audio. Please transcribe the speech of the {_} speaker into text." | |
| # new_line = line.strip().split("\t") | |
| # new_line[3] = prompt | |
| # new_line[4] = new_targets | |
| # new_line = "\t".join(new_line) | |
| # order_lines.append(new_line) | |
| # output_file_order = tsv.replace(".tsv", "_order.tsv") | |
| # with open(output_file_order, "w") as f: | |
| # f.write(head + "\n" + "\n".join(order_lines)) | |
| # # keyword task | |
| # for line in lines: | |
| # # 获取 targets 并将它们拆成单词集合 | |
| # targets = line.split("\t")[4].split(" <sc> ") | |
| # # 筛掉长度小于等 | |
| # targets_list = [set(word for word in t.split(" ") if len(word) >= 6) for t in targets] | |
| # # 求所有 targets 中的单词集合并 | |
| # all_word_set = set.union(*targets_list) | |
| # if len(all_word_set) == 0: | |
| # continue | |
| # # 找出每个 target 中独有的单词 | |
| # unique_targets_list = [] | |
| # for target_set in targets_list: | |
| # other_targets_union = set.union(*[s for s in targets_list if s != target_set]) | |
| # unique_words = target_set - other_targets_union | |
| # unique_targets_list.append(unique_words) | |
| # all_unique_word_set = set.union(*unique_targets_list) | |
| # if len(all_unique_word_set) == 0: | |
| # continue | |
| # sampled_word = random.choice(list(all_unique_word_set)) | |
| # # 找到这个单词属于的集合索引 | |
| # for i, unique_words in enumerate(unique_targets_list): | |
| # if sampled_word in unique_words: | |
| # set_index = i | |
| # break | |
| # new_targets = targets[set_index] | |
| # prompt = f'Please transcribe the speech of the speaker who said the word "{sampled_word}" in the overlapping speech audio.' | |
| # new_line = line.strip().split("\t") | |
| # new_line[3] = prompt | |
| # new_line[4] = new_targets | |
| # new_line = "\t".join(new_line) | |
| # keyword_lines.append(new_line) | |
| # output_file_keyword = tsv.replace(".tsv", "_keyword.tsv") | |
| # with open(output_file_keyword, "w") as f: | |
| # f.write(head + "\n" + "\n".join(keyword_lines)) | |
| # # target talker ASR | |
| # this part is processed by the model dataloader | |
| # for line in lines: | |
| # prompt = "The audio file starts with a 3-second reference speech by the target speaker, followed by overlapping speech. Please transcribe the target speaker's part from the overlapping section." | |
| # new_line = line.strip().split("\t") | |
| # new_line[3] = prompt | |
| # new_line = "\t".join(new_line) | |
| # target_lines.append(new_line) | |
| # output_file_targetASR = tsv.replace(".tsv", "_targetASR.tsv") | |
| # with open(output_file_targetASR, "w") as f: | |
| # f.write(head + "\n" + "\n".join(target_lines)) | |
| # target talker lingual | |
| if "de-en-" not in tsv: | |
| continue | |
| for line in lines: | |
| target_lingual = random.choice(["English", "German"]) | |
| prompt = f"Please transcribe the person speaking {target_lingual} from the overlapping speech audio." | |
| new_line = line.strip().split("\t") | |
| new_line[3] = prompt | |
| target_lingual = "en" if target_lingual=="English" else "de" | |
| tgt_texts = new_line[4].split(" <sc> ") | |
| langs = new_line[7].split("|") | |
| tgt_text = [text for text, lang in zip(tgt_texts, langs) if lang == target_lingual] | |
| new_line[4] = " <sc> ".join(tgt_text) | |
| new_line = "\t".join(new_line) | |
| targetLingual_lines.append(new_line) | |
| output_file_targetLingual = tsv.replace(".tsv", "_targetLingual.tsv") | |
| with open(output_file_targetLingual, "w") as f: | |
| f.write(head + "\n" + "\n".join(targetLingual_lines)) |