import glob import random tsv_files = glob.glob("/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/*.tsv") head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"]) for tsv in tsv_files: gender_lines = [] order_lines = [] keyword_lines = [] target_lines = [] targetLingual_lines = [] if "gender" in tsv or "order" in tsv or "keyword" in tsv or "librispeech_" in tsv or "target" in tsv: continue with open(tsv, "r") as f: _ = f.readline() lines = f.readlines() # # gender task # for line in lines: # genders = line.split("\t")[-1].strip().split("|") # targets = line.split("\t")[4].split(" ") # if "M" not in genders or "F" not in genders: # continue # else: # target_gen = "male" if random.random() > 0.5 else "female" # prompt = f"Please transcribe the contents spoken by {target_gen} speakers in overlapping speech." # new_targets = [target for gender, target in zip(genders, targets) if (gender == "F" and target_gen == "female") or (gender == "M" and target_gen == "male")] # new_targets = " ".join(new_targets) # new_line = line.strip().split("\t") # new_line[3] = prompt # new_line[4] = new_targets # new_line = "\t".join(new_line) # gender_lines.append(new_line) # output_file_gender = tsv.replace(".tsv", "_gender.tsv") # with open(output_file_gender, "w") as f: # f.write(head + "\n" + "\n".join(gender_lines)) # # order task # for line in lines: # if "2mix" in tsv: # num_spk = 2 # elif "3mix" in tsv: # num_spk = 3 # else: # continue # target_idx = random.randint(0, num_spk-1) # targets = line.split("\t")[4].split(" ") # new_targets = targets[target_idx] # _ = ['first', "second", "third"][target_idx] # prompt = f"There are multiple speakers in the audio. Please transcribe the speech of the {_} speaker into text." # new_line = line.strip().split("\t") # new_line[3] = prompt # new_line[4] = new_targets # new_line = "\t".join(new_line) # order_lines.append(new_line) # output_file_order = tsv.replace(".tsv", "_order.tsv") # with open(output_file_order, "w") as f: # f.write(head + "\n" + "\n".join(order_lines)) # # keyword task # for line in lines: # # 获取 targets 并将它们拆成单词集合 # targets = line.split("\t")[4].split(" ") # # 筛掉长度小于等 # targets_list = [set(word for word in t.split(" ") if len(word) >= 6) for t in targets] # # 求所有 targets 中的单词集合并 # all_word_set = set.union(*targets_list) # if len(all_word_set) == 0: # continue # # 找出每个 target 中独有的单词 # unique_targets_list = [] # for target_set in targets_list: # other_targets_union = set.union(*[s for s in targets_list if s != target_set]) # unique_words = target_set - other_targets_union # unique_targets_list.append(unique_words) # all_unique_word_set = set.union(*unique_targets_list) # if len(all_unique_word_set) == 0: # continue # sampled_word = random.choice(list(all_unique_word_set)) # # 找到这个单词属于的集合索引 # for i, unique_words in enumerate(unique_targets_list): # if sampled_word in unique_words: # set_index = i # break # new_targets = targets[set_index] # prompt = f'Please transcribe the speech of the speaker who said the word "{sampled_word}" in the overlapping speech audio.' # new_line = line.strip().split("\t") # new_line[3] = prompt # new_line[4] = new_targets # new_line = "\t".join(new_line) # keyword_lines.append(new_line) # output_file_keyword = tsv.replace(".tsv", "_keyword.tsv") # with open(output_file_keyword, "w") as f: # f.write(head + "\n" + "\n".join(keyword_lines)) # # target talker ASR # this part is processed by the model dataloader # for line in lines: # prompt = "The audio file starts with a 3-second reference speech by the target speaker, followed by overlapping speech. Please transcribe the target speaker's part from the overlapping section." # new_line = line.strip().split("\t") # new_line[3] = prompt # new_line = "\t".join(new_line) # target_lines.append(new_line) # output_file_targetASR = tsv.replace(".tsv", "_targetASR.tsv") # with open(output_file_targetASR, "w") as f: # f.write(head + "\n" + "\n".join(target_lines)) # target talker lingual if "de-en-" not in tsv: continue for line in lines: target_lingual = random.choice(["English", "German"]) prompt = f"Please transcribe the person speaking {target_lingual} from the overlapping speech audio." new_line = line.strip().split("\t") new_line[3] = prompt target_lingual = "en" if target_lingual=="English" else "de" tgt_texts = new_line[4].split(" ") langs = new_line[7].split("|") tgt_text = [text for text, lang in zip(tgt_texts, langs) if lang == target_lingual] new_line[4] = " ".join(tgt_text) new_line = "\t".join(new_line) targetLingual_lines.append(new_line) output_file_targetLingual = tsv.replace(".tsv", "_targetLingual.tsv") with open(output_file_targetLingual, "w") as f: f.write(head + "\n" + "\n".join(targetLingual_lines))