File size: 6,334 Bytes
c52df1b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | import glob
import random
tsv_files = glob.glob("/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/*.tsv")
head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"])
for tsv in tsv_files:
gender_lines = []
order_lines = []
keyword_lines = []
target_lines = []
targetLingual_lines = []
if "gender" in tsv or "order" in tsv or "keyword" in tsv or "librispeech_" in tsv or "target" in tsv:
continue
with open(tsv, "r") as f:
_ = f.readline()
lines = f.readlines()
# # gender task
# for line in lines:
# genders = line.split("\t")[-1].strip().split("|")
# targets = line.split("\t")[4].split(" <sc> ")
# if "M" not in genders or "F" not in genders:
# continue
# else:
# target_gen = "male" if random.random() > 0.5 else "female"
# prompt = f"Please transcribe the contents spoken by {target_gen} speakers in overlapping speech."
# new_targets = [target for gender, target in zip(genders, targets) if (gender == "F" and target_gen == "female") or (gender == "M" and target_gen == "male")]
# new_targets = " <sc> ".join(new_targets)
# new_line = line.strip().split("\t")
# new_line[3] = prompt
# new_line[4] = new_targets
# new_line = "\t".join(new_line)
# gender_lines.append(new_line)
# output_file_gender = tsv.replace(".tsv", "_gender.tsv")
# with open(output_file_gender, "w") as f:
# f.write(head + "\n" + "\n".join(gender_lines))
# # order task
# for line in lines:
# if "2mix" in tsv:
# num_spk = 2
# elif "3mix" in tsv:
# num_spk = 3
# else:
# continue
# target_idx = random.randint(0, num_spk-1)
# targets = line.split("\t")[4].split(" <sc> ")
# new_targets = targets[target_idx]
# _ = ['first', "second", "third"][target_idx]
# prompt = f"There are multiple speakers in the audio. Please transcribe the speech of the {_} speaker into text."
# new_line = line.strip().split("\t")
# new_line[3] = prompt
# new_line[4] = new_targets
# new_line = "\t".join(new_line)
# order_lines.append(new_line)
# output_file_order = tsv.replace(".tsv", "_order.tsv")
# with open(output_file_order, "w") as f:
# f.write(head + "\n" + "\n".join(order_lines))
# # keyword task
# for line in lines:
# # 获取 targets 并将它们拆成单词集合
# targets = line.split("\t")[4].split(" <sc> ")
# # 筛掉长度小于等
# targets_list = [set(word for word in t.split(" ") if len(word) >= 6) for t in targets]
# # 求所有 targets 中的单词集合并
# all_word_set = set.union(*targets_list)
# if len(all_word_set) == 0:
# continue
# # 找出每个 target 中独有的单词
# unique_targets_list = []
# for target_set in targets_list:
# other_targets_union = set.union(*[s for s in targets_list if s != target_set])
# unique_words = target_set - other_targets_union
# unique_targets_list.append(unique_words)
# all_unique_word_set = set.union(*unique_targets_list)
# if len(all_unique_word_set) == 0:
# continue
# sampled_word = random.choice(list(all_unique_word_set))
# # 找到这个单词属于的集合索引
# for i, unique_words in enumerate(unique_targets_list):
# if sampled_word in unique_words:
# set_index = i
# break
# new_targets = targets[set_index]
# prompt = f'Please transcribe the speech of the speaker who said the word "{sampled_word}" in the overlapping speech audio.'
# new_line = line.strip().split("\t")
# new_line[3] = prompt
# new_line[4] = new_targets
# new_line = "\t".join(new_line)
# keyword_lines.append(new_line)
# output_file_keyword = tsv.replace(".tsv", "_keyword.tsv")
# with open(output_file_keyword, "w") as f:
# f.write(head + "\n" + "\n".join(keyword_lines))
# # target talker ASR
# this part is processed by the model dataloader
# for line in lines:
# prompt = "The audio file starts with a 3-second reference speech by the target speaker, followed by overlapping speech. Please transcribe the target speaker's part from the overlapping section."
# new_line = line.strip().split("\t")
# new_line[3] = prompt
# new_line = "\t".join(new_line)
# target_lines.append(new_line)
# output_file_targetASR = tsv.replace(".tsv", "_targetASR.tsv")
# with open(output_file_targetASR, "w") as f:
# f.write(head + "\n" + "\n".join(target_lines))
# target talker lingual
if "de-en-" not in tsv:
continue
for line in lines:
target_lingual = random.choice(["English", "German"])
prompt = f"Please transcribe the person speaking {target_lingual} from the overlapping speech audio."
new_line = line.strip().split("\t")
new_line[3] = prompt
target_lingual = "en" if target_lingual=="English" else "de"
tgt_texts = new_line[4].split(" <sc> ")
langs = new_line[7].split("|")
tgt_text = [text for text, lang in zip(tgt_texts, langs) if lang == target_lingual]
new_line[4] = " <sc> ".join(tgt_text)
new_line = "\t".join(new_line)
targetLingual_lines.append(new_line)
output_file_targetLingual = tsv.replace(".tsv", "_targetLingual.tsv")
with open(output_file_targetLingual, "w") as f:
f.write(head + "\n" + "\n".join(targetLingual_lines)) |