File size: 6,334 Bytes

c52df1b

import glob
import random

tsv_files = glob.glob("/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/*.tsv")



head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"])
for tsv in tsv_files:
    gender_lines = []
    order_lines = []
    keyword_lines = [] 
    target_lines = [] 
    targetLingual_lines = []
    if "gender" in tsv or "order" in tsv or "keyword" in tsv or "librispeech_" in tsv or "target" in tsv:
        continue
    with open(tsv, "r") as f:

        _ = f.readline()
        lines = f.readlines()
        # # gender task
        # for line in lines:
        #     genders = line.split("\t")[-1].strip().split("|")
        #     targets = line.split("\t")[4].split(" <sc> ")
        #     if "M" not in genders or "F" not in genders:
        #         continue
        #     else:
        #         target_gen = "male" if random.random() > 0.5 else "female"
        #     prompt = f"Please transcribe the contents spoken by {target_gen} speakers in overlapping speech."

        #     new_targets = [target for gender, target in zip(genders, targets) if (gender == "F" and target_gen == "female") or (gender == "M" and target_gen == "male")]
        #     new_targets = " <sc> ".join(new_targets)

        #     new_line = line.strip().split("\t")
        #     new_line[3] = prompt
        #     new_line[4] = new_targets
        #     new_line = "\t".join(new_line)
        #     gender_lines.append(new_line)
        # output_file_gender = tsv.replace(".tsv", "_gender.tsv")
        # with open(output_file_gender, "w") as f:
        #     f.write(head + "\n" + "\n".join(gender_lines))
            
        # # order task
        # for line in lines:
        #     if "2mix" in tsv:
        #         num_spk = 2
        #     elif "3mix" in tsv:
        #         num_spk = 3
        #     else:
        #         continue
        #     target_idx = random.randint(0, num_spk-1)
        #     targets = line.split("\t")[4].split(" <sc> ")
        #     new_targets = targets[target_idx]

        #     _ = ['first', "second", "third"][target_idx]
        #     prompt = f"There are multiple speakers in the audio. Please transcribe the speech of the {_} speaker into text."

        #     new_line = line.strip().split("\t")
        #     new_line[3] = prompt
        #     new_line[4] = new_targets
        #     new_line = "\t".join(new_line)
        #     order_lines.append(new_line)
        # output_file_order = tsv.replace(".tsv", "_order.tsv")
        # with open(output_file_order, "w") as f:
        #     f.write(head + "\n" + "\n".join(order_lines))        
        

        # # keyword task 
        # for line in lines:
        #     # 获取 targets 并将它们拆成单词集合
        #     targets = line.split("\t")[4].split(" <sc> ")
        #     # 筛掉长度小于等
        #     targets_list = [set(word for word in t.split(" ") if len(word) >= 6) for t in targets]
        #     # 求所有 targets 中的单词集合并
        #     all_word_set = set.union(*targets_list)
        #     if len(all_word_set) == 0:
        #         continue
        #     # 找出每个 target 中独有的单词
        #     unique_targets_list = []
        #     for target_set in targets_list:
        #         other_targets_union = set.union(*[s for s in targets_list if s != target_set])
        #         unique_words = target_set - other_targets_union
        #         unique_targets_list.append(unique_words)

        #     all_unique_word_set = set.union(*unique_targets_list)
        #     if len(all_unique_word_set) == 0:
        #         continue

        #     sampled_word = random.choice(list(all_unique_word_set))
        #     # 找到这个单词属于的集合索引
        #     for i, unique_words in enumerate(unique_targets_list):
        #         if sampled_word in unique_words:
        #             set_index = i
        #             break
            
            
        #     new_targets = targets[set_index]
        #     prompt = f'Please transcribe the speech of the speaker who said the word "{sampled_word}" in the overlapping speech audio.'

        #     new_line = line.strip().split("\t")
        #     new_line[3] = prompt
        #     new_line[4] = new_targets
        #     new_line = "\t".join(new_line)
        #     keyword_lines.append(new_line)
        # output_file_keyword = tsv.replace(".tsv", "_keyword.tsv")
        # with open(output_file_keyword, "w") as f:
        #     f.write(head + "\n" + "\n".join(keyword_lines)) 

        # # target talker ASR
        # this part is processed by the model dataloader
        # for line in lines:
        #     prompt = "The audio file starts with a 3-second reference speech by the target speaker, followed by overlapping speech. Please transcribe the target speaker's part from the overlapping section."
        #     new_line = line.strip().split("\t")
        #     new_line[3] = prompt
        #     new_line = "\t".join(new_line)
        #     target_lines.append(new_line)
        # output_file_targetASR = tsv.replace(".tsv", "_targetASR.tsv")
        # with open(output_file_targetASR, "w") as f:
        #     f.write(head + "\n" + "\n".join(target_lines)) 
            

        # target talker lingual
        if "de-en-" not in tsv:
            continue
        for line in lines:
            target_lingual = random.choice(["English", "German"])
            prompt = f"Please transcribe the person speaking {target_lingual} from the overlapping speech audio."
            new_line = line.strip().split("\t")
            new_line[3] = prompt
            
            target_lingual = "en" if target_lingual=="English" else "de"
            tgt_texts = new_line[4].split(" <sc> ")
            langs = new_line[7].split("|")
            tgt_text =  [text for text, lang in zip(tgt_texts, langs) if lang == target_lingual]
            new_line[4] = " <sc> ".join(tgt_text)
            
            new_line = "\t".join(new_line)
            targetLingual_lines.append(new_line)
        output_file_targetLingual = tsv.replace(".tsv", "_targetLingual.tsv")
        with open(output_file_targetLingual, "w") as f:
            f.write(head + "\n" + "\n".join(targetLingual_lines))