Lingwei Meng
add data
c52df1b
import glob
tsv_files = glob.glob("/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/*.tsv")
head = "\t".join(["id", "audio", "n_frames", "prompt", "tgt_text", "codec", "with_speech", "language", "speakers", "genders"])
tsv_files = [
"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_train.tsv",
"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_train.tsv",
"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_train.tsv",
# "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_train_targetLingual.tsv",
# "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_train_targetLingual.tsv",
"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_train_fixed.tsv",
"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_train_fixed.tsv",
"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_train_fixed.tsv",
"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_train_fixed.tsv",
"/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_train_fixed.tsv",
# "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de_test.tsv",
# "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test.tsv",
# "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test.tsv",
# "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-2mix_test_targetLingual.tsv",
# "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/de-en-3mix_test_targetLingual.tsv",
# "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech_test_fixed.tsv",
# "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech2mix_test_fixed.tsv",
# "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/librispeech3mix_test_fixed.tsv",
# "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri2mix_test_fixed.tsv",
# "/valleblob/v-lingmeng/speech/data/Whisper-Sidecar-data-metadata/data_for_wavllm/libri3mix_test_fixed.tsv",
]
all_count = 0.0
for tsv in tsv_files:
if "test" in tsv:
continue
with open(tsv, "r") as f:
lines = f.readline()
lines = f.readlines()
print(tsv)
print(len(lines))
count = 0.0
max_sec = 0
new_lines = []
for line in lines:
new_line = line.strip().split("\t")
# if float(new_line[2])/16000 > 40:
# continue
new_lines.append("\t".join(new_line))
# max_sec = max_sec if max_sec > float(new_line[2])/16000 else float(new_line[2])/16000
count += float(new_line[2])/16000
# with open(tsv, "w") as f:
# f.write(head + "\n" + "\n".join(new_lines))
print(count/60/60)
print(max_sec)
print()
all_count += count/60/60
print(all_count)