Spaces:
Sleeping
Sleeping
mj-new
commited on
Commit
·
25f0e74
1
Parent(s):
5d90238
Added average audio utterance calculation
Browse files- app.py +3 -3
- reports/amu-cai/pl-asr-bigos-v2/dataset_contents.json +1 -1
- reports/amu-cai/pl-asr-bigos-v2/dataset_statistics.json +2 -2
- reports/pelcra/pl-asr-pelcra-for-bigos/dataset_contents.json +1 -1
- reports/pelcra/pl-asr-pelcra-for-bigos/dataset_statistics.json +2 -2
- run-analysis.py +12 -5
- utils.py +26 -2
app.py
CHANGED
|
@@ -64,7 +64,7 @@ with analysis_bigos:
|
|
| 64 |
st.dataframe(df_sum_stats_text)
|
| 65 |
|
| 66 |
|
| 67 |
-
metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec"]
|
| 68 |
|
| 69 |
df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
|
| 70 |
|
|
@@ -131,12 +131,12 @@ with analysis_bigos_pelcra:
|
|
| 131 |
st.dataframe(df_sum_stats_text)
|
| 132 |
|
| 133 |
|
| 134 |
-
metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec"]
|
| 135 |
|
| 136 |
df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
|
| 137 |
|
| 138 |
st.subheader("Dataset features (text)")
|
| 139 |
-
df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features[0:
|
| 140 |
st.dataframe(df_sum_stats_feats_text)
|
| 141 |
|
| 142 |
st.subheader("Dataset features (audio)")
|
|
|
|
| 64 |
st.dataframe(df_sum_stats_text)
|
| 65 |
|
| 66 |
|
| 67 |
+
metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec", "average_audio_duration[s]"]
|
| 68 |
|
| 69 |
df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
|
| 70 |
|
|
|
|
| 131 |
st.dataframe(df_sum_stats_text)
|
| 132 |
|
| 133 |
|
| 134 |
+
metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec", "average_audio_duration[s]"]
|
| 135 |
|
| 136 |
df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
|
| 137 |
|
| 138 |
st.subheader("Dataset features (text)")
|
| 139 |
+
df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features[0:3]]
|
| 140 |
st.dataframe(df_sum_stats_feats_text)
|
| 141 |
|
| 142 |
st.subheader("Dataset features (audio)")
|
reports/amu-cai/pl-asr-bigos-v2/dataset_contents.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 46668863
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43e808b081d9b692c2469396565fb967105fd815894a7eaded34e89969dbc890
|
| 3 |
size 46668863
|
reports/amu-cai/pl-asr-bigos-v2/dataset_statistics.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0acb30a9a215f9c96b567b8753f565f400eac2366df6dba6248ccba859e190e3
|
| 3 |
+
size 23940
|
reports/pelcra/pl-asr-pelcra-for-bigos/dataset_contents.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 95274266
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9cea38447dc7485c0f628eba6e52f45e24d1d467fbe23c065162d6b36455ab1d
|
| 3 |
size 95274266
|
reports/pelcra/pl-asr-pelcra-for-bigos/dataset_statistics.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ab97523e5f4776bb566ed57c38126004bfac43f64bb3177e9ae39f1ee6e51d5
|
| 3 |
+
size 30399
|
run-analysis.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
| 2 |
import json
|
| 3 |
from datasets import load_dataset, get_dataset_config_names, Features, Value
|
| 4 |
from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
|
| 5 |
-
from utils import
|
| 6 |
#, uniq_utts_per_speaker
|
| 7 |
from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
|
| 8 |
import argparse
|
|
@@ -58,19 +58,26 @@ for config_name in dataset_configs:
|
|
| 58 |
if(args.secret_test_split):
|
| 59 |
dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)
|
| 60 |
|
|
|
|
| 61 |
dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
|
| 62 |
-
dataset_statistics[config_name]["audio[h]"] =
|
| 63 |
dataset_statistics[config_name]["speakers"] = speakers_per_split(dataset_hf_subset)
|
| 64 |
|
|
|
|
| 65 |
# metrics based on transcriptions (references) - requires reading secret repo for test split
|
| 66 |
-
dataset_statistics[config_name]["utts_unique"], dataset_contents[config_name]["unique_utts"] = uniq_utts_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
| 67 |
dataset_statistics[config_name]["words"] = words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
| 68 |
-
dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
| 69 |
dataset_statistics[config_name]["chars"] = chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
|
|
|
|
|
|
| 71 |
dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
| 72 |
dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
| 73 |
-
|
|
|
|
| 74 |
# metadata coverage per subset in percent - speaker accent
|
| 75 |
dataset_statistics[config_name]["meta_cov_sex"] = meta_cov_per_split(dataset_hf_subset, 'speaker_sex')
|
| 76 |
dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age')
|
|
|
|
| 2 |
import json
|
| 3 |
from datasets import load_dataset, get_dataset_config_names, Features, Value
|
| 4 |
from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
|
| 5 |
+
from utils import total_audio_duration_per_split, average_audio_duration_per_split, speakers_per_split, meta_cov_per_split
|
| 6 |
#, uniq_utts_per_speaker
|
| 7 |
from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
|
| 8 |
import argparse
|
|
|
|
| 58 |
if(args.secret_test_split):
|
| 59 |
dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)
|
| 60 |
|
| 61 |
+
#audio content size
|
| 62 |
dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
|
| 63 |
+
dataset_statistics[config_name]["audio[h]"] = total_audio_duration_per_split(dataset_hf_subset)
|
| 64 |
dataset_statistics[config_name]["speakers"] = speakers_per_split(dataset_hf_subset)
|
| 65 |
|
| 66 |
+
# text content size
|
| 67 |
# metrics based on transcriptions (references) - requires reading secret repo for test split
|
|
|
|
| 68 |
dataset_statistics[config_name]["words"] = words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
|
|
|
| 69 |
dataset_statistics[config_name]["chars"] = chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
| 70 |
+
|
| 71 |
+
# text content derived features
|
| 72 |
+
dataset_statistics[config_name]["utts_unique"], dataset_contents[config_name]["unique_utts"] = uniq_utts_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
| 73 |
+
dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
| 74 |
dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
| 75 |
+
|
| 76 |
+
# audio content derived features
|
| 77 |
dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
| 78 |
dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
| 79 |
+
dataset_statistics[config_name]["average_audio_duration[s]"] = average_audio_duration_per_split(dataset_hf_subset)
|
| 80 |
+
|
| 81 |
# metadata coverage per subset in percent - speaker accent
|
| 82 |
dataset_statistics[config_name]["meta_cov_sex"] = meta_cov_per_split(dataset_hf_subset, 'speaker_sex')
|
| 83 |
dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age')
|
utils.py
CHANGED
|
@@ -32,7 +32,7 @@ def num_of_samples_per_split(dataset_hf):
|
|
| 32 |
|
| 33 |
return out_dict
|
| 34 |
|
| 35 |
-
def
|
| 36 |
# input - huggingface dataset object
|
| 37 |
# output - dictionary with statistics about audio duration per split
|
| 38 |
out_dict = {}
|
|
@@ -52,6 +52,31 @@ def audio_duration_per_split(dataset_hf):
|
|
| 52 |
out_dict["all_splits"] = sum(out_dict.values())
|
| 53 |
return out_dict
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
def speakers_per_split(dataset_hf):
|
| 56 |
# input - huggingface dataset object
|
| 57 |
# output - dictionary with statistics about audio duration per split
|
|
@@ -350,7 +375,6 @@ def meta_distribution_text(dataset_hf, meta_field):
|
|
| 350 |
return out_dict
|
| 351 |
|
| 352 |
|
| 353 |
-
|
| 354 |
def recordings_per_speaker(dataset_hf):
|
| 355 |
recordings_per_speaker_stats_dict = {}
|
| 356 |
|
|
|
|
| 32 |
|
| 33 |
return out_dict
|
| 34 |
|
| 35 |
+
def total_audio_duration_per_split(dataset_hf):
|
| 36 |
# input - huggingface dataset object
|
| 37 |
# output - dictionary with statistics about audio duration per split
|
| 38 |
out_dict = {}
|
|
|
|
| 52 |
out_dict["all_splits"] = sum(out_dict.values())
|
| 53 |
return out_dict
|
| 54 |
|
| 55 |
+
|
| 56 |
+
def average_audio_duration_per_split(dataset_hf):
|
| 57 |
+
# input - huggingface dataset object
|
| 58 |
+
# output - dictionary with statistics about audio duration per split
|
| 59 |
+
out_dict = {}
|
| 60 |
+
metric = "average_audio_duration[s]"
|
| 61 |
+
print("Calculating {}".format(metric))
|
| 62 |
+
samples_all=0
|
| 63 |
+
audio_length_total_seconds=0
|
| 64 |
+
for split in dataset_hf.keys():
|
| 65 |
+
#sampling_rate = dataset_hf[split]["sampling_rate"][0]
|
| 66 |
+
#audio_total_length_samples = 0
|
| 67 |
+
#audio_total_length_samples = sum(len(audio_file["array"]) for audio_file in dataset_hf["test"]["audio"])
|
| 68 |
+
audio_length_split_seconds = sum(dataset_hf[split]["audio_duration_seconds"])
|
| 69 |
+
audio_length_total_seconds += audio_length_split_seconds
|
| 70 |
+
|
| 71 |
+
samples_split = len(dataset_hf[split]["audio_duration_seconds"])
|
| 72 |
+
samples_all += samples_split
|
| 73 |
+
audio_average_length_seconds = round(audio_length_split_seconds / samples_split,2)
|
| 74 |
+
out_dict[split] = audio_average_length_seconds
|
| 75 |
+
#print(split, audio_total_length_hours)
|
| 76 |
+
# add number of samples for all splits
|
| 77 |
+
out_dict["all_splits"] = round(audio_length_total_seconds / samples_all,2)
|
| 78 |
+
return out_dict
|
| 79 |
+
|
| 80 |
def speakers_per_split(dataset_hf):
|
| 81 |
# input - huggingface dataset object
|
| 82 |
# output - dictionary with statistics about audio duration per split
|
|
|
|
| 375 |
return out_dict
|
| 376 |
|
| 377 |
|
|
|
|
| 378 |
def recordings_per_speaker(dataset_hf):
|
| 379 |
recordings_per_speaker_stats_dict = {}
|
| 380 |
|