Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| from datasets import load_dataset, get_dataset_config_names, Features, Value | |
| from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split | |
| from utils import total_audio_duration_per_split, average_audio_duration_per_split, average_utterance_length_chars_per_split, average_utterance_length_words_per_split, speakers_per_split, meta_cov_per_split | |
| #, uniq_utts_per_speaker | |
| from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split | |
| import argparse | |
| # move to constants | |
| output_dir_plots = "./plots" | |
| output_dir_reports = "./reports" | |
| os.makedirs(output_dir_plots, exist_ok=True) | |
| os.makedirs(output_dir_plots, exist_ok=True) | |
| # get as cmd line args | |
| # read from command line argument | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to generate reports for") | |
| parser.add_argument('--no_secret_test_split', action='store_false', help="Should references for test split be retrieved from the secret distribution?") | |
| args = parser.parse_args() | |
| dataset_name = args.dataset | |
| print("Generating reports for dataset: {}".format(dataset_name)) | |
| if not (args.no_secret_test_split): | |
| dataset_name_secret = str.join("-", [dataset_name, "secret"]) | |
| # check if secret repo exists | |
| print(dataset_name_secret) | |
| try: | |
| dataset_configs_secret = get_dataset_config_names(dataset_name_secret) | |
| except: | |
| print("Config for secret dataset {} cannot be retrieved!".format(dataset_name_secret)) | |
| output_dir_reports_dataset = os.path.join(output_dir_reports, dataset_name) | |
| os.makedirs(output_dir_reports_dataset, exist_ok=True) | |
| # get dataset config names | |
| dataset_configs = get_dataset_config_names(dataset_name) | |
| # initialize output structures | |
| dataset_statistics = {} | |
| output_fn_stats = os.path.join(output_dir_reports_dataset, "dataset_statistics.json") | |
| dataset_contents = {} | |
| output_fn_contents = os.path.join(output_dir_reports_dataset, "dataset_contents.json") | |
| # specify features to load. Skip loading of audio data | |
| features_to_load = Features({'audioname': Value(dtype='string', id=None), 'split': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None), 'speaker_id': Value(dtype='string', id=None), 'ref_orig': Value(dtype='string', id=None), 'audio_duration_samples': Value(dtype='int32', id=None), 'audio_duration_seconds': Value(dtype='float32', id=None), 'samplingrate_orig': Value(dtype='int32', id=None), 'sampling_rate': Value(dtype='int32', id=None), 'audiopath_bigos': Value(dtype='string', id=None), 'audiopath_local': Value(dtype='string', id=None), 'speaker_age': Value(dtype='string', id=None), 'speaker_gender': Value(dtype='string', id=None)}) | |
| for config_name in dataset_configs: | |
| print("Generating stats for {}".format(config_name)) | |
| dataset_statistics[config_name] = {} | |
| dataset_contents[config_name] = {} | |
| dataset_hf_subset = load_dataset(dataset_name, config_name, features=features_to_load, trust_remote_code=True) | |
| if not (args.no_secret_test_split): | |
| dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True) | |
| else: | |
| dataset_hf_subset_secret = None | |
| #audio content size | |
| dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset) | |
| dataset_statistics[config_name]["audio[h]"] = total_audio_duration_per_split(dataset_hf_subset) | |
| dataset_statistics[config_name]["speakers"] = speakers_per_split(dataset_hf_subset) | |
| # text content size | |
| # metrics based on transcriptions (references) - requires reading secret repo for test split | |
| dataset_statistics[config_name]["words"] = words_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
| dataset_statistics[config_name]["chars"] = chars_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
| # text content derived features | |
| dataset_statistics[config_name]["utts_unique"], dataset_contents[config_name]["unique_utts"] = uniq_utts_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
| dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
| dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
| dataset_statistics[config_name]["average_utterance_length[words]"] = average_utterance_length_words_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
| dataset_statistics[config_name]["average_utterance_length[chars]"] = average_utterance_length_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
| dataset_statistics[config_name]["samples_per_spk_stats"], dataset_contents[config_name]["samples_per_spk"] = recordings_per_speaker(dataset_hf_subset) | |
| # audio content derived features | |
| dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
| dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret) | |
| dataset_statistics[config_name]["average_audio_duration[s]"] = average_audio_duration_per_split(dataset_hf_subset) | |
| # metadata coverage per subset in percent - speaker accent | |
| dataset_statistics[config_name]["meta_cov_gender"] = meta_cov_per_split(dataset_hf_subset, 'speaker_gender') | |
| dataset_statistics[config_name]["meta_cov_age"] = meta_cov_per_split(dataset_hf_subset, 'speaker_age') | |
| # speech rate per subset | |
| dataset_statistics[config_name]["meta_dist_gender"] = meta_distribution_text(dataset_hf_subset, 'speaker_gender') | |
| dataset_statistics[config_name]["meta_dist_age"] = meta_distribution_text(dataset_hf_subset, 'speaker_age') | |
| # dataset_statistics[config_name] = uniq_utts_per_speaker(dataset_hf_subset) | |
| # number of words per speaker (min, max, med, avg, std) | |
| # distribution of audio duration per subset | |
| output_dir_plots_subset = os.path.join(output_dir_plots, config_name) | |
| meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_gender') | |
| # distribution of audio duration per age | |
| meta_distribution_violin_plot(dataset_hf_subset, output_dir_plots_subset, 'audio_duration_seconds', 'speaker_age') | |
| # save datasets statistics dict to storage as JSON file | |
| with open(output_fn_stats, 'w') as f: | |
| json.dump(dataset_statistics, f) | |
| # save dataset content analysis to storage | |
| with open(output_fn_contents, 'w') as f: | |
| json.dump(dataset_contents, f) | |