Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import streamlit as st | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import os | |
| import json | |
| from utils import read_reports, dict_to_multindex_df | |
| #add_test_split_stats_from_secret_dataset, dict_to_multindex_df_all_splits | |
| from utils import extract_stats_to_agg, extract_stats_all_splits, extract_stats_for_dataset_card | |
| from constants import BIGOS_INFO, PELCRA_INFO, ABOUT_INFO | |
| from datasets import get_dataset_config_names | |
| # PL ASR BIGOS analysis | |
| # PL ASR Diagnostic analysis | |
| # PELCRA analysis | |
| # TODO - compare the datasets | |
| st.set_page_config(layout="wide") | |
| metrics_size_audio = ["samples", "audio[h]", "speakers"] | |
| metrics_size_text = ["samples", "words", "chars"] | |
| metrics_size = metrics_size_audio + metrics_size_text | |
| metrics_features_text_uniq = ["utts_unique", "words_unique", "chars_unique"] | |
| metrics_features_speech_rate = ["words_per_sec", "chars_per_sec"] | |
| metrics_features_duration = ["average_audio_duration[s]", "average_utterance_length[words]", "average_utterance_length[chars]"] | |
| metrics_features_meta = ["meta_cov_gender", "meta_cov_age"] | |
| metrics_features = metrics_features_text_uniq + metrics_features_speech_rate + metrics_features_duration + metrics_features_meta | |
| about, analysis_bigos, analysis_bigos_diagnostic, analysis_bigos_pelcra = st.tabs(["About BIGOS datasets", "BIGOS V2 analysis", "BIGOS V2 diagnostic", "PELCRA for BIGOS analysis"]) | |
| #analysis_bigos_diagnostic | |
| #########################################BIGOS################################################ | |
| with about: | |
| st.title("About BIGOS project") | |
| st.markdown(ABOUT_INFO, unsafe_allow_html=True) | |
| # TODO - load and display about BIGOS benchmark | |
| with analysis_bigos: | |
| dataset_name = "amu-cai/pl-asr-bigos-v2" | |
| dataset_short_name = "BIGOS" | |
| dataset_version = "V2" | |
| dataset_configs = get_dataset_config_names(dataset_name,trust_remote_code=True) | |
| # remove "all" subset, which is always the last config type | |
| dataset_configs.pop() | |
| print(dataset_configs) | |
| # read the reports for public and secret datasets | |
| [stats_dict_public, contents_dict_public] = read_reports(dataset_name) | |
| # update the metrics for test split with the secret dataset statistics | |
| #stats_dict_public = add_test_split_stats_from_secret_dataset(stats_dict_public, stats_dict_secret) | |
| df_multindex_for_agg = dict_to_multindex_df(stats_dict_public, all_splits=False) | |
| df_multindex_all_splits = dict_to_multindex_df(stats_dict_public, all_splits=True) | |
| # extract metrics from dictionary and convert to various dataframes for easier analysis and visualization | |
| #st.header("Summary statistics") | |
| st.header("Dataset level metrics") | |
| df_sum_stats_agg = extract_stats_to_agg(df_multindex_for_agg, metrics_size) | |
| # split dataframe into separate dataframes for easier analysis and visualization | |
| st.subheader("Audio content size") | |
| df_sum_stats_audio = df_sum_stats_agg[metrics_size_audio] | |
| st.dataframe(df_sum_stats_audio) | |
| st.subheader("Text content size") | |
| df_sum_stats_text = df_sum_stats_agg[metrics_size_text] | |
| st.dataframe(df_sum_stats_text) | |
| df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features) | |
| st.subheader("Utterances, vocabulary and alphabet space") | |
| df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features_text_uniq] | |
| st.dataframe(df_sum_stats_feats_text) | |
| st.subheader("Speech rates") | |
| df_sum_stats_feats_speech_rate= df_sum_stats_all_splits[metrics_features_speech_rate] | |
| st.dataframe(df_sum_stats_feats_speech_rate) | |
| st.subheader("Average utterance lengths and audio duration") | |
| df_sum_stats_feats_durations = df_sum_stats_all_splits[metrics_features_duration] | |
| st.dataframe(df_sum_stats_feats_durations) | |
| st.subheader("Metadata coverage") | |
| df_sum_stats_feats_meta = df_sum_stats_all_splits[metrics_features_meta] | |
| st.dataframe(df_sum_stats_feats_meta) | |
| st.header("BIGOS subsets (source datasets) cards") | |
| for subset in dataset_configs: | |
| st.subheader("Dataset card for: {}".format(subset)) | |
| df_metrics_subset_size = extract_stats_for_dataset_card(df_multindex_for_agg, subset, metrics_size, add_total=True) | |
| st.dataframe(df_metrics_subset_size) | |
| df_metrics_subset_features = extract_stats_for_dataset_card(df_multindex_for_agg, subset, metrics_features, add_total=False) | |
| st.dataframe(df_metrics_subset_features) | |
| #########################################PELCRA################################################ | |
| with analysis_bigos_pelcra: | |
| dataset_name = "pelcra/pl-asr-pelcra-for-bigos" | |
| dataset_short_name = "PELCRA" | |
| # local version with granted gated access | |
| #dataset_configs = get_dataset_config_names(dataset_name,trust_remote_code=True) | |
| # remove "all" subset, which is always the last config type | |
| #dataset_configs.pop() | |
| # remote version with hardcoded access | |
| dataset_configs = ['ul-diabiz_poleval-22', 'ul-spokes_mix_emo-18', 'ul-spokes_mix_luz-18', 'ul-spokes_mix_parl-18', 'ul-spokes_biz_bio-23', 'ul-spokes_biz_int-23', 'ul-spokes_biz_luz-23', 'ul-spokes_biz_pod-23', 'ul-spokes_biz_pres-23', 'ul-spokes_biz_vc-23', 'ul-spokes_biz_vc2-23', 'ul-spokes_biz_wyw-23'] | |
| print(dataset_configs) | |
| # read the reports for public and secret datasets | |
| [stats_dict_public, contents_dict_public] = read_reports(dataset_name) | |
| # update the metrics for test split with the secret dataset statistics | |
| #stats_dict_public = add_test_split_stats_from_secret_dataset(stats_dict_public, stats_dict_secret) | |
| df_multindex_for_agg = dict_to_multindex_df(stats_dict_public, all_splits=False) | |
| df_multindex_all_splits = dict_to_multindex_df(stats_dict_public, all_splits=True) | |
| # extract metrics from dictionary and convert to various dataframes for easier analysis and visualization | |
| #st.header("Summary statistics") | |
| st.header("Dataset level metrics") | |
| df_sum_stats_agg = extract_stats_to_agg(df_multindex_for_agg, metrics_size) | |
| # split dataframe into separate dataframes for easier analysis and visualization | |
| st.subheader("Audio content size") | |
| df_sum_stats_audio = df_sum_stats_agg[metrics_size_audio] | |
| st.dataframe(df_sum_stats_audio) | |
| st.subheader("Text content size") | |
| df_sum_stats_text = df_sum_stats_agg[metrics_size_text] | |
| st.dataframe(df_sum_stats_text) | |
| df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features) | |
| st.subheader("Utterances, vocabulary and alphabet space") | |
| df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features_text_uniq] | |
| st.dataframe(df_sum_stats_feats_text) | |
| st.subheader("Speech rates") | |
| df_sum_stats_feats_speech_rate= df_sum_stats_all_splits[metrics_features_speech_rate] | |
| st.dataframe(df_sum_stats_feats_speech_rate) | |
| st.subheader("Average utterance lengths and audio duration") | |
| df_sum_stats_feats_durations = df_sum_stats_all_splits[metrics_features_duration] | |
| st.dataframe(df_sum_stats_feats_durations) | |
| st.subheader("Metadata coverage") | |
| df_sum_stats_feats_meta = df_sum_stats_all_splits[metrics_features_meta] | |
| st.dataframe(df_sum_stats_feats_meta) | |
| st.header("BIGOS subsets (source datasets) cards") | |
| for subset in dataset_configs: | |
| st.subheader("Dataset card for: {}".format(subset)) | |
| df_metrics_subset_size = extract_stats_for_dataset_card(df_multindex_for_agg, subset, metrics_size, add_total=True) | |
| st.dataframe(df_metrics_subset_size) | |
| df_metrics_subset_features = extract_stats_for_dataset_card(df_multindex_for_agg, subset, metrics_features, add_total=False) | |
| st.dataframe(df_metrics_subset_features) | |
| """ | |
| with analysis_bigos_diagnostic: | |
| dataset_name = "amu-cai/pl-asr-bigos-v2-diagnostic" | |
| dataset_short_name = "BIGOS diagnostic" | |
| dataset_version = "V2" | |
| dataset_configs = get_dataset_config_names(dataset_name,trust_remote_code=True) | |
| # remove "all" subset, which is always the last config type | |
| dataset_configs.pop() | |
| print(dataset_configs) | |
| # read the reports for public and secret datasets | |
| [stats_dict_public, contents_dict_public] = read_reports(dataset_name) | |
| # update the metrics for test split with the secret dataset statistics | |
| #stats_dict_public = add_test_split_stats_from_secret_dataset(stats_dict_public, stats_dict_secret) | |
| df_multindex_for_agg = dict_to_multindex_df(stats_dict_public, all_splits=False) | |
| df_multindex_all_splits = dict_to_multindex_df(stats_dict_public, all_splits=True) | |
| # extract metrics from dictionary and convert to various dataframes for easier analysis and visualization | |
| #st.header("Summary statistics") | |
| st.header("Dataset level metrics") | |
| df_sum_stats_agg = extract_stats_to_agg(df_multindex_for_agg, metrics_size) | |
| # split dataframe into separate dataframes for easier analysis and visualization | |
| st.subheader("Audio content size") | |
| df_sum_stats_audio = df_sum_stats_agg[metrics_size_audio] | |
| st.dataframe(df_sum_stats_audio) | |
| st.subheader("Text content size") | |
| df_sum_stats_text = df_sum_stats_agg[metrics_size_text] | |
| st.dataframe(df_sum_stats_text) | |
| df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features) | |
| st.subheader("Utterances, vocabulary and alphabet space") | |
| df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features_text_uniq] | |
| st.dataframe(df_sum_stats_feats_text) | |
| st.subheader("Speech rates") | |
| df_sum_stats_feats_speech_rate= df_sum_stats_all_splits[metrics_features_speech_rate] | |
| st.dataframe(df_sum_stats_feats_speech_rate) | |
| st.subheader("Average utterance lengths and audio duration") | |
| df_sum_stats_feats_durations = df_sum_stats_all_splits[metrics_features_duration] | |
| st.dataframe(df_sum_stats_feats_durations) | |
| st.subheader("Metadata coverage") | |
| df_sum_stats_feats_meta = df_sum_stats_all_splits[metrics_features_meta] | |
| st.dataframe(df_sum_stats_feats_meta) | |
| st.header("BIGOS subsets (source datasets) cards") | |
| for subset in dataset_configs: | |
| st.subheader("Dataset card for: {}".format(subset)) | |
| df_metrics_subset_size = extract_stats_for_dataset_card(df_multindex_for_agg, subset, metrics_size, add_total=True) | |
| st.dataframe(df_metrics_subset_size) | |
| df_metrics_subset_features = extract_stats_for_dataset_card(df_multindex_for_agg, subset, metrics_features, add_total=False) | |
| st.dataframe(df_metrics_subset_features) | |
| """ | |