Spaces:
Sleeping
Sleeping
mj-new
commited on
Commit
·
ad8c37c
1
Parent(s):
4eee292
Fixed filtering of freely and commercially available datasets
Browse files- __pycache__/contants.cpython-310.pyc +0 -0
- __pycache__/utils.cpython-310.pyc +0 -0
- app.py +39 -11
- contants.py +2 -2
__pycache__/contants.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/contants.cpython-310.pyc and b/__pycache__/contants.cpython-310.pyc differ
|
|
|
__pycache__/utils.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ
|
|
|
app.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
import streamlit as st
|
|
|
|
|
|
|
| 3 |
|
| 4 |
from app_utils import filter_dataframe, calculate_height_to_display
|
| 5 |
from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN, HOWTO_TAXONOMY_CAT
|
|
@@ -8,10 +10,6 @@ from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, lo
|
|
| 8 |
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
|
| 9 |
from utils import left_align, right_align
|
| 10 |
|
| 11 |
-
import matplotlib.pyplot as plt
|
| 12 |
-
import seaborn as sns
|
| 13 |
-
|
| 14 |
-
|
| 15 |
st.set_page_config(layout="wide")
|
| 16 |
|
| 17 |
|
|
@@ -23,10 +21,10 @@ df_data_tax = load_data_taxonomy()
|
|
| 23 |
# Filter out non available datasets
|
| 24 |
df_data_cat_available = df_data_cat[df_data_cat['Available online'] == 'yes']
|
| 25 |
# Available and free
|
| 26 |
-
df_data_cat_available_free = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] == '
|
| 27 |
|
| 28 |
# Available and paid
|
| 29 |
-
df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] != '
|
| 30 |
|
| 31 |
|
| 32 |
# Load PL ASR benchmarks survey data
|
|
@@ -86,11 +84,17 @@ with data_survey:
|
|
| 86 |
|
| 87 |
st.dataframe(df_datasets_per_year, use_container_width=False)
|
| 88 |
|
| 89 |
-
st.header("Institutions contributing Polish ASR speech
|
| 90 |
col_groupby = ['Publisher']
|
| 91 |
df_datasets_per_publisher = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 92 |
st.dataframe(df_datasets_per_publisher, use_container_width=False)
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
st.header("Repositories hosting Polish ASR speech datasets")
|
| 95 |
col_groupby = ['Repository']
|
| 96 |
df_datasets_per_repo = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
|
@@ -114,19 +118,43 @@ with data_survey:
|
|
| 114 |
st.header("Datasets per speech type")
|
| 115 |
col_groupby = ['Speech type']
|
| 116 |
df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
|
|
|
|
|
|
| 117 |
st.dataframe(df_datasets_per_speech_type, use_container_width=False)
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
# Display distribution of datasets for various speech types
|
| 120 |
st.header("Distribution of available speech data per audio device - Public domain datasets")
|
| 121 |
col_groupby = ['Audio device']
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
| 124 |
|
| 125 |
# Display distribution of datasets for various speech types
|
| 126 |
st.header("Distribution of available speech data per audio device - Commercial datasets")
|
| 127 |
col_groupby = ['Audio device']
|
| 128 |
-
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
with data_taxonomy:
|
| 132 |
st.title("Polish ASR Speech Data Taxonomy")
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import streamlit as st
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns
|
| 5 |
|
| 6 |
from app_utils import filter_dataframe, calculate_height_to_display
|
| 7 |
from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN, HOWTO_TAXONOMY_CAT
|
|
|
|
| 10 |
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
|
| 11 |
from utils import left_align, right_align
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
st.set_page_config(layout="wide")
|
| 14 |
|
| 15 |
|
|
|
|
| 21 |
# Filter out non available datasets
|
| 22 |
df_data_cat_available = df_data_cat[df_data_cat['Available online'] == 'yes']
|
| 23 |
# Available and free
|
| 24 |
+
df_data_cat_available_free = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] == '0')]
|
| 25 |
|
| 26 |
# Available and paid
|
| 27 |
+
df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] != '0')]
|
| 28 |
|
| 29 |
|
| 30 |
# Load PL ASR benchmarks survey data
|
|
|
|
| 84 |
|
| 85 |
st.dataframe(df_datasets_per_year, use_container_width=False)
|
| 86 |
|
| 87 |
+
st.header("Institutions contributing Polish ASR speech datasets")
|
| 88 |
col_groupby = ['Publisher']
|
| 89 |
df_datasets_per_publisher = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 90 |
st.dataframe(df_datasets_per_publisher, use_container_width=False)
|
| 91 |
|
| 92 |
+
st.header("Institutions contributing freely available Polish ASR speech datasets")
|
| 93 |
+
col_groupby = ['Publisher']
|
| 94 |
+
df_datasets_per_publisher_free = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 95 |
+
st.dataframe(df_datasets_per_publisher_free, use_container_width=False)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
st.header("Repositories hosting Polish ASR speech datasets")
|
| 99 |
col_groupby = ['Repository']
|
| 100 |
df_datasets_per_repo = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
|
|
|
| 118 |
st.header("Datasets per speech type")
|
| 119 |
col_groupby = ['Speech type']
|
| 120 |
df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 121 |
+
# sort by the size of audio transcribed
|
| 122 |
+
df_datasets_per_speech_type = df_datasets_per_speech_type.sort_values(by='Size audio transcribed [hours]', ascending=False)
|
| 123 |
st.dataframe(df_datasets_per_speech_type, use_container_width=False)
|
| 124 |
|
| 125 |
+
|
| 126 |
+
# Display distribution of datasets for various speech types
|
| 127 |
+
st.header("Distribution of available speech data per audio device - All available datasets")
|
| 128 |
+
col_groupby = ['Audio device']
|
| 129 |
+
df_datasets_per_device_all = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 130 |
+
# sort by the size of audio transcribed
|
| 131 |
+
df_datasets_per_device_all = df_datasets_per_device_all.sort_values(by='Size audio transcribed [hours]', ascending=False)
|
| 132 |
+
st.dataframe(df_datasets_per_device_all, use_container_width=False)
|
| 133 |
+
|
| 134 |
# Display distribution of datasets for various speech types
|
| 135 |
st.header("Distribution of available speech data per audio device - Public domain datasets")
|
| 136 |
col_groupby = ['Audio device']
|
| 137 |
+
df_datasets_per_device_free = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 138 |
+
# sort by the size of audio transcribed
|
| 139 |
+
df_datasets_per_device_free = df_datasets_per_device_free.sort_values(by='Size audio transcribed [hours]', ascending=False)
|
| 140 |
+
st.dataframe(df_datasets_per_device_free, use_container_width=False)
|
| 141 |
|
| 142 |
# Display distribution of datasets for various speech types
|
| 143 |
st.header("Distribution of available speech data per audio device - Commercial datasets")
|
| 144 |
col_groupby = ['Audio device']
|
| 145 |
+
df_datasets_per_device_paid = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 146 |
+
# sort by the size of audio transcribed
|
| 147 |
+
df_datasets_per_device_paid = df_datasets_per_device_paid.sort_values(by='Size audio transcribed [hours]', ascending=False)
|
| 148 |
+
st.dataframe(df_datasets_per_device_paid, use_container_width=False)
|
| 149 |
+
|
| 150 |
+
# Display distribution of datasets for various speech types
|
| 151 |
+
st.header("Datasets per sampling rate")
|
| 152 |
+
col_groupby = ['Sampling rate [Hz]']
|
| 153 |
+
df_datasets_per_sr = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 154 |
+
# sort by the size of audio transcribed
|
| 155 |
+
df_datasets_per_sr = df_datasets_per_sr.sort_values(by='Size audio transcribed [hours]', ascending=False)
|
| 156 |
+
st.dataframe(df_datasets_per_sr, use_container_width=False)
|
| 157 |
+
|
| 158 |
|
| 159 |
with data_taxonomy:
|
| 160 |
st.title("Polish ASR Speech Data Taxonomy")
|
contants.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
INFO_MAIN= " Welcome to the Polish ASR Survey dashboard! <br> \
|
| 2 |
-
You can use
|
| 3 |
The dashboard is built upon the [*Polish ASR Speech Datasets Catalog*](https://github.com/goodmike31/pl-asr-speech-data-survey) and [*Polish ASR Benchmarks Catalog*](https://docs.google.com/spreadsheets/d/1fVsE98Ulmt-EIEe4wx8sUdo7RLigDdAVjQxNpAJIrH8/edit?usp=sharing). <br><br> \
|
| 4 |
The dashboard is divided into the following tabs: <br> \
|
| 5 |
* **About Polish ASR Survey** - general information about the survey, references, and contact points <br> \
|
|
@@ -11,7 +11,7 @@ The dashboard is divided into the following tabs: <br> \
|
|
| 11 |
* **ASR Benchmarks Taxonomy** - explanation of the columns in the *Polish ASR Benchmarks Catalog* <br> \
|
| 12 |
Please visit respective tab to learn how to use it and provide feedback. <br><br> \
|
| 13 |
If you want to share your feedback regarding the Speech Data catalog, please use this [FORM](https://forms.gle/EWJ6YfbJJTyEzQs66). <br><br> \
|
| 14 |
-
If you are looking for the latest ASR benchmarks for Polish, please visit the [
|
| 15 |
You can also contact the author via [email](mailto:michal.junczyk@amu.edu.pl) or [LinkedIn](https://www.linkedin.com/in/michaljunczyk/).<br>"
|
| 16 |
|
| 17 |
CITATION_MAIN = "@misc{junczyk-2024-pl-asr-survey <br> \
|
|
|
|
| 1 |
INFO_MAIN= " Welcome to the Polish ASR Survey dashboard! <br> \
|
| 2 |
+
You can use it to learn about the state of Polish ASR speech data and benchmarks. <br> \
|
| 3 |
The dashboard is built upon the [*Polish ASR Speech Datasets Catalog*](https://github.com/goodmike31/pl-asr-speech-data-survey) and [*Polish ASR Benchmarks Catalog*](https://docs.google.com/spreadsheets/d/1fVsE98Ulmt-EIEe4wx8sUdo7RLigDdAVjQxNpAJIrH8/edit?usp=sharing). <br><br> \
|
| 4 |
The dashboard is divided into the following tabs: <br> \
|
| 5 |
* **About Polish ASR Survey** - general information about the survey, references, and contact points <br> \
|
|
|
|
| 11 |
* **ASR Benchmarks Taxonomy** - explanation of the columns in the *Polish ASR Benchmarks Catalog* <br> \
|
| 12 |
Please visit respective tab to learn how to use it and provide feedback. <br><br> \
|
| 13 |
If you want to share your feedback regarding the Speech Data catalog, please use this [FORM](https://forms.gle/EWJ6YfbJJTyEzQs66). <br><br> \
|
| 14 |
+
If you are looking for the latest ASR benchmarks for Polish, please visit the [AMU ASR leaderboard](https://huggingface.co/spaces/amu-cai/pl-asr-leaderboard). <br><br> \
|
| 15 |
You can also contact the author via [email](mailto:michal.junczyk@amu.edu.pl) or [LinkedIn](https://www.linkedin.com/in/michaljunczyk/).<br>"
|
| 16 |
|
| 17 |
CITATION_MAIN = "@misc{junczyk-2024-pl-asr-survey <br> \
|