Spaces:
Sleeping
Sleeping
mj-new
commited on
Commit
·
d5cbb7a
1
Parent(s):
2901944
Alpha version of the dataset catalog
Browse files- README.md +4 -4
- __pycache__/app_utils.cpython-310.pyc +0 -0
- __pycache__/contants.cpython-310.pyc +0 -0
- __pycache__/utils.cpython-310.pyc +0 -0
- analysis-playground.ipynb +0 -0
- app.py +101 -0
- app_utils.py +94 -0
- contants.py +5 -0
- requirements.txt +3 -0
- utils.py +276 -0
README.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
---
|
| 2 |
-
title: Pl Asr Survey
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: red
|
| 6 |
sdk: streamlit
|
| 7 |
-
sdk_version: 1.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: cc-by-sa-4.0
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Pl Asr Speech Data Survey
|
| 3 |
+
emoji: 🏃
|
| 4 |
+
colorFrom: pink
|
| 5 |
colorTo: red
|
| 6 |
sdk: streamlit
|
| 7 |
+
sdk_version: 1.31.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: cc-by-sa-4.0
|
__pycache__/app_utils.cpython-310.pyc
ADDED
|
Binary file (2.28 kB). View file
|
|
|
__pycache__/contants.cpython-310.pyc
ADDED
|
Binary file (482 Bytes). View file
|
|
|
__pycache__/utils.cpython-310.pyc
ADDED
|
Binary file (7.26 kB). View file
|
|
|
analysis-playground.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import streamlit as st
|
| 3 |
+
|
| 4 |
+
from app_utils import filter_dataframe, calculate_height_to_display
|
| 5 |
+
from contants import WELCOME_TEXT, CITATION_TEXT
|
| 6 |
+
from utils import BASE_SUMMARY_METRICS
|
| 7 |
+
from utils import load_catalog, load_taxonomy
|
| 8 |
+
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
|
| 9 |
+
|
| 10 |
+
import matplotlib.pyplot as plt
|
| 11 |
+
import seaborn as sns
|
| 12 |
+
|
| 13 |
+
st.set_page_config(layout="wide")
|
| 14 |
+
|
| 15 |
+
st.title("Polish Speech Datasets Catalog and Survey analysis")
|
| 16 |
+
|
| 17 |
+
st.write(WELCOME_TEXT)
|
| 18 |
+
|
| 19 |
+
st.write(CITATION_TEXT)
|
| 20 |
+
|
| 21 |
+
# Cache the dataframe so it's only loaded once
|
| 22 |
+
df_cat = load_catalog()
|
| 23 |
+
df_tax = load_taxonomy()
|
| 24 |
+
|
| 25 |
+
# Filter out non available datasets
|
| 26 |
+
df_cat_available = df_cat[df_cat['Available online'] == 'yes']
|
| 27 |
+
# Available and free
|
| 28 |
+
df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')]
|
| 29 |
+
|
| 30 |
+
# Available and paid
|
| 31 |
+
df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')]
|
| 32 |
+
|
| 33 |
+
# Display catalog contents
|
| 34 |
+
st.dataframe(filter_dataframe(df_cat), hide_index=True, use_container_width=True)
|
| 35 |
+
|
| 36 |
+
# Display taxonomy contents
|
| 37 |
+
|
| 38 |
+
# Display summary statistics
|
| 39 |
+
st.header("Polish ASR speech datasets summary statistics")
|
| 40 |
+
df_summary_metrics = catalog_summary_statistics(df_cat)
|
| 41 |
+
|
| 42 |
+
df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
|
| 43 |
+
st.dataframe(df_basic_stats, use_container_width=False)
|
| 44 |
+
|
| 45 |
+
st.header("Speech data available across Polish ASR speech datasets")
|
| 46 |
+
df_stats_audio_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[5:10]]
|
| 47 |
+
st.dataframe(df_stats_audio_available, use_container_width=False)
|
| 48 |
+
|
| 49 |
+
st.header("Transcribed data available across Polish ASR speech datasets")
|
| 50 |
+
df_stats_transcribed_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[10:15]]
|
| 51 |
+
st.dataframe(df_stats_transcribed_available, use_container_width=False)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# Display distribution of datasets created per year
|
| 55 |
+
st.header("Polish ASR speech datasets created in 1997-2023")
|
| 56 |
+
col_groupby = ['Creation year']
|
| 57 |
+
df_datasets_per_speech_type = datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 58 |
+
|
| 59 |
+
st.dataframe(df_datasets_per_speech_type, use_container_width=False)
|
| 60 |
+
|
| 61 |
+
st.header("Institutions contributing Polish ASR speech dataset")
|
| 62 |
+
col_groupby = ['Publisher']
|
| 63 |
+
df_datasets_per_publisher = datasets_count_and_size(df_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 64 |
+
st.dataframe(df_datasets_per_publisher, use_container_width=False)
|
| 65 |
+
|
| 66 |
+
st.header("Repositories hosting Polish ASR speech datasets")
|
| 67 |
+
col_groupby = ['Repository']
|
| 68 |
+
df_datasets_per_repo = datasets_count_and_size(df_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 69 |
+
st.dataframe(df_datasets_per_repo, use_container_width=False)
|
| 70 |
+
|
| 71 |
+
st.header("Public domain Polish ASR speech datasets")
|
| 72 |
+
col_groupby = ['License', "Dataset ID"]
|
| 73 |
+
df_datasets_public = datasets_count_and_size(df_cat_available_free, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
|
| 74 |
+
st.dataframe(df_datasets_public, use_container_width=False)
|
| 75 |
+
|
| 76 |
+
st.header("Commercialy available Polish ASR speech datasets")
|
| 77 |
+
col_groupby = ['License', "Dataset ID"]
|
| 78 |
+
df_datasets_paid = datasets_count_and_size(df_cat_available_paid, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
|
| 79 |
+
st.dataframe(df_datasets_paid, use_container_width=False)
|
| 80 |
+
|
| 81 |
+
st.header("Coverage of metadata across Polish ASR speech datasets")
|
| 82 |
+
df_meta_all_flat, df_meta_all_pivot = metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid)
|
| 83 |
+
st.dataframe(df_meta_all_pivot, use_container_width=False)
|
| 84 |
+
|
| 85 |
+
# Display distribution of datasets for various speech types
|
| 86 |
+
st.header("Datasets per speech type")
|
| 87 |
+
col_groupby = ['Speech type']
|
| 88 |
+
df_datasets_per_speech_type = datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 89 |
+
st.dataframe(df_datasets_per_speech_type, use_container_width=False)
|
| 90 |
+
|
| 91 |
+
# Display distribution of datasets for various speech types
|
| 92 |
+
st.header("Distribution of available speech data per audio device - Public domain datasets")
|
| 93 |
+
col_groupby = ['Audio device']
|
| 94 |
+
df_datasets_per_device = datasets_count_and_size(df_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 95 |
+
st.dataframe(df_datasets_per_device, use_container_width=False)
|
| 96 |
+
|
| 97 |
+
# Display distribution of datasets for various speech types
|
| 98 |
+
st.header("Distribution of available speech data per audio device - Commercial datasets")
|
| 99 |
+
col_groupby = ['Audio device']
|
| 100 |
+
df_datasets_per_device = datasets_count_and_size(df_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
| 101 |
+
st.dataframe(df_datasets_per_device, use_container_width=False)
|
app_utils.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import streamlit as st
|
| 3 |
+
|
| 4 |
+
from pandas.api.types import (
|
| 5 |
+
is_categorical_dtype,
|
| 6 |
+
is_datetime64_any_dtype,
|
| 7 |
+
is_numeric_dtype,
|
| 8 |
+
is_object_dtype,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
def calculate_height_to_display(df):
|
| 12 |
+
# Calculate the height of the DataFrame display area
|
| 13 |
+
num_rows = df.shape[0]
|
| 14 |
+
row_height = 25 # Estimate of row height in pixels, adjust based on your layout/theme
|
| 15 |
+
header_height = 50 # Estimate of header height in pixels
|
| 16 |
+
padding = 20 # Extra padding in pixels
|
| 17 |
+
calculated_height = num_rows * row_height + header_height + padding
|
| 18 |
+
|
| 19 |
+
return calculated_height
|
| 20 |
+
|
| 21 |
+
def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
| 22 |
+
"""
|
| 23 |
+
Adds a UI on top of a dataframe to let viewers filter columns
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
df (pd.DataFrame): Original dataframe
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
pd.DataFrame: Filtered dataframe
|
| 30 |
+
"""
|
| 31 |
+
modify = st.checkbox("Use filters on speech data catalog")
|
| 32 |
+
|
| 33 |
+
if not modify:
|
| 34 |
+
return df
|
| 35 |
+
|
| 36 |
+
df = df.copy()
|
| 37 |
+
|
| 38 |
+
# Try to convert datetimes into a standard format (datetime, no timezone)
|
| 39 |
+
for col in df.columns:
|
| 40 |
+
if is_object_dtype(df[col]):
|
| 41 |
+
try:
|
| 42 |
+
df[col] = pd.to_datetime(df[col])
|
| 43 |
+
except Exception:
|
| 44 |
+
pass
|
| 45 |
+
|
| 46 |
+
if is_datetime64_any_dtype(df[col]):
|
| 47 |
+
df[col] = df[col].dt.tz_localize(None)
|
| 48 |
+
|
| 49 |
+
modification_container = st.container()
|
| 50 |
+
|
| 51 |
+
with modification_container:
|
| 52 |
+
to_filter_columns = st.multiselect("Filter dataframe on", df.columns)
|
| 53 |
+
for column in to_filter_columns:
|
| 54 |
+
left, right = st.columns((1, 20))
|
| 55 |
+
# Treat columns with < 10 unique values as categorical
|
| 56 |
+
if is_categorical_dtype(df[column]) or df[column].nunique() < 10:
|
| 57 |
+
user_cat_input = right.multiselect(
|
| 58 |
+
f"Values for {column}",
|
| 59 |
+
df[column].unique(),
|
| 60 |
+
default=list(df[column].unique()),
|
| 61 |
+
)
|
| 62 |
+
df = df[df[column].isin(user_cat_input)]
|
| 63 |
+
elif is_numeric_dtype(df[column]):
|
| 64 |
+
_min = float(df[column].min())
|
| 65 |
+
_max = float(df[column].max())
|
| 66 |
+
step = (_max - _min) / 100
|
| 67 |
+
user_num_input = right.slider(
|
| 68 |
+
f"Values for {column}",
|
| 69 |
+
min_value=_min,
|
| 70 |
+
max_value=_max,
|
| 71 |
+
value=(_min, _max),
|
| 72 |
+
step=step,
|
| 73 |
+
)
|
| 74 |
+
df = df[df[column].between(*user_num_input)]
|
| 75 |
+
elif is_datetime64_any_dtype(df[column]):
|
| 76 |
+
user_date_input = right.date_input(
|
| 77 |
+
f"Values for {column}",
|
| 78 |
+
value=(
|
| 79 |
+
df[column].min(),
|
| 80 |
+
df[column].max(),
|
| 81 |
+
),
|
| 82 |
+
)
|
| 83 |
+
if len(user_date_input) == 2:
|
| 84 |
+
user_date_input = tuple(map(pd.to_datetime, user_date_input))
|
| 85 |
+
start_date, end_date = user_date_input
|
| 86 |
+
df = df.loc[df[column].between(start_date, end_date)]
|
| 87 |
+
else:
|
| 88 |
+
user_text_input = right.text_input(
|
| 89 |
+
f"Substring or regex in {column}",
|
| 90 |
+
)
|
| 91 |
+
if user_text_input:
|
| 92 |
+
df = df[df[column].astype(str).str.contains(user_text_input)]
|
| 93 |
+
|
| 94 |
+
return df
|
contants.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
WELCOME_TEXT = "This dashboard complements [Polish Speech Datasets Catalog](https://github.com/goodmike31/pl-asr-speech-data-survey) with:\n \
|
| 2 |
+
a. Dynamic filtering of catalog content\n \
|
| 3 |
+
b. Summary statistics about Polish ASR speech datasets\n"
|
| 4 |
+
|
| 5 |
+
CITATION_TEXT="Please cite this work as: TODO\n"
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
seaborn
|
| 2 |
+
matplotlib
|
| 3 |
+
pandas
|
utils.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import streamlit as st
|
| 4 |
+
|
| 5 |
+
catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d')
|
| 6 |
+
# TODO - extract from the catalog name
|
| 7 |
+
|
| 8 |
+
BASE_SUMMARY_METRICS = [
|
| 9 |
+
"Catalog last update date",
|
| 10 |
+
"Unique Polish speech datasets producers",
|
| 11 |
+
"Identified datasets reported in the public domain",
|
| 12 |
+
"Datasets available to the public (free and paid)",
|
| 13 |
+
"Fraction of reported datasets available to the public [%]",
|
| 14 |
+
"Speech data reported in the public domain [hours]",
|
| 15 |
+
"Speech data available total [hours]",
|
| 16 |
+
"Speech data available free of charge [hours]",
|
| 17 |
+
"Speech data available commercially [hours]",
|
| 18 |
+
"Reported vs available speech data ratio [%]",
|
| 19 |
+
"Transcribed speech data reported in the public domain [hours]",
|
| 20 |
+
"Transcribed speech data available total [hours]",
|
| 21 |
+
"Transcribed speech data available free of charge [hours]",
|
| 22 |
+
"Transcribed speech data available commercially [hours]",
|
| 23 |
+
"Reported vs available transcribed speech data ratio [%]",
|
| 24 |
+
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
def download_tsv_from_google_sheet(sheet_url):
|
| 28 |
+
# Modify the Google Sheet URL to export it as TSV
|
| 29 |
+
tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
|
| 30 |
+
|
| 31 |
+
# Send a GET request to download the TSV file
|
| 32 |
+
response = requests.get(tsv_url)
|
| 33 |
+
|
| 34 |
+
# Check if the request was successful
|
| 35 |
+
if response.status_code == 200:
|
| 36 |
+
# Read the TSV content into a pandas DataFrame
|
| 37 |
+
from io import StringIO
|
| 38 |
+
tsv_content = StringIO(response.text)
|
| 39 |
+
df = pd.read_csv(tsv_content, sep='\t')
|
| 40 |
+
return df
|
| 41 |
+
else:
|
| 42 |
+
print("Failed to download the TSV file.")
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
@st.cache_data
|
| 46 |
+
def load_catalog():
|
| 47 |
+
print("Reading speech data catalog")
|
| 48 |
+
catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0"
|
| 49 |
+
df_catalog = download_tsv_from_google_sheet(catalog_url)
|
| 50 |
+
return(df_catalog)
|
| 51 |
+
|
| 52 |
+
@st.cache_data
|
| 53 |
+
def load_taxonomy():
|
| 54 |
+
print("Reading speech data survey taxonomy")
|
| 55 |
+
taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
|
| 56 |
+
df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
|
| 57 |
+
return(df_taxonomy)
|
| 58 |
+
|
| 59 |
+
def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
|
| 60 |
+
"""
|
| 61 |
+
Function to generate a summary view of datasets by speech type and other relevant metrics.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
- df_cat (pd.DataFrame): The base dataframe containing dataset information.
|
| 65 |
+
- col_sum (str or list): The column(s) to sum.
|
| 66 |
+
- col_count (str or list): The column(s) to count.
|
| 67 |
+
- col_groupby (str or list): The column(s) to group the datasets by.
|
| 68 |
+
- col_percent (str): The column to calculate the percentage of total.
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
- pd.DataFrame: A dataframe summarizing datasets by speech type and other relevant metrics.
|
| 72 |
+
"""
|
| 73 |
+
# Convert col_sum, col_count, and col_groupby to lists if they are not already
|
| 74 |
+
if not isinstance(col_sum, list):
|
| 75 |
+
col_sum = [col_sum]
|
| 76 |
+
if not isinstance(col_count, list):
|
| 77 |
+
col_count = [col_count]
|
| 78 |
+
if not isinstance(col_groupby, list):
|
| 79 |
+
col_groupby = [col_groupby]
|
| 80 |
+
|
| 81 |
+
# First, ensure that the data types and potential missing values are handled correctly
|
| 82 |
+
for col in col_sum:
|
| 83 |
+
num_values = df_cat[col].apply(lambda x: pd.to_numeric(x, errors='coerce')).fillna(0)
|
| 84 |
+
df_cat[col] = num_values
|
| 85 |
+
|
| 86 |
+
# Aggregating datasets by provided column type
|
| 87 |
+
summary = df_cat.groupby(col_groupby).agg({
|
| 88 |
+
**{col: 'sum' for col in col_sum},
|
| 89 |
+
**{col: 'count' for col in col_count}
|
| 90 |
+
}).reset_index()
|
| 91 |
+
|
| 92 |
+
col_name_percent = 'Percent of total'
|
| 93 |
+
if col_percent is not None:
|
| 94 |
+
# Calculating the percentage
|
| 95 |
+
total = summary[col_percent].sum(axis=1)
|
| 96 |
+
summary[col_name_percent] = round(total / total.sum() * 100, 2)
|
| 97 |
+
|
| 98 |
+
# Sorting the summary by the sum of the column
|
| 99 |
+
summary.sort_values(by=col_sum[0], ascending=False, inplace=True)
|
| 100 |
+
|
| 101 |
+
# Replacing index with the groupby column
|
| 102 |
+
summary.reset_index(drop=True, inplace=True)
|
| 103 |
+
summary.set_index(col_groupby, inplace=True)
|
| 104 |
+
|
| 105 |
+
# Rename the column to a more descriptive name
|
| 106 |
+
if len(col_count) == 0:
|
| 107 |
+
col_name_count = None
|
| 108 |
+
elif len(col_count) == 1:
|
| 109 |
+
col_name_count = 'Count ' + col_count[0]
|
| 110 |
+
summary.rename(columns={col_count[0]: col_name_count }, inplace=True)
|
| 111 |
+
summary[col_name_count] = summary[col_name_count].astype(int)
|
| 112 |
+
else:
|
| 113 |
+
#TODO - add support for renaming multiple count columns
|
| 114 |
+
pass
|
| 115 |
+
|
| 116 |
+
# Make the order of columns as follows 'Count Dataset ID', Total transcribed [hours], 'Percent of total'
|
| 117 |
+
if col_percent is None:
|
| 118 |
+
if col_name_count not in summary.columns:
|
| 119 |
+
summary = summary[col_sum]
|
| 120 |
+
else:
|
| 121 |
+
summary = summary[[col_name_count] + col_sum]
|
| 122 |
+
else:
|
| 123 |
+
if col_name_count not in summary.columns:
|
| 124 |
+
summary = summary[col_sum + [col_name_percent]]
|
| 125 |
+
else:
|
| 126 |
+
summary = summary[[col_name_count] + col_sum + [col_name_percent]]
|
| 127 |
+
|
| 128 |
+
# Sort by the provided column col_sort
|
| 129 |
+
col_sort = col_groupby if col_sort is None else col_sort
|
| 130 |
+
summary.sort_values(by=col_sort, ascending=False, inplace=True)
|
| 131 |
+
|
| 132 |
+
# Replace 0 with no-info in columns with sum
|
| 133 |
+
for col in col_sum:
|
| 134 |
+
summary[col] = summary[col].replace(0, 'no-info')
|
| 135 |
+
|
| 136 |
+
return summary
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def datasets_count_and_size_standard(df_cat, col_groupby):
|
| 140 |
+
return datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=['Size audio transcribed [hours]'], col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count=['Dataset ID'])
|
| 141 |
+
|
| 142 |
+
def metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid):
|
| 143 |
+
#TODO - add number of speakers and recordings
|
| 144 |
+
|
| 145 |
+
# 'Speaker id info', 'Part of speech annotation', 'Named entity annotation', 'Emotion annotation'
|
| 146 |
+
meta_data_cols = ['Gender info', 'Age info', 'Accent info', 'Nativity info', 'Time alignement annotation']
|
| 147 |
+
meta_coverage_all_sets = {}
|
| 148 |
+
meta_coverage_free_sets = {}
|
| 149 |
+
meta_coverage_paid_sets = {}
|
| 150 |
+
|
| 151 |
+
col_name_sum_size = 'Size audio transcribed [hours]'
|
| 152 |
+
col_name_count = 'Count Dataset ID'
|
| 153 |
+
col_name_percent = 'Percent of total'
|
| 154 |
+
|
| 155 |
+
#, 'Named entity annotation', 'Emotion annotation']
|
| 156 |
+
for meta_data_col in meta_data_cols:
|
| 157 |
+
df_datasets_per_meta_paid = datasets_count_and_size_standard(df_cat_available_paid, meta_data_col)
|
| 158 |
+
#print(df_datasets_per_meta_paid)
|
| 159 |
+
if 'yes' in df_datasets_per_meta_paid.index:
|
| 160 |
+
meta_coverage_paid_sets[meta_data_col] = df_datasets_per_meta_paid.loc['yes']
|
| 161 |
+
else:
|
| 162 |
+
meta_coverage_paid_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
|
| 163 |
+
|
| 164 |
+
df_datasets_per_meta_all = datasets_count_and_size_standard(df_cat, meta_data_col)
|
| 165 |
+
#print(df_datasets_per_meta_all)
|
| 166 |
+
# select row where index has value "yes" and column name is "Percent of total"
|
| 167 |
+
if 'yes' in df_datasets_per_meta_all.index:
|
| 168 |
+
meta_coverage_all_sets[meta_data_col] = df_datasets_per_meta_all.loc['yes']
|
| 169 |
+
else:
|
| 170 |
+
meta_coverage_all_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
|
| 171 |
+
|
| 172 |
+
df_datasets_per_meta_free = datasets_count_and_size_standard(df_cat_available_free, meta_data_col)
|
| 173 |
+
#print(df_datasets_per_meta_free)
|
| 174 |
+
# check if index has value "yes", if not assign 0
|
| 175 |
+
if 'yes' in df_datasets_per_meta_free.index:
|
| 176 |
+
meta_coverage_free_sets[meta_data_col] = df_datasets_per_meta_free.loc['yes']
|
| 177 |
+
else:
|
| 178 |
+
meta_coverage_free_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
|
| 179 |
+
|
| 180 |
+
#merge all free and paid dataframes
|
| 181 |
+
df_meta_free = pd.DataFrame.from_dict(meta_coverage_free_sets, orient='index')
|
| 182 |
+
df_meta_free[col_name_count] = df_meta_free[col_name_count].astype(int)
|
| 183 |
+
|
| 184 |
+
df_meta_paid = pd.DataFrame.from_dict(meta_coverage_paid_sets, orient='index')
|
| 185 |
+
df_meta_paid[col_name_count] = df_meta_paid[col_name_count].astype(int)
|
| 186 |
+
|
| 187 |
+
df_meta_free['Type'] = 'Free'
|
| 188 |
+
df_meta_paid['Type'] = 'Paid'
|
| 189 |
+
df_meta_all_flat = pd.concat([df_meta_free, df_meta_paid])
|
| 190 |
+
|
| 191 |
+
#transform to compare free and paid column by column
|
| 192 |
+
df_meta_all_pivot = df_meta_all_flat.reset_index()
|
| 193 |
+
df_meta_all_pivot = df_meta_all_pivot.rename(columns={'index':'Metadata'})
|
| 194 |
+
df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent])
|
| 195 |
+
df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int)
|
| 196 |
+
|
| 197 |
+
return(df_meta_all_flat, df_meta_all_pivot)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def catalog_summary_statistics(df_cat):
|
| 201 |
+
"""
|
| 202 |
+
Function to generate summary statistics for the speech data catalog.
|
| 203 |
+
|
| 204 |
+
Args:
|
| 205 |
+
- df_cat (pd.DataFrame): The base dataframe containing dataset information.
|
| 206 |
+
|
| 207 |
+
Returns:
|
| 208 |
+
- pd.DataFrame: A dataframe summarizing the speech data catalog.
|
| 209 |
+
"""
|
| 210 |
+
|
| 211 |
+
col_name_transcribed = 'Size audio transcribed [hours]'
|
| 212 |
+
col_name_audio= 'Size audio total [hours]'
|
| 213 |
+
|
| 214 |
+
# Convert numerical fields to numeric type
|
| 215 |
+
df_cat[col_name_audio] = pd.to_numeric(df_cat[col_name_audio], errors='coerce')
|
| 216 |
+
df_cat[col_name_transcribed] = pd.to_numeric(df_cat[col_name_transcribed], errors='coerce')
|
| 217 |
+
|
| 218 |
+
# Filter out non-available datasets
|
| 219 |
+
df_cat_available = df_cat[df_cat['Available online'] == 'yes']
|
| 220 |
+
df_cat_free = df_cat[df_cat['Price - non-commercial usage'] == 'free']
|
| 221 |
+
df_cat_commercial = df_cat[df_cat['Price - non-commercial usage'] != 'free']
|
| 222 |
+
|
| 223 |
+
# Available and free
|
| 224 |
+
df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')]
|
| 225 |
+
|
| 226 |
+
# Available and paid
|
| 227 |
+
df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')]
|
| 228 |
+
|
| 229 |
+
# Basic Calculations
|
| 230 |
+
identified_datasets_count = df_cat.shape[0]
|
| 231 |
+
accessible_datasets_count = df_cat_available.shape[0]
|
| 232 |
+
unique_producers_count = df_cat['Publisher'].nunique()
|
| 233 |
+
accessible_datasets_fraction = round((accessible_datasets_count / identified_datasets_count) * 100, 2)
|
| 234 |
+
|
| 235 |
+
# Total audio available and other dependent calculations
|
| 236 |
+
audio_reported = round(df_cat[col_name_audio].sum(), 2)
|
| 237 |
+
audio_accessible = round(df_cat_available[col_name_audio].sum(), 2)
|
| 238 |
+
audio_accessible_free = round(df_cat_available_free[col_name_audio].sum(), 2)
|
| 239 |
+
audio_accessible_paid = round(df_cat_available_paid[col_name_audio].sum(), 2)
|
| 240 |
+
|
| 241 |
+
transcribed_audio_reported = round(df_cat[col_name_transcribed].sum(), 2)
|
| 242 |
+
transcribed_audio_accessible = round(df_cat_available[col_name_transcribed].sum(), 2)
|
| 243 |
+
transcribed_audio_accessible_free = round(df_cat_available_free[col_name_transcribed].sum(), 2)
|
| 244 |
+
transcribed_audio_accessible_paid = round(df_cat_available_paid[col_name_transcribed].sum(), 2)
|
| 245 |
+
|
| 246 |
+
# available vs Reported Speech Material Ratio
|
| 247 |
+
accessible_vs_reported_audio_ratio = round((audio_accessible / audio_reported) * 100, 2)
|
| 248 |
+
accessible_vs_reported_transcribed_ratio = round((transcribed_audio_accessible / transcribed_audio_reported) * 100, 2)
|
| 249 |
+
|
| 250 |
+
# Finalizing the metrics dictionary
|
| 251 |
+
metrics_dict = {
|
| 252 |
+
"Metric": BASE_SUMMARY_METRICS,
|
| 253 |
+
"Value": [
|
| 254 |
+
catalog_last_update_date,
|
| 255 |
+
unique_producers_count,
|
| 256 |
+
identified_datasets_count,
|
| 257 |
+
accessible_datasets_count,
|
| 258 |
+
accessible_datasets_fraction,
|
| 259 |
+
audio_reported,
|
| 260 |
+
audio_accessible,
|
| 261 |
+
audio_accessible_free,
|
| 262 |
+
audio_accessible_paid,
|
| 263 |
+
accessible_vs_reported_audio_ratio,
|
| 264 |
+
transcribed_audio_reported,
|
| 265 |
+
transcribed_audio_accessible,
|
| 266 |
+
transcribed_audio_accessible_free,
|
| 267 |
+
transcribed_audio_accessible_paid,
|
| 268 |
+
accessible_vs_reported_transcribed_ratio,
|
| 269 |
+
]
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
# Convert the dictionary into a DataFrame
|
| 273 |
+
metrics_df = pd.DataFrame(metrics_dict)
|
| 274 |
+
metrics_df.reset_index(drop=True, inplace=True)
|
| 275 |
+
metrics_df.set_index("Metric", inplace=True)
|
| 276 |
+
return(metrics_df)
|