Improve error handling, logging, and user feedback; add sidebar options for NPMI and top words
#8
by
Snaseem2026
- opened
app.py
CHANGED
|
@@ -71,9 +71,9 @@ OUR_LABEL_FIELD = dataset_utils.OUR_LABEL_FIELD
|
|
| 71 |
TOKENIZED_FIELD = dataset_utils.TOKENIZED_FIELD
|
| 72 |
EMBEDDING_FIELD = dataset_utils.EMBEDDING_FIELD
|
| 73 |
LENGTH_FIELD = dataset_utils.LENGTH_FIELD
|
| 74 |
-
#
|
| 75 |
-
|
| 76 |
-
|
| 77 |
|
| 78 |
|
| 79 |
@st.cache_resource(
|
|
@@ -100,28 +100,32 @@ def load_or_prepare(ds_args, show_embeddings, use_cache=False):
|
|
| 100 |
if use_cache:
|
| 101 |
logs.warning("Using cache")
|
| 102 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
|
| 103 |
-
logs.warning("Loading dataset")
|
| 104 |
-
dstats.load_or_prepare_dataset()
|
| 105 |
-
logs.warning("Loading labels")
|
| 106 |
-
dstats.load_or_prepare_labels()
|
| 107 |
-
logs.warning("Loading text lengths")
|
| 108 |
-
dstats.load_or_prepare_text_lengths()
|
| 109 |
-
logs.warning("Loading duplicates")
|
| 110 |
-
dstats.load_or_prepare_text_duplicates()
|
| 111 |
-
logs.warning("Loading vocabulary")
|
| 112 |
-
dstats.load_or_prepare_vocab()
|
| 113 |
-
logs.warning("Loading general statistics...")
|
| 114 |
-
dstats.load_or_prepare_general_stats()
|
| 115 |
-
if show_embeddings:
|
| 116 |
-
logs.warning("Loading Embeddings")
|
| 117 |
-
dstats.load_or_prepare_embeddings()
|
| 118 |
-
logs.warning("Loading nPMI")
|
| 119 |
try:
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
logs.
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
return dstats
|
| 126 |
|
| 127 |
@st.cache_resource(
|
|
@@ -136,9 +140,7 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
|
|
| 136 |
ds_args:
|
| 137 |
show_embeddings:
|
| 138 |
use_cache:
|
| 139 |
-
|
| 140 |
Returns:
|
| 141 |
-
|
| 142 |
"""
|
| 143 |
|
| 144 |
if use_cache:
|
|
@@ -154,58 +156,56 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
|
|
| 154 |
#except:
|
| 155 |
# logs.warning("We're screwed")
|
| 156 |
if cache_dir_exists:
|
|
|
|
| 157 |
try:
|
| 158 |
-
# We need to have the text_dset loaded for further load_or_prepare
|
| 159 |
dstats.load_or_prepare_dataset()
|
| 160 |
-
except:
|
| 161 |
-
logs.warning("Missing a cache for load or prepare dataset")
|
|
|
|
| 162 |
try:
|
| 163 |
-
# Header widget
|
| 164 |
dstats.load_or_prepare_dset_peek()
|
| 165 |
-
except:
|
| 166 |
-
logs.warning("Missing a cache for dset peek")
|
| 167 |
try:
|
| 168 |
-
# General stats widget
|
| 169 |
dstats.load_or_prepare_general_stats()
|
| 170 |
-
except:
|
| 171 |
-
logs.warning("Missing a cache for general stats")
|
| 172 |
try:
|
| 173 |
-
# Labels widget
|
| 174 |
dstats.load_or_prepare_labels()
|
| 175 |
-
except:
|
| 176 |
-
logs.warning("Missing a cache for prepare labels")
|
| 177 |
try:
|
| 178 |
-
# Text lengths widget
|
| 179 |
dstats.load_or_prepare_text_lengths()
|
| 180 |
-
except:
|
| 181 |
-
logs.warning("Missing a cache for text lengths")
|
| 182 |
if show_embeddings:
|
| 183 |
try:
|
| 184 |
-
# Embeddings widget
|
| 185 |
dstats.load_or_prepare_embeddings()
|
| 186 |
-
except:
|
| 187 |
-
logs.warning("Missing a cache for embeddings")
|
| 188 |
try:
|
| 189 |
dstats.load_or_prepare_text_duplicates()
|
| 190 |
-
except:
|
| 191 |
-
logs.warning("Missing a cache for text duplicates")
|
| 192 |
try:
|
| 193 |
dstats.load_or_prepare_npmi()
|
| 194 |
-
except:
|
| 195 |
-
logs.warning("Missing a cache for npmi")
|
| 196 |
try:
|
| 197 |
dstats.load_or_prepare_zipf()
|
| 198 |
-
except:
|
| 199 |
-
logs.warning("Missing a cache for zipf")
|
| 200 |
return dstats, cache_dir_exists
|
| 201 |
|
| 202 |
-
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
|
| 203 |
"""
|
| 204 |
Function for displaying the elements in the right column of the streamlit app.
|
| 205 |
Args:
|
| 206 |
ds_name_to_dict (dict): the dataset name and options in dictionary form
|
| 207 |
show_embeddings (Bool): whether embeddings should we loaded and displayed for this dataset
|
| 208 |
column_id (str): what column of the dataset the analysis is done on
|
|
|
|
|
|
|
| 209 |
Returns:
|
| 210 |
The function displays the information using the functions defined in the st_utils class.
|
| 211 |
"""
|
|
@@ -222,7 +222,7 @@ def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
|
|
| 222 |
st_utils.expander_text_duplicates(dstats, column_id)
|
| 223 |
# Uses an interaction; handled a bit differently than other widgets.
|
| 224 |
logs.info("showing npmi widget")
|
| 225 |
-
st_utils.npmi_widget(dstats.npmi_stats,
|
| 226 |
logs.info("showing zipf")
|
| 227 |
st_utils.expander_zipf(dstats.z, dstats.zipf_fig, column_id)
|
| 228 |
if show_embeddings:
|
|
@@ -248,6 +248,23 @@ def main():
|
|
| 248 |
# When not doing new development, use the cache.
|
| 249 |
use_cache = True
|
| 250 |
show_embeddings = st.sidebar.checkbox("Show text clusters")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
# List of datasets for which embeddings are hard to compute:
|
| 252 |
|
| 253 |
if compare_mode:
|
|
@@ -260,7 +277,7 @@ def main():
|
|
| 260 |
)
|
| 261 |
with left_col:
|
| 262 |
if cache_exists_left:
|
| 263 |
-
show_column(dstats_left, ds_name_to_dict, show_embeddings, " A")
|
| 264 |
else:
|
| 265 |
st.markdown("### Missing pre-computed data measures!")
|
| 266 |
st.write(dataset_args_left)
|
|
@@ -269,7 +286,7 @@ def main():
|
|
| 269 |
)
|
| 270 |
with right_col:
|
| 271 |
if cache_exists_right:
|
| 272 |
-
show_column(dstats_right, ds_name_to_dict, show_embeddings, " B")
|
| 273 |
else:
|
| 274 |
st.markdown("### Missing pre-computed data measures!")
|
| 275 |
st.write(dataset_args_right)
|
|
@@ -278,7 +295,7 @@ def main():
|
|
| 278 |
dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
|
| 279 |
dstats, cache_exists = load_or_prepare_widgets(dataset_args, show_embeddings, use_cache=use_cache)
|
| 280 |
if cache_exists:
|
| 281 |
-
show_column(dstats, ds_name_to_dict, show_embeddings, "")
|
| 282 |
else:
|
| 283 |
st.markdown("### Missing pre-computed data measures!")
|
| 284 |
st.write(dataset_args)
|
|
|
|
| 71 |
TOKENIZED_FIELD = dataset_utils.TOKENIZED_FIELD
|
| 72 |
EMBEDDING_FIELD = dataset_utils.EMBEDDING_FIELD
|
| 73 |
LENGTH_FIELD = dataset_utils.LENGTH_FIELD
|
| 74 |
+
# Allow users to specify these in the sidebar
|
| 75 |
+
_MIN_VOCAB_COUNT_DEFAULT = 10
|
| 76 |
+
_SHOW_TOP_N_WORDS_DEFAULT = 10
|
| 77 |
|
| 78 |
|
| 79 |
@st.cache_resource(
|
|
|
|
| 100 |
if use_cache:
|
| 101 |
logs.warning("Using cache")
|
| 102 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
try:
|
| 104 |
+
logs.info("Loading dataset")
|
| 105 |
+
dstats.load_or_prepare_dataset()
|
| 106 |
+
logs.info("Loading labels")
|
| 107 |
+
dstats.load_or_prepare_labels()
|
| 108 |
+
logs.info("Loading text lengths")
|
| 109 |
+
dstats.load_or_prepare_text_lengths()
|
| 110 |
+
logs.info("Loading duplicates")
|
| 111 |
+
dstats.load_or_prepare_text_duplicates()
|
| 112 |
+
logs.info("Loading vocabulary")
|
| 113 |
+
dstats.load_or_prepare_vocab()
|
| 114 |
+
logs.info("Loading general statistics...")
|
| 115 |
+
dstats.load_or_prepare_general_stats()
|
| 116 |
+
if show_embeddings:
|
| 117 |
+
logs.info("Loading Embeddings")
|
| 118 |
+
dstats.load_or_prepare_embeddings()
|
| 119 |
+
logs.info("Loading nPMI")
|
| 120 |
+
try:
|
| 121 |
+
dstats.load_or_prepare_npmi()
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logs.warning(f"Missing a cache for npmi: {e}")
|
| 124 |
+
logs.info("Loading Zipf")
|
| 125 |
+
dstats.load_or_prepare_zipf()
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logs.error(f"Error during dataset preparation: {e}")
|
| 128 |
+
st.error(f"An error occurred during dataset preparation: {e}")
|
| 129 |
return dstats
|
| 130 |
|
| 131 |
@st.cache_resource(
|
|
|
|
| 140 |
ds_args:
|
| 141 |
show_embeddings:
|
| 142 |
use_cache:
|
|
|
|
| 143 |
Returns:
|
|
|
|
| 144 |
"""
|
| 145 |
|
| 146 |
if use_cache:
|
|
|
|
| 156 |
#except:
|
| 157 |
# logs.warning("We're screwed")
|
| 158 |
if cache_dir_exists:
|
| 159 |
+
# Improved: catch and log specific exceptions, show user feedback
|
| 160 |
try:
|
|
|
|
| 161 |
dstats.load_or_prepare_dataset()
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logs.warning(f"Missing a cache for load or prepare dataset: {e}")
|
| 164 |
+
st.warning(f"Could not load or prepare dataset: {e}")
|
| 165 |
try:
|
|
|
|
| 166 |
dstats.load_or_prepare_dset_peek()
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logs.warning(f"Missing a cache for dset peek: {e}")
|
| 169 |
try:
|
|
|
|
| 170 |
dstats.load_or_prepare_general_stats()
|
| 171 |
+
except Exception as e:
|
| 172 |
+
logs.warning(f"Missing a cache for general stats: {e}")
|
| 173 |
try:
|
|
|
|
| 174 |
dstats.load_or_prepare_labels()
|
| 175 |
+
except Exception as e:
|
| 176 |
+
logs.warning(f"Missing a cache for prepare labels: {e}")
|
| 177 |
try:
|
|
|
|
| 178 |
dstats.load_or_prepare_text_lengths()
|
| 179 |
+
except Exception as e:
|
| 180 |
+
logs.warning(f"Missing a cache for text lengths: {e}")
|
| 181 |
if show_embeddings:
|
| 182 |
try:
|
|
|
|
| 183 |
dstats.load_or_prepare_embeddings()
|
| 184 |
+
except Exception as e:
|
| 185 |
+
logs.warning(f"Missing a cache for embeddings: {e}")
|
| 186 |
try:
|
| 187 |
dstats.load_or_prepare_text_duplicates()
|
| 188 |
+
except Exception as e:
|
| 189 |
+
logs.warning(f"Missing a cache for text duplicates: {e}")
|
| 190 |
try:
|
| 191 |
dstats.load_or_prepare_npmi()
|
| 192 |
+
except Exception as e:
|
| 193 |
+
logs.warning(f"Missing a cache for npmi: {e}")
|
| 194 |
try:
|
| 195 |
dstats.load_or_prepare_zipf()
|
| 196 |
+
except Exception as e:
|
| 197 |
+
logs.warning(f"Missing a cache for zipf: {e}")
|
| 198 |
return dstats, cache_dir_exists
|
| 199 |
|
| 200 |
+
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, min_vocab_count, show_top_n_words):
|
| 201 |
"""
|
| 202 |
Function for displaying the elements in the right column of the streamlit app.
|
| 203 |
Args:
|
| 204 |
ds_name_to_dict (dict): the dataset name and options in dictionary form
|
| 205 |
show_embeddings (Bool): whether embeddings should we loaded and displayed for this dataset
|
| 206 |
column_id (str): what column of the dataset the analysis is done on
|
| 207 |
+
min_vocab_count (int): minimum vocabulary count for NPMI widget
|
| 208 |
+
show_top_n_words (int): number of top words to show
|
| 209 |
Returns:
|
| 210 |
The function displays the information using the functions defined in the st_utils class.
|
| 211 |
"""
|
|
|
|
| 222 |
st_utils.expander_text_duplicates(dstats, column_id)
|
| 223 |
# Uses an interaction; handled a bit differently than other widgets.
|
| 224 |
logs.info("showing npmi widget")
|
| 225 |
+
st_utils.npmi_widget(dstats.npmi_stats, min_vocab_count, column_id)
|
| 226 |
logs.info("showing zipf")
|
| 227 |
st_utils.expander_zipf(dstats.z, dstats.zipf_fig, column_id)
|
| 228 |
if show_embeddings:
|
|
|
|
| 248 |
# When not doing new development, use the cache.
|
| 249 |
use_cache = True
|
| 250 |
show_embeddings = st.sidebar.checkbox("Show text clusters")
|
| 251 |
+
# User-configurable options
|
| 252 |
+
min_vocab_count = st.sidebar.number_input(
|
| 253 |
+
"Minimum vocabulary count for NPMI widget",
|
| 254 |
+
min_value=1,
|
| 255 |
+
max_value=1000,
|
| 256 |
+
value=_MIN_VOCAB_COUNT_DEFAULT,
|
| 257 |
+
step=1,
|
| 258 |
+
help="Minimum number of occurrences for a word to be included in NPMI stats."
|
| 259 |
+
)
|
| 260 |
+
show_top_n_words = st.sidebar.number_input(
|
| 261 |
+
"Number of top words to show",
|
| 262 |
+
min_value=1,
|
| 263 |
+
max_value=100,
|
| 264 |
+
value=_SHOW_TOP_N_WORDS_DEFAULT,
|
| 265 |
+
step=1,
|
| 266 |
+
help="Number of top words to display in relevant widgets."
|
| 267 |
+
)
|
| 268 |
# List of datasets for which embeddings are hard to compute:
|
| 269 |
|
| 270 |
if compare_mode:
|
|
|
|
| 277 |
)
|
| 278 |
with left_col:
|
| 279 |
if cache_exists_left:
|
| 280 |
+
show_column(dstats_left, ds_name_to_dict, show_embeddings, " A", min_vocab_count, show_top_n_words)
|
| 281 |
else:
|
| 282 |
st.markdown("### Missing pre-computed data measures!")
|
| 283 |
st.write(dataset_args_left)
|
|
|
|
| 286 |
)
|
| 287 |
with right_col:
|
| 288 |
if cache_exists_right:
|
| 289 |
+
show_column(dstats_right, ds_name_to_dict, show_embeddings, " B", min_vocab_count, show_top_n_words)
|
| 290 |
else:
|
| 291 |
st.markdown("### Missing pre-computed data measures!")
|
| 292 |
st.write(dataset_args_right)
|
|
|
|
| 295 |
dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
|
| 296 |
dstats, cache_exists = load_or_prepare_widgets(dataset_args, show_embeddings, use_cache=use_cache)
|
| 297 |
if cache_exists:
|
| 298 |
+
show_column(dstats, ds_name_to_dict, show_embeddings, "", min_vocab_count, show_top_n_words)
|
| 299 |
else:
|
| 300 |
st.markdown("### Missing pre-computed data measures!")
|
| 301 |
st.write(dataset_args)
|