Improve error handling, logging, and user feedback; add sidebar options for NPMI and top words

#8
Files changed (1) hide show
  1. app.py +72 -55
app.py CHANGED
@@ -71,9 +71,9 @@ OUR_LABEL_FIELD = dataset_utils.OUR_LABEL_FIELD
71
  TOKENIZED_FIELD = dataset_utils.TOKENIZED_FIELD
72
  EMBEDDING_FIELD = dataset_utils.EMBEDDING_FIELD
73
  LENGTH_FIELD = dataset_utils.LENGTH_FIELD
74
- # TODO: Allow users to specify this.
75
- _MIN_VOCAB_COUNT = 10
76
- _SHOW_TOP_N_WORDS = 10
77
 
78
 
79
  @st.cache_resource(
@@ -100,28 +100,32 @@ def load_or_prepare(ds_args, show_embeddings, use_cache=False):
100
  if use_cache:
101
  logs.warning("Using cache")
102
  dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
103
- logs.warning("Loading dataset")
104
- dstats.load_or_prepare_dataset()
105
- logs.warning("Loading labels")
106
- dstats.load_or_prepare_labels()
107
- logs.warning("Loading text lengths")
108
- dstats.load_or_prepare_text_lengths()
109
- logs.warning("Loading duplicates")
110
- dstats.load_or_prepare_text_duplicates()
111
- logs.warning("Loading vocabulary")
112
- dstats.load_or_prepare_vocab()
113
- logs.warning("Loading general statistics...")
114
- dstats.load_or_prepare_general_stats()
115
- if show_embeddings:
116
- logs.warning("Loading Embeddings")
117
- dstats.load_or_prepare_embeddings()
118
- logs.warning("Loading nPMI")
119
  try:
120
- dstats.load_or_prepare_npmi()
121
- except:
122
- logs.warning("Missing a cache for npmi")
123
- logs.warning("Loading Zipf")
124
- dstats.load_or_prepare_zipf()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  return dstats
126
 
127
  @st.cache_resource(
@@ -136,9 +140,7 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
136
  ds_args:
137
  show_embeddings:
138
  use_cache:
139
-
140
  Returns:
141
-
142
  """
143
 
144
  if use_cache:
@@ -154,58 +156,56 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
154
  #except:
155
  # logs.warning("We're screwed")
156
  if cache_dir_exists:
 
157
  try:
158
- # We need to have the text_dset loaded for further load_or_prepare
159
  dstats.load_or_prepare_dataset()
160
- except:
161
- logs.warning("Missing a cache for load or prepare dataset")
 
162
  try:
163
- # Header widget
164
  dstats.load_or_prepare_dset_peek()
165
- except:
166
- logs.warning("Missing a cache for dset peek")
167
  try:
168
- # General stats widget
169
  dstats.load_or_prepare_general_stats()
170
- except:
171
- logs.warning("Missing a cache for general stats")
172
  try:
173
- # Labels widget
174
  dstats.load_or_prepare_labels()
175
- except:
176
- logs.warning("Missing a cache for prepare labels")
177
  try:
178
- # Text lengths widget
179
  dstats.load_or_prepare_text_lengths()
180
- except:
181
- logs.warning("Missing a cache for text lengths")
182
  if show_embeddings:
183
  try:
184
- # Embeddings widget
185
  dstats.load_or_prepare_embeddings()
186
- except:
187
- logs.warning("Missing a cache for embeddings")
188
  try:
189
  dstats.load_or_prepare_text_duplicates()
190
- except:
191
- logs.warning("Missing a cache for text duplicates")
192
  try:
193
  dstats.load_or_prepare_npmi()
194
- except:
195
- logs.warning("Missing a cache for npmi")
196
  try:
197
  dstats.load_or_prepare_zipf()
198
- except:
199
- logs.warning("Missing a cache for zipf")
200
  return dstats, cache_dir_exists
201
 
202
- def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
203
  """
204
  Function for displaying the elements in the right column of the streamlit app.
205
  Args:
206
  ds_name_to_dict (dict): the dataset name and options in dictionary form
207
  show_embeddings (Bool): whether embeddings should we loaded and displayed for this dataset
208
  column_id (str): what column of the dataset the analysis is done on
 
 
209
  Returns:
210
  The function displays the information using the functions defined in the st_utils class.
211
  """
@@ -222,7 +222,7 @@ def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
222
  st_utils.expander_text_duplicates(dstats, column_id)
223
  # Uses an interaction; handled a bit differently than other widgets.
224
  logs.info("showing npmi widget")
225
- st_utils.npmi_widget(dstats.npmi_stats, _MIN_VOCAB_COUNT, column_id)
226
  logs.info("showing zipf")
227
  st_utils.expander_zipf(dstats.z, dstats.zipf_fig, column_id)
228
  if show_embeddings:
@@ -248,6 +248,23 @@ def main():
248
  # When not doing new development, use the cache.
249
  use_cache = True
250
  show_embeddings = st.sidebar.checkbox("Show text clusters")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  # List of datasets for which embeddings are hard to compute:
252
 
253
  if compare_mode:
@@ -260,7 +277,7 @@ def main():
260
  )
261
  with left_col:
262
  if cache_exists_left:
263
- show_column(dstats_left, ds_name_to_dict, show_embeddings, " A")
264
  else:
265
  st.markdown("### Missing pre-computed data measures!")
266
  st.write(dataset_args_left)
@@ -269,7 +286,7 @@ def main():
269
  )
270
  with right_col:
271
  if cache_exists_right:
272
- show_column(dstats_right, ds_name_to_dict, show_embeddings, " B")
273
  else:
274
  st.markdown("### Missing pre-computed data measures!")
275
  st.write(dataset_args_right)
@@ -278,7 +295,7 @@ def main():
278
  dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
279
  dstats, cache_exists = load_or_prepare_widgets(dataset_args, show_embeddings, use_cache=use_cache)
280
  if cache_exists:
281
- show_column(dstats, ds_name_to_dict, show_embeddings, "")
282
  else:
283
  st.markdown("### Missing pre-computed data measures!")
284
  st.write(dataset_args)
 
71
  TOKENIZED_FIELD = dataset_utils.TOKENIZED_FIELD
72
  EMBEDDING_FIELD = dataset_utils.EMBEDDING_FIELD
73
  LENGTH_FIELD = dataset_utils.LENGTH_FIELD
74
+ # Allow users to specify these in the sidebar
75
+ _MIN_VOCAB_COUNT_DEFAULT = 10
76
+ _SHOW_TOP_N_WORDS_DEFAULT = 10
77
 
78
 
79
  @st.cache_resource(
 
100
  if use_cache:
101
  logs.warning("Using cache")
102
  dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  try:
104
+ logs.info("Loading dataset")
105
+ dstats.load_or_prepare_dataset()
106
+ logs.info("Loading labels")
107
+ dstats.load_or_prepare_labels()
108
+ logs.info("Loading text lengths")
109
+ dstats.load_or_prepare_text_lengths()
110
+ logs.info("Loading duplicates")
111
+ dstats.load_or_prepare_text_duplicates()
112
+ logs.info("Loading vocabulary")
113
+ dstats.load_or_prepare_vocab()
114
+ logs.info("Loading general statistics...")
115
+ dstats.load_or_prepare_general_stats()
116
+ if show_embeddings:
117
+ logs.info("Loading Embeddings")
118
+ dstats.load_or_prepare_embeddings()
119
+ logs.info("Loading nPMI")
120
+ try:
121
+ dstats.load_or_prepare_npmi()
122
+ except Exception as e:
123
+ logs.warning(f"Missing a cache for npmi: {e}")
124
+ logs.info("Loading Zipf")
125
+ dstats.load_or_prepare_zipf()
126
+ except Exception as e:
127
+ logs.error(f"Error during dataset preparation: {e}")
128
+ st.error(f"An error occurred during dataset preparation: {e}")
129
  return dstats
130
 
131
  @st.cache_resource(
 
140
  ds_args:
141
  show_embeddings:
142
  use_cache:
 
143
  Returns:
 
144
  """
145
 
146
  if use_cache:
 
156
  #except:
157
  # logs.warning("We're screwed")
158
  if cache_dir_exists:
159
+ # Improved: catch and log specific exceptions, show user feedback
160
  try:
 
161
  dstats.load_or_prepare_dataset()
162
+ except Exception as e:
163
+ logs.warning(f"Missing a cache for load or prepare dataset: {e}")
164
+ st.warning(f"Could not load or prepare dataset: {e}")
165
  try:
 
166
  dstats.load_or_prepare_dset_peek()
167
+ except Exception as e:
168
+ logs.warning(f"Missing a cache for dset peek: {e}")
169
  try:
 
170
  dstats.load_or_prepare_general_stats()
171
+ except Exception as e:
172
+ logs.warning(f"Missing a cache for general stats: {e}")
173
  try:
 
174
  dstats.load_or_prepare_labels()
175
+ except Exception as e:
176
+ logs.warning(f"Missing a cache for prepare labels: {e}")
177
  try:
 
178
  dstats.load_or_prepare_text_lengths()
179
+ except Exception as e:
180
+ logs.warning(f"Missing a cache for text lengths: {e}")
181
  if show_embeddings:
182
  try:
 
183
  dstats.load_or_prepare_embeddings()
184
+ except Exception as e:
185
+ logs.warning(f"Missing a cache for embeddings: {e}")
186
  try:
187
  dstats.load_or_prepare_text_duplicates()
188
+ except Exception as e:
189
+ logs.warning(f"Missing a cache for text duplicates: {e}")
190
  try:
191
  dstats.load_or_prepare_npmi()
192
+ except Exception as e:
193
+ logs.warning(f"Missing a cache for npmi: {e}")
194
  try:
195
  dstats.load_or_prepare_zipf()
196
+ except Exception as e:
197
+ logs.warning(f"Missing a cache for zipf: {e}")
198
  return dstats, cache_dir_exists
199
 
200
+ def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, min_vocab_count, show_top_n_words):
201
  """
202
  Function for displaying the elements in the right column of the streamlit app.
203
  Args:
204
  ds_name_to_dict (dict): the dataset name and options in dictionary form
205
  show_embeddings (Bool): whether embeddings should we loaded and displayed for this dataset
206
  column_id (str): what column of the dataset the analysis is done on
207
+ min_vocab_count (int): minimum vocabulary count for NPMI widget
208
+ show_top_n_words (int): number of top words to show
209
  Returns:
210
  The function displays the information using the functions defined in the st_utils class.
211
  """
 
222
  st_utils.expander_text_duplicates(dstats, column_id)
223
  # Uses an interaction; handled a bit differently than other widgets.
224
  logs.info("showing npmi widget")
225
+ st_utils.npmi_widget(dstats.npmi_stats, min_vocab_count, column_id)
226
  logs.info("showing zipf")
227
  st_utils.expander_zipf(dstats.z, dstats.zipf_fig, column_id)
228
  if show_embeddings:
 
248
  # When not doing new development, use the cache.
249
  use_cache = True
250
  show_embeddings = st.sidebar.checkbox("Show text clusters")
251
+ # User-configurable options
252
+ min_vocab_count = st.sidebar.number_input(
253
+ "Minimum vocabulary count for NPMI widget",
254
+ min_value=1,
255
+ max_value=1000,
256
+ value=_MIN_VOCAB_COUNT_DEFAULT,
257
+ step=1,
258
+ help="Minimum number of occurrences for a word to be included in NPMI stats."
259
+ )
260
+ show_top_n_words = st.sidebar.number_input(
261
+ "Number of top words to show",
262
+ min_value=1,
263
+ max_value=100,
264
+ value=_SHOW_TOP_N_WORDS_DEFAULT,
265
+ step=1,
266
+ help="Number of top words to display in relevant widgets."
267
+ )
268
  # List of datasets for which embeddings are hard to compute:
269
 
270
  if compare_mode:
 
277
  )
278
  with left_col:
279
  if cache_exists_left:
280
+ show_column(dstats_left, ds_name_to_dict, show_embeddings, " A", min_vocab_count, show_top_n_words)
281
  else:
282
  st.markdown("### Missing pre-computed data measures!")
283
  st.write(dataset_args_left)
 
286
  )
287
  with right_col:
288
  if cache_exists_right:
289
+ show_column(dstats_right, ds_name_to_dict, show_embeddings, " B", min_vocab_count, show_top_n_words)
290
  else:
291
  st.markdown("### Missing pre-computed data measures!")
292
  st.write(dataset_args_right)
 
295
  dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
296
  dstats, cache_exists = load_or_prepare_widgets(dataset_args, show_embeddings, use_cache=use_cache)
297
  if cache_exists:
298
+ show_column(dstats, ds_name_to_dict, show_embeddings, "", min_vocab_count, show_top_n_words)
299
  else:
300
  st.markdown("### Missing pre-computed data measures!")
301
  st.write(dataset_args)