lynn-twinkl commited on
Commit
c2e9454
·
1 Parent(s): cbff612

UI fixes; light refactoring for Filter summary

Browse files
Files changed (1) hide show
  1. app.py +85 -72
app.py CHANGED
@@ -83,7 +83,7 @@ def load_and_process(raw_csv: bytes) -> Tuple[pd.DataFrame, str]:
83
 
84
  # Usage Extraction
85
  docs = df_orig[freeform_col].to_list()
86
- scored['Usage'] = extract_usage(docs)
87
 
88
  return scored, freeform_col, id_col
89
 
@@ -109,9 +109,8 @@ def run_topic_modeling():
109
  ################################
110
 
111
  st.title("🪷 Community Collections Helper")
112
- st.badge("Version 1.0.0", icon=':material/category:',color='violet')
113
 
114
- uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv')
115
 
116
  if uploaded_file is not None:
117
  # Read file from raw bytes for caching and repeated use --> this ensure all the processing isn't repeated when a user changes the filters
@@ -152,7 +151,7 @@ if uploaded_file is not None:
152
  ## --- Necessity Index Filtering ---
153
  min_idx = float(df['necessity_index'].min())
154
  max_idx = float(df['necessity_index'].max())
155
- filter_range = st.sidebar.slider(
156
  "Necessity Index Range", min_value=min_idx, max_value=max_idx, value=(min_idx, max_idx)
157
  )
158
 
@@ -176,6 +175,30 @@ if uploaded_file is not None:
176
  st.markdown(f"**Total Applications:** {len(df)}")
177
  st.markdown(f"**Filtered Applications:** {len(filtered_df)}")
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
  ## ====== CREATE TAB SECTIONS =======
181
  tab1, tab2 = st.tabs(["Shortlist Manager","Insights"])
@@ -228,13 +251,14 @@ if uploaded_file is not None:
228
  shorltist_cols_to_show = [
229
  id_col,
230
  freeform_col,
231
- 'Usage',
 
232
  'necessity_index',
233
  'urgency_score',
234
  'severity_score',
235
  'vulnerability_score',
236
  'shortlist_score',
237
- 'book_candidates',
238
  ]
239
 
240
  st.dataframe(auto_short_df.loc[:, shorltist_cols_to_show], hide_index=True)
@@ -258,7 +282,7 @@ if uploaded_file is not None:
258
  col4.metric("Vulnerability", f"{int(row['vulnerability_score'])}")
259
 
260
  # HTML for clean usage items
261
- usage_items = [item for item in row['Usage'] if item and item.lower() != 'none']
262
  st.markdown("##### Excerpt")
263
  st.write(row[freeform_col])
264
  if usage_items:
@@ -288,23 +312,6 @@ if uploaded_file is not None:
288
  unsafe_allow_html=True,
289
  )
290
 
291
- # ======== SHORTLIST SUMMARY AND DOWNLOAD (MANUAL) ======
292
- shortlisted = [
293
- i for i in filtered_df.index
294
- if st.session_state.get(f"shortlist_{i}", False)
295
- ]
296
- st.sidebar.markdown(f"**Manually Shortlisted:** {len(shortlisted)}")
297
- if shortlisted:
298
- csv = df.loc[shortlisted].to_csv(index=False).encode('utf-8')
299
- st.sidebar.download_button(
300
- "Download Manual Shortlist", csv, "shortlist.csv", "text/csv"
301
- )
302
-
303
-
304
- add_vertical_space(5)
305
- st.divider()
306
- st.markdown(":grey[Made with 🩷 by the AI Innovation team   |   Contact: lynn.perez@twinkl.com]")
307
-
308
 
309
  #########################################
310
  # INSIGHTS TAB #
@@ -324,80 +331,86 @@ if uploaded_file is not None:
324
  col3.metric("Avg. Word Count", f"{df['word_count'].mean().round(1)}")
325
 
326
  ## --- NI Distribution Plot ---
327
- ni_distribution_plt = plot_histogram(df, col_to_plot='necessity_index', bins=50)
328
  st.plotly_chart(ni_distribution_plt)
329
 
330
 
 
331
 
 
332
 
333
-
334
-
335
-
336
-
337
 
338
- # =========== TOPIC MODELING ============
 
339
 
340
- st.header("Topic Modeling")
341
- add_vertical_space(1)
342
 
343
- ## ------- 1. Tokenize texts into sentences -------
344
- nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
 
 
345
 
346
- sentences = []
347
- mappings = []
348
 
349
- for idx, application_text in df[freeform_col].dropna().items():
350
- for sentence in topic_modeling_pipeline.spacy_sent_tokenize(application_text):
351
- sentences.append(sentence)
352
- mappings.append(idx)
353
 
 
 
354
 
355
- ## -------- 2. Generate embeddings -------
356
 
357
- embeddings_model = load_embeddings_model()
358
- embeddings = embeddings_model.encode(sentences, show_progress_bar=True)
359
 
360
- ## -------- 3. Topic Modeling --------
 
361
 
362
- umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
363
- hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
364
 
365
- topic_model, topics, probs = run_topic_modeling()
366
 
367
- topic_modeling_pipeline.ai_labels_to_custom_name(topic_model) # converts OpenAI representatino to actual topic labels
368
 
369
- ## ------- 4. Display Topics Dataframe ------
 
 
 
 
 
370
 
371
- topics_df = topic_model.get_topic_info()
372
- topics_df = topics_df[topics_df['Topic'] > -1]
373
- topics_df.drop(columns=['Name', 'OpenAI'], inplace=True)
374
- cols_to_move = ['Topic','CustomName']
375
- topics_df = topics_df[cols_to_move + [col for col in topics_df.columns if col not in cols_to_move]]
376
- topics_df.rename(columns={'CustomName':'Topic Name', 'Topic':'Topic Nr.'}, inplace=True)
377
 
378
- with st.expander("How are topic extracted?", icon="🌱", expanded=False):
 
379
 
380
- st.write("""
381
- **About Topic Modeling**
382
 
383
- We use BERTopic to :primary[**dynamically**] extract the most common topics from the natural language data.
384
 
385
- BERTopic is a machine learning technique that allows us to group documents (in this case, sentences within application letters) based on their semantic similarity and other patterns such as word frequency and placement.
386
 
387
- The table you see below shows you the extracted topics, alongside their top 10 extracted keywords and a small sample of real texts from the applications that demonstrate where the topics came from.
 
 
 
 
 
 
388
 
389
- **Table Info**
390
- - **Topic Nr.:** The 'id' of the topic.
391
- - **Topic Name:** This is an AI-generated label based on a few samples of application responses alongside their corresponding keywords.
392
- - **Representation:** Top 10 keywords that best represent a topic
393
- - **Representative Docs**: Sample sentences contributing to the topic
394
- """)
395
- st.dataframe(topics_df, hide_index=True)
396
 
397
- ## -------- 5. Plot Topics Chart ----------
 
398
 
399
- topic_count_plot = plot_topic_countplot(topics_df, topic_id_col='Topic Nr.', topic_name_col='Topic Name', representation_col='Representation', height=500)
400
 
401
- st.plotly_chart(topic_count_plot, use_container_width=True)
 
 
 
 
 
402
 
403
-
 
 
83
 
84
  # Usage Extraction
85
  docs = df_orig[freeform_col].to_list()
86
+ scored['usage'] = extract_usage(docs)
87
 
88
  return scored, freeform_col, id_col
89
 
 
109
  ################################
110
 
111
  st.title("🪷 Community Collections Helper")
 
112
 
113
+ uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv', label_visibility='hidden')
114
 
115
  if uploaded_file is not None:
116
  # Read file from raw bytes for caching and repeated use --> this ensure all the processing isn't repeated when a user changes the filters
 
151
  ## --- Necessity Index Filtering ---
152
  min_idx = float(df['necessity_index'].min())
153
  max_idx = float(df['necessity_index'].max())
154
+ filter_range = st.slider(
155
  "Necessity Index Range", min_value=min_idx, max_value=max_idx, value=(min_idx, max_idx)
156
  )
157
 
 
175
  st.markdown(f"**Total Applications:** {len(df)}")
176
  st.markdown(f"**Filtered Applications:** {len(filtered_df)}")
177
 
178
+ manual_keys = [k for k in st.session_state.keys() if k.startswith("shortlist_")]
179
+ manually_shortlisted = [int(k.split("_")[1]) for k in manual_keys if st.session_state[k]]
180
+
181
+ st.markdown(f"**Manually Shortlisted:** {len(manually_shortlisted)}")
182
+ if manually_shortlisted:
183
+ csv = df.loc[manually_shortlisted].to_csv(index=False).encode("utf-8")
184
+ st.download_button(
185
+ "Download Manual Shortlist",
186
+ data=csv,
187
+ file_name="manual_shortlist.csv",
188
+ mime="text/csv",
189
+ icon="⬇️",
190
+ )
191
+
192
+
193
+ add_vertical_space(4)
194
+ st.divider()
195
+ st.badge("Version 1.0.0", icon=':material/category:',color='violet')
196
+ st.caption("""
197
+ Made with 🩷 by the AI Innovation Team
198
+ Contact: lynn.perez@twinkl.com
199
+ """)
200
+
201
+
202
 
203
  ## ====== CREATE TAB SECTIONS =======
204
  tab1, tab2 = st.tabs(["Shortlist Manager","Insights"])
 
251
  shorltist_cols_to_show = [
252
  id_col,
253
  freeform_col,
254
+ 'book_candidates',
255
+ 'usage',
256
  'necessity_index',
257
  'urgency_score',
258
  'severity_score',
259
  'vulnerability_score',
260
  'shortlist_score',
261
+ 'is_heartfelt',
262
  ]
263
 
264
  st.dataframe(auto_short_df.loc[:, shorltist_cols_to_show], hide_index=True)
 
282
  col4.metric("Vulnerability", f"{int(row['vulnerability_score'])}")
283
 
284
  # HTML for clean usage items
285
+ usage_items = [item for item in row['usage'] if item and item.lower() != 'none']
286
  st.markdown("##### Excerpt")
287
  st.write(row[freeform_col])
288
  if usage_items:
 
312
  unsafe_allow_html=True,
313
  )
314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
  #########################################
317
  # INSIGHTS TAB #
 
331
  col3.metric("Avg. Word Count", f"{df['word_count'].mean().round(1)}")
332
 
333
  ## --- NI Distribution Plot ---
334
+ ni_distribution_plt = plot_histogram(df, col_to_plot='necessity_index', bins=50, title='Necessity Index Histogram')
335
  st.plotly_chart(ni_distribution_plt)
336
 
337
 
338
+ # =========== TOPIC MODELING ============
339
 
340
+ try:
341
 
342
+ st.header("Topic Modeling")
343
+ add_vertical_space(1)
 
 
344
 
345
+ ## ------- 1. Tokenize texts into sentences -------
346
+ nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
347
 
348
+ sentences = []
349
+ mappings = []
350
 
351
+ for idx, application_text in df[freeform_col].dropna().items():
352
+ for sentence in topic_modeling_pipeline.spacy_sent_tokenize(application_text):
353
+ sentences.append(sentence)
354
+ mappings.append(idx)
355
 
 
 
356
 
357
+ ## -------- 2. Generate embeddings -------
 
 
 
358
 
359
+ embeddings_model = load_embeddings_model()
360
+ embeddings = embeddings_model.encode(sentences, show_progress_bar=True)
361
 
362
+ ## -------- 3. Topic Modeling --------
363
 
364
+ umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
365
+ hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
366
 
367
+ # Run topic modeling from cached resource
368
+ topic_model, topics, probs = run_topic_modeling()
369
 
370
+ topic_modeling_pipeline.ai_labels_to_custom_name(topic_model) # converts OpenAI representatino to actual topic labels
 
371
 
 
372
 
373
+ ## ------- 4. Display Topics Dataframe ------
374
 
375
+ topics_df = topic_model.get_topic_info()
376
+ topics_df = topics_df[topics_df['Topic'] > -1]
377
+ topics_df.drop(columns=['Name', 'OpenAI'], inplace=True)
378
+ cols_to_move = ['Topic','CustomName']
379
+ topics_df = topics_df[cols_to_move + [col for col in topics_df.columns if col not in cols_to_move]]
380
+ topics_df.rename(columns={'CustomName':'Topic Name', 'Topic':'Topic Nr.'}, inplace=True)
381
 
382
+ with st.popover("How are topic extracted?", icon="🌱"):
 
 
 
 
 
383
 
384
+ st.write("""
385
+ **About Topic Modeling**
386
 
387
+ We use BERTopic to :primary[**dynamically**] extract the most common topics from the natural language data.
 
388
 
389
+ BERTopic is a machine learning technique that allows us to group documents (in this case, sentences within application letters) based on their semantic similarity and other patterns such as word frequency and placement.
390
 
391
+ The table you see below shows you the extracted topics, alongside their top 10 extracted keywords and a small sample of real texts from the applications that demonstrate where the topics came from.
392
 
393
+ **Table Info**
394
+ - **Topic Nr.:** The 'id' of the topic.
395
+ - **Topic Name:** This is an AI-generated label based on a few samples of application responses alongside their corresponding keywords.
396
+ - **Representation:** Top 10 keywords that best represent a topic
397
+ - **Representative Docs**: Sample sentences contributing to the topic
398
+ """)
399
+ st.dataframe(topics_df, hide_index=True)
400
 
401
+ ## -------- 5. Plot Topics Chart ----------
 
 
 
 
 
 
402
 
403
+ topic_count_plot = plot_topic_countplot(topics_df, topic_id_col='Topic Nr.', topic_name_col='Topic Name', representation_col='Representation', height=500, title='Topic Frequency Chart')
404
+ st.plotly_chart(topic_count_plot, use_container_width=True)
405
 
406
+ ## --------- 6. User Updates -----------
407
 
408
+ st.toast(
409
+ """
410
+ **Topic modeling is ready!** View the results on the _Insights_ tab
411
+ """,
412
+ icon='🎉'
413
+ )
414
 
415
+ except Exception as e:
416
+ st.error(f"Topic modeling failed: {str(e)}")