lynn-twinkl commited on
Commit
c41f427
·
1 Parent(s): 54ec9cd

added: topic-based filtering

Browse files
Files changed (1) hide show
  1. app.py +65 -26
app.py CHANGED
@@ -101,12 +101,10 @@ def compute_shortlist(df: pd.DataFrame) -> pd.DataFrame:
101
 
102
  @st.cache_resource(show_spinner=True)
103
  def run_topic_modeling():
104
- try:
105
 
106
- st.header("Topic Modeling")
107
- add_vertical_space(1)
108
 
109
- ## ------- 1. Tokenize texts into sentences -------
110
  nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
111
 
112
  sentences = []
@@ -118,27 +116,28 @@ def run_topic_modeling():
118
  mappings.append(idx)
119
 
120
 
121
- ## -------- 2. Generate embeddings -------
122
 
123
  embeddings_model = load_embeddings_model()
124
  embeddings = embeddings_model.encode(sentences, show_progress_bar=True)
125
 
126
- ## -------- 3. Topic Modeling --------
127
 
128
- umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
129
  hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
130
 
131
- ## --------- 4. Perform Topic Modeling ---------
 
132
  topic_model, topics, probs = topic_modeling_pipeline.bertopic_model(sentences, embeddings, embeddings_model, umap_model, hdbscan_model)
133
 
134
  topic_modeling_pipeline.ai_labels_to_custom_name(topic_model)
135
 
136
- return topic_model, topics, probs
137
 
138
  except Exception as e:
139
  st.error(f"Topic modeling failed: {e}")
140
  st.code(traceback.format_exc()) # Shows the full error in a nice code box
141
- return None, None, None
142
 
143
 
144
 
@@ -148,7 +147,7 @@ def run_topic_modeling():
148
 
149
  st.title("🪷 Community Collections Helper")
150
 
151
- uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv', label_visibility='hidden')
152
 
153
 
154
  # ========== FINGERPRINTING CURRENT FILE ==========
@@ -169,7 +168,25 @@ if raw is None:
169
  ## ====== DATA PROCESSING ======
170
 
171
  df, freeform_col, id_col = load_and_process(raw) # from cached function
172
- topic_model, topics, probs = run_topic_modeling() # from cached function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
 
175
  book_candidates_df = df[df['book_candidates'] == True]
@@ -223,6 +240,16 @@ with st.sidebar:
223
 
224
  filtered_df = filter_map[selected_view](df, auto_short_df, filter_range)
225
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  st.markdown(f"**Total Applications:** {len(df)}")
228
  st.markdown(f"**Filtered Applications:** {len(filtered_df)}")
@@ -269,12 +296,14 @@ with tab1:
269
  csv_auto = auto_short_df.to_csv(index=False).encode("utf-8")
270
  all_processed_data = df.to_csv(index=False).encode("utf-8")
271
  book_candidates = book_candidates_df.to_csv(index=False).encode("utf-8")
 
272
 
273
 
274
  csv_options = {
275
  "Shortlist": (csv_auto, "shortlist.csv"),
276
  "All Processed Data": (all_processed_data, "all_processed.csv"),
277
  "Book Candidates": (book_candidates, "book_candidates.csv"),
 
278
  }
279
 
280
  choice = st.selectbox("Select a file for download", list(csv_options.keys()))
@@ -311,6 +340,7 @@ with tab1:
311
  'vulnerability_score',
312
  'shortlist_score',
313
  'is_heartfelt',
 
314
  ]
315
 
316
  st.dataframe(auto_short_df.loc[:, shorltist_cols_to_show], hide_index=True)
@@ -333,20 +363,32 @@ with tab1:
333
  col3.metric("Severity", f"{int(row['severity_score'])}")
334
  col4.metric("Vulnerability", f"{int(row['vulnerability_score'])}")
335
 
336
- # HTML for clean usage items
337
- usage_items = [item for item in row['usage'] if item and item.lower() != 'none']
338
  st.markdown("##### Excerpt")
339
  st.write(row[freeform_col])
 
 
 
340
  if usage_items:
341
  st.markdown("##### Usage")
342
  pills_html = "".join(
343
  f"<span style='display:inline-block;background-color:#E7F4FF;color:#125E9E;border-radius:20px;padding:4px 10px;margin:2px;font-size:0.95rem;'>{item}</span>"
344
  for item in usage_items
345
  )
346
- st.markdown(pills_html, unsafe_allow_html=True)
347
  else:
348
  st.caption("*No usage found*")
349
- st.write("")
 
 
 
 
 
 
 
 
 
 
 
350
 
351
  st.checkbox(
352
  "Add to shortlist",
@@ -387,15 +429,12 @@ with tab2:
387
  st.plotly_chart(ni_distribution_plt)
388
 
389
 
 
390
 
391
- ## ------- 4. Display Topics Dataframe ------
392
-
393
- topics_df = topic_model.get_topic_info()
394
- topics_df = topics_df[topics_df['Topic'] > -1]
395
- topics_df.drop(columns=['Name', 'OpenAI'], inplace=True)
396
- cols_to_move = ['Topic','CustomName']
397
- topics_df = topics_df[cols_to_move + [col for col in topics_df.columns if col not in cols_to_move]]
398
- topics_df.rename(columns={'CustomName':'Topic Name', 'Topic':'Topic Nr.'}, inplace=True)
399
 
400
  with st.popover("How are topic extracted?", icon="🌱"):
401
 
@@ -416,12 +455,12 @@ with tab2:
416
  """)
417
  st.dataframe(topics_df, hide_index=True)
418
 
419
- ## -------- 5. Plot Topics Chart ----------
420
 
421
  topic_count_plot = plot_topic_countplot(topics_df, topic_id_col='Topic Nr.', topic_name_col='Topic Name', representation_col='Representation', height=500, title='Topic Frequency Chart')
422
  st.plotly_chart(topic_count_plot, use_container_width=True)
423
 
424
- ## --------- 6. User Updates -----------
425
 
426
  if st.session_state.get("topic_toast_shown_for") != st.session_state["current_file_hash"]:
427
  st.toast(
 
101
 
102
  @st.cache_resource(show_spinner=True)
103
  def run_topic_modeling():
 
104
 
105
+ try:
 
106
 
107
+ # ------- 1. Tokenize texts into sentences -------
108
  nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
109
 
110
  sentences = []
 
116
  mappings.append(idx)
117
 
118
 
119
+ # -------- 2. Generate embeddings -------
120
 
121
  embeddings_model = load_embeddings_model()
122
  embeddings = embeddings_model.encode(sentences, show_progress_bar=True)
123
 
124
+ # -------- 3. Topic Modeling --------
125
 
126
+ umap_model = UMAP(n_neighbors=7, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
127
  hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
128
 
129
+ # --------- 4. Perform Topic Modeling ---------
130
+
131
  topic_model, topics, probs = topic_modeling_pipeline.bertopic_model(sentences, embeddings, embeddings_model, umap_model, hdbscan_model)
132
 
133
  topic_modeling_pipeline.ai_labels_to_custom_name(topic_model)
134
 
135
+ return topic_model, topics, probs, mappings
136
 
137
  except Exception as e:
138
  st.error(f"Topic modeling failed: {e}")
139
  st.code(traceback.format_exc()) # Shows the full error in a nice code box
140
+ return None, None, None, None
141
 
142
 
143
 
 
147
 
148
  st.title("🪷 Community Collections Helper")
149
 
150
+ uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv', label_visibility='collapsed')
151
 
152
 
153
  # ========== FINGERPRINTING CURRENT FILE ==========
 
168
  ## ====== DATA PROCESSING ======
169
 
170
  df, freeform_col, id_col = load_and_process(raw) # from cached function
171
+ topic_model, topics, probs, mappings = run_topic_modeling() # from cached function
172
+
173
+ if topic_model is not None:
174
+ label_map = (topic_model
175
+ .get_topic_info()
176
+ .set_index("Topic")["CustomName"]
177
+ .to_dict())
178
+ df = topic_modeling_pipeline.attach_topics(df, mappings, topics, label_map, col="topics")
179
+ else:
180
+ st.warning("Topics could not be generated; continuing without them.")
181
+
182
+
183
+ topics_df = topic_model.get_topic_info()
184
+ topics_df = topics_df[topics_df['Topic'] > -1]
185
+ topics_df.drop(columns=['Name', 'OpenAI'], inplace=True)
186
+ cols_to_move = ['Topic','CustomName']
187
+ topics_df = topics_df[cols_to_move + [col for col in topics_df.columns if col not in cols_to_move]]
188
+ topics_df.rename(columns={'CustomName':'Topic Name', 'Topic':'Topic Nr.'}, inplace=True)
189
+
190
 
191
 
192
  book_candidates_df = df[df['book_candidates'] == True]
 
240
 
241
  filtered_df = filter_map[selected_view](df, auto_short_df, filter_range)
242
 
243
+ ## -------- Topic Filtering -------
244
+
245
+ topic_options = sorted(topics_df['Topic Name'].unique())
246
+ selected_topics = st.multiselect("Filter by Topic(s)", options=topic_options, default=[])
247
+
248
+ if selected_topics:
249
+ selected_set = set(selected_topics)
250
+ filtered_df = filtered_df[
251
+ filtered_df['topics'].apply(lambda topic_list: selected_set.issubset(set(topic_list)))
252
+ ]
253
 
254
  st.markdown(f"**Total Applications:** {len(df)}")
255
  st.markdown(f"**Filtered Applications:** {len(filtered_df)}")
 
296
  csv_auto = auto_short_df.to_csv(index=False).encode("utf-8")
297
  all_processed_data = df.to_csv(index=False).encode("utf-8")
298
  book_candidates = book_candidates_df.to_csv(index=False).encode("utf-8")
299
+ topic_descriptions_csv = topics_df.to_csv(index=False).encode("utf-8")
300
 
301
 
302
  csv_options = {
303
  "Shortlist": (csv_auto, "shortlist.csv"),
304
  "All Processed Data": (all_processed_data, "all_processed.csv"),
305
  "Book Candidates": (book_candidates, "book_candidates.csv"),
306
+ "Topic Descriptions": (topic_descriptions_csv, "topic_descriptions.csv"),
307
  }
308
 
309
  choice = st.selectbox("Select a file for download", list(csv_options.keys()))
 
340
  'vulnerability_score',
341
  'shortlist_score',
342
  'is_heartfelt',
343
+ 'topics',
344
  ]
345
 
346
  st.dataframe(auto_short_df.loc[:, shorltist_cols_to_show], hide_index=True)
 
363
  col3.metric("Severity", f"{int(row['severity_score'])}")
364
  col4.metric("Vulnerability", f"{int(row['vulnerability_score'])}")
365
 
 
 
366
  st.markdown("##### Excerpt")
367
  st.write(row[freeform_col])
368
+
369
+ # HTML for clean usage items
370
+ usage_items = [item for item in row['usage'] if item and item.lower() != 'none']
371
  if usage_items:
372
  st.markdown("##### Usage")
373
  pills_html = "".join(
374
  f"<span style='display:inline-block;background-color:#E7F4FF;color:#125E9E;border-radius:20px;padding:4px 10px;margin:2px;font-size:0.95rem;'>{item}</span>"
375
  for item in usage_items
376
  )
377
+ st.html(pills_html)
378
  else:
379
  st.caption("*No usage found*")
380
+
381
+ topic_items = [item for item in row['topics'] if item and item.lower() != 'none']
382
+ if topic_items:
383
+ st.markdown("##### Topics")
384
+ topic_boxes_html= "".join(
385
+ f"<span style='display:inline-block;background-color:#ECE0FC;color:#6741B9;border-radius:5px;padding:4px 10px;margin:2px;font-size:0.95rem;'>{item}</span>"
386
+ for item in topic_items
387
+ )
388
+ st.html(topic_boxes_html)
389
+ else:
390
+ st.caption("_No topics assigned for this application_")
391
+
392
 
393
  st.checkbox(
394
  "Add to shortlist",
 
429
  st.plotly_chart(ni_distribution_plt)
430
 
431
 
432
+ ## ============= TOPIC MODELING ============
433
 
434
+ st.header("Topic Modeling")
435
+ add_vertical_space(1)
436
+
437
+ ## ------- Display Topics Dataframe ------
 
 
 
 
438
 
439
  with st.popover("How are topic extracted?", icon="🌱"):
440
 
 
455
  """)
456
  st.dataframe(topics_df, hide_index=True)
457
 
458
+ ## -------- Plot Topics Chart ----------
459
 
460
  topic_count_plot = plot_topic_countplot(topics_df, topic_id_col='Topic Nr.', topic_name_col='Topic Name', representation_col='Representation', height=500, title='Topic Frequency Chart')
461
  st.plotly_chart(topic_count_plot, use_container_width=True)
462
 
463
+ ## --------- User Updates -----------
464
 
465
  if st.session_state.get("topic_toast_shown_for") != st.session_state["current_file_hash"]:
466
  st.toast(