lynn-twinkl commited on
Commit
729ef7b
·
1 Parent(s): c2e9454

fix(ui): show 'Topic modeling is ready' toast only once per upload

Browse files

* Read the uploaded CSV bytes a single time and fingerprint them with MD5.
* Store `current_file_hash` in st.session_state; skip toast if it matches.
* Write `topic_toast_shown_for` flag after first display to prevent repeats.
* Removed duplicate `uploaded_file.read()` that emptied the buffer.

Files changed (1) hide show
  1. app.py +240 -227
app.py CHANGED
@@ -4,7 +4,7 @@
4
 
5
  import streamlit as st
6
  import pandas as pd
7
- import altair as alt
8
  import joblib
9
  from io import BytesIO
10
  from umap import UMAP
@@ -112,299 +112,310 @@ st.title("🪷 Community Collections Helper")
112
 
113
  uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv', label_visibility='hidden')
114
 
 
 
 
115
  if uploaded_file is not None:
116
- # Read file from raw bytes for caching and repeated use --> this ensure all the processing isn't repeated when a user changes the filters
117
- raw = uploaded_file.read()
 
 
 
 
118
 
119
- ## ====== PROCESSED DATA (CACHED) ======
 
120
 
121
- df, freeform_col, id_col = load_and_process(raw)
122
 
123
- book_candidates_df = df[df['book_candidates'] == True]
124
 
125
- ###############################
126
- # SIDE PANNEL #
127
- ###############################
128
 
129
- with st.sidebar:
130
- st.title("Shortlist Mode")
 
131
 
 
 
132
 
133
- quantile_map = {"strict": 0.75, "generous": 0.5}
134
- mode = st.segmented_control(
135
- "Select one option",
136
- options=["strict", "generous"],
137
- default="strict",
138
- )
139
-
140
- scored_full = compute_shortlist(df)
141
- threshold_score = scored_full["shortlist_score"].quantile(quantile_map[mode])
142
- auto_short_df = scored_full[scored_full["shortlist_score"] >= threshold_score]
143
-
144
- st.title("Filters")
145
-
146
- ## --- Dataframe To Filter ---
147
- options = ['All applications', 'Not shortlisted']
148
- selected_view = st.pills('Choose data to filter', options, default='Not shortlisted')
149
- st.write("")
150
-
151
- ## --- Necessity Index Filtering ---
152
- min_idx = float(df['necessity_index'].min())
153
- max_idx = float(df['necessity_index'].max())
154
- filter_range = st.slider(
155
- "Necessity Index Range", min_value=min_idx, max_value=max_idx, value=(min_idx, max_idx)
156
- )
157
-
158
- def filter_all_applications(df, auto_short_df, filter_range):
159
- return df[df['necessity_index'].between(filter_range[0], filter_range[1])]
160
-
161
- def filter_not_shortlisted(df, auto_short_df, filter_range):
162
- return df[
163
- (~df.index.isin(auto_short_df.index)) &
164
- (df['necessity_index'].between(filter_range[0], filter_range[1]))
165
- ]
166
 
167
- filter_map = {
168
- 'All applications': filter_all_applications,
169
- 'Not shortlisted': filter_not_shortlisted,
170
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
- filtered_df = filter_map[selected_view](df, auto_short_df, filter_range)
 
 
 
 
173
 
 
 
 
 
174
 
175
- st.markdown(f"**Total Applications:** {len(df)}")
176
- st.markdown(f"**Filtered Applications:** {len(filtered_df)}")
177
 
178
- manual_keys = [k for k in st.session_state.keys() if k.startswith("shortlist_")]
179
- manually_shortlisted = [int(k.split("_")[1]) for k in manual_keys if st.session_state[k]]
180
 
181
- st.markdown(f"**Manually Shortlisted:** {len(manually_shortlisted)}")
182
- if manually_shortlisted:
183
- csv = df.loc[manually_shortlisted].to_csv(index=False).encode("utf-8")
184
- st.download_button(
185
- "Download Manual Shortlist",
186
- data=csv,
187
- file_name="manual_shortlist.csv",
188
- mime="text/csv",
189
- icon="⬇️",
190
- )
191
 
 
 
192
 
193
- add_vertical_space(4)
194
- st.divider()
195
- st.badge("Version 1.0.0", icon=':material/category:',color='violet')
196
- st.caption("""
197
- Made with 🩷 by the AI Innovation Team
198
- Contact: lynn.perez@twinkl.com
199
- """)
 
 
 
200
 
201
 
 
 
 
 
 
 
 
202
 
203
- ## ====== CREATE TAB SECTIONS =======
204
- tab1, tab2 = st.tabs(["Shortlist Manager","Insights"])
205
 
206
 
207
- ##################################################
208
- # SHORTLIST MANAGER TAB #
209
- ##################################################
210
 
211
- with tab1:
212
-
213
- ## =========== AUTOMATIC SHORTLIST =========
214
 
215
- st.header("Automatic Shortlist")
 
 
216
 
217
- csv_auto = auto_short_df.to_csv(index=False).encode("utf-8")
218
- all_processed_data = df.to_csv(index=False).encode("utf-8")
219
- book_candidates = book_candidates_df.to_csv(index=False).encode("utf-8")
220
 
 
221
 
222
- csv_options = {
223
- "Shortlist": (csv_auto, "shortlist.csv"),
224
- "All Processed Data": (all_processed_data, "all_processed.csv"),
225
- "Book Candidates": (book_candidates, "book_candidates.csv"),
226
- }
227
 
228
- choice = st.selectbox("Select a file for download", list(csv_options.keys()))
229
 
230
- csv_data, file_name = csv_options[choice]
 
 
 
 
231
 
 
232
 
233
- st.download_button(
234
- label=f"Download {choice}",
235
- data=csv_data,
236
- file_name=file_name,
237
- mime="text/csv",
238
- help="This button will download the selected file from above",
239
- icon="⬇️"
240
 
241
- )
242
 
243
-
244
- st.write("")
245
- total_col, shortlistCounter_col, mode_col = st.columns(3)
246
-
247
- total_col.metric("Applications Submitted", len(df))
248
- shortlistCounter_col.metric("Shorlist Length", len(auto_short_df))
249
- mode_col.metric("Mode", mode)
250
-
251
- shorltist_cols_to_show = [
252
- id_col,
253
- freeform_col,
254
- 'book_candidates',
255
- 'usage',
256
- 'necessity_index',
257
- 'urgency_score',
258
- 'severity_score',
259
- 'vulnerability_score',
260
- 'shortlist_score',
261
- 'is_heartfelt',
262
- ]
263
-
264
- st.dataframe(auto_short_df.loc[:, shorltist_cols_to_show], hide_index=True)
265
-
266
- ## ====== APPLICATIONS REVIEW =======
267
-
268
- add_vertical_space(2)
269
- st.header("Manual Filtering")
270
- st.info("Use the **side panel** filters to more easily sort through applications that you'd like to review.", icon=':material/info:')
271
-
272
- st.write("")
273
- if len(filtered_df) > 0:
274
- st.markdown("#### Filtered Applications")
275
- for idx, row in filtered_df.iterrows():
276
- with st.expander(f"Application {row[id_col]}"):
277
- st.write("")
278
- col1, col2, col3, col4 = st.columns(4)
279
- col1.metric("Necessity", f"{row['necessity_index']:.1f}")
280
- col2.metric("Urgency", f"{int(row['urgency_score'])}")
281
- col3.metric("Severity", f"{int(row['severity_score'])}")
282
- col4.metric("Vulnerability", f"{int(row['vulnerability_score'])}")
283
-
284
- # HTML for clean usage items
285
- usage_items = [item for item in row['usage'] if item and item.lower() != 'none']
286
- st.markdown("##### Excerpt")
287
- st.write(row[freeform_col])
288
- if usage_items:
289
- st.markdown("##### Usage")
290
- pills_html = "".join(
291
- f"<span style='display:inline-block;background-color:#E7F4FF;color:#125E9E;border-radius:20px;padding:4px 10px;margin:2px;font-size:0.95rem;'>{item}</span>"
292
- for item in usage_items
293
- )
294
- st.markdown(pills_html, unsafe_allow_html=True)
295
- else:
296
- st.caption("*No usage found*")
297
- st.write("")
298
-
299
- st.checkbox(
300
- "Add to shortlist",
301
- key=f"shortlist_{idx}"
 
302
  )
 
 
 
 
 
 
 
 
 
303
 
304
- else:
305
- st.markdown(
306
- """
307
- <br>
308
- <div style="text-align: center; font-size: 1.2em">
309
- 🍂 <span style="color: grey;">No applications matched these filters...</span>
310
- </div>
311
- """,
312
- unsafe_allow_html=True,
313
- )
314
 
315
 
316
- #########################################
317
- # INSIGHTS TAB #
318
- #########################################
319
 
320
- with tab2:
321
 
322
 
323
- ## =========== DATA OVERVIEW ==========
324
 
325
- st.header("General Insights")
326
- add_vertical_space(1)
327
 
328
- col1, col2, col3 = st.columns(3)
329
- col1.metric("Applications Submitted", len(df))
330
- col2.metric("Median N.I", df['necessity_index'].median().round(2))
331
- col3.metric("Avg. Word Count", f"{df['word_count'].mean().round(1)}")
332
 
333
- ## --- NI Distribution Plot ---
334
- ni_distribution_plt = plot_histogram(df, col_to_plot='necessity_index', bins=50, title='Necessity Index Histogram')
335
- st.plotly_chart(ni_distribution_plt)
336
 
337
 
338
- # =========== TOPIC MODELING ============
339
 
340
- try:
341
 
342
- st.header("Topic Modeling")
343
- add_vertical_space(1)
344
 
345
- ## ------- 1. Tokenize texts into sentences -------
346
- nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
347
 
348
- sentences = []
349
- mappings = []
350
 
351
- for idx, application_text in df[freeform_col].dropna().items():
352
- for sentence in topic_modeling_pipeline.spacy_sent_tokenize(application_text):
353
- sentences.append(sentence)
354
- mappings.append(idx)
355
 
356
 
357
- ## -------- 2. Generate embeddings -------
358
 
359
- embeddings_model = load_embeddings_model()
360
- embeddings = embeddings_model.encode(sentences, show_progress_bar=True)
361
 
362
- ## -------- 3. Topic Modeling --------
363
 
364
- umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
365
- hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
366
 
367
- # Run topic modeling from cached resource
368
- topic_model, topics, probs = run_topic_modeling()
369
 
370
- topic_modeling_pipeline.ai_labels_to_custom_name(topic_model) # converts OpenAI representatino to actual topic labels
371
 
372
 
373
- ## ------- 4. Display Topics Dataframe ------
374
 
375
- topics_df = topic_model.get_topic_info()
376
- topics_df = topics_df[topics_df['Topic'] > -1]
377
- topics_df.drop(columns=['Name', 'OpenAI'], inplace=True)
378
- cols_to_move = ['Topic','CustomName']
379
- topics_df = topics_df[cols_to_move + [col for col in topics_df.columns if col not in cols_to_move]]
380
- topics_df.rename(columns={'CustomName':'Topic Name', 'Topic':'Topic Nr.'}, inplace=True)
381
 
382
- with st.popover("How are topic extracted?", icon="🌱"):
383
 
384
- st.write("""
385
- **About Topic Modeling**
386
 
387
- We use BERTopic to :primary[**dynamically**] extract the most common topics from the natural language data.
388
 
389
- BERTopic is a machine learning technique that allows us to group documents (in this case, sentences within application letters) based on their semantic similarity and other patterns such as word frequency and placement.
390
 
391
- The table you see below shows you the extracted topics, alongside their top 10 extracted keywords and a small sample of real texts from the applications that demonstrate where the topics came from.
392
 
393
- **Table Info**
394
- - **Topic Nr.:** The 'id' of the topic.
395
- - **Topic Name:** This is an AI-generated label based on a few samples of application responses alongside their corresponding keywords.
396
- - **Representation:** Top 10 keywords that best represent a topic
397
- - **Representative Docs**: Sample sentences contributing to the topic
398
- """)
399
- st.dataframe(topics_df, hide_index=True)
400
 
401
- ## -------- 5. Plot Topics Chart ----------
402
 
403
- topic_count_plot = plot_topic_countplot(topics_df, topic_id_col='Topic Nr.', topic_name_col='Topic Name', representation_col='Representation', height=500, title='Topic Frequency Chart')
404
- st.plotly_chart(topic_count_plot, use_container_width=True)
405
 
406
- ## --------- 6. User Updates -----------
407
 
 
408
  st.toast(
409
  """
410
  **Topic modeling is ready!** View the results on the _Insights_ tab
@@ -412,5 +423,7 @@ if uploaded_file is not None:
412
  icon='🎉'
413
  )
414
 
415
- except Exception as e:
416
- st.error(f"Topic modeling failed: {str(e)}")
 
 
 
4
 
5
  import streamlit as st
6
  import pandas as pd
7
+ import hashlib
8
  import joblib
9
  from io import BytesIO
10
  from umap import UMAP
 
112
 
113
  uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv', label_visibility='hidden')
114
 
115
+
116
+ # ====== Fingerprinting current file to avoid unncesssary reruns =====
117
+
118
  if uploaded_file is not None:
119
+ raw = uploaded_file.read() # single read
120
+ file_hash = hashlib.md5(raw).hexdigest()
121
+ st.session_state["current_file_hash"] = file_hash
122
+ else:
123
+ raw = None
124
+ st.session_state.pop("current_file_hash", None)
125
 
126
+ if raw is None:
127
+ st.stop()
128
 
129
+ ## ====== PROCESSED DATA (CACHED) ======
130
 
131
+ df, freeform_col, id_col = load_and_process(raw)
132
 
133
+ book_candidates_df = df[df['book_candidates'] == True]
 
 
134
 
135
+ ###############################
136
+ # SIDE PANNEL #
137
+ ###############################
138
 
139
+ with st.sidebar:
140
+ st.title("Shortlist Mode")
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
+ quantile_map = {"strict": 0.75, "generous": 0.5}
144
+ mode = st.segmented_control(
145
+ "Select one option",
146
+ options=["strict", "generous"],
147
+ default="strict",
148
+ )
149
+
150
+ scored_full = compute_shortlist(df)
151
+ threshold_score = scored_full["shortlist_score"].quantile(quantile_map[mode])
152
+ auto_short_df = scored_full[scored_full["shortlist_score"] >= threshold_score]
153
+
154
+ st.title("Filters")
155
+
156
+ ## --- Dataframe To Filter ---
157
+ options = ['All applications', 'Not shortlisted']
158
+ selected_view = st.pills('Choose data to filter', options, default='Not shortlisted')
159
+ st.write("")
160
+
161
+ ## --- Necessity Index Filtering ---
162
+ min_idx = float(df['necessity_index'].min())
163
+ max_idx = float(df['necessity_index'].max())
164
+ filter_range = st.slider(
165
+ "Necessity Index Range", min_value=min_idx, max_value=max_idx, value=(min_idx, max_idx)
166
+ )
167
+
168
+ def filter_all_applications(df, auto_short_df, filter_range):
169
+ return df[df['necessity_index'].between(filter_range[0], filter_range[1])]
170
 
171
+ def filter_not_shortlisted(df, auto_short_df, filter_range):
172
+ return df[
173
+ (~df.index.isin(auto_short_df.index)) &
174
+ (df['necessity_index'].between(filter_range[0], filter_range[1]))
175
+ ]
176
 
177
+ filter_map = {
178
+ 'All applications': filter_all_applications,
179
+ 'Not shortlisted': filter_not_shortlisted,
180
+ }
181
 
182
+ filtered_df = filter_map[selected_view](df, auto_short_df, filter_range)
 
183
 
 
 
184
 
185
+ st.markdown(f"**Total Applications:** {len(df)}")
186
+ st.markdown(f"**Filtered Applications:** {len(filtered_df)}")
 
 
 
 
 
 
 
 
187
 
188
+ manual_keys = [k for k in st.session_state.keys() if k.startswith("shortlist_")]
189
+ manually_shortlisted = [int(k.split("_")[1]) for k in manual_keys if st.session_state[k]]
190
 
191
+ st.markdown(f"**Manually Shortlisted:** {len(manually_shortlisted)}")
192
+ if manually_shortlisted:
193
+ csv = df.loc[manually_shortlisted].to_csv(index=False).encode("utf-8")
194
+ st.download_button(
195
+ "Download Manual Shortlist",
196
+ data=csv,
197
+ file_name="manual_shortlist.csv",
198
+ mime="text/csv",
199
+ icon="⬇️",
200
+ )
201
 
202
 
203
+ add_vertical_space(4)
204
+ st.divider()
205
+ st.badge("Version 1.0.0", icon=':material/category:',color='violet')
206
+ st.markdown("""
207
+ :grey[Made with 🩷 by the AI Innovation Team
208
+ Contact: lynn.perez@twinkl.com]
209
+ """)
210
 
 
 
211
 
212
 
213
+ ## ====== CREATE TAB SECTIONS =======
214
+ tab1, tab2 = st.tabs(["Shortlist Manager","Insights"])
 
215
 
 
 
 
216
 
217
+ ##################################################
218
+ # SHORTLIST MANAGER TAB #
219
+ ##################################################
220
 
221
+ with tab1:
222
+
223
+ ## =========== AUTOMATIC SHORTLIST =========
224
 
225
+ st.header("Automatic Shortlist")
226
 
227
+ csv_auto = auto_short_df.to_csv(index=False).encode("utf-8")
228
+ all_processed_data = df.to_csv(index=False).encode("utf-8")
229
+ book_candidates = book_candidates_df.to_csv(index=False).encode("utf-8")
 
 
230
 
 
231
 
232
+ csv_options = {
233
+ "Shortlist": (csv_auto, "shortlist.csv"),
234
+ "All Processed Data": (all_processed_data, "all_processed.csv"),
235
+ "Book Candidates": (book_candidates, "book_candidates.csv"),
236
+ }
237
 
238
+ choice = st.selectbox("Select a file for download", list(csv_options.keys()))
239
 
240
+ csv_data, file_name = csv_options[choice]
 
 
 
 
 
 
241
 
 
242
 
243
+ st.download_button(
244
+ label=f"Download {choice}",
245
+ data=csv_data,
246
+ file_name=file_name,
247
+ mime="text/csv",
248
+ help="This button will download the selected file from above",
249
+ icon="⬇️"
250
+
251
+ )
252
+
253
+
254
+ st.write("")
255
+ total_col, shortlistCounter_col, mode_col = st.columns(3)
256
+
257
+ total_col.metric("Applications Submitted", len(df))
258
+ shortlistCounter_col.metric("Shorlist Length", len(auto_short_df))
259
+ mode_col.metric("Mode", mode)
260
+
261
+ shorltist_cols_to_show = [
262
+ id_col,
263
+ freeform_col,
264
+ 'book_candidates',
265
+ 'usage',
266
+ 'necessity_index',
267
+ 'urgency_score',
268
+ 'severity_score',
269
+ 'vulnerability_score',
270
+ 'shortlist_score',
271
+ 'is_heartfelt',
272
+ ]
273
+
274
+ st.dataframe(auto_short_df.loc[:, shorltist_cols_to_show], hide_index=True)
275
+
276
+ ## ====== APPLICATIONS REVIEW =======
277
+
278
+ add_vertical_space(2)
279
+ st.header("Manual Filtering")
280
+ st.info("Use the **side panel** filters to more easily sort through applications that you'd like to review.", icon=':material/info:')
281
+
282
+ st.write("")
283
+ if len(filtered_df) > 0:
284
+ st.markdown("#### Filtered Applications")
285
+ for idx, row in filtered_df.iterrows():
286
+ with st.expander(f"Application {row[id_col]}"):
287
+ st.write("")
288
+ col1, col2, col3, col4 = st.columns(4)
289
+ col1.metric("Necessity", f"{row['necessity_index']:.1f}")
290
+ col2.metric("Urgency", f"{int(row['urgency_score'])}")
291
+ col3.metric("Severity", f"{int(row['severity_score'])}")
292
+ col4.metric("Vulnerability", f"{int(row['vulnerability_score'])}")
293
+
294
+ # HTML for clean usage items
295
+ usage_items = [item for item in row['usage'] if item and item.lower() != 'none']
296
+ st.markdown("##### Excerpt")
297
+ st.write(row[freeform_col])
298
+ if usage_items:
299
+ st.markdown("##### Usage")
300
+ pills_html = "".join(
301
+ f"<span style='display:inline-block;background-color:#E7F4FF;color:#125E9E;border-radius:20px;padding:4px 10px;margin:2px;font-size:0.95rem;'>{item}</span>"
302
+ for item in usage_items
303
  )
304
+ st.markdown(pills_html, unsafe_allow_html=True)
305
+ else:
306
+ st.caption("*No usage found*")
307
+ st.write("")
308
+
309
+ st.checkbox(
310
+ "Add to shortlist",
311
+ key=f"shortlist_{idx}"
312
+ )
313
 
314
+ else:
315
+ st.markdown(
316
+ """
317
+ <br>
318
+ <div style="text-align: center; font-size: 1.2em">
319
+ 🍂 <span style="color: grey;">No applications matched these filters...</span>
320
+ </div>
321
+ """,
322
+ unsafe_allow_html=True,
323
+ )
324
 
325
 
326
+ #########################################
327
+ # INSIGHTS TAB #
328
+ #########################################
329
 
330
+ with tab2:
331
 
332
 
333
+ ## =========== DATA OVERVIEW ==========
334
 
335
+ st.header("General Insights")
336
+ add_vertical_space(1)
337
 
338
+ col1, col2, col3 = st.columns(3)
339
+ col1.metric("Applications Submitted", len(df))
340
+ col2.metric("Median N.I", df['necessity_index'].median().round(2))
341
+ col3.metric("Avg. Word Count", f"{df['word_count'].mean().round(1)}")
342
 
343
+ ## --- NI Distribution Plot ---
344
+ ni_distribution_plt = plot_histogram(df, col_to_plot='necessity_index', bins=50, title='Necessity Index Histogram')
345
+ st.plotly_chart(ni_distribution_plt)
346
 
347
 
348
+ # =========== TOPIC MODELING ============
349
 
350
+ try:
351
 
352
+ st.header("Topic Modeling")
353
+ add_vertical_space(1)
354
 
355
+ ## ------- 1. Tokenize texts into sentences -------
356
+ nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
357
 
358
+ sentences = []
359
+ mappings = []
360
 
361
+ for idx, application_text in df[freeform_col].dropna().items():
362
+ for sentence in topic_modeling_pipeline.spacy_sent_tokenize(application_text):
363
+ sentences.append(sentence)
364
+ mappings.append(idx)
365
 
366
 
367
+ ## -------- 2. Generate embeddings -------
368
 
369
+ embeddings_model = load_embeddings_model()
370
+ embeddings = embeddings_model.encode(sentences, show_progress_bar=True)
371
 
372
+ ## -------- 3. Topic Modeling --------
373
 
374
+ umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
375
+ hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
376
 
377
+ # Run topic modeling from cached resource
378
+ topic_model, topics, probs = run_topic_modeling()
379
 
380
+ topic_modeling_pipeline.ai_labels_to_custom_name(topic_model) # converts OpenAI representatino to actual topic labels
381
 
382
 
383
+ ## ------- 4. Display Topics Dataframe ------
384
 
385
+ topics_df = topic_model.get_topic_info()
386
+ topics_df = topics_df[topics_df['Topic'] > -1]
387
+ topics_df.drop(columns=['Name', 'OpenAI'], inplace=True)
388
+ cols_to_move = ['Topic','CustomName']
389
+ topics_df = topics_df[cols_to_move + [col for col in topics_df.columns if col not in cols_to_move]]
390
+ topics_df.rename(columns={'CustomName':'Topic Name', 'Topic':'Topic Nr.'}, inplace=True)
391
 
392
+ with st.popover("How are topic extracted?", icon="🌱"):
393
 
394
+ st.write("""
395
+ **About Topic Modeling**
396
 
397
+ We use BERTopic to :primary[**dynamically**] extract the most common topics from the natural language data.
398
 
399
+ BERTopic is a machine learning technique that allows us to group documents (in this case, sentences within application letters) based on their semantic similarity and other patterns such as word frequency and placement.
400
 
401
+ The table you see below shows you the extracted topics, alongside their top 10 extracted keywords and a small sample of real texts from the applications that demonstrate where the topics came from.
402
 
403
+ **Table Info**
404
+ - **Topic Nr.:** The 'id' of the topic.
405
+ - **Topic Name:** This is an AI-generated label based on a few samples of application responses alongside their corresponding keywords.
406
+ - **Representation:** Top 10 keywords that best represent a topic
407
+ - **Representative Docs**: Sample sentences contributing to the topic
408
+ """)
409
+ st.dataframe(topics_df, hide_index=True)
410
 
411
+ ## -------- 5. Plot Topics Chart ----------
412
 
413
+ topic_count_plot = plot_topic_countplot(topics_df, topic_id_col='Topic Nr.', topic_name_col='Topic Name', representation_col='Representation', height=500, title='Topic Frequency Chart')
414
+ st.plotly_chart(topic_count_plot, use_container_width=True)
415
 
416
+ ## --------- 6. User Updates -----------
417
 
418
+ if st.session_state.get("topic_toast_shown_for") != st.session_state["current_file_hash"]:
419
  st.toast(
420
  """
421
  **Topic modeling is ready!** View the results on the _Insights_ tab
 
423
  icon='🎉'
424
  )
425
 
426
+ st.session_state["topic_toast_shown_for"] = st.session_state["current_file_hash"]
427
+
428
+ except Exception as e:
429
+ st.error(f"Topic modeling failed: {str(e)}")