lynn-twinkl commited on
Commit
54ec9cd
·
1 Parent(s): 729ef7b

refac: moved topic modelling to top of script in order to implement it on filters and dowload options

Browse files
Files changed (1) hide show
  1. app.py +80 -75
app.py CHANGED
@@ -30,7 +30,7 @@ from typing import Tuple
30
  style_metric_cards(box_shadow=False, border_left_color='#E7F4FF',background_color='#E7F4FF', border_size_px=0, border_radius_px=6)
31
 
32
  ##################################
33
- # CACHED PROCESSING FUNCTION
34
  ##################################
35
 
36
  # -----------------------------------------------------------------------------
@@ -101,8 +101,46 @@ def compute_shortlist(df: pd.DataFrame) -> pd.DataFrame:
101
 
102
  @st.cache_resource(show_spinner=True)
103
  def run_topic_modeling():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- return topic_modeling_pipeline.bertopic_model(sentences, embeddings, embeddings_model, umap_model, hdbscan_model)
106
 
107
  ################################
108
  # MAIN APP SCRIPT
@@ -113,10 +151,12 @@ st.title("🪷 Community Collections Helper")
113
  uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv', label_visibility='hidden')
114
 
115
 
116
- # ====== Fingerprinting current file to avoid unncesssary reruns =====
 
 
117
 
118
  if uploaded_file is not None:
119
- raw = uploaded_file.read() # ← single read
120
  file_hash = hashlib.md5(raw).hexdigest()
121
  st.session_state["current_file_hash"] = file_hash
122
  else:
@@ -126,9 +166,11 @@ else:
126
  if raw is None:
127
  st.stop()
128
 
129
- ## ====== PROCESSED DATA (CACHED) ======
 
 
 
130
 
131
- df, freeform_col, id_col = load_and_process(raw)
132
 
133
  book_candidates_df = df[df['book_candidates'] == True]
134
 
@@ -345,85 +387,48 @@ with tab2:
345
  st.plotly_chart(ni_distribution_plt)
346
 
347
 
348
- # =========== TOPIC MODELING ============
349
 
350
- try:
351
-
352
- st.header("Topic Modeling")
353
- add_vertical_space(1)
354
-
355
- ## ------- 1. Tokenize texts into sentences -------
356
- nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
357
-
358
- sentences = []
359
- mappings = []
360
-
361
- for idx, application_text in df[freeform_col].dropna().items():
362
- for sentence in topic_modeling_pipeline.spacy_sent_tokenize(application_text):
363
- sentences.append(sentence)
364
- mappings.append(idx)
365
-
366
-
367
- ## -------- 2. Generate embeddings -------
368
-
369
- embeddings_model = load_embeddings_model()
370
- embeddings = embeddings_model.encode(sentences, show_progress_bar=True)
371
-
372
- ## -------- 3. Topic Modeling --------
373
-
374
- umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
375
- hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
376
-
377
- # Run topic modeling from cached resource
378
- topic_model, topics, probs = run_topic_modeling()
379
-
380
- topic_modeling_pipeline.ai_labels_to_custom_name(topic_model) # converts OpenAI representatino to actual topic labels
381
 
 
 
 
 
 
 
382
 
383
- ## ------- 4. Display Topics Dataframe ------
384
 
385
- topics_df = topic_model.get_topic_info()
386
- topics_df = topics_df[topics_df['Topic'] > -1]
387
- topics_df.drop(columns=['Name', 'OpenAI'], inplace=True)
388
- cols_to_move = ['Topic','CustomName']
389
- topics_df = topics_df[cols_to_move + [col for col in topics_df.columns if col not in cols_to_move]]
390
- topics_df.rename(columns={'CustomName':'Topic Name', 'Topic':'Topic Nr.'}, inplace=True)
391
 
392
- with st.popover("How are topic extracted?", icon="🌱"):
393
 
394
- st.write("""
395
- **About Topic Modeling**
396
 
397
- We use BERTopic to :primary[**dynamically**] extract the most common topics from the natural language data.
398
 
399
- BERTopic is a machine learning technique that allows us to group documents (in this case, sentences within application letters) based on their semantic similarity and other patterns such as word frequency and placement.
 
 
 
 
 
 
400
 
401
- The table you see below shows you the extracted topics, alongside their top 10 extracted keywords and a small sample of real texts from the applications that demonstrate where the topics came from.
402
 
403
- **Table Info**
404
- - **Topic Nr.:** The 'id' of the topic.
405
- - **Topic Name:** This is an AI-generated label based on a few samples of application responses alongside their corresponding keywords.
406
- - **Representation:** Top 10 keywords that best represent a topic
407
- - **Representative Docs**: Sample sentences contributing to the topic
408
- """)
409
- st.dataframe(topics_df, hide_index=True)
410
 
411
- ## -------- 5. Plot Topics Chart ----------
412
 
413
- topic_count_plot = plot_topic_countplot(topics_df, topic_id_col='Topic Nr.', topic_name_col='Topic Name', representation_col='Representation', height=500, title='Topic Frequency Chart')
414
- st.plotly_chart(topic_count_plot, use_container_width=True)
415
-
416
- ## --------- 6. User Updates -----------
417
-
418
- if st.session_state.get("topic_toast_shown_for") != st.session_state["current_file_hash"]:
419
- st.toast(
420
- """
421
- **Topic modeling is ready!** View the results on the _Insights_ tab
422
- """,
423
- icon='🎉'
424
- )
425
-
426
- st.session_state["topic_toast_shown_for"] = st.session_state["current_file_hash"]
427
 
428
- except Exception as e:
429
- st.error(f"Topic modeling failed: {str(e)}")
 
30
  style_metric_cards(box_shadow=False, border_left_color='#E7F4FF',background_color='#E7F4FF', border_size_px=0, border_radius_px=6)
31
 
32
  ##################################
33
+ # CACHED PROCESSING FUNCTIONS
34
  ##################################
35
 
36
  # -----------------------------------------------------------------------------
 
101
 
102
  @st.cache_resource(show_spinner=True)
103
  def run_topic_modeling():
104
+ try:
105
+
106
+ st.header("Topic Modeling")
107
+ add_vertical_space(1)
108
+
109
+ ## ------- 1. Tokenize texts into sentences -------
110
+ nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
111
+
112
+ sentences = []
113
+ mappings = []
114
+
115
+ for idx, application_text in df[freeform_col].dropna().items():
116
+ for sentence in topic_modeling_pipeline.spacy_sent_tokenize(application_text):
117
+ sentences.append(sentence)
118
+ mappings.append(idx)
119
+
120
+
121
+ ## -------- 2. Generate embeddings -------
122
+
123
+ embeddings_model = load_embeddings_model()
124
+ embeddings = embeddings_model.encode(sentences, show_progress_bar=True)
125
+
126
+ ## -------- 3. Topic Modeling --------
127
+
128
+ umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
129
+ hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
130
+
131
+ ## --------- 4. Perform Topic Modeling ---------
132
+ topic_model, topics, probs = topic_modeling_pipeline.bertopic_model(sentences, embeddings, embeddings_model, umap_model, hdbscan_model)
133
+
134
+ topic_modeling_pipeline.ai_labels_to_custom_name(topic_model)
135
+
136
+ return topic_model, topics, probs
137
+
138
+ except Exception as e:
139
+ st.error(f"Topic modeling failed: {e}")
140
+ st.code(traceback.format_exc()) # Shows the full error in a nice code box
141
+ return None, None, None
142
+
143
 
 
144
 
145
  ################################
146
  # MAIN APP SCRIPT
 
151
  uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv', label_visibility='hidden')
152
 
153
 
154
+ # ========== FINGERPRINTING CURRENT FILE ==========
155
+ # This helps avoid reruns of certain functions as
156
+ # long as the file stays the same
157
 
158
  if uploaded_file is not None:
159
+ raw = uploaded_file.read()
160
  file_hash = hashlib.md5(raw).hexdigest()
161
  st.session_state["current_file_hash"] = file_hash
162
  else:
 
166
  if raw is None:
167
  st.stop()
168
 
169
+ ## ====== DATA PROCESSING ======
170
+
171
+ df, freeform_col, id_col = load_and_process(raw) # from cached function
172
+ topic_model, topics, probs = run_topic_modeling() # from cached function
173
 
 
174
 
175
  book_candidates_df = df[df['book_candidates'] == True]
176
 
 
387
  st.plotly_chart(ni_distribution_plt)
388
 
389
 
 
390
 
391
+ ## ------- 4. Display Topics Dataframe ------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
393
+ topics_df = topic_model.get_topic_info()
394
+ topics_df = topics_df[topics_df['Topic'] > -1]
395
+ topics_df.drop(columns=['Name', 'OpenAI'], inplace=True)
396
+ cols_to_move = ['Topic','CustomName']
397
+ topics_df = topics_df[cols_to_move + [col for col in topics_df.columns if col not in cols_to_move]]
398
+ topics_df.rename(columns={'CustomName':'Topic Name', 'Topic':'Topic Nr.'}, inplace=True)
399
 
400
+ with st.popover("How are topic extracted?", icon="🌱"):
401
 
402
+ st.write("""
403
+ **About Topic Modeling**
 
 
 
 
404
 
405
+ We use BERTopic to :primary[**dynamically**] extract the most common topics from the natural language data.
406
 
407
+ BERTopic is a machine learning technique that allows us to group documents (in this case, sentences within application letters) based on their semantic similarity and other patterns such as word frequency and placement.
 
408
 
409
+ The table you see below shows you the extracted topics, alongside their top 10 extracted keywords and a small sample of real texts from the applications that demonstrate where the topics came from.
410
 
411
+ **Table Info**
412
+ - **Topic Nr.:** The 'id' of the topic.
413
+ - **Topic Name:** This is an AI-generated label based on a few samples of application responses alongside their corresponding keywords.
414
+ - **Representation:** Top 10 keywords that best represent a topic
415
+ - **Representative Docs**: Sample sentences contributing to the topic
416
+ """)
417
+ st.dataframe(topics_df, hide_index=True)
418
 
419
+ ## -------- 5. Plot Topics Chart ----------
420
 
421
+ topic_count_plot = plot_topic_countplot(topics_df, topic_id_col='Topic Nr.', topic_name_col='Topic Name', representation_col='Representation', height=500, title='Topic Frequency Chart')
422
+ st.plotly_chart(topic_count_plot, use_container_width=True)
 
 
 
 
 
423
 
424
+ ## --------- 6. User Updates -----------
425
 
426
+ if st.session_state.get("topic_toast_shown_for") != st.session_state["current_file_hash"]:
427
+ st.toast(
428
+ """
429
+ **Topic modeling is ready!** View the results on the _Insights_ tab
430
+ """,
431
+ icon='🎉'
432
+ )
 
 
 
 
 
 
 
433
 
434
+ st.session_state["topic_toast_shown_for"] = st.session_state["current_file_hash"]