lynn-twinkl commited on
Commit
11d9a88
·
1 Parent(s): 6925f1d

added: topic modeling and ai custom labels

Browse files
Files changed (1) hide show
  1. app.py +50 -1
app.py CHANGED
@@ -7,7 +7,11 @@ import pandas as pd
7
  import altair as alt
8
  import joblib
9
  from io import BytesIO
 
 
 
10
  import os
 
11
  from streamlit_extras.metric_cards import style_metric_cards
12
 
13
  # ---- FUNCTIONS ----
@@ -18,6 +22,7 @@ from src.column_detection import detect_freeform_col
18
  from src.shortlist import shortlist_applications
19
  from src.twinkl_originals import find_book_candidates
20
  from src.preprocess_text import normalise_text
 
21
  from typing import Tuple
22
 
23
  style_metric_cards(box_shadow=False, border_left_color='#E7F4FF',background_color='#E7F4FF', border_size_px=0, border_radius_px=6)
@@ -36,6 +41,10 @@ def load_heartfelt_predictor():
36
  model_path = os.path.join("src", "models", "heartfelt_pipeline.joblib")
37
  return joblib.load(model_path)
38
 
 
 
 
 
39
  @st.cache_data(show_spinner=True)
40
  def load_and_process(raw_csv: bytes) -> Tuple[pd.DataFrame, str]:
41
  """
@@ -86,6 +95,11 @@ def compute_shortlist(df: pd.DataFrame) -> pd.DataFrame:
86
  """Pre‑compute shortlist_score for all rows (used for both modes)."""
87
  return shortlist_applications(df, k=len(df))
88
 
 
 
 
 
 
89
  ################################
90
  # MAIN APP SCRIPT
91
  ################################
@@ -252,9 +266,44 @@ if uploaded_file is not None:
252
  )
253
 
254
 
255
- ## ------------ INSIGHTS TAB -----------
 
 
256
 
257
  with tab2:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  st.write("")
259
 
260
  col1, col2, col3 = st.columns(3)
 
7
  import altair as alt
8
  import joblib
9
  from io import BytesIO
10
+ from umap import UMAP
11
+ from hdbscan import HDBSCAN
12
+
13
  import os
14
+
15
  from streamlit_extras.metric_cards import style_metric_cards
16
 
17
  # ---- FUNCTIONS ----
 
22
  from src.shortlist import shortlist_applications
23
  from src.twinkl_originals import find_book_candidates
24
  from src.preprocess_text import normalise_text
25
+ import src.models.topic_modeling_pipeline as topic_modeling_pipeline
26
  from typing import Tuple
27
 
28
  style_metric_cards(box_shadow=False, border_left_color='#E7F4FF',background_color='#E7F4FF', border_size_px=0, border_radius_px=6)
 
41
  model_path = os.path.join("src", "models", "heartfelt_pipeline.joblib")
42
  return joblib.load(model_path)
43
 
44
+ @st.cache_resource
45
+ def load_embeddings_model():
46
+ return topic_modeling_pipeline.load_embedding_model('all-MiniLM-L12-v2')
47
+
48
  @st.cache_data(show_spinner=True)
49
  def load_and_process(raw_csv: bytes) -> Tuple[pd.DataFrame, str]:
50
  """
 
95
  """Pre‑compute shortlist_score for all rows (used for both modes)."""
96
  return shortlist_applications(df, k=len(df))
97
 
98
+ @st.cache_resource(show_spinner=True)
99
+ def run_topic_modeling():
100
+
101
+ return topic_modeling_pipeline.bertopic_model(sentences, embeddings, embeddings_model, umap_model, hdbscan_model)
102
+
103
  ################################
104
  # MAIN APP SCRIPT
105
  ################################
 
266
  )
267
 
268
 
269
+ #########################################
270
+ # INSIGHTS TAB #
271
+ #########################################
272
 
273
  with tab2:
274
+
275
+ # =========== TOPIC MODELING ============
276
+
277
+ ## ------- 1. Tokenize texts into sentences -------
278
+ nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
279
+
280
+ sentences = []
281
+ mappings = []
282
+
283
+ for idx, application_text in df[freeform_col].dropna().items():
284
+ for sentence in topic_modeling_pipeline.spacy_sent_tokenize(application_text):
285
+ sentences.append(sentence)
286
+ mappings.append(idx)
287
+
288
+
289
+ ## -------- 2. Generate embeddings -------
290
+
291
+ embeddings_model = load_embeddings_model()
292
+ embeddings = embeddings_model.encode(sentences, show_progress_bar=True)
293
+
294
+ ## -------- 3. Topic Modeling --------
295
+
296
+ umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
297
+ hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
298
+
299
+ topic_model, topics, probs = run_topic_modeling()
300
+
301
+ topic_modeling_pipeline.ai_labels_to_custom_name(topic_model) # converts OpenAI representatino to actual topic labels
302
+
303
+
304
+ st.dataframe(topic_model.get_topic_info())
305
+
306
+
307
  st.write("")
308
 
309
  col1, col2, col3 = st.columns(3)