lynn-twinkl commited on
Commit ·
c41f427
1
Parent(s): 54ec9cd
added: topic-based filtering
Browse files
app.py
CHANGED
|
@@ -101,12 +101,10 @@ def compute_shortlist(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 101 |
|
| 102 |
@st.cache_resource(show_spinner=True)
|
| 103 |
def run_topic_modeling():
|
| 104 |
-
try:
|
| 105 |
|
| 106 |
-
|
| 107 |
-
add_vertical_space(1)
|
| 108 |
|
| 109 |
-
|
| 110 |
nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
|
| 111 |
|
| 112 |
sentences = []
|
|
@@ -118,27 +116,28 @@ def run_topic_modeling():
|
|
| 118 |
mappings.append(idx)
|
| 119 |
|
| 120 |
|
| 121 |
-
|
| 122 |
|
| 123 |
embeddings_model = load_embeddings_model()
|
| 124 |
embeddings = embeddings_model.encode(sentences, show_progress_bar=True)
|
| 125 |
|
| 126 |
-
|
| 127 |
|
| 128 |
-
umap_model = UMAP(n_neighbors=
|
| 129 |
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
|
| 130 |
|
| 131 |
-
|
|
|
|
| 132 |
topic_model, topics, probs = topic_modeling_pipeline.bertopic_model(sentences, embeddings, embeddings_model, umap_model, hdbscan_model)
|
| 133 |
|
| 134 |
topic_modeling_pipeline.ai_labels_to_custom_name(topic_model)
|
| 135 |
|
| 136 |
-
return topic_model, topics, probs
|
| 137 |
|
| 138 |
except Exception as e:
|
| 139 |
st.error(f"Topic modeling failed: {e}")
|
| 140 |
st.code(traceback.format_exc()) # Shows the full error in a nice code box
|
| 141 |
-
return None, None, None
|
| 142 |
|
| 143 |
|
| 144 |
|
|
@@ -148,7 +147,7 @@ def run_topic_modeling():
|
|
| 148 |
|
| 149 |
st.title("🪷 Community Collections Helper")
|
| 150 |
|
| 151 |
-
uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv', label_visibility='
|
| 152 |
|
| 153 |
|
| 154 |
# ========== FINGERPRINTING CURRENT FILE ==========
|
|
@@ -169,7 +168,25 @@ if raw is None:
|
|
| 169 |
## ====== DATA PROCESSING ======
|
| 170 |
|
| 171 |
df, freeform_col, id_col = load_and_process(raw) # from cached function
|
| 172 |
-
topic_model, topics, probs = run_topic_modeling() # from cached function
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
|
| 175 |
book_candidates_df = df[df['book_candidates'] == True]
|
|
@@ -223,6 +240,16 @@ with st.sidebar:
|
|
| 223 |
|
| 224 |
filtered_df = filter_map[selected_view](df, auto_short_df, filter_range)
|
| 225 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
st.markdown(f"**Total Applications:** {len(df)}")
|
| 228 |
st.markdown(f"**Filtered Applications:** {len(filtered_df)}")
|
|
@@ -269,12 +296,14 @@ with tab1:
|
|
| 269 |
csv_auto = auto_short_df.to_csv(index=False).encode("utf-8")
|
| 270 |
all_processed_data = df.to_csv(index=False).encode("utf-8")
|
| 271 |
book_candidates = book_candidates_df.to_csv(index=False).encode("utf-8")
|
|
|
|
| 272 |
|
| 273 |
|
| 274 |
csv_options = {
|
| 275 |
"Shortlist": (csv_auto, "shortlist.csv"),
|
| 276 |
"All Processed Data": (all_processed_data, "all_processed.csv"),
|
| 277 |
"Book Candidates": (book_candidates, "book_candidates.csv"),
|
|
|
|
| 278 |
}
|
| 279 |
|
| 280 |
choice = st.selectbox("Select a file for download", list(csv_options.keys()))
|
|
@@ -311,6 +340,7 @@ with tab1:
|
|
| 311 |
'vulnerability_score',
|
| 312 |
'shortlist_score',
|
| 313 |
'is_heartfelt',
|
|
|
|
| 314 |
]
|
| 315 |
|
| 316 |
st.dataframe(auto_short_df.loc[:, shorltist_cols_to_show], hide_index=True)
|
|
@@ -333,20 +363,32 @@ with tab1:
|
|
| 333 |
col3.metric("Severity", f"{int(row['severity_score'])}")
|
| 334 |
col4.metric("Vulnerability", f"{int(row['vulnerability_score'])}")
|
| 335 |
|
| 336 |
-
# HTML for clean usage items
|
| 337 |
-
usage_items = [item for item in row['usage'] if item and item.lower() != 'none']
|
| 338 |
st.markdown("##### Excerpt")
|
| 339 |
st.write(row[freeform_col])
|
|
|
|
|
|
|
|
|
|
| 340 |
if usage_items:
|
| 341 |
st.markdown("##### Usage")
|
| 342 |
pills_html = "".join(
|
| 343 |
f"<span style='display:inline-block;background-color:#E7F4FF;color:#125E9E;border-radius:20px;padding:4px 10px;margin:2px;font-size:0.95rem;'>{item}</span>"
|
| 344 |
for item in usage_items
|
| 345 |
)
|
| 346 |
-
st.
|
| 347 |
else:
|
| 348 |
st.caption("*No usage found*")
|
| 349 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
|
| 351 |
st.checkbox(
|
| 352 |
"Add to shortlist",
|
|
@@ -387,15 +429,12 @@ with tab2:
|
|
| 387 |
st.plotly_chart(ni_distribution_plt)
|
| 388 |
|
| 389 |
|
|
|
|
| 390 |
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
topics_df.drop(columns=['Name', 'OpenAI'], inplace=True)
|
| 396 |
-
cols_to_move = ['Topic','CustomName']
|
| 397 |
-
topics_df = topics_df[cols_to_move + [col for col in topics_df.columns if col not in cols_to_move]]
|
| 398 |
-
topics_df.rename(columns={'CustomName':'Topic Name', 'Topic':'Topic Nr.'}, inplace=True)
|
| 399 |
|
| 400 |
with st.popover("How are topic extracted?", icon="🌱"):
|
| 401 |
|
|
@@ -416,12 +455,12 @@ with tab2:
|
|
| 416 |
""")
|
| 417 |
st.dataframe(topics_df, hide_index=True)
|
| 418 |
|
| 419 |
-
## --------
|
| 420 |
|
| 421 |
topic_count_plot = plot_topic_countplot(topics_df, topic_id_col='Topic Nr.', topic_name_col='Topic Name', representation_col='Representation', height=500, title='Topic Frequency Chart')
|
| 422 |
st.plotly_chart(topic_count_plot, use_container_width=True)
|
| 423 |
|
| 424 |
-
## ---------
|
| 425 |
|
| 426 |
if st.session_state.get("topic_toast_shown_for") != st.session_state["current_file_hash"]:
|
| 427 |
st.toast(
|
|
|
|
| 101 |
|
| 102 |
@st.cache_resource(show_spinner=True)
|
| 103 |
def run_topic_modeling():
|
|
|
|
| 104 |
|
| 105 |
+
try:
|
|
|
|
| 106 |
|
| 107 |
+
# ------- 1. Tokenize texts into sentences -------
|
| 108 |
nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
|
| 109 |
|
| 110 |
sentences = []
|
|
|
|
| 116 |
mappings.append(idx)
|
| 117 |
|
| 118 |
|
| 119 |
+
# -------- 2. Generate embeddings -------
|
| 120 |
|
| 121 |
embeddings_model = load_embeddings_model()
|
| 122 |
embeddings = embeddings_model.encode(sentences, show_progress_bar=True)
|
| 123 |
|
| 124 |
+
# -------- 3. Topic Modeling --------
|
| 125 |
|
| 126 |
+
umap_model = UMAP(n_neighbors=7, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
|
| 127 |
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
|
| 128 |
|
| 129 |
+
# --------- 4. Perform Topic Modeling ---------
|
| 130 |
+
|
| 131 |
topic_model, topics, probs = topic_modeling_pipeline.bertopic_model(sentences, embeddings, embeddings_model, umap_model, hdbscan_model)
|
| 132 |
|
| 133 |
topic_modeling_pipeline.ai_labels_to_custom_name(topic_model)
|
| 134 |
|
| 135 |
+
return topic_model, topics, probs, mappings
|
| 136 |
|
| 137 |
except Exception as e:
|
| 138 |
st.error(f"Topic modeling failed: {e}")
|
| 139 |
st.code(traceback.format_exc()) # Shows the full error in a nice code box
|
| 140 |
+
return None, None, None, None
|
| 141 |
|
| 142 |
|
| 143 |
|
|
|
|
| 147 |
|
| 148 |
st.title("🪷 Community Collections Helper")
|
| 149 |
|
| 150 |
+
uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv', label_visibility='collapsed')
|
| 151 |
|
| 152 |
|
| 153 |
# ========== FINGERPRINTING CURRENT FILE ==========
|
|
|
|
| 168 |
## ====== DATA PROCESSING ======
|
| 169 |
|
| 170 |
df, freeform_col, id_col = load_and_process(raw) # from cached function
|
| 171 |
+
topic_model, topics, probs, mappings = run_topic_modeling() # from cached function
|
| 172 |
+
|
| 173 |
+
if topic_model is not None:
|
| 174 |
+
label_map = (topic_model
|
| 175 |
+
.get_topic_info()
|
| 176 |
+
.set_index("Topic")["CustomName"]
|
| 177 |
+
.to_dict())
|
| 178 |
+
df = topic_modeling_pipeline.attach_topics(df, mappings, topics, label_map, col="topics")
|
| 179 |
+
else:
|
| 180 |
+
st.warning("Topics could not be generated; continuing without them.")
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
topics_df = topic_model.get_topic_info()
|
| 184 |
+
topics_df = topics_df[topics_df['Topic'] > -1]
|
| 185 |
+
topics_df.drop(columns=['Name', 'OpenAI'], inplace=True)
|
| 186 |
+
cols_to_move = ['Topic','CustomName']
|
| 187 |
+
topics_df = topics_df[cols_to_move + [col for col in topics_df.columns if col not in cols_to_move]]
|
| 188 |
+
topics_df.rename(columns={'CustomName':'Topic Name', 'Topic':'Topic Nr.'}, inplace=True)
|
| 189 |
+
|
| 190 |
|
| 191 |
|
| 192 |
book_candidates_df = df[df['book_candidates'] == True]
|
|
|
|
| 240 |
|
| 241 |
filtered_df = filter_map[selected_view](df, auto_short_df, filter_range)
|
| 242 |
|
| 243 |
+
## -------- Topic Filtering -------
|
| 244 |
+
|
| 245 |
+
topic_options = sorted(topics_df['Topic Name'].unique())
|
| 246 |
+
selected_topics = st.multiselect("Filter by Topic(s)", options=topic_options, default=[])
|
| 247 |
+
|
| 248 |
+
if selected_topics:
|
| 249 |
+
selected_set = set(selected_topics)
|
| 250 |
+
filtered_df = filtered_df[
|
| 251 |
+
filtered_df['topics'].apply(lambda topic_list: selected_set.issubset(set(topic_list)))
|
| 252 |
+
]
|
| 253 |
|
| 254 |
st.markdown(f"**Total Applications:** {len(df)}")
|
| 255 |
st.markdown(f"**Filtered Applications:** {len(filtered_df)}")
|
|
|
|
| 296 |
csv_auto = auto_short_df.to_csv(index=False).encode("utf-8")
|
| 297 |
all_processed_data = df.to_csv(index=False).encode("utf-8")
|
| 298 |
book_candidates = book_candidates_df.to_csv(index=False).encode("utf-8")
|
| 299 |
+
topic_descriptions_csv = topics_df.to_csv(index=False).encode("utf-8")
|
| 300 |
|
| 301 |
|
| 302 |
csv_options = {
|
| 303 |
"Shortlist": (csv_auto, "shortlist.csv"),
|
| 304 |
"All Processed Data": (all_processed_data, "all_processed.csv"),
|
| 305 |
"Book Candidates": (book_candidates, "book_candidates.csv"),
|
| 306 |
+
"Topic Descriptions": (topic_descriptions_csv, "topic_descriptions.csv"),
|
| 307 |
}
|
| 308 |
|
| 309 |
choice = st.selectbox("Select a file for download", list(csv_options.keys()))
|
|
|
|
| 340 |
'vulnerability_score',
|
| 341 |
'shortlist_score',
|
| 342 |
'is_heartfelt',
|
| 343 |
+
'topics',
|
| 344 |
]
|
| 345 |
|
| 346 |
st.dataframe(auto_short_df.loc[:, shorltist_cols_to_show], hide_index=True)
|
|
|
|
| 363 |
col3.metric("Severity", f"{int(row['severity_score'])}")
|
| 364 |
col4.metric("Vulnerability", f"{int(row['vulnerability_score'])}")
|
| 365 |
|
|
|
|
|
|
|
| 366 |
st.markdown("##### Excerpt")
|
| 367 |
st.write(row[freeform_col])
|
| 368 |
+
|
| 369 |
+
# HTML for clean usage items
|
| 370 |
+
usage_items = [item for item in row['usage'] if item and item.lower() != 'none']
|
| 371 |
if usage_items:
|
| 372 |
st.markdown("##### Usage")
|
| 373 |
pills_html = "".join(
|
| 374 |
f"<span style='display:inline-block;background-color:#E7F4FF;color:#125E9E;border-radius:20px;padding:4px 10px;margin:2px;font-size:0.95rem;'>{item}</span>"
|
| 375 |
for item in usage_items
|
| 376 |
)
|
| 377 |
+
st.html(pills_html)
|
| 378 |
else:
|
| 379 |
st.caption("*No usage found*")
|
| 380 |
+
|
| 381 |
+
topic_items = [item for item in row['topics'] if item and item.lower() != 'none']
|
| 382 |
+
if topic_items:
|
| 383 |
+
st.markdown("##### Topics")
|
| 384 |
+
topic_boxes_html= "".join(
|
| 385 |
+
f"<span style='display:inline-block;background-color:#ECE0FC;color:#6741B9;border-radius:5px;padding:4px 10px;margin:2px;font-size:0.95rem;'>{item}</span>"
|
| 386 |
+
for item in topic_items
|
| 387 |
+
)
|
| 388 |
+
st.html(topic_boxes_html)
|
| 389 |
+
else:
|
| 390 |
+
st.caption("_No topics assigned for this application_")
|
| 391 |
+
|
| 392 |
|
| 393 |
st.checkbox(
|
| 394 |
"Add to shortlist",
|
|
|
|
| 429 |
st.plotly_chart(ni_distribution_plt)
|
| 430 |
|
| 431 |
|
| 432 |
+
## ============= TOPIC MODELING ============
|
| 433 |
|
| 434 |
+
st.header("Topic Modeling")
|
| 435 |
+
add_vertical_space(1)
|
| 436 |
+
|
| 437 |
+
## ------- Display Topics Dataframe ------
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
|
| 439 |
with st.popover("How are topic extracted?", icon="🌱"):
|
| 440 |
|
|
|
|
| 455 |
""")
|
| 456 |
st.dataframe(topics_df, hide_index=True)
|
| 457 |
|
| 458 |
+
## -------- Plot Topics Chart ----------
|
| 459 |
|
| 460 |
topic_count_plot = plot_topic_countplot(topics_df, topic_id_col='Topic Nr.', topic_name_col='Topic Name', representation_col='Representation', height=500, title='Topic Frequency Chart')
|
| 461 |
st.plotly_chart(topic_count_plot, use_container_width=True)
|
| 462 |
|
| 463 |
+
## --------- User Updates -----------
|
| 464 |
|
| 465 |
if st.session_state.get("topic_toast_shown_for") != st.session_state["current_file_hash"]:
|
| 466 |
st.toast(
|