lynn-twinkl
commited on
Commit
·
893d9c9
1
Parent(s):
d371343
added: bertopic info expander
Browse files
app.py
CHANGED
|
@@ -23,7 +23,7 @@ from src.shortlist import shortlist_applications
|
|
| 23 |
from src.twinkl_originals import find_book_candidates
|
| 24 |
from src.preprocess_text import normalise_text
|
| 25 |
import src.models.topic_modeling_pipeline as topic_modeling_pipeline
|
| 26 |
-
from src.
|
| 27 |
from typing import Tuple
|
| 28 |
|
| 29 |
style_metric_cards(box_shadow=False, border_left_color='#E7F4FF',background_color='#E7F4FF', border_size_px=0, border_radius_px=6)
|
|
@@ -308,18 +308,16 @@ if uploaded_file is not None:
|
|
| 308 |
col3.metric("Total Applications", len(df))
|
| 309 |
st.html("<br>")
|
| 310 |
|
|
|
|
| 311 |
st.subheader("Necessity Index (NI) Distribution")
|
| 312 |
-
|
| 313 |
-
st.write("")
|
| 314 |
-
# Histogram of necessity index colored by priority labels
|
| 315 |
-
ni_distribution_plt = plot_hist(df, col_to_plot='necessity_index', bins=20)
|
| 316 |
-
|
| 317 |
st.plotly_chart(ni_distribution_plt)
|
| 318 |
|
| 319 |
-
st.dataframe(df, hide_index=True)
|
| 320 |
|
| 321 |
# =========== TOPIC MODELING ============
|
| 322 |
|
|
|
|
|
|
|
| 323 |
## ------- 1. Tokenize texts into sentences -------
|
| 324 |
nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
|
| 325 |
|
|
@@ -346,8 +344,45 @@ if uploaded_file is not None:
|
|
| 346 |
|
| 347 |
topic_modeling_pipeline.ai_labels_to_custom_name(topic_model) # converts OpenAI representatino to actual topic labels
|
| 348 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
-
st.dataframe(topic_model.get_topic_info())
|
| 351 |
|
| 352 |
|
| 353 |
|
|
|
|
| 23 |
from src.twinkl_originals import find_book_candidates
|
| 24 |
from src.preprocess_text import normalise_text
|
| 25 |
import src.models.topic_modeling_pipeline as topic_modeling_pipeline
|
| 26 |
+
from src.px_charts import plot_histogram, plot_topic_countplot
|
| 27 |
from typing import Tuple
|
| 28 |
|
| 29 |
style_metric_cards(box_shadow=False, border_left_color='#E7F4FF',background_color='#E7F4FF', border_size_px=0, border_radius_px=6)
|
|
|
|
| 308 |
col3.metric("Total Applications", len(df))
|
| 309 |
st.html("<br>")
|
| 310 |
|
| 311 |
+
## --- NI Distribution Plot ---
|
| 312 |
st.subheader("Necessity Index (NI) Distribution")
|
| 313 |
+
ni_distribution_plt = plot_histogram(df, col_to_plot='necessity_index', bins=50)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
st.plotly_chart(ni_distribution_plt)
|
| 315 |
|
|
|
|
| 316 |
|
| 317 |
# =========== TOPIC MODELING ============
|
| 318 |
|
| 319 |
+
st.subheader("Topic Modeling")
|
| 320 |
+
|
| 321 |
## ------- 1. Tokenize texts into sentences -------
|
| 322 |
nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
|
| 323 |
|
|
|
|
| 344 |
|
| 345 |
topic_modeling_pipeline.ai_labels_to_custom_name(topic_model) # converts OpenAI representatino to actual topic labels
|
| 346 |
|
| 347 |
+
## ------- 4. Display Topics Dataframe ------
|
| 348 |
+
|
| 349 |
+
topics_df = topic_model.get_topic_info()
|
| 350 |
+
topics_df = topics_df[topics_df['Topic'] > -1]
|
| 351 |
+
topics_df.drop(columns=['Name', 'OpenAI'], inplace=True)
|
| 352 |
+
cols_to_move = ['Topic','CustomName']
|
| 353 |
+
topics_df = topics_df[cols_to_move + [col for col in topics_df.columns if col not in cols_to_move]]
|
| 354 |
+
topics_df.rename(columns={'CustomName':'Topic Name', 'Topic':'Topic Nr.'}, inplace=True)
|
| 355 |
+
|
| 356 |
+
st.markdown("""
|
| 357 |
+
### Extracted Topics Table
|
| 358 |
+
This table shows you the topics that have been extracted from the applications.
|
| 359 |
+
""")
|
| 360 |
+
|
| 361 |
+
with st.expander("How are topic extracted?", icon="❓", expanded=False):
|
| 362 |
+
|
| 363 |
+
st.write("""
|
| 364 |
+
**About Topic Modeling**
|
| 365 |
+
|
| 366 |
+
We use BERTopic to :primary[**dynamically**] extract the most common topics from the natural language data.
|
| 367 |
+
|
| 368 |
+
BERTopic is a machine learning technique that allows us to group documents (in this case, sentences within application letters) based on their semantic similarity and other patterns such as word frequency and placement.
|
| 369 |
+
|
| 370 |
+
The table you see below shows you the extracted topics, alongside their top 10 extracted keywords and a small sample of real texts from the applications that demonstrate where the topics came from.
|
| 371 |
+
|
| 372 |
+
**Table Info**
|
| 373 |
+
- **Topic Nr.:** The 'id' of the topic.
|
| 374 |
+
- **Topic Name:** This is an AI-generated label based on a few samples of application responses alongside their corresponding keywords.
|
| 375 |
+
- **Representation:** Top 10 keywords that best represent a topic
|
| 376 |
+
- **Representative Docs**: Sample sentences contributing to the topic
|
| 377 |
+
""")
|
| 378 |
+
st.dataframe(topics_df, hide_index=True)
|
| 379 |
+
|
| 380 |
+
## -------- 5. Plot Topics Chart ----------
|
| 381 |
+
|
| 382 |
+
topic_count_plot = plot_topic_countplot(topics_df, topic_id_col='Topic Nr.', topic_name_col='Topic Name', representation_col='Representation', height=500)
|
| 383 |
+
|
| 384 |
+
st.plotly_chart(topic_count_plot, use_container_width=True)
|
| 385 |
|
|
|
|
| 386 |
|
| 387 |
|
| 388 |
|