lynn-twinkl commited on
Commit
893d9c9
·
1 Parent(s): d371343

added: bertopic info expander

Browse files
Files changed (1) hide show
  1. app.py +43 -8
app.py CHANGED
@@ -23,7 +23,7 @@ from src.shortlist import shortlist_applications
23
  from src.twinkl_originals import find_book_candidates
24
  from src.preprocess_text import normalise_text
25
  import src.models.topic_modeling_pipeline as topic_modeling_pipeline
26
- from src.plot_histogram import plot_hist
27
  from typing import Tuple
28
 
29
  style_metric_cards(box_shadow=False, border_left_color='#E7F4FF',background_color='#E7F4FF', border_size_px=0, border_radius_px=6)
@@ -308,18 +308,16 @@ if uploaded_file is not None:
308
  col3.metric("Total Applications", len(df))
309
  st.html("<br>")
310
 
 
311
  st.subheader("Necessity Index (NI) Distribution")
312
- st.write("")
313
- st.write("")
314
- # Histogram of necessity index colored by priority labels
315
- ni_distribution_plt = plot_hist(df, col_to_plot='necessity_index', bins=20)
316
-
317
  st.plotly_chart(ni_distribution_plt)
318
 
319
- st.dataframe(df, hide_index=True)
320
 
321
  # =========== TOPIC MODELING ============
322
 
 
 
323
  ## ------- 1. Tokenize texts into sentences -------
324
  nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
325
 
@@ -346,8 +344,45 @@ if uploaded_file is not None:
346
 
347
  topic_modeling_pipeline.ai_labels_to_custom_name(topic_model) # converts OpenAI representatino to actual topic labels
348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
- st.dataframe(topic_model.get_topic_info())
351
 
352
 
353
 
 
23
  from src.twinkl_originals import find_book_candidates
24
  from src.preprocess_text import normalise_text
25
  import src.models.topic_modeling_pipeline as topic_modeling_pipeline
26
+ from src.px_charts import plot_histogram, plot_topic_countplot
27
  from typing import Tuple
28
 
29
  style_metric_cards(box_shadow=False, border_left_color='#E7F4FF',background_color='#E7F4FF', border_size_px=0, border_radius_px=6)
 
308
  col3.metric("Total Applications", len(df))
309
  st.html("<br>")
310
 
311
+ ## --- NI Distribution Plot ---
312
  st.subheader("Necessity Index (NI) Distribution")
313
+ ni_distribution_plt = plot_histogram(df, col_to_plot='necessity_index', bins=50)
 
 
 
 
314
  st.plotly_chart(ni_distribution_plt)
315
 
 
316
 
317
  # =========== TOPIC MODELING ============
318
 
319
+ st.subheader("Topic Modeling")
320
+
321
  ## ------- 1. Tokenize texts into sentences -------
322
  nlp = topic_modeling_pipeline.load_spacy_model(model_name='en_core_web_sm')
323
 
 
344
 
345
  topic_modeling_pipeline.ai_labels_to_custom_name(topic_model) # converts OpenAI representatino to actual topic labels
346
 
347
+ ## ------- 4. Display Topics Dataframe ------
348
+
349
+ topics_df = topic_model.get_topic_info()
350
+ topics_df = topics_df[topics_df['Topic'] > -1]
351
+ topics_df.drop(columns=['Name', 'OpenAI'], inplace=True)
352
+ cols_to_move = ['Topic','CustomName']
353
+ topics_df = topics_df[cols_to_move + [col for col in topics_df.columns if col not in cols_to_move]]
354
+ topics_df.rename(columns={'CustomName':'Topic Name', 'Topic':'Topic Nr.'}, inplace=True)
355
+
356
+ st.markdown("""
357
+ ### Extracted Topics Table
358
+ This table shows you the topics that have been extracted from the applications.
359
+ """)
360
+
361
+ with st.expander("How are topic extracted?", icon="❓", expanded=False):
362
+
363
+ st.write("""
364
+ **About Topic Modeling**
365
+
366
+ We use BERTopic to :primary[**dynamically**] extract the most common topics from the natural language data.
367
+
368
+ BERTopic is a machine learning technique that allows us to group documents (in this case, sentences within application letters) based on their semantic similarity and other patterns such as word frequency and placement.
369
+
370
+ The table you see below shows you the extracted topics, alongside their top 10 extracted keywords and a small sample of real texts from the applications that demonstrate where the topics came from.
371
+
372
+ **Table Info**
373
+ - **Topic Nr.:** The 'id' of the topic.
374
+ - **Topic Name:** This is an AI-generated label based on a few samples of application responses alongside their corresponding keywords.
375
+ - **Representation:** Top 10 keywords that best represent a topic
376
+ - **Representative Docs**: Sample sentences contributing to the topic
377
+ """)
378
+ st.dataframe(topics_df, hide_index=True)
379
+
380
+ ## -------- 5. Plot Topics Chart ----------
381
+
382
+ topic_count_plot = plot_topic_countplot(topics_df, topic_id_col='Topic Nr.', topic_name_col='Topic Name', representation_col='Representation', height=500)
383
+
384
+ st.plotly_chart(topic_count_plot, use_container_width=True)
385
 
 
386
 
387
 
388