lynn-twinkl
Added nltk import
89917b7
###############################
# IMPORTS & CONFIG
###############################
import streamlit as st
import pandas as pd
import time
from datetime import datetime
import nltk
from nltk.tokenize import sent_tokenize
from hdbscan import HDBSCAN
from umap import UMAP
from openai import OpenAI
from tenacity import retry, wait_exponential, stop_after_attempt
from functions.auto_column_detection import auto_detect_columns
from functions.preprocessing_functions import remove_numeric_or_special_responses, robust_convert_date
from functions.language_labeling_translation import detect_language, translate_text
from functions.sentiment_analysis import analyze_sentiment, label_sentiment
from functions.create_cancellation_reasons_table import generate_cancellation_reasons_overview
from functions.topicModeling_contentRequests import (
load_embedding_model,
bertopic_model,
merge_specific_topics,
update_df_with_topics
)
from plots.overview_charts import (
create_word_count_histogram,
create_sentiment_pie,
create_cancellation_reasons_plot,
create_grouped_chart
)
from plots.topicModeling_charts import (
create_topics_overtime_chart,
create_stacked_topics_per_class
)
############################
# STREAMLIT APP CONFIGURATION
############################
st.set_page_config(
layout='wide',
page_title="Exit Survey Processing App",
initial_sidebar_state="expanded",
)
# Global settings
OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
client = OpenAI(api_key=OPENAI_API_KEY)
###############################
# HELPER CLASSES & FUNCTIONS
###############################
class OpenAIWrapper:
"""
Wraps the OpenAI chat.completions call with automatic retries and
a configurable prompt.
"""
def __init__(self, model, prompt=""):
self.model = model
self.prompt = prompt
@retry(wait=wait_exponential(multiplier=1, min=2, max=10), stop=stop_after_attempt(5))
def run(self, user_text):
try:
response = client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": self.prompt},
{"role": "user", "content": user_text},
]
)
return response.choices[0].message.content
except Exception as e:
st.error(f"Error during OpenAI API call: {e}")
raise
@st.cache_data(show_spinner=False)
def cached_translate(text):
"""Cached translation function to reduce repeated OpenAI calls."""
return translate_text(text, skip_translation=False, translator_model=openai_model)
@st.cache_resource(show_spinner=False)
def get_embedding_model():
"""Caches the embedding model for topic modeling."""
return load_embedding_model()
def translate_non_english(df):
"""
Identifies and translates non-English rows (with word-count > 8) in 'freeform_answer'.
Uses the globally cached `cached_translate`.
"""
df['language'] = df['freeform_answer'].apply(detect_language)
to_translate = df[(df['language'] == 'non-en') & (df['word-count'] > 8)].copy()
if not to_translate.empty:
progress_text = st.empty()
progress_bar = st.progress(0)
total = len(to_translate)
for i, (idx, row) in enumerate(to_translate.iterrows(), 1):
progress_text.text(f"Translating non-English responses ({i} of {total})")
try:
translated = cached_translate(row['freeform_answer'])
df.at[idx, 'freeform_answer'] = translated
except Exception as e:
st.error(f"Error translating response {i}: {str(e)}")
progress_bar.progress(i / total)
progress_text.empty()
progress_bar.empty()
st.success(
f"Successfully translated {total} non-English responses",
icon='✅'
)
df.drop(columns='language', inplace=True, errors='ignore')
return df
@st.cache_data(show_spinner=False)
def run_topic_modeling(df):
"""
Full pipeline for:
1. Sentence tokenization
2. Embedding
3. UMAP, HDBSCAN
4. BERTopic modeling
5. Custom topic naming via OpenAI
6. Merging small topics, final labeling
Returns:
(topic_model, updated_topics, mapping, chatgpt_topic_labels)
"""
# --- 1. Sentence tokenization ---
try:
nltk.data.find("tokenizers/punkt_tab/english")
except LookupError:
nltk.download("punkt_tab")
sentences = []
mapping = []
for idx, response in df['freeform_answer'].dropna().items():
for sentence in sent_tokenize(response):
sentences.append(sentence)
mapping.append(idx)
# --- 2. Embedding ---
embedding_model = get_embedding_model()
embeddings = embedding_model.encode(sentences, show_progress_bar=True)
# --- 3. UMAP, HDBSCAN ---
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean',
cluster_selection_method='eom',
prediction_data=True)
# --- 4. BERTopic model creation ---
_topic_model, topics, probs = bertopic_model(
sentences, embeddings, embedding_model,
umap_model, hdbscan_model
)
# Merge small or closely
_topic_model = merge_specific_topics(_topic_model, sentences)
updated_topics, _ = _topic_model.transform(sentences)
# --- 5. Custom topic naming via OpenAI ---
topic_info = _topic_model.get_topic_info()
chatgpt_topic_labels = {}
for topic_id in topic_info['Topic']:
if topic_id == -1:
continue
rep_docs = _topic_model.get_representative_docs(topic_id)
doc_text = " ".join(rep_docs[:10]) # Up to 10 docs for context
topic_keywords = _topic_model.get_topic(topic_id) or []
keywords_text = ", ".join([word for word, score in topic_keywords])
prompt_template = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]
Based on the information above, extract a short but highly descriptive topic label
of at most 5 words. Make sure it is in the following format:
topic: <topic label>
""".strip()
prompt_filled = prompt_template.replace("[DOCUMENTS]", doc_text).replace("[KEYWORDS]", keywords_text)
response = naming_model.run(prompt_filled)
label = response.strip()
if label.lower().startswith("topic:"):
label = label[len("topic:"):].strip()
chatgpt_topic_labels[topic_id] = label
if -1 in chatgpt_topic_labels:
del chatgpt_topic_labels[-1]
_topic_model.set_topic_labels(chatgpt_topic_labels)
return _topic_model, updated_topics, mapping, chatgpt_topic_labels
def process_file(uploaded_file):
"""
Process the uploaded file, perform data cleaning, and return a processed DataFrame.
"""
# 1. Read file
try:
if uploaded_file.name.endswith('.csv'):
df = pd.read_csv(uploaded_file)
else:
df = pd.read_excel(uploaded_file)
except Exception as e:
st.error(f"Error reading file: {e}")
st.stop()
original_row_count = len(df)
# 2. Auto-detect columns
st.header("Data Preview")
df_preview_col, spacer, detected_cols_col = st.columns([1, 0.05, 1])
with df_preview_col:
st.subheader("Raw Data Preview")
st.dataframe(df, hide_index=True)
with detected_cols_col:
detected = auto_detect_columns(df)
st.subheader("Column Detection & Selection")
st.info(
"We've automatically detected a few columns. Verify these are correct or select manually.",
icon='💡'
)
st.json(detected)
for req in ['freeform_answer', 'date']:
if req not in detected:
detected[req] = st.selectbox(f"Select column for {req}", df.columns.tolist())
if not st.button("Continue with these columns"):
st.stop()
# 3. Rename columns
rename_mapping = {detected[col]: col for col in detected}
df.rename(columns=rename_mapping, inplace=True)
df.columns = df.columns.str.lower().str.replace(" ", "_")
# 4. Basic cleaning steps
if 'freeform_answer' not in df.columns:
st.error("Column 'freeform_answer' not found.")
st.stop()
# Word count
df['word-count'] = df['freeform_answer'].apply(
lambda x: len(str(x).split()) if pd.notnull(x) else 0
)
# Convert date
if 'date' in df.columns:
df['date'] = robust_convert_date(df['date'])
else:
st.error("'date' column is missing.")
st.stop()
# Remove numeric or special responses
df = remove_numeric_or_special_responses(df, 'freeform_answer')
# 5. Translate non-English
df = translate_non_english(df)
# 6. Sentiment
df['sentiment-score'] = df['freeform_answer'].apply(analyze_sentiment)
df['sentiment'] = df['sentiment-score'].apply(label_sentiment)
final_row_count = len(df)
row_count_delta = final_row_count - original_row_count
return df, row_count_delta, final_row_count, original_row_count
############################
# APP ENTRY POINT
############################
def main():
st.title("Exit Survey Processing App")
st.markdown("Upload your Exit Survey file in CSV or Excel format; the app cleans & processes it.")
# Global/Shared models
global openai_model, naming_model
openai_model = OpenAIWrapper(model="gpt-4o-mini", prompt="")
naming_model = OpenAIWrapper(model="gpt-4o-mini", prompt="") # for topic naming
# Reset button
if st.button("Reset App"):
st.session_state.clear()
# File upload
uploaded_file = st.file_uploader("Upload an exit survey file", type=["csv", "xlsx"])
if uploaded_file:
if 'processed_df' not in st.session_state:
with st.spinner("Processing file..."):
df, row_count_delta, final_row_count, original_row_count = process_file(uploaded_file)
st.session_state['processed_df'] = df
st.session_state['row_count_delta'] = row_count_delta
st.session_state['final_row_count'] = final_row_count
st.session_state['original_row_count'] = original_row_count
else:
df = st.session_state['processed_df']
row_count_delta = st.session_state['row_count_delta']
final_row_count = st.session_state['final_row_count']
original_row_count = st.session_state['original_row_count']
st.divider()
########################################
# 1. General Overview
########################################
st.header("General Overview")
with st.container():
metric_col1, metric_col2 = st.columns(2)
metric_col1.metric(
label="No. Responses After Processing",
value=final_row_count,
delta=row_count_delta
)
avg_length = int(df['word-count'].mean().round())
metric_col2.metric(
label="Avg. Response Length",
value=f"{avg_length} words"
)
st.write("#### Data Overview")
st.dataframe(
df,
hide_index=True,
column_config={'date': st.column_config.DatetimeColumn(format="YYYY-MM-DD")}
)
if 'exit_reason' in df.columns:
st.write("#### Exit Reason Distribution")
overview = generate_cancellation_reasons_overview(df, 'exit_reason')
reasons_bar = create_cancellation_reasons_plot(overview)
st.plotly_chart(reasons_bar, use_container_width=True)
########################################
# 2. Sentiment Analysis
########################################
st.subheader("Sentiment Analysis")
st.write("Visual representation of sentiment distribution, plus a grouped bar chart if you like.")
exclude_cols_sentiment = ['freeform_answer', 'date', 'word-count', 'sentiment-score', 'sentiment']
candidate_cols = [col for col in df.columns if col not in exclude_cols_sentiment and df[col].nunique() > 1]
col_left, col_right = st.columns([2,1])
with col_left:
if candidate_cols:
grouping_col = st.selectbox(
"Select a column to group sentiment by",
candidate_cols,
index=0
)
grouped_data = df.groupby([grouping_col, 'sentiment']).size().reset_index(name='count')
st.write(f"##### Sentiment Grouped by {grouping_col}")
chart = create_grouped_chart(grouped_data, grouping_col, 'sentiment')
st.plotly_chart(chart, use_container_width=True)
else:
st.write("##### Sentiment (no grouping column available)")
grouped_data = df.groupby(['sentiment']).size().reset_index(name='count')
chart = create_grouped_chart(grouped_data, 'sentiment', 'sentiment')
st.plotly_chart(chart, use_container_width=True)
with col_right:
st.write("##### Overall Sentiment Distribution")
sentiment_pie = create_sentiment_pie(df)
st.plotly_chart(sentiment_pie, use_container_width=True)
########################################
# 3. Topic Modeling
########################################
st.header("Topic Modeling")
# Only run the modeling once per data set (cached).
_topic_model, updated_topics, mapping, chatgpt_topic_labels = run_topic_modeling(df)
topics_df = _topic_model.get_topic_info()
topics_df = topics_df[topics_df['Topic'] != -1].copy()
topics_df.drop(columns=['Name'], errors='ignore', inplace=True)
topics_df.rename(columns={
'CustomName': 'Topic Name',
'Topic': 'Topic Number (ID)'
}, inplace=True)
# Re-arrange cols for easier viewing
cols_order = ['Topic Number (ID)', 'Topic Name', 'Count',
'Representation', 'Secondary Representation', 'Representative_Docs']
topics_df = topics_df[[c for c in cols_order if c in topics_df.columns]]
st.subheader("Topics Barchart (Stacked by Class)")
st.markdown("""
Choose a categorical column from your data to visualize how frequently each topic appears
across different classes.
""")
with st.expander("Explore Topic Details", expanded=False):
st.write("""
**Table Info:**
- **Topic Name**: AI-generated label
- **Representation**: Top 10 keywords
- **Secondary Representation**: Reranked keywords for diversity
- **Representative Docs**: Sample sentences contributing to the topic
""")
st.dataframe(topics_df, hide_index=True)
# For stacked barchart, pick a class column
exclude_cols = ["freeform_answer", "sat_score", "date",
"word-count", "sentiment-score", "sentiment"]
available_cols = [c for c in df.columns if c not in exclude_cols]
default_idx = available_cols.index("exit_reason") if "exit_reason" in available_cols else 0
class_column = st.selectbox(
"How to group topics for visualization?",
available_cols,
index=default_idx
)
@st.cache_data(show_spinner=False)
def get_topics_per_class(class_col, mapping, df, sentences, _model):
sentence_classes = [df.loc[idx, class_col] for idx in mapping]
tpc = _model.topics_per_class(sentences, classes=sentence_classes)
t_labels = _model.get_topic_info()[['Topic', 'CustomName']]
tpc = tpc.merge(t_labels, on='Topic', how='left')
tpc = tpc[tpc['Topic'] != -1].reset_index(drop=True)
return tpc
# Create stacked bar chart
sentences = [""] * len(mapping)
sentences = []
for idx, response in df['freeform_answer'].dropna().items():
for sentence in sent_tokenize(response):
sentences.append(sentence)
topics_per_class = get_topics_per_class(class_column, mapping, df, sentences, _topic_model)
stacked_chart = create_stacked_topics_per_class(topics_per_class)
st.plotly_chart(stacked_chart, use_container_width=True)
########################################
# 4. Topics Over Time
########################################
st.subheader("Topics Over Time")
valid_dates = df['date'].dropna()
if valid_dates.nunique() < 2:
st.warning("Not enough distinct date values to plot topics over time.")
else:
# Build list of dates for each sentence
sentence_dates = [df.loc[idx, 'date'] for idx in mapping]
topics_over_time = _topic_model.topics_over_time(sentences, sentence_dates, nr_bins=20)
# Merge custom labels
topic_labels = _topic_model.get_topic_info()[['Topic', 'CustomName']]
topics_over_time = topics_over_time.merge(topic_labels, on='Topic', how='left')
topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
chart = create_topics_overtime_chart(topics_over_time)
st.plotly_chart(chart, use_container_width=True)
########################################
# 5. Updated DataFrame
########################################
updated_df = update_df_with_topics(df, mapping, updated_topics, chatgpt_topic_labels)
with st.expander("View Final Updated DataFrame", expanded=False):
st.dataframe(updated_df, hide_index=True)
if __name__ == "__main__":
main()