Spaces:

TwinklData
/

monthly_exit_survey_analysis_app

Sleeping

monthly_exit_survey_analysis_app / app.py

lynn-twinkl

Added nltk import

89917b7 about 1 year ago

18.1 kB

	###############################
	# IMPORTS & CONFIG
	###############################
	import streamlit as st
	import pandas as pd
	import time
	from datetime import datetime
	import nltk
	from nltk.tokenize import sent_tokenize
	from hdbscan import HDBSCAN
	from umap import UMAP
	from openai import OpenAI
	from tenacity import retry, wait_exponential, stop_after_attempt

	from functions.auto_column_detection import auto_detect_columns
	from functions.preprocessing_functions import remove_numeric_or_special_responses, robust_convert_date
	from functions.language_labeling_translation import detect_language, translate_text
	from functions.sentiment_analysis import analyze_sentiment, label_sentiment
	from functions.create_cancellation_reasons_table import generate_cancellation_reasons_overview

	from functions.topicModeling_contentRequests import (
	load_embedding_model,
	bertopic_model,
	merge_specific_topics,
	update_df_with_topics
	)
	from plots.overview_charts import (
	create_word_count_histogram,
	create_sentiment_pie,
	create_cancellation_reasons_plot,
	create_grouped_chart
	)
	from plots.topicModeling_charts import (
	create_topics_overtime_chart,
	create_stacked_topics_per_class
	)

	############################
	# STREAMLIT APP CONFIGURATION
	############################
	st.set_page_config(
	layout='wide',
	page_title="Exit Survey Processing App",
	initial_sidebar_state="expanded",
	)

	# Global settings
	OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
	client = OpenAI(api_key=OPENAI_API_KEY)

	###############################
	# HELPER CLASSES & FUNCTIONS
	###############################
	class OpenAIWrapper:
	"""
	Wraps the OpenAI chat.completions call with automatic retries and
	a configurable prompt.
	"""
	def __init__(self, model, prompt=""):
	self.model = model
	self.prompt = prompt

	@retry(wait=wait_exponential(multiplier=1, min=2, max=10), stop=stop_after_attempt(5))
	def run(self, user_text):
	try:
	response = client.chat.completions.create(
	model=self.model,
	messages=[
	{"role": "system", "content": self.prompt},
	{"role": "user", "content": user_text},
	]
	)
	return response.choices[0].message.content
	except Exception as e:
	st.error(f"Error during OpenAI API call: {e}")
	raise

	@st.cache_data(show_spinner=False)
	def cached_translate(text):
	"""Cached translation function to reduce repeated OpenAI calls."""
	return translate_text(text, skip_translation=False, translator_model=openai_model)

	@st.cache_resource(show_spinner=False)
	def get_embedding_model():
	"""Caches the embedding model for topic modeling."""
	return load_embedding_model()

	def translate_non_english(df):
	"""
	Identifies and translates non-English rows (with word-count > 8) in 'freeform_answer'.
	Uses the globally cached `cached_translate`.
	"""
	df['language'] = df['freeform_answer'].apply(detect_language)
	to_translate = df[(df['language'] == 'non-en') & (df['word-count'] > 8)].copy()
	if not to_translate.empty:
	progress_text = st.empty()
	progress_bar = st.progress(0)
	total = len(to_translate)
	for i, (idx, row) in enumerate(to_translate.iterrows(), 1):
	progress_text.text(f"Translating non-English responses ({i} of {total})")
	try:
	translated = cached_translate(row['freeform_answer'])
	df.at[idx, 'freeform_answer'] = translated
	except Exception as e:
	st.error(f"Error translating response {i}: {str(e)}")
	progress_bar.progress(i / total)
	progress_text.empty()
	progress_bar.empty()
	st.success(
	f"Successfully translated {total} non-English responses",
	icon='✅'
	)
	df.drop(columns='language', inplace=True, errors='ignore')
	return df

	@st.cache_data(show_spinner=False)
	def run_topic_modeling(df):
	"""
	Full pipeline for:
	1. Sentence tokenization
	2. Embedding
	3. UMAP, HDBSCAN
	4. BERTopic modeling
	5. Custom topic naming via OpenAI
	6. Merging small topics, final labeling

	Returns:
	(topic_model, updated_topics, mapping, chatgpt_topic_labels)
	"""
	# --- 1. Sentence tokenization ---
	try:
	nltk.data.find("tokenizers/punkt_tab/english")
	except LookupError:
	nltk.download("punkt_tab")

	sentences = []
	mapping = []
	for idx, response in df['freeform_answer'].dropna().items():
	for sentence in sent_tokenize(response):
	sentences.append(sentence)
	mapping.append(idx)

	# --- 2. Embedding ---
	embedding_model = get_embedding_model()
	embeddings = embedding_model.encode(sentences, show_progress_bar=True)

	# --- 3. UMAP, HDBSCAN ---
	umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
	hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean',
	cluster_selection_method='eom',
	prediction_data=True)

	# --- 4. BERTopic model creation ---
	_topic_model, topics, probs = bertopic_model(
	sentences, embeddings, embedding_model,
	umap_model, hdbscan_model
	)

	# Merge small or closely
	_topic_model = merge_specific_topics(_topic_model, sentences)
	updated_topics, _ = _topic_model.transform(sentences)

	# --- 5. Custom topic naming via OpenAI ---
	topic_info = _topic_model.get_topic_info()
	chatgpt_topic_labels = {}
	for topic_id in topic_info['Topic']:
	if topic_id == -1:
	continue
	rep_docs = _topic_model.get_representative_docs(topic_id)
	doc_text = " ".join(rep_docs[:10]) # Up to 10 docs for context
	topic_keywords = _topic_model.get_topic(topic_id) or []
	keywords_text = ", ".join([word for word, score in topic_keywords])

	prompt_template = """
	I have a topic that contains the following documents:
	[DOCUMENTS]

	The topic is described by the following keywords: [KEYWORDS]

	Based on the information above, extract a short but highly descriptive topic label
	of at most 5 words. Make sure it is in the following format:

	topic: <topic label>
	""".strip()

	prompt_filled = prompt_template.replace("[DOCUMENTS]", doc_text).replace("[KEYWORDS]", keywords_text)
	response = naming_model.run(prompt_filled)
	label = response.strip()
	if label.lower().startswith("topic:"):
	label = label[len("topic:"):].strip()
	chatgpt_topic_labels[topic_id] = label

	if -1 in chatgpt_topic_labels:
	del chatgpt_topic_labels[-1]
	_topic_model.set_topic_labels(chatgpt_topic_labels)

	return _topic_model, updated_topics, mapping, chatgpt_topic_labels

	def process_file(uploaded_file):
	"""
	Process the uploaded file, perform data cleaning, and return a processed DataFrame.
	"""
	# 1. Read file
	try:
	if uploaded_file.name.endswith('.csv'):
	df = pd.read_csv(uploaded_file)
	else:
	df = pd.read_excel(uploaded_file)
	except Exception as e:
	st.error(f"Error reading file: {e}")
	st.stop()

	original_row_count = len(df)

	# 2. Auto-detect columns
	st.header("Data Preview")
	df_preview_col, spacer, detected_cols_col = st.columns([1, 0.05, 1])

	with df_preview_col:
	st.subheader("Raw Data Preview")
	st.dataframe(df, hide_index=True)

	with detected_cols_col:
	detected = auto_detect_columns(df)
	st.subheader("Column Detection & Selection")
	st.info(
	"We've automatically detected a few columns. Verify these are correct or select manually.",
	icon='💡'
	)
	st.json(detected)

	for req in ['freeform_answer', 'date']:
	if req not in detected:
	detected[req] = st.selectbox(f"Select column for {req}", df.columns.tolist())

	if not st.button("Continue with these columns"):
	st.stop()

	# 3. Rename columns
	rename_mapping = {detected[col]: col for col in detected}
	df.rename(columns=rename_mapping, inplace=True)
	df.columns = df.columns.str.lower().str.replace(" ", "_")

	# 4. Basic cleaning steps
	if 'freeform_answer' not in df.columns:
	st.error("Column 'freeform_answer' not found.")
	st.stop()

	# Word count
	df['word-count'] = df['freeform_answer'].apply(
	lambda x: len(str(x).split()) if pd.notnull(x) else 0
	)

	# Convert date
	if 'date' in df.columns:
	df['date'] = robust_convert_date(df['date'])
	else:
	st.error("'date' column is missing.")
	st.stop()

	# Remove numeric or special responses
	df = remove_numeric_or_special_responses(df, 'freeform_answer')

	# 5. Translate non-English
	df = translate_non_english(df)

	# 6. Sentiment
	df['sentiment-score'] = df['freeform_answer'].apply(analyze_sentiment)
	df['sentiment'] = df['sentiment-score'].apply(label_sentiment)

	final_row_count = len(df)
	row_count_delta = final_row_count - original_row_count

	return df, row_count_delta, final_row_count, original_row_count

	############################
	# APP ENTRY POINT
	############################
	def main():
	st.title("Exit Survey Processing App")
	st.markdown("Upload your Exit Survey file in CSV or Excel format; the app cleans & processes it.")

	# Global/Shared models
	global openai_model, naming_model
	openai_model = OpenAIWrapper(model="gpt-4o-mini", prompt="")
	naming_model = OpenAIWrapper(model="gpt-4o-mini", prompt="") # for topic naming

	# Reset button
	if st.button("Reset App"):
	st.session_state.clear()

	# File upload
	uploaded_file = st.file_uploader("Upload an exit survey file", type=["csv", "xlsx"])

	if uploaded_file:
	if 'processed_df' not in st.session_state:
	with st.spinner("Processing file..."):
	df, row_count_delta, final_row_count, original_row_count = process_file(uploaded_file)
	st.session_state['processed_df'] = df
	st.session_state['row_count_delta'] = row_count_delta
	st.session_state['final_row_count'] = final_row_count
	st.session_state['original_row_count'] = original_row_count
	else:
	df = st.session_state['processed_df']
	row_count_delta = st.session_state['row_count_delta']
	final_row_count = st.session_state['final_row_count']
	original_row_count = st.session_state['original_row_count']

	st.divider()

	########################################
	# 1. General Overview
	########################################
	st.header("General Overview")
	with st.container():
	metric_col1, metric_col2 = st.columns(2)
	metric_col1.metric(
	label="No. Responses After Processing",
	value=final_row_count,
	delta=row_count_delta
	)
	avg_length = int(df['word-count'].mean().round())
	metric_col2.metric(
	label="Avg. Response Length",
	value=f"{avg_length} words"
	)

	st.write("#### Data Overview")
	st.dataframe(
	df,
	hide_index=True,
	column_config={'date': st.column_config.DatetimeColumn(format="YYYY-MM-DD")}
	)

	if 'exit_reason' in df.columns:
	st.write("#### Exit Reason Distribution")
	overview = generate_cancellation_reasons_overview(df, 'exit_reason')
	reasons_bar = create_cancellation_reasons_plot(overview)
	st.plotly_chart(reasons_bar, use_container_width=True)

	########################################
	# 2. Sentiment Analysis
	########################################
	st.subheader("Sentiment Analysis")
	st.write("Visual representation of sentiment distribution, plus a grouped bar chart if you like.")
	exclude_cols_sentiment = ['freeform_answer', 'date', 'word-count', 'sentiment-score', 'sentiment']
	candidate_cols = [col for col in df.columns if col not in exclude_cols_sentiment and df[col].nunique() > 1]

	col_left, col_right = st.columns([2,1])
	with col_left:
	if candidate_cols:
	grouping_col = st.selectbox(
	"Select a column to group sentiment by",
	candidate_cols,
	index=0
	)
	grouped_data = df.groupby([grouping_col, 'sentiment']).size().reset_index(name='count')
	st.write(f"##### Sentiment Grouped by {grouping_col}")
	chart = create_grouped_chart(grouped_data, grouping_col, 'sentiment')
	st.plotly_chart(chart, use_container_width=True)
	else:
	st.write("##### Sentiment (no grouping column available)")
	grouped_data = df.groupby(['sentiment']).size().reset_index(name='count')
	chart = create_grouped_chart(grouped_data, 'sentiment', 'sentiment')
	st.plotly_chart(chart, use_container_width=True)

	with col_right:
	st.write("##### Overall Sentiment Distribution")
	sentiment_pie = create_sentiment_pie(df)
	st.plotly_chart(sentiment_pie, use_container_width=True)

	########################################
	# 3. Topic Modeling
	########################################
	st.header("Topic Modeling")

	# Only run the modeling once per data set (cached).
	_topic_model, updated_topics, mapping, chatgpt_topic_labels = run_topic_modeling(df)

	topics_df = _topic_model.get_topic_info()
	topics_df = topics_df[topics_df['Topic'] != -1].copy()
	topics_df.drop(columns=['Name'], errors='ignore', inplace=True)
	topics_df.rename(columns={
	'CustomName': 'Topic Name',
	'Topic': 'Topic Number (ID)'
	}, inplace=True)

	# Re-arrange cols for easier viewing
	cols_order = ['Topic Number (ID)', 'Topic Name', 'Count',
	'Representation', 'Secondary Representation', 'Representative_Docs']
	topics_df = topics_df[[c for c in cols_order if c in topics_df.columns]]

	st.subheader("Topics Barchart (Stacked by Class)")
	st.markdown("""
	Choose a categorical column from your data to visualize how frequently each topic appears
	across different classes.
	""")

	with st.expander("Explore Topic Details", expanded=False):
	st.write("""
	Table Info:
	- Topic Name: AI-generated label
	- Representation: Top 10 keywords
	- Secondary Representation: Reranked keywords for diversity
	- Representative Docs: Sample sentences contributing to the topic
	""")
	st.dataframe(topics_df, hide_index=True)

	# For stacked barchart, pick a class column
	exclude_cols = ["freeform_answer", "sat_score", "date",
	"word-count", "sentiment-score", "sentiment"]
	available_cols = [c for c in df.columns if c not in exclude_cols]
	default_idx = available_cols.index("exit_reason") if "exit_reason" in available_cols else 0
	class_column = st.selectbox(
	"How to group topics for visualization?",
	available_cols,
	index=default_idx
	)

	@st.cache_data(show_spinner=False)
	def get_topics_per_class(class_col, mapping, df, sentences, _model):
	sentence_classes = [df.loc[idx, class_col] for idx in mapping]
	tpc = _model.topics_per_class(sentences, classes=sentence_classes)
	t_labels = _model.get_topic_info()[['Topic', 'CustomName']]
	tpc = tpc.merge(t_labels, on='Topic', how='left')
	tpc = tpc[tpc['Topic'] != -1].reset_index(drop=True)
	return tpc

	# Create stacked bar chart
	sentences = [""] * len(mapping)
	sentences = []
	for idx, response in df['freeform_answer'].dropna().items():
	for sentence in sent_tokenize(response):
	sentences.append(sentence)

	topics_per_class = get_topics_per_class(class_column, mapping, df, sentences, _topic_model)
	stacked_chart = create_stacked_topics_per_class(topics_per_class)
	st.plotly_chart(stacked_chart, use_container_width=True)

	########################################
	# 4. Topics Over Time
	########################################
	st.subheader("Topics Over Time")
	valid_dates = df['date'].dropna()
	if valid_dates.nunique() < 2:
	st.warning("Not enough distinct date values to plot topics over time.")
	else:
	# Build list of dates for each sentence
	sentence_dates = [df.loc[idx, 'date'] for idx in mapping]
	topics_over_time = _topic_model.topics_over_time(sentences, sentence_dates, nr_bins=20)

	# Merge custom labels
	topic_labels = _topic_model.get_topic_info()[['Topic', 'CustomName']]
	topics_over_time = topics_over_time.merge(topic_labels, on='Topic', how='left')
	topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]

	chart = create_topics_overtime_chart(topics_over_time)
	st.plotly_chart(chart, use_container_width=True)

	########################################
	# 5. Updated DataFrame
	########################################
	updated_df = update_df_with_topics(df, mapping, updated_topics, chatgpt_topic_labels)
	with st.expander("View Final Updated DataFrame", expanded=False):
	st.dataframe(updated_df, hide_index=True)


	if __name__ == "__main__":
	main()