| |
| |
| |
| import streamlit as st |
| import pandas as pd |
| import time |
| from datetime import datetime |
| import nltk |
| from nltk.tokenize import sent_tokenize |
| from hdbscan import HDBSCAN |
| from umap import UMAP |
| from openai import OpenAI |
| from tenacity import retry, wait_exponential, stop_after_attempt |
|
|
| from functions.auto_column_detection import auto_detect_columns |
| from functions.preprocessing_functions import remove_numeric_or_special_responses, robust_convert_date |
| from functions.language_labeling_translation import detect_language, translate_text |
| from functions.sentiment_analysis import analyze_sentiment, label_sentiment |
| from functions.create_cancellation_reasons_table import generate_cancellation_reasons_overview |
|
|
| from functions.topicModeling_contentRequests import ( |
| load_embedding_model, |
| bertopic_model, |
| merge_specific_topics, |
| update_df_with_topics |
| ) |
| from plots.overview_charts import ( |
| create_word_count_histogram, |
| create_sentiment_pie, |
| create_cancellation_reasons_plot, |
| create_grouped_chart |
| ) |
| from plots.topicModeling_charts import ( |
| create_topics_overtime_chart, |
| create_stacked_topics_per_class |
| ) |
|
|
| |
| |
| |
| st.set_page_config( |
| layout='wide', |
| page_title="Exit Survey Processing App", |
| initial_sidebar_state="expanded", |
| ) |
|
|
| |
| OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"] |
| client = OpenAI(api_key=OPENAI_API_KEY) |
|
|
| |
| |
| |
| class OpenAIWrapper: |
| """ |
| Wraps the OpenAI chat.completions call with automatic retries and |
| a configurable prompt. |
| """ |
| def __init__(self, model, prompt=""): |
| self.model = model |
| self.prompt = prompt |
|
|
| @retry(wait=wait_exponential(multiplier=1, min=2, max=10), stop=stop_after_attempt(5)) |
| def run(self, user_text): |
| try: |
| response = client.chat.completions.create( |
| model=self.model, |
| messages=[ |
| {"role": "system", "content": self.prompt}, |
| {"role": "user", "content": user_text}, |
| ] |
| ) |
| return response.choices[0].message.content |
| except Exception as e: |
| st.error(f"Error during OpenAI API call: {e}") |
| raise |
|
|
| @st.cache_data(show_spinner=False) |
| def cached_translate(text): |
| """Cached translation function to reduce repeated OpenAI calls.""" |
| return translate_text(text, skip_translation=False, translator_model=openai_model) |
|
|
| @st.cache_resource(show_spinner=False) |
| def get_embedding_model(): |
| """Caches the embedding model for topic modeling.""" |
| return load_embedding_model() |
|
|
| def translate_non_english(df): |
| """ |
| Identifies and translates non-English rows (with word-count > 8) in 'freeform_answer'. |
| Uses the globally cached `cached_translate`. |
| """ |
| df['language'] = df['freeform_answer'].apply(detect_language) |
| to_translate = df[(df['language'] == 'non-en') & (df['word-count'] > 8)].copy() |
| if not to_translate.empty: |
| progress_text = st.empty() |
| progress_bar = st.progress(0) |
| total = len(to_translate) |
| for i, (idx, row) in enumerate(to_translate.iterrows(), 1): |
| progress_text.text(f"Translating non-English responses ({i} of {total})") |
| try: |
| translated = cached_translate(row['freeform_answer']) |
| df.at[idx, 'freeform_answer'] = translated |
| except Exception as e: |
| st.error(f"Error translating response {i}: {str(e)}") |
| progress_bar.progress(i / total) |
| progress_text.empty() |
| progress_bar.empty() |
| st.success( |
| f"Successfully translated {total} non-English responses", |
| icon='✅' |
| ) |
| df.drop(columns='language', inplace=True, errors='ignore') |
| return df |
|
|
| @st.cache_data(show_spinner=False) |
| def run_topic_modeling(df): |
| """ |
| Full pipeline for: |
| 1. Sentence tokenization |
| 2. Embedding |
| 3. UMAP, HDBSCAN |
| 4. BERTopic modeling |
| 5. Custom topic naming via OpenAI |
| 6. Merging small topics, final labeling |
| |
| Returns: |
| (topic_model, updated_topics, mapping, chatgpt_topic_labels) |
| """ |
| |
| try: |
| nltk.data.find("tokenizers/punkt_tab/english") |
| except LookupError: |
| nltk.download("punkt_tab") |
|
|
| sentences = [] |
| mapping = [] |
| for idx, response in df['freeform_answer'].dropna().items(): |
| for sentence in sent_tokenize(response): |
| sentences.append(sentence) |
| mapping.append(idx) |
|
|
| |
| embedding_model = get_embedding_model() |
| embeddings = embedding_model.encode(sentences, show_progress_bar=True) |
|
|
| |
| umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42) |
| hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', |
| cluster_selection_method='eom', |
| prediction_data=True) |
|
|
| |
| _topic_model, topics, probs = bertopic_model( |
| sentences, embeddings, embedding_model, |
| umap_model, hdbscan_model |
| ) |
| |
| |
| _topic_model = merge_specific_topics(_topic_model, sentences) |
| updated_topics, _ = _topic_model.transform(sentences) |
|
|
| |
| topic_info = _topic_model.get_topic_info() |
| chatgpt_topic_labels = {} |
| for topic_id in topic_info['Topic']: |
| if topic_id == -1: |
| continue |
| rep_docs = _topic_model.get_representative_docs(topic_id) |
| doc_text = " ".join(rep_docs[:10]) |
| topic_keywords = _topic_model.get_topic(topic_id) or [] |
| keywords_text = ", ".join([word for word, score in topic_keywords]) |
|
|
| prompt_template = """ |
| I have a topic that contains the following documents: |
| [DOCUMENTS] |
| |
| The topic is described by the following keywords: [KEYWORDS] |
| |
| Based on the information above, extract a short but highly descriptive topic label |
| of at most 5 words. Make sure it is in the following format: |
| |
| topic: <topic label> |
| """.strip() |
|
|
| prompt_filled = prompt_template.replace("[DOCUMENTS]", doc_text).replace("[KEYWORDS]", keywords_text) |
| response = naming_model.run(prompt_filled) |
| label = response.strip() |
| if label.lower().startswith("topic:"): |
| label = label[len("topic:"):].strip() |
| chatgpt_topic_labels[topic_id] = label |
|
|
| if -1 in chatgpt_topic_labels: |
| del chatgpt_topic_labels[-1] |
| _topic_model.set_topic_labels(chatgpt_topic_labels) |
|
|
| return _topic_model, updated_topics, mapping, chatgpt_topic_labels |
|
|
| def process_file(uploaded_file): |
| """ |
| Process the uploaded file, perform data cleaning, and return a processed DataFrame. |
| """ |
| |
| try: |
| if uploaded_file.name.endswith('.csv'): |
| df = pd.read_csv(uploaded_file) |
| else: |
| df = pd.read_excel(uploaded_file) |
| except Exception as e: |
| st.error(f"Error reading file: {e}") |
| st.stop() |
|
|
| original_row_count = len(df) |
|
|
| |
| st.header("Data Preview") |
| df_preview_col, spacer, detected_cols_col = st.columns([1, 0.05, 1]) |
|
|
| with df_preview_col: |
| st.subheader("Raw Data Preview") |
| st.dataframe(df, hide_index=True) |
|
|
| with detected_cols_col: |
| detected = auto_detect_columns(df) |
| st.subheader("Column Detection & Selection") |
| st.info( |
| "We've automatically detected a few columns. Verify these are correct or select manually.", |
| icon='💡' |
| ) |
| st.json(detected) |
|
|
| for req in ['freeform_answer', 'date']: |
| if req not in detected: |
| detected[req] = st.selectbox(f"Select column for {req}", df.columns.tolist()) |
|
|
| if not st.button("Continue with these columns"): |
| st.stop() |
|
|
| |
| rename_mapping = {detected[col]: col for col in detected} |
| df.rename(columns=rename_mapping, inplace=True) |
| df.columns = df.columns.str.lower().str.replace(" ", "_") |
|
|
| |
| if 'freeform_answer' not in df.columns: |
| st.error("Column 'freeform_answer' not found.") |
| st.stop() |
|
|
| |
| df['word-count'] = df['freeform_answer'].apply( |
| lambda x: len(str(x).split()) if pd.notnull(x) else 0 |
| ) |
|
|
| |
| if 'date' in df.columns: |
| df['date'] = robust_convert_date(df['date']) |
| else: |
| st.error("'date' column is missing.") |
| st.stop() |
|
|
| |
| df = remove_numeric_or_special_responses(df, 'freeform_answer') |
|
|
| |
| df = translate_non_english(df) |
|
|
| |
| df['sentiment-score'] = df['freeform_answer'].apply(analyze_sentiment) |
| df['sentiment'] = df['sentiment-score'].apply(label_sentiment) |
|
|
| final_row_count = len(df) |
| row_count_delta = final_row_count - original_row_count |
|
|
| return df, row_count_delta, final_row_count, original_row_count |
|
|
| |
| |
| |
| def main(): |
| st.title("Exit Survey Processing App") |
| st.markdown("Upload your Exit Survey file in CSV or Excel format; the app cleans & processes it.") |
|
|
| |
| global openai_model, naming_model |
| openai_model = OpenAIWrapper(model="gpt-4o-mini", prompt="") |
| naming_model = OpenAIWrapper(model="gpt-4o-mini", prompt="") |
|
|
| |
| if st.button("Reset App"): |
| st.session_state.clear() |
|
|
| |
| uploaded_file = st.file_uploader("Upload an exit survey file", type=["csv", "xlsx"]) |
|
|
| if uploaded_file: |
| if 'processed_df' not in st.session_state: |
| with st.spinner("Processing file..."): |
| df, row_count_delta, final_row_count, original_row_count = process_file(uploaded_file) |
| st.session_state['processed_df'] = df |
| st.session_state['row_count_delta'] = row_count_delta |
| st.session_state['final_row_count'] = final_row_count |
| st.session_state['original_row_count'] = original_row_count |
| else: |
| df = st.session_state['processed_df'] |
| row_count_delta = st.session_state['row_count_delta'] |
| final_row_count = st.session_state['final_row_count'] |
| original_row_count = st.session_state['original_row_count'] |
|
|
| st.divider() |
|
|
| |
| |
| |
| st.header("General Overview") |
| with st.container(): |
| metric_col1, metric_col2 = st.columns(2) |
| metric_col1.metric( |
| label="No. Responses After Processing", |
| value=final_row_count, |
| delta=row_count_delta |
| ) |
| avg_length = int(df['word-count'].mean().round()) |
| metric_col2.metric( |
| label="Avg. Response Length", |
| value=f"{avg_length} words" |
| ) |
|
|
| st.write("#### Data Overview") |
| st.dataframe( |
| df, |
| hide_index=True, |
| column_config={'date': st.column_config.DatetimeColumn(format="YYYY-MM-DD")} |
| ) |
|
|
| if 'exit_reason' in df.columns: |
| st.write("#### Exit Reason Distribution") |
| overview = generate_cancellation_reasons_overview(df, 'exit_reason') |
| reasons_bar = create_cancellation_reasons_plot(overview) |
| st.plotly_chart(reasons_bar, use_container_width=True) |
|
|
| |
| |
| |
| st.subheader("Sentiment Analysis") |
| st.write("Visual representation of sentiment distribution, plus a grouped bar chart if you like.") |
| exclude_cols_sentiment = ['freeform_answer', 'date', 'word-count', 'sentiment-score', 'sentiment'] |
| candidate_cols = [col for col in df.columns if col not in exclude_cols_sentiment and df[col].nunique() > 1] |
|
|
| col_left, col_right = st.columns([2,1]) |
| with col_left: |
| if candidate_cols: |
| grouping_col = st.selectbox( |
| "Select a column to group sentiment by", |
| candidate_cols, |
| index=0 |
| ) |
| grouped_data = df.groupby([grouping_col, 'sentiment']).size().reset_index(name='count') |
| st.write(f"##### Sentiment Grouped by {grouping_col}") |
| chart = create_grouped_chart(grouped_data, grouping_col, 'sentiment') |
| st.plotly_chart(chart, use_container_width=True) |
| else: |
| st.write("##### Sentiment (no grouping column available)") |
| grouped_data = df.groupby(['sentiment']).size().reset_index(name='count') |
| chart = create_grouped_chart(grouped_data, 'sentiment', 'sentiment') |
| st.plotly_chart(chart, use_container_width=True) |
|
|
| with col_right: |
| st.write("##### Overall Sentiment Distribution") |
| sentiment_pie = create_sentiment_pie(df) |
| st.plotly_chart(sentiment_pie, use_container_width=True) |
|
|
| |
| |
| |
| st.header("Topic Modeling") |
|
|
| |
| _topic_model, updated_topics, mapping, chatgpt_topic_labels = run_topic_modeling(df) |
|
|
| topics_df = _topic_model.get_topic_info() |
| topics_df = topics_df[topics_df['Topic'] != -1].copy() |
| topics_df.drop(columns=['Name'], errors='ignore', inplace=True) |
| topics_df.rename(columns={ |
| 'CustomName': 'Topic Name', |
| 'Topic': 'Topic Number (ID)' |
| }, inplace=True) |
|
|
| |
| cols_order = ['Topic Number (ID)', 'Topic Name', 'Count', |
| 'Representation', 'Secondary Representation', 'Representative_Docs'] |
| topics_df = topics_df[[c for c in cols_order if c in topics_df.columns]] |
|
|
| st.subheader("Topics Barchart (Stacked by Class)") |
| st.markdown(""" |
| Choose a categorical column from your data to visualize how frequently each topic appears |
| across different classes. |
| """) |
|
|
| with st.expander("Explore Topic Details", expanded=False): |
| st.write(""" |
| **Table Info:** |
| - **Topic Name**: AI-generated label |
| - **Representation**: Top 10 keywords |
| - **Secondary Representation**: Reranked keywords for diversity |
| - **Representative Docs**: Sample sentences contributing to the topic |
| """) |
| st.dataframe(topics_df, hide_index=True) |
|
|
| |
| exclude_cols = ["freeform_answer", "sat_score", "date", |
| "word-count", "sentiment-score", "sentiment"] |
| available_cols = [c for c in df.columns if c not in exclude_cols] |
| default_idx = available_cols.index("exit_reason") if "exit_reason" in available_cols else 0 |
| class_column = st.selectbox( |
| "How to group topics for visualization?", |
| available_cols, |
| index=default_idx |
| ) |
|
|
| @st.cache_data(show_spinner=False) |
| def get_topics_per_class(class_col, mapping, df, sentences, _model): |
| sentence_classes = [df.loc[idx, class_col] for idx in mapping] |
| tpc = _model.topics_per_class(sentences, classes=sentence_classes) |
| t_labels = _model.get_topic_info()[['Topic', 'CustomName']] |
| tpc = tpc.merge(t_labels, on='Topic', how='left') |
| tpc = tpc[tpc['Topic'] != -1].reset_index(drop=True) |
| return tpc |
|
|
| |
| sentences = [""] * len(mapping) |
| sentences = [] |
| for idx, response in df['freeform_answer'].dropna().items(): |
| for sentence in sent_tokenize(response): |
| sentences.append(sentence) |
|
|
| topics_per_class = get_topics_per_class(class_column, mapping, df, sentences, _topic_model) |
| stacked_chart = create_stacked_topics_per_class(topics_per_class) |
| st.plotly_chart(stacked_chart, use_container_width=True) |
|
|
| |
| |
| |
| st.subheader("Topics Over Time") |
| valid_dates = df['date'].dropna() |
| if valid_dates.nunique() < 2: |
| st.warning("Not enough distinct date values to plot topics over time.") |
| else: |
| |
| sentence_dates = [df.loc[idx, 'date'] for idx in mapping] |
| topics_over_time = _topic_model.topics_over_time(sentences, sentence_dates, nr_bins=20) |
|
|
| |
| topic_labels = _topic_model.get_topic_info()[['Topic', 'CustomName']] |
| topics_over_time = topics_over_time.merge(topic_labels, on='Topic', how='left') |
| topics_over_time = topics_over_time[topics_over_time['Topic'] != -1] |
|
|
| chart = create_topics_overtime_chart(topics_over_time) |
| st.plotly_chart(chart, use_container_width=True) |
|
|
| |
| |
| |
| updated_df = update_df_with_topics(df, mapping, updated_topics, chatgpt_topic_labels) |
| with st.expander("View Final Updated DataFrame", expanded=False): |
| st.dataframe(updated_df, hide_index=True) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|