Spaces:

TwinklData
/

monthly_exit_survey_analysis_app

Sleeping

App Files Files Community

lynn-twinkl commited on Mar 4, 2025

Commit

2e164d2

1 Parent(s): 47fac11

first commit

Browse files

Files changed (12) hide show

.gitignore +166 -0
app.py +464 -0
functions/auto_column_detection.py +310 -0
functions/broad_category_priorities.py +8 -0
functions/create_cancellation_reasons_table.py +28 -0
functions/language_labeling_translation.py +80 -0
functions/preprocessing_functions.py +91 -0
functions/sentiment_analysis.py +13 -0
functions/topicModeling_contentRequests.py +269 -0
plots/overview_charts.py +111 -0
plots/topicModeling_charts.py +141 -0
requirements.txt +18 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,166 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+#Local Files
+.DS_Store
+secrets.toml
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py ADDED Viewed

	@@ -0,0 +1,464 @@

+###############################
+# IMPORTS & CONFIG
+###############################
+import streamlit as st
+import pandas as pd
+import time
+from datetime import datetime
+from nltk.tokenize import sent_tokenize
+from hdbscan import HDBSCAN
+from umap import UMAP
+from openai import OpenAI
+from tenacity import retry, wait_exponential, stop_after_attempt
+from functions.auto_column_detection import auto_detect_columns
+from functions.preprocessing_functions import remove_numeric_or_special_responses, robust_convert_date
+from functions.language_labeling_translation import detect_language, translate_text
+from functions.sentiment_analysis import analyze_sentiment, label_sentiment
+from functions.create_cancellation_reasons_table import generate_cancellation_reasons_overview
+from html_helpers.cancellation_reasons_table_html import generate_cancellation_table_html
+from functions.topicModeling_contentRequests import (
+    load_embedding_model,
+    bertopic_model,
+    merge_specific_topics,
+    update_df_with_topics
+)
+from plots.overview_charts import (
+    create_word_count_histogram,
+    create_sentiment_pie,
+    create_cancellation_reasons_plot,
+    create_grouped_chart
+)
+from plots.topicModeling_charts import (
+    create_topics_overtime_chart,
+    create_stacked_topics_per_class
+)
+############################
+# STREAMLIT APP CONFIGURATION
+############################
+st.set_page_config(
+    layout='wide',
+    page_title="Exit Survey Processing App",
+    initial_sidebar_state="expanded",
+)
+# Global settings
+OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
+client = OpenAI(api_key=OPENAI_API_KEY)
+###############################
+# HELPER CLASSES & FUNCTIONS
+###############################
+class OpenAIWrapper:
+    """
+    Wraps the OpenAI chat.completions call with automatic retries and
+    a configurable prompt.
+    """
+    def __init__(self, model, prompt=""):
+        self.model = model
+        self.prompt = prompt
+    @retry(wait=wait_exponential(multiplier=1, min=2, max=10), stop=stop_after_attempt(5))
+    def run(self, user_text):
+        try:
+            response = client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {"role": "system", "content": self.prompt},
+                    {"role": "user", "content": user_text},
+                ]
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            st.error(f"Error during OpenAI API call: {e}")
+            raise
+@st.cache_data(show_spinner=False)
+def cached_translate(text):
+    """Cached translation function to reduce repeated OpenAI calls."""
+    return translate_text(text, skip_translation=False, translator_model=openai_model)
+@st.cache_resource(show_spinner=False)
+def get_embedding_model():
+    """Caches the embedding model for topic modeling."""
+    return load_embedding_model()
+def translate_non_english(df):
+    """
+    Identifies and translates non-English rows (with word-count > 8) in 'freeform_answer'.
+    Uses the globally cached `cached_translate`.
+    """
+    df['language'] = df['freeform_answer'].apply(detect_language)
+    to_translate = df[(df['language'] == 'non-en') & (df['word-count'] > 8)].copy()
+    if not to_translate.empty:
+        progress_text = st.empty()
+        progress_bar = st.progress(0)
+        total = len(to_translate)
+        for i, (idx, row) in enumerate(to_translate.iterrows(), 1):
+            progress_text.text(f"Translating non-English responses ({i} of {total})")
+            try:
+                translated = cached_translate(row['freeform_answer'])
+                df.at[idx, 'freeform_answer'] = translated
+            except Exception as e:
+                st.error(f"Error translating response {i}: {str(e)}")
+            progress_bar.progress(i / total)
+        progress_text.empty()
+        progress_bar.empty()
+        st.success(
+            f"Successfully translated {total} non-English responses",
+            icon='✅'
+        )
+    df.drop(columns='language', inplace=True, errors='ignore')
+    return df
+@st.cache_data(show_spinner=False)
+def run_topic_modeling(df):
+    """
+    Full pipeline for:
+      1. Sentence tokenization
+      2. Embedding
+      3. UMAP, HDBSCAN
+      4. BERTopic modeling
+      5. Custom topic naming via OpenAI
+      6. Merging small topics, final labeling
+    Returns:
+      (topic_model, updated_topics, mapping, chatgpt_topic_labels)
+    """
+    # --- 1. Sentence tokenization ---
+    sentences = []
+    mapping = []
+    for idx, response in df['freeform_answer'].dropna().items():
+        for sentence in sent_tokenize(response):
+            sentences.append(sentence)
+            mapping.append(idx)
+    # --- 2. Embedding ---
+    embedding_model = get_embedding_model()
+    embeddings = embedding_model.encode(sentences, show_progress_bar=True)
+    # --- 3. UMAP, HDBSCAN ---
+    umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
+    hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean',
+                            cluster_selection_method='eom',
+                            prediction_data=True)
+    # --- 4. BERTopic model creation ---
+    _topic_model, topics, probs = bertopic_model(
+        sentences, embeddings, embedding_model,
+        umap_model, hdbscan_model
+    )
+    # Merge small or closely
+    _topic_model = merge_specific_topics(_topic_model, sentences)
+    updated_topics, _ = _topic_model.transform(sentences)
+    # --- 5. Custom topic naming via OpenAI ---
+    topic_info = _topic_model.get_topic_info()
+    chatgpt_topic_labels = {}
+    for topic_id in topic_info['Topic']:
+        if topic_id == -1:
+            continue
+        rep_docs = _topic_model.get_representative_docs(topic_id)
+        doc_text = " ".join(rep_docs[:10])  # Up to 10 docs for context
+        topic_keywords = _topic_model.get_topic(topic_id) or []
+        keywords_text = ", ".join([word for word, score in topic_keywords])
+        prompt_template = """
+        I have a topic that contains the following documents:
+        [DOCUMENTS]
+        The topic is described by the following keywords: [KEYWORDS]
+        Based on the information above, extract a short but highly descriptive topic label
+        of at most 5 words. Make sure it is in the following format:
+        topic: <topic label>
+        """.strip()
+        prompt_filled = prompt_template.replace("[DOCUMENTS]", doc_text).replace("[KEYWORDS]", keywords_text)
+        response = naming_model.run(prompt_filled)
+        label = response.strip()
+        if label.lower().startswith("topic:"):
+            label = label[len("topic:"):].strip()
+        chatgpt_topic_labels[topic_id] = label
+    if -1 in chatgpt_topic_labels:
+        del chatgpt_topic_labels[-1]
+    _topic_model.set_topic_labels(chatgpt_topic_labels)
+    return _topic_model, updated_topics, mapping, chatgpt_topic_labels
+def process_file(uploaded_file):
+    """
+    Process the uploaded file, perform data cleaning, and return a processed DataFrame.
+    """
+    # 1. Read file
+    try:
+        if uploaded_file.name.endswith('.csv'):
+            df = pd.read_csv(uploaded_file)
+        else:
+            df = pd.read_excel(uploaded_file)
+    except Exception as e:
+        st.error(f"Error reading file: {e}")
+        st.stop()
+    original_row_count = len(df)
+    # 2. Auto-detect columns
+    st.header("Data Preview")
+    df_preview_col, spacer, detected_cols_col = st.columns([1, 0.05, 1])
+    with df_preview_col:
+        st.subheader("Raw Data Preview")
+        st.dataframe(df, hide_index=True)
+    with detected_cols_col:
+        detected = auto_detect_columns(df)
+        st.subheader("Column Detection & Selection")
+        st.info(
+            "We've automatically detected a few columns. Verify these are correct or select manually.",
+            icon='💡'
+        )
+        st.json(detected)
+        for req in ['freeform_answer', 'date']:
+            if req not in detected:
+                detected[req] = st.selectbox(f"Select column for {req}", df.columns.tolist())
+        if not st.button("Continue with these columns"):
+            st.stop()
+    # 3. Rename columns
+    rename_mapping = {detected[col]: col for col in detected}
+    df.rename(columns=rename_mapping, inplace=True)
+    df.columns = df.columns.str.lower().str.replace(" ", "_")
+    # 4. Basic cleaning steps
+    if 'freeform_answer' not in df.columns:
+        st.error("Column 'freeform_answer' not found.")
+        st.stop()
+    # Word count
+    df['word-count'] = df['freeform_answer'].apply(
+        lambda x: len(str(x).split()) if pd.notnull(x) else 0
+    )
+    # Convert date
+    if 'date' in df.columns:
+        df['date'] = robust_convert_date(df['date'])
+    else:
+        st.error("'date' column is missing.")
+        st.stop()
+    # Remove numeric or special responses
+    df = remove_numeric_or_special_responses(df, 'freeform_answer')
+    # 5. Translate non-English
+    df = translate_non_english(df)
+    # 6. Sentiment
+    df['sentiment-score'] = df['freeform_answer'].apply(analyze_sentiment)
+    df['sentiment']       = df['sentiment-score'].apply(label_sentiment)
+    final_row_count = len(df)
+    row_count_delta = final_row_count - original_row_count
+    return df, row_count_delta, final_row_count, original_row_count
+############################
+# APP ENTRY POINT
+############################
+def main():
+    st.title("Exit Survey Processing App")
+    st.markdown("Upload your Exit Survey file in CSV or Excel format; the app cleans & processes it.")
+    # Global/Shared models
+    global openai_model, naming_model
+    openai_model = OpenAIWrapper(model="gpt-4o-mini", prompt="")
+    naming_model = OpenAIWrapper(model="gpt-4o-mini", prompt="")  # for topic naming
+    # Reset button
+    if st.button("Reset App"):
+        st.session_state.clear()
+    # File upload
+    uploaded_file = st.file_uploader("Upload an exit survey file", type=["csv", "xlsx"])
+    if uploaded_file:
+        if 'processed_df' not in st.session_state:
+            with st.spinner("Processing file..."):
+                df, row_count_delta, final_row_count, original_row_count = process_file(uploaded_file)
+                st.session_state['processed_df'] = df
+                st.session_state['row_count_delta'] = row_count_delta
+                st.session_state['final_row_count'] = final_row_count
+                st.session_state['original_row_count'] = original_row_count
+        else:
+            df = st.session_state['processed_df']
+            row_count_delta = st.session_state['row_count_delta']
+            final_row_count = st.session_state['final_row_count']
+            original_row_count = st.session_state['original_row_count']
+        st.divider()
+        ########################################
+        # 1. General Overview
+        ########################################
+        st.header("General Overview")
+        with st.container():
+            metric_col1, metric_col2 = st.columns(2)
+            metric_col1.metric(
+                label="No. Responses After Processing",
+                value=final_row_count,
+                delta=row_count_delta
+            )
+            avg_length = int(df['word-count'].mean().round())
+            metric_col2.metric(
+                label="Avg. Response Length",
+                value=f"{avg_length} words"
+            )
+        st.write("#### Data Overview")
+        st.dataframe(
+            df,
+            hide_index=True,
+            column_config={'date': st.column_config.DatetimeColumn(format="YYYY-MM-DD")}
+        )
+        if 'exit_reason' in df.columns:
+            st.write("#### Exit Reason Distribution")
+            overview = generate_cancellation_reasons_overview(df, 'exit_reason')
+            reasons_bar = create_cancellation_reasons_plot(overview)
+            st.plotly_chart(reasons_bar, use_container_width=True)
+        ########################################
+        # 2. Sentiment Analysis
+        ########################################
+        st.subheader("Sentiment Analysis")
+        st.write("Visual representation of sentiment distribution, plus a grouped bar chart if you like.")
+        exclude_cols_sentiment = ['freeform_answer', 'date', 'word-count', 'sentiment-score', 'sentiment']
+        candidate_cols = [col for col in df.columns if col not in exclude_cols_sentiment and df[col].nunique() > 1]
+        col_left, col_right = st.columns([2,1])
+        with col_left:
+            if candidate_cols:
+                grouping_col = st.selectbox(
+                    "Select a column to group sentiment by",
+                    candidate_cols,
+                    index=0
+                )
+                grouped_data = df.groupby([grouping_col, 'sentiment']).size().reset_index(name='count')
+                st.write(f"##### Sentiment Grouped by {grouping_col}")
+                chart = create_grouped_chart(grouped_data, grouping_col, 'sentiment')
+                st.plotly_chart(chart, use_container_width=True)
+            else:
+                st.write("##### Sentiment (no grouping column available)")
+                grouped_data = df.groupby(['sentiment']).size().reset_index(name='count')
+                chart = create_grouped_chart(grouped_data, 'sentiment', 'sentiment')
+                st.plotly_chart(chart, use_container_width=True)
+        with col_right:
+            st.write("##### Overall Sentiment Distribution")
+            sentiment_pie = create_sentiment_pie(df)
+            st.plotly_chart(sentiment_pie, use_container_width=True)
+        ########################################
+        # 3. Topic Modeling
+        ########################################
+        st.header("Topic Modeling")
+        # Only run the modeling once per data set (cached).
+        _topic_model, updated_topics, mapping, chatgpt_topic_labels = run_topic_modeling(df)
+        topics_df = _topic_model.get_topic_info()
+        topics_df = topics_df[topics_df['Topic'] != -1].copy()
+        topics_df.drop(columns=['Name'], errors='ignore', inplace=True)
+        topics_df.rename(columns={
+            'CustomName': 'Topic Name',
+            'Topic': 'Topic Number (ID)'
+        }, inplace=True)
+        # Re-arrange cols for easier viewing
+        cols_order = ['Topic Number (ID)', 'Topic Name', 'Count',
+                      'Representation', 'Secondary Representation', 'Representative_Docs']
+        topics_df = topics_df[[c for c in cols_order if c in topics_df.columns]]
+        st.subheader("Topics Barchart (Stacked by Class)")
+        st.markdown("""
+        Choose a categorical column from your data to visualize how frequently each topic appears
+        across different classes.
+        """)
+        with st.expander("Explore Topic Details", expanded=False):
+            st.write("""
+            **Table Info:**
+            - **Topic Name**: AI-generated label
+            - **Representation**: Top 10 keywords
+            - **Secondary Representation**: Reranked keywords for diversity
+            - **Representative Docs**: Sample sentences contributing to the topic
+            """)
+            st.dataframe(topics_df, hide_index=True)
+        # For stacked barchart, pick a class column
+        exclude_cols = ["freeform_answer", "sat_score", "date",
+                        "word-count", "sentiment-score", "sentiment"]
+        available_cols = [c for c in df.columns if c not in exclude_cols]
+        default_idx = available_cols.index("exit_reason") if "exit_reason" in available_cols else 0
+        class_column = st.selectbox(
+            "How to group topics for visualization?",
+            available_cols,
+            index=default_idx
+        )
+        @st.cache_data(show_spinner=False)
+        def get_topics_per_class(class_col, mapping, df, sentences, _model):
+            sentence_classes = [df.loc[idx, class_col] for idx in mapping]
+            tpc = _model.topics_per_class(sentences, classes=sentence_classes)
+            t_labels = _model.get_topic_info()[['Topic', 'CustomName']]
+            tpc = tpc.merge(t_labels, on='Topic', how='left')
+            tpc = tpc[tpc['Topic'] != -1].reset_index(drop=True)
+            return tpc
+        # Create stacked bar chart
+        sentences = [""] * len(mapping)
+        sentences = []
+        for idx, response in df['freeform_answer'].dropna().items():
+            for sentence in sent_tokenize(response):
+                sentences.append(sentence)
+        topics_per_class = get_topics_per_class(class_column, mapping, df, sentences, _topic_model)
+        stacked_chart = create_stacked_topics_per_class(topics_per_class)
+        st.plotly_chart(stacked_chart, use_container_width=True)
+        ########################################
+        # 4. Topics Over Time
+        ########################################
+        st.subheader("Topics Over Time")
+        valid_dates = df['date'].dropna()
+        if valid_dates.nunique() < 2:
+            st.warning("Not enough distinct date values to plot topics over time.")
+        else:
+            # Build list of dates for each sentence
+            sentence_dates = [df.loc[idx, 'date'] for idx in mapping]
+            topics_over_time = _topic_model.topics_over_time(sentences, sentence_dates, nr_bins=20)
+            # Merge custom labels
+            topic_labels = _topic_model.get_topic_info()[['Topic', 'CustomName']]
+            topics_over_time = topics_over_time.merge(topic_labels, on='Topic', how='left')
+            topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
+            chart = create_topics_overtime_chart(topics_over_time)
+            st.plotly_chart(chart, use_container_width=True)
+        ########################################
+        # 5. Updated DataFrame
+        ########################################
+        updated_df = update_df_with_topics(df, mapping, updated_topics, chatgpt_topic_labels)
+        with st.expander("View Final Updated DataFrame", expanded=False):
+            st.dataframe(updated_df, hide_index=True)
+if __name__ == "__main__":
+    main()

functions/auto_column_detection.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import pandas as pd
+import numpy as np
+import re
+import string
+# ----------------------------------------
+# 1. HELPER FUNCTIONS
+# ----------------------------------------
+def get_keyword_fraction(series, keywords):
+    """
+    Returns the fraction of non-null string values in `series` that contain any of the provided `keywords`.
+    Uses a vectorized regex search for improved performance.
+    """
+    values = series.dropna().astype(str).str.lower().str.strip()
+    if values.empty:
+        return 0
+    pattern = '|'.join(re.escape(keyword) for keyword in keywords)
+    matches = values.str.contains(pattern, regex=True)
+    return matches.mean()
+def detect_keyword_based_column(
+    df,
+    candidate_columns,
+    keywords,
+    bonus_pattern=None,
+    threshold=0.5,
+    bonus_multiplier=1.1
+):
+    """
+    Computes the fraction of values that match any of the given keywords using regex for each candidate column.
+    Optionally applies a bonus multiplier if the column name matches the bonus pattern.
+    Returns the best candidate column if its score exceeds the threshold.
+    """
+    possible = {}
+    for col in candidate_columns:
+        fraction = get_keyword_fraction(df[col], keywords)
+        # Apply column-name bonus
+        if bonus_pattern and re.search(bonus_pattern, col, re.IGNORECASE):
+            fraction *= bonus_multiplier
+        possible[col] = fraction
+    if not possible:
+        return None
+    best_col = max(possible, key=possible.get)
+    if possible[best_col] >= threshold:
+        return best_col
+    return None
+def detect_exact_match_column(
+    df,
+    candidate_columns,
+    expected_values,
+    bonus_pattern=None,
+    threshold=0.5,
+    bonus_multiplier=1.1
+):
+    """
+    Computes the fraction of values that exactly match any of the expected_values for each candidate column.
+    Optionally applies a bonus multiplier if the column name matches the bonus pattern.
+    Returns the best candidate column if its score exceeds the threshold.
+    """
+    expected_set = {str(val).lower().strip() for val in expected_values}
+    possible = {}
+    for col in candidate_columns:
+        values = df[col].dropna().astype(str).str.lower().str.strip()
+        if values.empty:
+            continue
+        fraction = values.isin(expected_set).mean()
+        # Apply column-name bonus
+        if bonus_pattern and re.search(bonus_pattern, col, re.IGNORECASE):
+            fraction *= bonus_multiplier
+        possible[col] = fraction
+    if not possible:
+        return None
+    best_col = max(possible, key=possible.get)
+    if possible[best_col] >= threshold:
+        return best_col
+    return None
+# ----------------------------------------
+# 2. REFAC: DETECTION SUBROUTINES
+# ----------------------------------------
+def detect_numeric_column(df, col_name='sat_score', min_fraction=0.9):
+    """
+    Detect a single numeric column (by default for 'sat_score').
+    Returns the name of the column or None.
+    """
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    # 1) If there's exactly one numeric column, just pick it.
+    if len(numeric_cols) == 1:
+        return numeric_cols[0]
+    # 2) Otherwise, pick the column that is numeric for the largest fraction of rows.
+    #    We only accept it if that fraction is above `min_fraction`.
+    possible_numeric = {}
+    for col in df.columns:
+        conv = pd.to_numeric(df[col], errors='coerce')
+        fraction_numeric = conv.notna().mean()
+        possible_numeric[col] = fraction_numeric
+    if not possible_numeric:
+        return None
+    best_col = max(possible_numeric, key=possible_numeric.get)
+    if possible_numeric[best_col] >= min_fraction:
+        return best_col
+    return None
+def detect_freeform_answer_column(df, penalty_for_low_uniqueness=0.4):
+    """
+    Detect the 'freeform_answer' column using heuristics: average length, punctuation, uniqueness.
+    Returns the most likely column name or None.
+    """
+    text_cols = df.select_dtypes(include=['object']).columns.tolist()
+    if not text_cols:
+        return None
+    scores = {}
+    for col in text_cols:
+        series = df[col].dropna().astype(str)
+        if series.empty:
+            continue
+        avg_len = series.apply(len).mean()
+        punct_counts = series.apply(lambda x: sum(1 for char in x if char in string.punctuation))
+        avg_punct = punct_counts.mean()
+        total = len(series)
+        unique_ratio = series.nunique() / total if total else 0
+        # Weighted composite
+        weight_length = 0.4
+        weight_punct  = 0.3
+        weight_unique = 0.3
+        norm_factor = 1e-9  # avoid dividing by 0
+        scores[col] = {
+            'avg_len': avg_len,
+            'avg_punct': avg_punct,
+            'unique_ratio': unique_ratio,
+        }
+    if not scores:
+        return None
+    # Normalizing across all columns
+    max_len   = max(s['avg_len']   for s in scores.values()) or 1e-9
+    max_punct = max(s['avg_punct'] for s in scores.values()) or 1e-9
+    composite = {}
+    for col, s in scores.items():
+        norm_len   = s['avg_len'] / max_len
+        norm_punct = s['avg_punct'] / max_punct
+        comp_score = (0.4 * norm_len) + (0.3 * norm_punct) + (0.3 * s['unique_ratio'])
+        # Bonus/penalty for column names
+        if "additional_comment" in col.lower():
+            comp_score *= 3.1
+        if "usage_reason" in col.lower():
+            comp_score *= 0.5
+        # Penalize low uniqueness
+        if s['unique_ratio'] < penalty_for_low_uniqueness:
+            comp_score *= 0.5
+        composite[col] = comp_score
+    return max(composite, key=composite.get)
+def detect_date_column(df, detected_cols):
+    """
+    Detect a date column by parsing and measuring fraction_valid + uniqueness ratio.
+    Returns the best date column or None.
+    """
+    # We exclude columns already detected for something else
+    remaining = [col for col in df.columns if col not in detected_cols.values()]
+    possible_dates = {}
+    for col in remaining:
+        # Attempt to parse the column as a date
+        dt_series = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
+        fraction_valid = dt_series.notna().mean()
+        total = len(dt_series)
+        uniqueness_ratio = dt_series.nunique() / total if total > 0 else 0
+        # Weighted composite
+        score = 0.6 * fraction_valid + 0.4 * uniqueness_ratio
+        # Name-based bonus
+        if re.search(r'date|time', col, re.IGNORECASE):
+            score *= 1.2
+        possible_dates[col] = score
+    if not possible_dates:
+        return None
+    best_col = max(possible_dates, key=possible_dates.get)
+    # Adjust threshold logic or do multiple checks if you like
+    if possible_dates[best_col] >= 0.6:
+        return best_col
+    # Fallback: if there's a partial match, you could do another pass
+    if possible_dates[best_col] >= 0.5:
+        return best_col
+    return None
+# ----------------------------------------
+# 3. MAIN AUTO-DETECT FUNCTION
+# ----------------------------------------
+def auto_detect_columns(df):
+    """
+    Automatically detect and label DataFrame columns based on heuristics.
+    Returns a dictionary mapping semantic names to the corresponding column names.
+    """
+    detected = {}
+    # 1. Detect numeric column (for example, 'sat_score')
+    sat_score_col = detect_numeric_column(df, col_name='sat_score', min_fraction=0.9)
+    if sat_score_col:
+        detected['sat_score'] = sat_score_col
+    # 2. Detect natural language response (freeform_answer)
+    freeform_col = detect_freeform_answer_column(df)
+    if freeform_col:
+        detected['freeform_answer'] = freeform_col
+    # Helper functino for skipping columns that have already been detected
+    def remaining_text_cols():
+        return [
+            col for col in df.select_dtypes(include=['object']).columns
+            if col not in detected.values()
+        ]
+    # 3. Detect "career" column
+    career_keywords = ["ks3", "parent", "sen", "tutor", "grade", "esl"]
+    career_candidate = detect_keyword_based_column(
+        df,
+        remaining_text_cols(),
+        career_keywords,
+        bonus_pattern="career",
+        threshold=0.5
+    )
+    if career_candidate:
+        detected['career'] = career_candidate
+    # 4. Detect "country" column
+    country_keywords = [
+        'poland','england','united states','romania','jordan','kazakhstan','thailand',
+        'italy','philippines','australia','india','south africa','south korea','vietnam',
+        'norway','moldova','malaysia','austria','chile','cameroon'
+    ]
+    country_candidate = detect_keyword_based_column(
+        df,
+        remaining_text_cols(),
+        country_keywords,
+        bonus_pattern="country",
+        threshold=0.5
+    )
+    if country_candidate:
+        detected['country'] = country_candidate
+    # 5. Detect "exit_reason" column
+    exit_reason_values = [
+        "I can't afford it right now",
+        "I'm not using the membership enough",
+        "Other",
+        "I am on family leave",
+        "I can't find the resources I need",
+        "I've changed careers",
+        "I'm using an alternative resource provider",
+        "My school has subscribed",
+        "I'm unwell and not working at the moment",
+        "I'm retiring"
+    ]
+    exit_reason_candidate = detect_exact_match_column(
+        df,
+        remaining_text_cols(),
+        exit_reason_values,
+        bonus_pattern=r'exit|reason',
+        threshold=0.5
+    )
+    if exit_reason_candidate:
+        detected['exit_reason'] = exit_reason_candidate
+    # 6. Detect "secondary_reason" column
+    secondary_reason_values = [
+        'Customer Service','Resource Quality','Variety of Materials',
+        'Price','Ease of Website','other'
+    ]
+    secondary_reason_candidate = detect_exact_match_column(
+        df,
+        remaining_text_cols(),
+        secondary_reason_values,
+        bonus_pattern=r'secondary|reason',
+        threshold=0.5
+    )
+    if secondary_reason_candidate:
+        detected['secondary_reason'] = secondary_reason_candidate
+    # 7. Detect date column
+    date_col = detect_date_column(df, detected)
+    if date_col:
+        detected['date'] = date_col
+    print("Auto-detected columns:", detected)
+    print("All columns:", df.columns.tolist())
+    return detected

functions/broad_category_priorities.py ADDED Viewed

	@@ -0,0 +1,8 @@

+def assign_priority(count):
+    if count >= high_threshold:
+        return 'High'
+    elif count >= low_threshold:
+        return 'Medium'
+    else:
+        return 'Low'

functions/create_cancellation_reasons_table.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import pandas as pd
+def generate_cancellation_reasons_overview(df, source_col):
+    category_counts = df[source_col].value_counts()
+    percentages = (category_counts / len(df)) * 100
+    # Assigning Priority Thresholds
+    low_threshold = category_counts.quantile(0.33)
+    high_threshold = category_counts.quantile(0.67)
+    # Assigning Priorities
+    def assign_priority(count):
+        if count >= high_threshold:
+            return 'High'
+        elif count >= low_threshold:
+            return 'Medium'
+        else:
+            return 'Low'
+    # Creating the overview DataFrame
+    overview_df = pd.DataFrame({
+        'Category': category_counts.index,
+        'Count': category_counts.values,
+        'Percentage': percentages.round(1).values,
+        'Priority': category_counts.apply(assign_priority).values,
+    }).reset_index(drop=True)
+    return overview_df

functions/language_labeling_translation.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import langid
+import openai
+from typing import Optional
+import streamlit as st
+OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
+## -- DETECT LANGUAGE
+def detect_language(text):
+    try:
+        lang, _ = langid.classify(text)
+        return 'en' if lang == 'en' else 'non-en'
+    except:
+        return "unknown"
+## -- TRANSLATE TEXT
+# Example: Reuse your existing OpenAIWrapper for robust retry logic.
+# from my_wrappers import OpenAIWrapper  # Hypothetical import if your wrapper is in a separate module.
+def translate_text(
+    text: str,
+    skip_translation: bool = False,
+    translator_model: Optional["OpenAIWrapper"] = None
+) -> str:
+    """
+    Translate the provided text into English using the specified translator model.
+    If 'skip_translation' is True, it returns the original text without translation.
+    If the text is already in English or gibberish,
+    the output should mirror the original text as per the system prompt instructions.
+    Parameters:
+        text (str): The text to translate.
+        skip_translation (bool): Whether to skip translation entirely. Defaults to False.
+        translator_model (OpenAIWrapper, optional): An instance of your OpenAIWrapper class
+            for robust, retriable OpenAI calls. If None, no translation is performed.
+    Returns:
+        str: The translated text (or original text if skip_translation is True).
+    """
+    # If skip translation is set or there's no translator provided, just return the original text.
+    if skip_translation or translator_model is None:
+        return text
+    # Prepare a system prompt and user prompt.
+    # For instance, you could store this in translator_model or pass it here.
+    system_prompt = (
+        "You are an expert multilingual translator working at a subscription-based EDU publishing company."
+    )
+    user_prompt_template = """
+Below you will find a survey response from our Exit Survey that is not in English.
+Your goal is to read it carefully to identify the original language,
+and then translate it into English being as true to the original intent as possible.
+## RULES:
+1. Your output should ONLY contain the translated text.
+   Do NOT include any additional text, information, or explanations.
+2. Do NOT wrap your answer in quotation marks.
+3. If the text seems to be in English or you can't identify the language, or the text appears
+   to be gibberish, simply return the same exact text you received.
+## TEXT FOR TRANSLATION:
+{text}
+"""
+    user_prompt = user_prompt_template.format(text=text)
+    # translator_model might already have a "system" prompt built in,
+    # or we can combine them here. For example:
+    full_prompt = f"{system_prompt}\n\n{user_prompt}"
+    # Use the run() method with robust retry logic.
+    # (Adjust depending on how your wrapper is structured)
+    translated_text = translator_model.run(full_prompt)
+    return translated_text

functions/preprocessing_functions.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import pandas as pd
+import re
+from typing import Any
+# Precompile regex for special-only strings.
+SPECIAL_ONLY_REGEX = re.compile(r'^[^A-Za-z0-9]+$')
+def is_numeric_or_special(s: Any) -> bool:
+    """
+    Check if the provided value is numeric or consists solely of special characters.
+    Parameters:
+        s (Any): The input value to check.
+    Returns:
+        bool: True if the value is numeric or special-only, False otherwise.
+    """
+    if pd.isnull(s):
+        return False
+    # Ensure the input is a string.
+    s = str(s).strip()
+    # Check if the string can be converted to a float.
+    try:
+        float(s)
+        return True
+    except ValueError:
+        pass
+    # Check if the string is composed exclusively of special characters.
+    if SPECIAL_ONLY_REGEX.match(s):
+        return True
+    return False
+def remove_numeric_or_special_responses(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
+    """
+    Remove rows from the DataFrame where the target column's value is either numeric or
+    consists solely of special characters.
+    Parameters:
+        df (pd.DataFrame): The input DataFrame.
+        target_col (str): The name of the column to filter.
+    Returns:
+        pd.DataFrame: A DataFrame with the undesired responses removed.
+    """
+    filtered_df = df[~df[target_col].map(is_numeric_or_special)].reset_index(drop=True)
+    return filtered_df
+#####################
+# DATE CONVERT
+#####################
+import pandas as pd
+import datetime
+from dateutil import parser
+def robust_convert_date(date_series):
+    """
+    Convert a pandas Series containing dates in various formats to datetime objects.
+    This function tries:
+    1. The built-in pd.to_datetime() with infer_datetime_format and dayfirst options.
+    2. Falls back to dateutil.parser.parse for any values that remain unparsed.
+    Parameters:
+        date_series (pd.Series): A pandas Series with date values (as strings, numbers, etc.)
+    Returns:
+        pd.Series: A Series of datetime objects (or pd.NaT if conversion fails)
+    """
+    def convert_single(x):
+        # If the value is already a datetime, just return it.
+        if pd.isnull(x):
+            return pd.NaT
+        if isinstance(x, (pd.Timestamp, datetime.datetime)):
+            return x
+        # First, try using pd.to_datetime with coercion.
+        dt = pd.to_datetime(x, errors='coerce', infer_datetime_format=True, dayfirst=True)
+        if pd.notnull(dt):
+            return dt
+        # Fallback: use dateutil.parser to attempt parsing.
+        try:
+            return parser.parse(str(x), dayfirst=True)
+        except Exception:
+            return pd.NaT
+    return date_series.apply(convert_single)

functions/sentiment_analysis.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from textblob import TextBlob
+def analyze_sentiment(text):
+    analysis = TextBlob(text)
+    return analysis.sentiment.polarity
+def label_sentiment(score, threshold=0.2):
+    if score > threshold:
+        return 'Positive'
+    elif score < 0:
+        return 'Negative'
+    else:
+        return 'Neutral'

functions/topicModeling_contentRequests.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import streamlit as st
+import re
+import string
+import torch
+import spacy
+from sentence_transformers import SentenceTransformer
+import nltk
+from nltk.corpus import stopwords
+import contractions
+from sklearn.feature_extraction.text import CountVectorizer
+from bertopic import BERTopic
+from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
+import openai
+import numpy as np
+OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
+"""
+-----------------------------------
+Lemmatization & Stopword Removal
+-----------------------------------
+"""
+def topicModeling_preprocessing(df, spacy_model="en_core_web_lg"):
+    base_stopwords = set(stopwords.words('english'))
+    custom_stopwords = {
+        'material', 'materials', 'resources', 'resource', 'activity',
+        'activities', 'sheet', 'sheets', 'worksheet', 'worksheets',
+        'teacher', 'teachers', 'teach', 'high school', 'highschool',
+        'middle school', 'grade', 'grades', 'hs', 'level', 'age', 'ages',
+        'older', 'older kid', 'kid', 'student', "1st", "2nd", "3rd", "4th", '5th', '6th',
+        '7th', '8th', '9th'
+        }
+    stopword_set = base_stopwords.union(custom_stopwords)
+    stopword_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in stopword_set) + r')\b'
+    nlp = spacy.load(spacy_model)
+    def clean_lemmatize_text(text):
+        if not isinstance(text, str):
+            return None
+        text = contractions.fix(text)
+        text = re.sub(r'\s+', ' ', text).strip()
+        text = re.sub(stopword_pattern, '', text)
+        doc = nlp(text)
+        tokens = [token.lemma_ for token in doc]
+        clean_text = " ".join(tokens).strip()
+        clean_text = re.sub(r'\s+', ' ', clean_text)
+        return clean_text if clean_text else None
+    df['processedForModeling'] = df['preprocessedBasic'].apply(clean_lemmatize_text)
+    # Drop rows where cleaned text is empty or None
+    df = df.dropna(subset=['processedForModeling'])
+    return df
+"""
+--------------------------
+ Load Transformer Model
+--------------------------
+"""
+@st.cache_resource
+def load_embedding_model():
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    else:
+        device = "cpu"
+    st.write(f"Using device: {device}")
+    return SentenceTransformer("paraphrase-mpnet-base-v2", device=device)
+"""
+-------------------------
+Batch Embedding Creation
+-------------------------
+"""
+def encode_content_documents(embedding_model, content_documents, batch_size=20):
+    embeddings_batches = []
+    for i in range(0, len(content_documents), batch_size):
+        batch_docs = content_documents[i:i + batch_size]
+        batch_embeddings = embedding_model.encode(batch_docs, convert_to_numpy=True, show_progress_bar=True)
+        embeddings_batches.append(batch_embeddings)
+    return np.vstack(embeddings_batches)
+"""
+-----------------------------
+Topic Modeling with BERTopic
+-----------------------------
+"""
+stopwords = list(stopwords.words('english')) + [
+        'activities',
+        'activity',
+        'class',
+        'classroom',
+        'material',
+        'materials',
+        'membership',
+        'memberships',
+        'pupil',
+        'pupils',
+        'resource',
+        'resources',
+        'sheet',
+        'sheets',
+        'student',
+        'students',
+        'subscription',
+        'subscriptions',
+        'subscribe',
+        'subscribed',
+        'recommend',
+        'recommendation',
+        'teach',
+        'teacher',
+        'teachers',
+        'tutor',
+        'tutors',
+        'twinkl',
+        'twinkls',
+        'twinkle',
+        'worksheet',
+        'worksheets',
+    ]
+######### --------------- BERTOPIC ----------------- #############
+@st.cache_resource
+def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_model):
+    main_representation_model = KeyBERTInspired()
+    aspect_representation_model1 = MaximalMarginalRelevance(diversity=.3)
+    # OpenAI Representation Model
+    client = openai.OpenAI(api_key=OPENAI_API_KEY)
+    prompt = """
+    I have a topic that contains the following documents:
+    [DOCUMENTS]
+    The topic is described by the following keywords: [KEYWORDS]
+    Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
+    topic: <topic label>
+    """
+    openai_model = OpenAI(client, model="gpt-4o-mini", exponential_backoff=True, chat=True, prompt=prompt)
+    representation_model = {
+        "Main": main_representation_model,
+        "Secondary Representation": aspect_representation_model1,
+    }
+    vectorizer_model = CountVectorizer(min_df=2, max_df=0.60, stop_words=stopwords)
+    seed_topic_list = [
+            ["autism", "special needs", "special education needs", "special education", "adhd", "autistic", "dyslexia", "dyslexic", "sen"],
+            ]
+    topic_model = BERTopic(
+        verbose=True,
+        embedding_model=_embedding_model,
+        umap_model=_umap_model,
+        hdbscan_model = _hdbscan_model,
+        vectorizer_model=vectorizer_model,
+        #seed_topic_list = seed_topic_list,
+        representation_model=representation_model,
+    )
+    topics, probs = topic_model.fit_transform(docs, embeddings)
+    return topic_model, topics, probs
+##################################
+# TOPIC MERGING
+##################################
+def merge_specific_topics(topic_model, sentences,
+                          cancellation_keywords=["cancel", "cancellation", "cancel", "canceled"],
+                          thanks_keywords=["thank", "thanks", "thank you", "thankyou", "ty", "thx"],
+                          expensive_keywords=["can't afford", "price", "expensive", "cost"]):
+    topic_info = topic_model.get_topic_info()
+    # Identify cancellation-related topics by checking if any cancellation keyword appears in the topic name.
+    cancellation_regex = '|'.join(cancellation_keywords)
+    cancellation_topics = topic_info[
+        topic_info['Name'].str.contains(cancellation_regex, case=False, na=False)
+    ]['Topic'].tolist()
+    # Identify thank-you-related topics similarly.
+    thanks_regex = '|'.join(thanks_keywords)
+    thanks_topics = topic_info[
+        topic_info['Name'].str.contains(thanks_regex, case=False, na=False)
+    ]['Topic'].tolist()
+    # Identify expensive-related topics.
+    expensive_regex = '|'.join(expensive_keywords)
+    expensive_topics = topic_info[
+        topic_info['Name'].str.contains(expensive_regex, case=False, na=False)
+    ]['Topic'].tolist()
+    # Exclude the outlier topic (-1) if it appears.
+    cancellation_topics = [t for t in cancellation_topics if t != -1]
+    thanks_topics = [t for t in thanks_topics if t != -1]
+    expensive_topics = [t for t in expensive_topics if t != -1]
+    # Create a list of topics to merge
+    topics_to_merge = []
+    if len(cancellation_topics) > 1:
+        print(f"Merging cancellation topics: {cancellation_topics}")
+        topics_to_merge.append(cancellation_topics)
+    if len(thanks_topics) > 1:
+        print(f"Merging thank-you topics: {thanks_topics}")
+        topics_to_merge.append(thanks_topics)
+    if len(expensive_topics) > 1:
+        print(f"Merging expensive topics: {expensive_topics}")
+        topics_to_merge.append(expensive_topics)
+    # Call merge_topics
+    if topics_to_merge:
+        topic_model.merge_topics(sentences, topics_to_merge)
+    return topic_model
+##################################
+# Topic to Dataframe Mapping
+#################################
+def update_df_with_topics(df, mapping, sentence_topics, topic_label_map):
+    topics_by_row = {}
+    for i, row_idx in enumerate(mapping):
+        topic = sentence_topics[i]
+        topics_by_row.setdefault(row_idx, set()).add(topic)
+    updated_df = df.copy()
+    def map_topics(row_idx):
+        topic_ids = topics_by_row.get(row_idx, set())
+        topic_names = [topic_label_map.get(t, str(t)) for t in topic_ids if t != -1]
+        return ", ".join(sorted(topic_names))
+    updated_df['Topics'] = updated_df.index.map(map_topics)
+    return updated_df

plots/overview_charts.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import plotly.express as px
+legend_font_size=14
+xaxis_font_size=16
+ticks_size=14
+## -- WORD COUNT PLOT
+def create_word_count_histogram(df, nbins=40, height=550):
+    fig = px.histogram(
+        df,
+        x='word-count',
+        nbins=nbins,
+        title=None,
+        color_discrete_sequence=['#646DEF']
+    )
+    fig.update_layout(
+        height=height,
+        margin=dict(t=30),
+    )
+    return fig
+## -- SENTIMENT PLOT
+def create_sentiment_pie(df, height=450):
+    sentiment_pie = px.pie(
+        df,
+        names='sentiment',
+        color='sentiment',
+        color_discrete_map={ 'Positive':'darkturquoise', 'Neutral':'#646DEF', 'Negative':'red'},
+        hole=0.45,
+        title=None
+    )
+    sentiment_pie.update_traces(hovertemplate='%{label}<extra></extra>')
+    sentiment_pie.update_layout(
+            showlegend=False,
+            margin=dict(r=50),
+            legend=dict(
+                font=dict(size=legend_font_size),
+                orientation="h",  # Vertical orientation
+                x=0.5,
+                xanchor="center",
+        )
+    )
+    return sentiment_pie
+## -- CANCELLATION REASONS
+def create_cancellation_reasons_plot(cancellation_overview):
+    reasons_bar = px.bar(
+    cancellation_overview,
+    x='Category',
+    y='Count',
+    color_discrete_sequence=['#646DEF'],
+    color_discrete_map={'Low':'darkturquoise', 'Medium':'orangered', 'High':'red'},
+)
+    reasons_bar.update_traces(
+        customdata=cancellation_overview['Percentage'],
+        hovertemplate='Count = %{y}<br>Percentage = %{customdata}%'
+        )
+    reasons_bar.update_layout(
+        height=600,
+        xaxis_title="",
+        yaxis_title="",
+        xaxis=dict(title_font=dict(size=xaxis_font_size), tickfont=dict(size=ticks_size)),
+        # yaxis=dict(title_font=dict(size=xaxis_font_size), tickfont=dict(size=ticks_size)),
+        # yaxis_title=None,
+        # margin=dict(r=70),
+        # legend=dict(
+        #    font=dict(size=legend_font_size),
+         #   orientation='h',  # Makes the legend horizontal
+          #  yanchor='bottom',  # Aligns the bottom of the legend box
+           # y=1.05,  # Places the legend slightly above the plot
+            #)
+        )
+    return reasons_bar
+############# Grouped By Career ############
+def create_grouped_chart(grouped_df, group_name_col, color_col):
+    grouped_chart = px.bar(
+    grouped_df,
+    x=group_name_col,
+    y='count',
+    color= color_col,
+    color_discrete_map={'Positive':'darkturquoise', 'Neutral':'#646DEF', 'Negative':'red'},
+    title=None,
+    barmode="stack")
+    grouped_chart.update_layout(
+        legend=dict(
+            x=-0.05,
+            xanchor="left",
+            y=1.2,
+            yanchor="top",
+            orientation='h'
+        )
+    )
+    return grouped_chart

plots/topicModeling_charts.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from bertopic import BERTopic  # Ensure you have BERTopic installed
+import plotly.graph_objects as go  # BERTopic visualization uses Plotly
+import plotly.colors as pc
+import plotly.express as px
+xaxis_font_size=14
+ticks_size=14
+def topicDistribution(topic_model, top_n_topics=6, n_words=5):
+    content_topics_barchart = topic_model.visualize_barchart(top_n_topics=top_n_topics, n_words=n_words)
+    colors = pc.qualitative.Plotly
+    for i, trace in enumerate(content_topics_barchart.data):
+        trace.marker.color = colors[i % len(colors)]  # Cycle through colors
+    content_topics_barchart.update_layout(title_text="")  # Remove the title
+    return content_topics_barchart
+####################
+# TOPIC FREQUENCY
+###################
+def create_topicFreq_chart(topics_df):
+    # Create a new column "top_words" that holds the top 5 words for each topic.
+    # `topic_model.get_topic(topic)` returns a list of (word, score) tuples.
+    topics_df['top_5_words'] = topics_df.iloc[:,3].apply(lambda x: ', '.join(x[:5]) if isinstance(x, list) else x)
+    # Create the bar chart using Plotly Express.
+    # Pass the "top_words" column as custom data for use in the hover template.
+    topicFreq_barchart = px.bar(
+        topics_df,
+        x="Topic Name",
+        y="Count",
+        custom_data=["top_5_words"],
+        title=None,
+        labels={"Count": "Frequency", "Topic": "CutomName"},
+    )
+    # Update traces to include custom hover text showing the top 5 words.
+    topicFreq_barchart.update_traces(
+        marker_color='#646DEF',
+        textposition='outside',
+        hovertemplate=(
+            'Frequency: %{y}<br>'
+            'Top 5 words: %{customdata[0]}<extra></extra>'
+        )
+    )
+    topicFreq_barchart.update_layout(
+        uniformtext_minsize=8,
+        uniformtext_mode='hide',
+        xaxis_title="Topic Name",
+        yaxis_title="Frequency",
+        height=650,
+        xaxis=dict(title_font=dict(size=xaxis_font_size), tickfont=dict(size=ticks_size)),
+    )
+    return topicFreq_barchart
+###############################
+# Stacked Topic Freq Per Class
+###############################
+def create_stacked_topics_per_class(df):
+    topcis_per_class_chart = px.bar(
+                df,
+                x="CustomName",        # Classes on the x-axis
+                y="Frequency",        # Count of documents per topic
+                color="Class",    # Different colors for different topics
+                title=None,
+                barmode="stack",   # Stacked bars
+                labels={"Count": "Frequency", "Topics":"CustomName"},
+                )
+    topcis_per_class_chart.update_layout(
+        uniformtext_minsize=8,
+        uniformtext_mode='hide',
+        xaxis_title="Topic Name",
+        yaxis_title="Frequency",
+        height=650,
+    )
+    return topcis_per_class_chart
+#######################
+# Intertopic Distance
+#######################
+def intertopicDistanceMap(topic_model, color="orangered"):
+    # Generate the base figure
+    fig = topic_model.visualize_topics(
+            title="")
+    # Update trace colors
+    for trace in fig.data:
+        trace.marker.color = color
+        trace.marker.line.width = 0
+    fig.update_layout(
+            margin=dict(r=50)
+            )
+    return fig
+##########################
+# Topics Over Time
+#########################
+def create_topics_overtime_chart(topics_overtime_df):
+    topics_overtime_chart = px.line(
+            topics_overtime_df,
+            x="Timestamp",
+            y="Frequency",
+            color="CustomName",
+            markers=True,
+            title=None,
+            labels={"Timestamp": "Time", "Frequency": "Topic Frequency", "Name": "CustomName"},
+        )
+    topics_overtime_chart.update_layout(
+        xaxis_title="Time",
+        yaxis_title="Frequency",
+        legend_title="Topics",
+        height=700,
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=-0.5,  # Adjust this value as needed to move the legend further down
+            xanchor="center",
+            x=0.5
+        )
+    )
+    return topics_overtime_chart

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+bertopic==0.16.4
+contractions==0.1.73
+hdbscan==0.8.40
+langid==1.1.6
+nltk==3.9.1
+numpy==2.2.3
+openai==1.65.2
+pandas==2.2.3
+plotly==5.24.1
+python_dateutil==2.9.0.post0
+scikit_learn==1.6.1
+sentence_transformers==3.3.1
+spacy==3.8.2
+streamlit==1.42.2
+tenacity==9.0.0
+textblob==0.19.0
+torch==2.5.1
+umap_learn==0.5.7