lynn-twinkl commited on
Commit ·
2e164d2
1
Parent(s): 47fac11
first commit
Browse files- .gitignore +166 -0
- app.py +464 -0
- functions/auto_column_detection.py +310 -0
- functions/broad_category_priorities.py +8 -0
- functions/create_cancellation_reasons_table.py +28 -0
- functions/language_labeling_translation.py +80 -0
- functions/preprocessing_functions.py +91 -0
- functions/sentiment_analysis.py +13 -0
- functions/topicModeling_contentRequests.py +269 -0
- plots/overview_charts.py +111 -0
- plots/topicModeling_charts.py +141 -0
- requirements.txt +18 -0
.gitignore
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# poetry
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 102 |
+
#poetry.lock
|
| 103 |
+
|
| 104 |
+
# pdm
|
| 105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 106 |
+
#pdm.lock
|
| 107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 108 |
+
# in version control.
|
| 109 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
| 110 |
+
.pdm.toml
|
| 111 |
+
.pdm-python
|
| 112 |
+
.pdm-build/
|
| 113 |
+
|
| 114 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 115 |
+
__pypackages__/
|
| 116 |
+
|
| 117 |
+
# Celery stuff
|
| 118 |
+
celerybeat-schedule
|
| 119 |
+
celerybeat.pid
|
| 120 |
+
|
| 121 |
+
# SageMath parsed files
|
| 122 |
+
*.sage.py
|
| 123 |
+
|
| 124 |
+
# Environments
|
| 125 |
+
.env
|
| 126 |
+
.venv
|
| 127 |
+
env/
|
| 128 |
+
venv/
|
| 129 |
+
ENV/
|
| 130 |
+
env.bak/
|
| 131 |
+
venv.bak/
|
| 132 |
+
|
| 133 |
+
# Spyder project settings
|
| 134 |
+
.spyderproject
|
| 135 |
+
.spyproject
|
| 136 |
+
|
| 137 |
+
# Rope project settings
|
| 138 |
+
.ropeproject
|
| 139 |
+
|
| 140 |
+
# mkdocs documentation
|
| 141 |
+
/site
|
| 142 |
+
|
| 143 |
+
# mypy
|
| 144 |
+
.mypy_cache/
|
| 145 |
+
.dmypy.json
|
| 146 |
+
dmypy.json
|
| 147 |
+
|
| 148 |
+
# Pyre type checker
|
| 149 |
+
.pyre/
|
| 150 |
+
|
| 151 |
+
# pytype static type analyzer
|
| 152 |
+
.pytype/
|
| 153 |
+
|
| 154 |
+
# Cython debug symbols
|
| 155 |
+
cython_debug/
|
| 156 |
+
|
| 157 |
+
#Local Files
|
| 158 |
+
.DS_Store
|
| 159 |
+
secrets.toml
|
| 160 |
+
|
| 161 |
+
# PyCharm
|
| 162 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 163 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 164 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 165 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 166 |
+
#.idea/
|
app.py
ADDED
|
@@ -0,0 +1,464 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
###############################
|
| 2 |
+
# IMPORTS & CONFIG
|
| 3 |
+
###############################
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import time
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from nltk.tokenize import sent_tokenize
|
| 9 |
+
from hdbscan import HDBSCAN
|
| 10 |
+
from umap import UMAP
|
| 11 |
+
from openai import OpenAI
|
| 12 |
+
from tenacity import retry, wait_exponential, stop_after_attempt
|
| 13 |
+
|
| 14 |
+
from functions.auto_column_detection import auto_detect_columns
|
| 15 |
+
from functions.preprocessing_functions import remove_numeric_or_special_responses, robust_convert_date
|
| 16 |
+
from functions.language_labeling_translation import detect_language, translate_text
|
| 17 |
+
from functions.sentiment_analysis import analyze_sentiment, label_sentiment
|
| 18 |
+
from functions.create_cancellation_reasons_table import generate_cancellation_reasons_overview
|
| 19 |
+
from html_helpers.cancellation_reasons_table_html import generate_cancellation_table_html
|
| 20 |
+
|
| 21 |
+
from functions.topicModeling_contentRequests import (
|
| 22 |
+
load_embedding_model,
|
| 23 |
+
bertopic_model,
|
| 24 |
+
merge_specific_topics,
|
| 25 |
+
update_df_with_topics
|
| 26 |
+
)
|
| 27 |
+
from plots.overview_charts import (
|
| 28 |
+
create_word_count_histogram,
|
| 29 |
+
create_sentiment_pie,
|
| 30 |
+
create_cancellation_reasons_plot,
|
| 31 |
+
create_grouped_chart
|
| 32 |
+
)
|
| 33 |
+
from plots.topicModeling_charts import (
|
| 34 |
+
create_topics_overtime_chart,
|
| 35 |
+
create_stacked_topics_per_class
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
############################
|
| 39 |
+
# STREAMLIT APP CONFIGURATION
|
| 40 |
+
############################
|
| 41 |
+
st.set_page_config(
|
| 42 |
+
layout='wide',
|
| 43 |
+
page_title="Exit Survey Processing App",
|
| 44 |
+
initial_sidebar_state="expanded",
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Global settings
|
| 48 |
+
OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
|
| 49 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 50 |
+
|
| 51 |
+
###############################
|
| 52 |
+
# HELPER CLASSES & FUNCTIONS
|
| 53 |
+
###############################
|
| 54 |
+
class OpenAIWrapper:
|
| 55 |
+
"""
|
| 56 |
+
Wraps the OpenAI chat.completions call with automatic retries and
|
| 57 |
+
a configurable prompt.
|
| 58 |
+
"""
|
| 59 |
+
def __init__(self, model, prompt=""):
|
| 60 |
+
self.model = model
|
| 61 |
+
self.prompt = prompt
|
| 62 |
+
|
| 63 |
+
@retry(wait=wait_exponential(multiplier=1, min=2, max=10), stop=stop_after_attempt(5))
|
| 64 |
+
def run(self, user_text):
|
| 65 |
+
try:
|
| 66 |
+
response = client.chat.completions.create(
|
| 67 |
+
model=self.model,
|
| 68 |
+
messages=[
|
| 69 |
+
{"role": "system", "content": self.prompt},
|
| 70 |
+
{"role": "user", "content": user_text},
|
| 71 |
+
]
|
| 72 |
+
)
|
| 73 |
+
return response.choices[0].message.content
|
| 74 |
+
except Exception as e:
|
| 75 |
+
st.error(f"Error during OpenAI API call: {e}")
|
| 76 |
+
raise
|
| 77 |
+
|
| 78 |
+
@st.cache_data(show_spinner=False)
|
| 79 |
+
def cached_translate(text):
|
| 80 |
+
"""Cached translation function to reduce repeated OpenAI calls."""
|
| 81 |
+
return translate_text(text, skip_translation=False, translator_model=openai_model)
|
| 82 |
+
|
| 83 |
+
@st.cache_resource(show_spinner=False)
|
| 84 |
+
def get_embedding_model():
|
| 85 |
+
"""Caches the embedding model for topic modeling."""
|
| 86 |
+
return load_embedding_model()
|
| 87 |
+
|
| 88 |
+
def translate_non_english(df):
|
| 89 |
+
"""
|
| 90 |
+
Identifies and translates non-English rows (with word-count > 8) in 'freeform_answer'.
|
| 91 |
+
Uses the globally cached `cached_translate`.
|
| 92 |
+
"""
|
| 93 |
+
df['language'] = df['freeform_answer'].apply(detect_language)
|
| 94 |
+
to_translate = df[(df['language'] == 'non-en') & (df['word-count'] > 8)].copy()
|
| 95 |
+
if not to_translate.empty:
|
| 96 |
+
progress_text = st.empty()
|
| 97 |
+
progress_bar = st.progress(0)
|
| 98 |
+
total = len(to_translate)
|
| 99 |
+
for i, (idx, row) in enumerate(to_translate.iterrows(), 1):
|
| 100 |
+
progress_text.text(f"Translating non-English responses ({i} of {total})")
|
| 101 |
+
try:
|
| 102 |
+
translated = cached_translate(row['freeform_answer'])
|
| 103 |
+
df.at[idx, 'freeform_answer'] = translated
|
| 104 |
+
except Exception as e:
|
| 105 |
+
st.error(f"Error translating response {i}: {str(e)}")
|
| 106 |
+
progress_bar.progress(i / total)
|
| 107 |
+
progress_text.empty()
|
| 108 |
+
progress_bar.empty()
|
| 109 |
+
st.success(
|
| 110 |
+
f"Successfully translated {total} non-English responses",
|
| 111 |
+
icon='✅'
|
| 112 |
+
)
|
| 113 |
+
df.drop(columns='language', inplace=True, errors='ignore')
|
| 114 |
+
return df
|
| 115 |
+
|
| 116 |
+
@st.cache_data(show_spinner=False)
|
| 117 |
+
def run_topic_modeling(df):
|
| 118 |
+
"""
|
| 119 |
+
Full pipeline for:
|
| 120 |
+
1. Sentence tokenization
|
| 121 |
+
2. Embedding
|
| 122 |
+
3. UMAP, HDBSCAN
|
| 123 |
+
4. BERTopic modeling
|
| 124 |
+
5. Custom topic naming via OpenAI
|
| 125 |
+
6. Merging small topics, final labeling
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
(topic_model, updated_topics, mapping, chatgpt_topic_labels)
|
| 129 |
+
"""
|
| 130 |
+
# --- 1. Sentence tokenization ---
|
| 131 |
+
sentences = []
|
| 132 |
+
mapping = []
|
| 133 |
+
for idx, response in df['freeform_answer'].dropna().items():
|
| 134 |
+
for sentence in sent_tokenize(response):
|
| 135 |
+
sentences.append(sentence)
|
| 136 |
+
mapping.append(idx)
|
| 137 |
+
|
| 138 |
+
# --- 2. Embedding ---
|
| 139 |
+
embedding_model = get_embedding_model()
|
| 140 |
+
embeddings = embedding_model.encode(sentences, show_progress_bar=True)
|
| 141 |
+
|
| 142 |
+
# --- 3. UMAP, HDBSCAN ---
|
| 143 |
+
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
|
| 144 |
+
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean',
|
| 145 |
+
cluster_selection_method='eom',
|
| 146 |
+
prediction_data=True)
|
| 147 |
+
|
| 148 |
+
# --- 4. BERTopic model creation ---
|
| 149 |
+
_topic_model, topics, probs = bertopic_model(
|
| 150 |
+
sentences, embeddings, embedding_model,
|
| 151 |
+
umap_model, hdbscan_model
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
# Merge small or closely
|
| 155 |
+
_topic_model = merge_specific_topics(_topic_model, sentences)
|
| 156 |
+
updated_topics, _ = _topic_model.transform(sentences)
|
| 157 |
+
|
| 158 |
+
# --- 5. Custom topic naming via OpenAI ---
|
| 159 |
+
topic_info = _topic_model.get_topic_info()
|
| 160 |
+
chatgpt_topic_labels = {}
|
| 161 |
+
for topic_id in topic_info['Topic']:
|
| 162 |
+
if topic_id == -1:
|
| 163 |
+
continue
|
| 164 |
+
rep_docs = _topic_model.get_representative_docs(topic_id)
|
| 165 |
+
doc_text = " ".join(rep_docs[:10]) # Up to 10 docs for context
|
| 166 |
+
topic_keywords = _topic_model.get_topic(topic_id) or []
|
| 167 |
+
keywords_text = ", ".join([word for word, score in topic_keywords])
|
| 168 |
+
|
| 169 |
+
prompt_template = """
|
| 170 |
+
I have a topic that contains the following documents:
|
| 171 |
+
[DOCUMENTS]
|
| 172 |
+
|
| 173 |
+
The topic is described by the following keywords: [KEYWORDS]
|
| 174 |
+
|
| 175 |
+
Based on the information above, extract a short but highly descriptive topic label
|
| 176 |
+
of at most 5 words. Make sure it is in the following format:
|
| 177 |
+
|
| 178 |
+
topic: <topic label>
|
| 179 |
+
""".strip()
|
| 180 |
+
|
| 181 |
+
prompt_filled = prompt_template.replace("[DOCUMENTS]", doc_text).replace("[KEYWORDS]", keywords_text)
|
| 182 |
+
response = naming_model.run(prompt_filled)
|
| 183 |
+
label = response.strip()
|
| 184 |
+
if label.lower().startswith("topic:"):
|
| 185 |
+
label = label[len("topic:"):].strip()
|
| 186 |
+
chatgpt_topic_labels[topic_id] = label
|
| 187 |
+
|
| 188 |
+
if -1 in chatgpt_topic_labels:
|
| 189 |
+
del chatgpt_topic_labels[-1]
|
| 190 |
+
_topic_model.set_topic_labels(chatgpt_topic_labels)
|
| 191 |
+
|
| 192 |
+
return _topic_model, updated_topics, mapping, chatgpt_topic_labels
|
| 193 |
+
|
| 194 |
+
def process_file(uploaded_file):
|
| 195 |
+
"""
|
| 196 |
+
Process the uploaded file, perform data cleaning, and return a processed DataFrame.
|
| 197 |
+
"""
|
| 198 |
+
# 1. Read file
|
| 199 |
+
try:
|
| 200 |
+
if uploaded_file.name.endswith('.csv'):
|
| 201 |
+
df = pd.read_csv(uploaded_file)
|
| 202 |
+
else:
|
| 203 |
+
df = pd.read_excel(uploaded_file)
|
| 204 |
+
except Exception as e:
|
| 205 |
+
st.error(f"Error reading file: {e}")
|
| 206 |
+
st.stop()
|
| 207 |
+
|
| 208 |
+
original_row_count = len(df)
|
| 209 |
+
|
| 210 |
+
# 2. Auto-detect columns
|
| 211 |
+
st.header("Data Preview")
|
| 212 |
+
df_preview_col, spacer, detected_cols_col = st.columns([1, 0.05, 1])
|
| 213 |
+
|
| 214 |
+
with df_preview_col:
|
| 215 |
+
st.subheader("Raw Data Preview")
|
| 216 |
+
st.dataframe(df, hide_index=True)
|
| 217 |
+
|
| 218 |
+
with detected_cols_col:
|
| 219 |
+
detected = auto_detect_columns(df)
|
| 220 |
+
st.subheader("Column Detection & Selection")
|
| 221 |
+
st.info(
|
| 222 |
+
"We've automatically detected a few columns. Verify these are correct or select manually.",
|
| 223 |
+
icon='💡'
|
| 224 |
+
)
|
| 225 |
+
st.json(detected)
|
| 226 |
+
|
| 227 |
+
for req in ['freeform_answer', 'date']:
|
| 228 |
+
if req not in detected:
|
| 229 |
+
detected[req] = st.selectbox(f"Select column for {req}", df.columns.tolist())
|
| 230 |
+
|
| 231 |
+
if not st.button("Continue with these columns"):
|
| 232 |
+
st.stop()
|
| 233 |
+
|
| 234 |
+
# 3. Rename columns
|
| 235 |
+
rename_mapping = {detected[col]: col for col in detected}
|
| 236 |
+
df.rename(columns=rename_mapping, inplace=True)
|
| 237 |
+
df.columns = df.columns.str.lower().str.replace(" ", "_")
|
| 238 |
+
|
| 239 |
+
# 4. Basic cleaning steps
|
| 240 |
+
if 'freeform_answer' not in df.columns:
|
| 241 |
+
st.error("Column 'freeform_answer' not found.")
|
| 242 |
+
st.stop()
|
| 243 |
+
|
| 244 |
+
# Word count
|
| 245 |
+
df['word-count'] = df['freeform_answer'].apply(
|
| 246 |
+
lambda x: len(str(x).split()) if pd.notnull(x) else 0
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
# Convert date
|
| 250 |
+
if 'date' in df.columns:
|
| 251 |
+
df['date'] = robust_convert_date(df['date'])
|
| 252 |
+
else:
|
| 253 |
+
st.error("'date' column is missing.")
|
| 254 |
+
st.stop()
|
| 255 |
+
|
| 256 |
+
# Remove numeric or special responses
|
| 257 |
+
df = remove_numeric_or_special_responses(df, 'freeform_answer')
|
| 258 |
+
|
| 259 |
+
# 5. Translate non-English
|
| 260 |
+
df = translate_non_english(df)
|
| 261 |
+
|
| 262 |
+
# 6. Sentiment
|
| 263 |
+
df['sentiment-score'] = df['freeform_answer'].apply(analyze_sentiment)
|
| 264 |
+
df['sentiment'] = df['sentiment-score'].apply(label_sentiment)
|
| 265 |
+
|
| 266 |
+
final_row_count = len(df)
|
| 267 |
+
row_count_delta = final_row_count - original_row_count
|
| 268 |
+
|
| 269 |
+
return df, row_count_delta, final_row_count, original_row_count
|
| 270 |
+
|
| 271 |
+
############################
|
| 272 |
+
# APP ENTRY POINT
|
| 273 |
+
############################
|
| 274 |
+
def main():
|
| 275 |
+
st.title("Exit Survey Processing App")
|
| 276 |
+
st.markdown("Upload your Exit Survey file in CSV or Excel format; the app cleans & processes it.")
|
| 277 |
+
|
| 278 |
+
# Global/Shared models
|
| 279 |
+
global openai_model, naming_model
|
| 280 |
+
openai_model = OpenAIWrapper(model="gpt-4o-mini", prompt="")
|
| 281 |
+
naming_model = OpenAIWrapper(model="gpt-4o-mini", prompt="") # for topic naming
|
| 282 |
+
|
| 283 |
+
# Reset button
|
| 284 |
+
if st.button("Reset App"):
|
| 285 |
+
st.session_state.clear()
|
| 286 |
+
|
| 287 |
+
# File upload
|
| 288 |
+
uploaded_file = st.file_uploader("Upload an exit survey file", type=["csv", "xlsx"])
|
| 289 |
+
|
| 290 |
+
if uploaded_file:
|
| 291 |
+
if 'processed_df' not in st.session_state:
|
| 292 |
+
with st.spinner("Processing file..."):
|
| 293 |
+
df, row_count_delta, final_row_count, original_row_count = process_file(uploaded_file)
|
| 294 |
+
st.session_state['processed_df'] = df
|
| 295 |
+
st.session_state['row_count_delta'] = row_count_delta
|
| 296 |
+
st.session_state['final_row_count'] = final_row_count
|
| 297 |
+
st.session_state['original_row_count'] = original_row_count
|
| 298 |
+
else:
|
| 299 |
+
df = st.session_state['processed_df']
|
| 300 |
+
row_count_delta = st.session_state['row_count_delta']
|
| 301 |
+
final_row_count = st.session_state['final_row_count']
|
| 302 |
+
original_row_count = st.session_state['original_row_count']
|
| 303 |
+
|
| 304 |
+
st.divider()
|
| 305 |
+
|
| 306 |
+
########################################
|
| 307 |
+
# 1. General Overview
|
| 308 |
+
########################################
|
| 309 |
+
st.header("General Overview")
|
| 310 |
+
with st.container():
|
| 311 |
+
metric_col1, metric_col2 = st.columns(2)
|
| 312 |
+
metric_col1.metric(
|
| 313 |
+
label="No. Responses After Processing",
|
| 314 |
+
value=final_row_count,
|
| 315 |
+
delta=row_count_delta
|
| 316 |
+
)
|
| 317 |
+
avg_length = int(df['word-count'].mean().round())
|
| 318 |
+
metric_col2.metric(
|
| 319 |
+
label="Avg. Response Length",
|
| 320 |
+
value=f"{avg_length} words"
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
st.write("#### Data Overview")
|
| 324 |
+
st.dataframe(
|
| 325 |
+
df,
|
| 326 |
+
hide_index=True,
|
| 327 |
+
column_config={'date': st.column_config.DatetimeColumn(format="YYYY-MM-DD")}
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
if 'exit_reason' in df.columns:
|
| 331 |
+
st.write("#### Exit Reason Distribution")
|
| 332 |
+
overview = generate_cancellation_reasons_overview(df, 'exit_reason')
|
| 333 |
+
reasons_bar = create_cancellation_reasons_plot(overview)
|
| 334 |
+
st.plotly_chart(reasons_bar, use_container_width=True)
|
| 335 |
+
|
| 336 |
+
########################################
|
| 337 |
+
# 2. Sentiment Analysis
|
| 338 |
+
########################################
|
| 339 |
+
st.subheader("Sentiment Analysis")
|
| 340 |
+
st.write("Visual representation of sentiment distribution, plus a grouped bar chart if you like.")
|
| 341 |
+
exclude_cols_sentiment = ['freeform_answer', 'date', 'word-count', 'sentiment-score', 'sentiment']
|
| 342 |
+
candidate_cols = [col for col in df.columns if col not in exclude_cols_sentiment and df[col].nunique() > 1]
|
| 343 |
+
|
| 344 |
+
col_left, col_right = st.columns([2,1])
|
| 345 |
+
with col_left:
|
| 346 |
+
if candidate_cols:
|
| 347 |
+
grouping_col = st.selectbox(
|
| 348 |
+
"Select a column to group sentiment by",
|
| 349 |
+
candidate_cols,
|
| 350 |
+
index=0
|
| 351 |
+
)
|
| 352 |
+
grouped_data = df.groupby([grouping_col, 'sentiment']).size().reset_index(name='count')
|
| 353 |
+
st.write(f"##### Sentiment Grouped by {grouping_col}")
|
| 354 |
+
chart = create_grouped_chart(grouped_data, grouping_col, 'sentiment')
|
| 355 |
+
st.plotly_chart(chart, use_container_width=True)
|
| 356 |
+
else:
|
| 357 |
+
st.write("##### Sentiment (no grouping column available)")
|
| 358 |
+
grouped_data = df.groupby(['sentiment']).size().reset_index(name='count')
|
| 359 |
+
chart = create_grouped_chart(grouped_data, 'sentiment', 'sentiment')
|
| 360 |
+
st.plotly_chart(chart, use_container_width=True)
|
| 361 |
+
|
| 362 |
+
with col_right:
|
| 363 |
+
st.write("##### Overall Sentiment Distribution")
|
| 364 |
+
sentiment_pie = create_sentiment_pie(df)
|
| 365 |
+
st.plotly_chart(sentiment_pie, use_container_width=True)
|
| 366 |
+
|
| 367 |
+
########################################
|
| 368 |
+
# 3. Topic Modeling
|
| 369 |
+
########################################
|
| 370 |
+
st.header("Topic Modeling")
|
| 371 |
+
|
| 372 |
+
# Only run the modeling once per data set (cached).
|
| 373 |
+
_topic_model, updated_topics, mapping, chatgpt_topic_labels = run_topic_modeling(df)
|
| 374 |
+
|
| 375 |
+
topics_df = _topic_model.get_topic_info()
|
| 376 |
+
topics_df = topics_df[topics_df['Topic'] != -1].copy()
|
| 377 |
+
topics_df.drop(columns=['Name'], errors='ignore', inplace=True)
|
| 378 |
+
topics_df.rename(columns={
|
| 379 |
+
'CustomName': 'Topic Name',
|
| 380 |
+
'Topic': 'Topic Number (ID)'
|
| 381 |
+
}, inplace=True)
|
| 382 |
+
|
| 383 |
+
# Re-arrange cols for easier viewing
|
| 384 |
+
cols_order = ['Topic Number (ID)', 'Topic Name', 'Count',
|
| 385 |
+
'Representation', 'Secondary Representation', 'Representative_Docs']
|
| 386 |
+
topics_df = topics_df[[c for c in cols_order if c in topics_df.columns]]
|
| 387 |
+
|
| 388 |
+
st.subheader("Topics Barchart (Stacked by Class)")
|
| 389 |
+
st.markdown("""
|
| 390 |
+
Choose a categorical column from your data to visualize how frequently each topic appears
|
| 391 |
+
across different classes.
|
| 392 |
+
""")
|
| 393 |
+
|
| 394 |
+
with st.expander("Explore Topic Details", expanded=False):
|
| 395 |
+
st.write("""
|
| 396 |
+
**Table Info:**
|
| 397 |
+
- **Topic Name**: AI-generated label
|
| 398 |
+
- **Representation**: Top 10 keywords
|
| 399 |
+
- **Secondary Representation**: Reranked keywords for diversity
|
| 400 |
+
- **Representative Docs**: Sample sentences contributing to the topic
|
| 401 |
+
""")
|
| 402 |
+
st.dataframe(topics_df, hide_index=True)
|
| 403 |
+
|
| 404 |
+
# For stacked barchart, pick a class column
|
| 405 |
+
exclude_cols = ["freeform_answer", "sat_score", "date",
|
| 406 |
+
"word-count", "sentiment-score", "sentiment"]
|
| 407 |
+
available_cols = [c for c in df.columns if c not in exclude_cols]
|
| 408 |
+
default_idx = available_cols.index("exit_reason") if "exit_reason" in available_cols else 0
|
| 409 |
+
class_column = st.selectbox(
|
| 410 |
+
"How to group topics for visualization?",
|
| 411 |
+
available_cols,
|
| 412 |
+
index=default_idx
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
@st.cache_data(show_spinner=False)
|
| 416 |
+
def get_topics_per_class(class_col, mapping, df, sentences, _model):
|
| 417 |
+
sentence_classes = [df.loc[idx, class_col] for idx in mapping]
|
| 418 |
+
tpc = _model.topics_per_class(sentences, classes=sentence_classes)
|
| 419 |
+
t_labels = _model.get_topic_info()[['Topic', 'CustomName']]
|
| 420 |
+
tpc = tpc.merge(t_labels, on='Topic', how='left')
|
| 421 |
+
tpc = tpc[tpc['Topic'] != -1].reset_index(drop=True)
|
| 422 |
+
return tpc
|
| 423 |
+
|
| 424 |
+
# Create stacked bar chart
|
| 425 |
+
sentences = [""] * len(mapping)
|
| 426 |
+
sentences = []
|
| 427 |
+
for idx, response in df['freeform_answer'].dropna().items():
|
| 428 |
+
for sentence in sent_tokenize(response):
|
| 429 |
+
sentences.append(sentence)
|
| 430 |
+
|
| 431 |
+
topics_per_class = get_topics_per_class(class_column, mapping, df, sentences, _topic_model)
|
| 432 |
+
stacked_chart = create_stacked_topics_per_class(topics_per_class)
|
| 433 |
+
st.plotly_chart(stacked_chart, use_container_width=True)
|
| 434 |
+
|
| 435 |
+
########################################
|
| 436 |
+
# 4. Topics Over Time
|
| 437 |
+
########################################
|
| 438 |
+
st.subheader("Topics Over Time")
|
| 439 |
+
valid_dates = df['date'].dropna()
|
| 440 |
+
if valid_dates.nunique() < 2:
|
| 441 |
+
st.warning("Not enough distinct date values to plot topics over time.")
|
| 442 |
+
else:
|
| 443 |
+
# Build list of dates for each sentence
|
| 444 |
+
sentence_dates = [df.loc[idx, 'date'] for idx in mapping]
|
| 445 |
+
topics_over_time = _topic_model.topics_over_time(sentences, sentence_dates, nr_bins=20)
|
| 446 |
+
|
| 447 |
+
# Merge custom labels
|
| 448 |
+
topic_labels = _topic_model.get_topic_info()[['Topic', 'CustomName']]
|
| 449 |
+
topics_over_time = topics_over_time.merge(topic_labels, on='Topic', how='left')
|
| 450 |
+
topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
|
| 451 |
+
|
| 452 |
+
chart = create_topics_overtime_chart(topics_over_time)
|
| 453 |
+
st.plotly_chart(chart, use_container_width=True)
|
| 454 |
+
|
| 455 |
+
########################################
|
| 456 |
+
# 5. Updated DataFrame
|
| 457 |
+
########################################
|
| 458 |
+
updated_df = update_df_with_topics(df, mapping, updated_topics, chatgpt_topic_labels)
|
| 459 |
+
with st.expander("View Final Updated DataFrame", expanded=False):
|
| 460 |
+
st.dataframe(updated_df, hide_index=True)
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
if __name__ == "__main__":
|
| 464 |
+
main()
|
functions/auto_column_detection.py
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import re
|
| 4 |
+
import string
|
| 5 |
+
|
| 6 |
+
# ----------------------------------------
|
| 7 |
+
# 1. HELPER FUNCTIONS
|
| 8 |
+
# ----------------------------------------
|
| 9 |
+
|
| 10 |
+
def get_keyword_fraction(series, keywords):
|
| 11 |
+
"""
|
| 12 |
+
Returns the fraction of non-null string values in `series` that contain any of the provided `keywords`.
|
| 13 |
+
Uses a vectorized regex search for improved performance.
|
| 14 |
+
"""
|
| 15 |
+
values = series.dropna().astype(str).str.lower().str.strip()
|
| 16 |
+
if values.empty:
|
| 17 |
+
return 0
|
| 18 |
+
pattern = '|'.join(re.escape(keyword) for keyword in keywords)
|
| 19 |
+
matches = values.str.contains(pattern, regex=True)
|
| 20 |
+
return matches.mean()
|
| 21 |
+
|
| 22 |
+
def detect_keyword_based_column(
|
| 23 |
+
df,
|
| 24 |
+
candidate_columns,
|
| 25 |
+
keywords,
|
| 26 |
+
bonus_pattern=None,
|
| 27 |
+
threshold=0.5,
|
| 28 |
+
bonus_multiplier=1.1
|
| 29 |
+
):
|
| 30 |
+
"""
|
| 31 |
+
Computes the fraction of values that match any of the given keywords using regex for each candidate column.
|
| 32 |
+
Optionally applies a bonus multiplier if the column name matches the bonus pattern.
|
| 33 |
+
Returns the best candidate column if its score exceeds the threshold.
|
| 34 |
+
"""
|
| 35 |
+
possible = {}
|
| 36 |
+
for col in candidate_columns:
|
| 37 |
+
fraction = get_keyword_fraction(df[col], keywords)
|
| 38 |
+
# Apply column-name bonus
|
| 39 |
+
if bonus_pattern and re.search(bonus_pattern, col, re.IGNORECASE):
|
| 40 |
+
fraction *= bonus_multiplier
|
| 41 |
+
possible[col] = fraction
|
| 42 |
+
|
| 43 |
+
if not possible:
|
| 44 |
+
return None
|
| 45 |
+
|
| 46 |
+
best_col = max(possible, key=possible.get)
|
| 47 |
+
if possible[best_col] >= threshold:
|
| 48 |
+
return best_col
|
| 49 |
+
return None
|
| 50 |
+
|
| 51 |
+
def detect_exact_match_column(
|
| 52 |
+
df,
|
| 53 |
+
candidate_columns,
|
| 54 |
+
expected_values,
|
| 55 |
+
bonus_pattern=None,
|
| 56 |
+
threshold=0.5,
|
| 57 |
+
bonus_multiplier=1.1
|
| 58 |
+
):
|
| 59 |
+
"""
|
| 60 |
+
Computes the fraction of values that exactly match any of the expected_values for each candidate column.
|
| 61 |
+
Optionally applies a bonus multiplier if the column name matches the bonus pattern.
|
| 62 |
+
Returns the best candidate column if its score exceeds the threshold.
|
| 63 |
+
"""
|
| 64 |
+
expected_set = {str(val).lower().strip() for val in expected_values}
|
| 65 |
+
possible = {}
|
| 66 |
+
for col in candidate_columns:
|
| 67 |
+
values = df[col].dropna().astype(str).str.lower().str.strip()
|
| 68 |
+
if values.empty:
|
| 69 |
+
continue
|
| 70 |
+
fraction = values.isin(expected_set).mean()
|
| 71 |
+
# Apply column-name bonus
|
| 72 |
+
if bonus_pattern and re.search(bonus_pattern, col, re.IGNORECASE):
|
| 73 |
+
fraction *= bonus_multiplier
|
| 74 |
+
possible[col] = fraction
|
| 75 |
+
|
| 76 |
+
if not possible:
|
| 77 |
+
return None
|
| 78 |
+
|
| 79 |
+
best_col = max(possible, key=possible.get)
|
| 80 |
+
if possible[best_col] >= threshold:
|
| 81 |
+
return best_col
|
| 82 |
+
return None
|
| 83 |
+
|
| 84 |
+
# ----------------------------------------
|
| 85 |
+
# 2. REFAC: DETECTION SUBROUTINES
|
| 86 |
+
# ----------------------------------------
|
| 87 |
+
|
| 88 |
+
def detect_numeric_column(df, col_name='sat_score', min_fraction=0.9):
|
| 89 |
+
"""
|
| 90 |
+
Detect a single numeric column (by default for 'sat_score').
|
| 91 |
+
Returns the name of the column or None.
|
| 92 |
+
"""
|
| 93 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 94 |
+
# 1) If there's exactly one numeric column, just pick it.
|
| 95 |
+
if len(numeric_cols) == 1:
|
| 96 |
+
return numeric_cols[0]
|
| 97 |
+
|
| 98 |
+
# 2) Otherwise, pick the column that is numeric for the largest fraction of rows.
|
| 99 |
+
# We only accept it if that fraction is above `min_fraction`.
|
| 100 |
+
possible_numeric = {}
|
| 101 |
+
for col in df.columns:
|
| 102 |
+
conv = pd.to_numeric(df[col], errors='coerce')
|
| 103 |
+
fraction_numeric = conv.notna().mean()
|
| 104 |
+
possible_numeric[col] = fraction_numeric
|
| 105 |
+
|
| 106 |
+
if not possible_numeric:
|
| 107 |
+
return None
|
| 108 |
+
|
| 109 |
+
best_col = max(possible_numeric, key=possible_numeric.get)
|
| 110 |
+
if possible_numeric[best_col] >= min_fraction:
|
| 111 |
+
return best_col
|
| 112 |
+
return None
|
| 113 |
+
|
| 114 |
+
def detect_freeform_answer_column(df, penalty_for_low_uniqueness=0.4):
|
| 115 |
+
"""
|
| 116 |
+
Detect the 'freeform_answer' column using heuristics: average length, punctuation, uniqueness.
|
| 117 |
+
Returns the most likely column name or None.
|
| 118 |
+
"""
|
| 119 |
+
text_cols = df.select_dtypes(include=['object']).columns.tolist()
|
| 120 |
+
if not text_cols:
|
| 121 |
+
return None
|
| 122 |
+
|
| 123 |
+
scores = {}
|
| 124 |
+
for col in text_cols:
|
| 125 |
+
series = df[col].dropna().astype(str)
|
| 126 |
+
if series.empty:
|
| 127 |
+
continue
|
| 128 |
+
avg_len = series.apply(len).mean()
|
| 129 |
+
punct_counts = series.apply(lambda x: sum(1 for char in x if char in string.punctuation))
|
| 130 |
+
avg_punct = punct_counts.mean()
|
| 131 |
+
total = len(series)
|
| 132 |
+
unique_ratio = series.nunique() / total if total else 0
|
| 133 |
+
|
| 134 |
+
# Weighted composite
|
| 135 |
+
weight_length = 0.4
|
| 136 |
+
weight_punct = 0.3
|
| 137 |
+
weight_unique = 0.3
|
| 138 |
+
norm_factor = 1e-9 # avoid dividing by 0
|
| 139 |
+
scores[col] = {
|
| 140 |
+
'avg_len': avg_len,
|
| 141 |
+
'avg_punct': avg_punct,
|
| 142 |
+
'unique_ratio': unique_ratio,
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
if not scores:
|
| 146 |
+
return None
|
| 147 |
+
|
| 148 |
+
# Normalizing across all columns
|
| 149 |
+
max_len = max(s['avg_len'] for s in scores.values()) or 1e-9
|
| 150 |
+
max_punct = max(s['avg_punct'] for s in scores.values()) or 1e-9
|
| 151 |
+
|
| 152 |
+
composite = {}
|
| 153 |
+
for col, s in scores.items():
|
| 154 |
+
norm_len = s['avg_len'] / max_len
|
| 155 |
+
norm_punct = s['avg_punct'] / max_punct
|
| 156 |
+
comp_score = (0.4 * norm_len) + (0.3 * norm_punct) + (0.3 * s['unique_ratio'])
|
| 157 |
+
|
| 158 |
+
# Bonus/penalty for column names
|
| 159 |
+
if "additional_comment" in col.lower():
|
| 160 |
+
comp_score *= 3.1
|
| 161 |
+
if "usage_reason" in col.lower():
|
| 162 |
+
comp_score *= 0.5
|
| 163 |
+
|
| 164 |
+
# Penalize low uniqueness
|
| 165 |
+
if s['unique_ratio'] < penalty_for_low_uniqueness:
|
| 166 |
+
comp_score *= 0.5
|
| 167 |
+
|
| 168 |
+
composite[col] = comp_score
|
| 169 |
+
|
| 170 |
+
return max(composite, key=composite.get)
|
| 171 |
+
|
| 172 |
+
def detect_date_column(df, detected_cols):
|
| 173 |
+
"""
|
| 174 |
+
Detect a date column by parsing and measuring fraction_valid + uniqueness ratio.
|
| 175 |
+
Returns the best date column or None.
|
| 176 |
+
"""
|
| 177 |
+
# We exclude columns already detected for something else
|
| 178 |
+
remaining = [col for col in df.columns if col not in detected_cols.values()]
|
| 179 |
+
|
| 180 |
+
possible_dates = {}
|
| 181 |
+
for col in remaining:
|
| 182 |
+
# Attempt to parse the column as a date
|
| 183 |
+
dt_series = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
|
| 184 |
+
fraction_valid = dt_series.notna().mean()
|
| 185 |
+
total = len(dt_series)
|
| 186 |
+
uniqueness_ratio = dt_series.nunique() / total if total > 0 else 0
|
| 187 |
+
# Weighted composite
|
| 188 |
+
score = 0.6 * fraction_valid + 0.4 * uniqueness_ratio
|
| 189 |
+
|
| 190 |
+
# Name-based bonus
|
| 191 |
+
if re.search(r'date|time', col, re.IGNORECASE):
|
| 192 |
+
score *= 1.2
|
| 193 |
+
possible_dates[col] = score
|
| 194 |
+
|
| 195 |
+
if not possible_dates:
|
| 196 |
+
return None
|
| 197 |
+
|
| 198 |
+
best_col = max(possible_dates, key=possible_dates.get)
|
| 199 |
+
# Adjust threshold logic or do multiple checks if you like
|
| 200 |
+
if possible_dates[best_col] >= 0.6:
|
| 201 |
+
return best_col
|
| 202 |
+
|
| 203 |
+
# Fallback: if there's a partial match, you could do another pass
|
| 204 |
+
if possible_dates[best_col] >= 0.5:
|
| 205 |
+
return best_col
|
| 206 |
+
return None
|
| 207 |
+
|
| 208 |
+
# ----------------------------------------
|
| 209 |
+
# 3. MAIN AUTO-DETECT FUNCTION
|
| 210 |
+
# ----------------------------------------
|
| 211 |
+
|
| 212 |
+
def auto_detect_columns(df):
|
| 213 |
+
"""
|
| 214 |
+
Automatically detect and label DataFrame columns based on heuristics.
|
| 215 |
+
Returns a dictionary mapping semantic names to the corresponding column names.
|
| 216 |
+
"""
|
| 217 |
+
detected = {}
|
| 218 |
+
|
| 219 |
+
# 1. Detect numeric column (for example, 'sat_score')
|
| 220 |
+
sat_score_col = detect_numeric_column(df, col_name='sat_score', min_fraction=0.9)
|
| 221 |
+
if sat_score_col:
|
| 222 |
+
detected['sat_score'] = sat_score_col
|
| 223 |
+
|
| 224 |
+
# 2. Detect natural language response (freeform_answer)
|
| 225 |
+
freeform_col = detect_freeform_answer_column(df)
|
| 226 |
+
if freeform_col:
|
| 227 |
+
detected['freeform_answer'] = freeform_col
|
| 228 |
+
|
| 229 |
+
# Helper functino for skipping columns that have already been detected
|
| 230 |
+
def remaining_text_cols():
|
| 231 |
+
return [
|
| 232 |
+
col for col in df.select_dtypes(include=['object']).columns
|
| 233 |
+
if col not in detected.values()
|
| 234 |
+
]
|
| 235 |
+
|
| 236 |
+
# 3. Detect "career" column
|
| 237 |
+
career_keywords = ["ks3", "parent", "sen", "tutor", "grade", "esl"]
|
| 238 |
+
career_candidate = detect_keyword_based_column(
|
| 239 |
+
df,
|
| 240 |
+
remaining_text_cols(),
|
| 241 |
+
career_keywords,
|
| 242 |
+
bonus_pattern="career",
|
| 243 |
+
threshold=0.5
|
| 244 |
+
)
|
| 245 |
+
if career_candidate:
|
| 246 |
+
detected['career'] = career_candidate
|
| 247 |
+
|
| 248 |
+
# 4. Detect "country" column
|
| 249 |
+
country_keywords = [
|
| 250 |
+
'poland','england','united states','romania','jordan','kazakhstan','thailand',
|
| 251 |
+
'italy','philippines','australia','india','south africa','south korea','vietnam',
|
| 252 |
+
'norway','moldova','malaysia','austria','chile','cameroon'
|
| 253 |
+
]
|
| 254 |
+
country_candidate = detect_keyword_based_column(
|
| 255 |
+
df,
|
| 256 |
+
remaining_text_cols(),
|
| 257 |
+
country_keywords,
|
| 258 |
+
bonus_pattern="country",
|
| 259 |
+
threshold=0.5
|
| 260 |
+
)
|
| 261 |
+
if country_candidate:
|
| 262 |
+
detected['country'] = country_candidate
|
| 263 |
+
|
| 264 |
+
# 5. Detect "exit_reason" column
|
| 265 |
+
exit_reason_values = [
|
| 266 |
+
"I can't afford it right now",
|
| 267 |
+
"I'm not using the membership enough",
|
| 268 |
+
"Other",
|
| 269 |
+
"I am on family leave",
|
| 270 |
+
"I can't find the resources I need",
|
| 271 |
+
"I've changed careers",
|
| 272 |
+
"I'm using an alternative resource provider",
|
| 273 |
+
"My school has subscribed",
|
| 274 |
+
"I'm unwell and not working at the moment",
|
| 275 |
+
"I'm retiring"
|
| 276 |
+
]
|
| 277 |
+
exit_reason_candidate = detect_exact_match_column(
|
| 278 |
+
df,
|
| 279 |
+
remaining_text_cols(),
|
| 280 |
+
exit_reason_values,
|
| 281 |
+
bonus_pattern=r'exit|reason',
|
| 282 |
+
threshold=0.5
|
| 283 |
+
)
|
| 284 |
+
if exit_reason_candidate:
|
| 285 |
+
detected['exit_reason'] = exit_reason_candidate
|
| 286 |
+
|
| 287 |
+
# 6. Detect "secondary_reason" column
|
| 288 |
+
secondary_reason_values = [
|
| 289 |
+
'Customer Service','Resource Quality','Variety of Materials',
|
| 290 |
+
'Price','Ease of Website','other'
|
| 291 |
+
]
|
| 292 |
+
secondary_reason_candidate = detect_exact_match_column(
|
| 293 |
+
df,
|
| 294 |
+
remaining_text_cols(),
|
| 295 |
+
secondary_reason_values,
|
| 296 |
+
bonus_pattern=r'secondary|reason',
|
| 297 |
+
threshold=0.5
|
| 298 |
+
)
|
| 299 |
+
if secondary_reason_candidate:
|
| 300 |
+
detected['secondary_reason'] = secondary_reason_candidate
|
| 301 |
+
|
| 302 |
+
# 7. Detect date column
|
| 303 |
+
date_col = detect_date_column(df, detected)
|
| 304 |
+
if date_col:
|
| 305 |
+
detected['date'] = date_col
|
| 306 |
+
|
| 307 |
+
print("Auto-detected columns:", detected)
|
| 308 |
+
print("All columns:", df.columns.tolist())
|
| 309 |
+
|
| 310 |
+
return detected
|
functions/broad_category_priorities.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def assign_priority(count):
|
| 2 |
+
|
| 3 |
+
if count >= high_threshold:
|
| 4 |
+
return 'High'
|
| 5 |
+
elif count >= low_threshold:
|
| 6 |
+
return 'Medium'
|
| 7 |
+
else:
|
| 8 |
+
return 'Low'
|
functions/create_cancellation_reasons_table.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
def generate_cancellation_reasons_overview(df, source_col):
|
| 4 |
+
category_counts = df[source_col].value_counts()
|
| 5 |
+
percentages = (category_counts / len(df)) * 100
|
| 6 |
+
|
| 7 |
+
# Assigning Priority Thresholds
|
| 8 |
+
low_threshold = category_counts.quantile(0.33)
|
| 9 |
+
high_threshold = category_counts.quantile(0.67)
|
| 10 |
+
|
| 11 |
+
# Assigning Priorities
|
| 12 |
+
def assign_priority(count):
|
| 13 |
+
if count >= high_threshold:
|
| 14 |
+
return 'High'
|
| 15 |
+
elif count >= low_threshold:
|
| 16 |
+
return 'Medium'
|
| 17 |
+
else:
|
| 18 |
+
return 'Low'
|
| 19 |
+
|
| 20 |
+
# Creating the overview DataFrame
|
| 21 |
+
overview_df = pd.DataFrame({
|
| 22 |
+
'Category': category_counts.index,
|
| 23 |
+
'Count': category_counts.values,
|
| 24 |
+
'Percentage': percentages.round(1).values,
|
| 25 |
+
'Priority': category_counts.apply(assign_priority).values,
|
| 26 |
+
}).reset_index(drop=True)
|
| 27 |
+
|
| 28 |
+
return overview_df
|
functions/language_labeling_translation.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import langid
|
| 2 |
+
import openai
|
| 3 |
+
from typing import Optional
|
| 4 |
+
import streamlit as st
|
| 5 |
+
|
| 6 |
+
OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
|
| 7 |
+
|
| 8 |
+
## -- DETECT LANGUAGE
|
| 9 |
+
|
| 10 |
+
def detect_language(text):
|
| 11 |
+
try:
|
| 12 |
+
lang, _ = langid.classify(text)
|
| 13 |
+
return 'en' if lang == 'en' else 'non-en'
|
| 14 |
+
except:
|
| 15 |
+
return "unknown"
|
| 16 |
+
|
| 17 |
+
## -- TRANSLATE TEXT
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Example: Reuse your existing OpenAIWrapper for robust retry logic.
|
| 21 |
+
# from my_wrappers import OpenAIWrapper # Hypothetical import if your wrapper is in a separate module.
|
| 22 |
+
|
| 23 |
+
def translate_text(
|
| 24 |
+
text: str,
|
| 25 |
+
skip_translation: bool = False,
|
| 26 |
+
translator_model: Optional["OpenAIWrapper"] = None
|
| 27 |
+
) -> str:
|
| 28 |
+
"""
|
| 29 |
+
Translate the provided text into English using the specified translator model.
|
| 30 |
+
If 'skip_translation' is True, it returns the original text without translation.
|
| 31 |
+
|
| 32 |
+
If the text is already in English or gibberish,
|
| 33 |
+
the output should mirror the original text as per the system prompt instructions.
|
| 34 |
+
|
| 35 |
+
Parameters:
|
| 36 |
+
text (str): The text to translate.
|
| 37 |
+
skip_translation (bool): Whether to skip translation entirely. Defaults to False.
|
| 38 |
+
translator_model (OpenAIWrapper, optional): An instance of your OpenAIWrapper class
|
| 39 |
+
for robust, retriable OpenAI calls. If None, no translation is performed.
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
str: The translated text (or original text if skip_translation is True).
|
| 43 |
+
"""
|
| 44 |
+
# If skip translation is set or there's no translator provided, just return the original text.
|
| 45 |
+
if skip_translation or translator_model is None:
|
| 46 |
+
return text
|
| 47 |
+
|
| 48 |
+
# Prepare a system prompt and user prompt.
|
| 49 |
+
# For instance, you could store this in translator_model or pass it here.
|
| 50 |
+
system_prompt = (
|
| 51 |
+
"You are an expert multilingual translator working at a subscription-based EDU publishing company."
|
| 52 |
+
)
|
| 53 |
+
user_prompt_template = """
|
| 54 |
+
Below you will find a survey response from our Exit Survey that is not in English.
|
| 55 |
+
Your goal is to read it carefully to identify the original language,
|
| 56 |
+
and then translate it into English being as true to the original intent as possible.
|
| 57 |
+
|
| 58 |
+
## RULES:
|
| 59 |
+
1. Your output should ONLY contain the translated text.
|
| 60 |
+
Do NOT include any additional text, information, or explanations.
|
| 61 |
+
2. Do NOT wrap your answer in quotation marks.
|
| 62 |
+
3. If the text seems to be in English or you can't identify the language, or the text appears
|
| 63 |
+
to be gibberish, simply return the same exact text you received.
|
| 64 |
+
|
| 65 |
+
## TEXT FOR TRANSLATION:
|
| 66 |
+
{text}
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
user_prompt = user_prompt_template.format(text=text)
|
| 70 |
+
|
| 71 |
+
# translator_model might already have a "system" prompt built in,
|
| 72 |
+
# or we can combine them here. For example:
|
| 73 |
+
full_prompt = f"{system_prompt}\n\n{user_prompt}"
|
| 74 |
+
|
| 75 |
+
# Use the run() method with robust retry logic.
|
| 76 |
+
# (Adjust depending on how your wrapper is structured)
|
| 77 |
+
translated_text = translator_model.run(full_prompt)
|
| 78 |
+
|
| 79 |
+
return translated_text
|
| 80 |
+
|
functions/preprocessing_functions.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import re
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
# Precompile regex for special-only strings.
|
| 6 |
+
SPECIAL_ONLY_REGEX = re.compile(r'^[^A-Za-z0-9]+$')
|
| 7 |
+
|
| 8 |
+
def is_numeric_or_special(s: Any) -> bool:
|
| 9 |
+
"""
|
| 10 |
+
Check if the provided value is numeric or consists solely of special characters.
|
| 11 |
+
|
| 12 |
+
Parameters:
|
| 13 |
+
s (Any): The input value to check.
|
| 14 |
+
|
| 15 |
+
Returns:
|
| 16 |
+
bool: True if the value is numeric or special-only, False otherwise.
|
| 17 |
+
"""
|
| 18 |
+
if pd.isnull(s):
|
| 19 |
+
return False
|
| 20 |
+
# Ensure the input is a string.
|
| 21 |
+
s = str(s).strip()
|
| 22 |
+
|
| 23 |
+
# Check if the string can be converted to a float.
|
| 24 |
+
try:
|
| 25 |
+
float(s)
|
| 26 |
+
return True
|
| 27 |
+
except ValueError:
|
| 28 |
+
pass
|
| 29 |
+
|
| 30 |
+
# Check if the string is composed exclusively of special characters.
|
| 31 |
+
if SPECIAL_ONLY_REGEX.match(s):
|
| 32 |
+
return True
|
| 33 |
+
|
| 34 |
+
return False
|
| 35 |
+
|
| 36 |
+
def remove_numeric_or_special_responses(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
|
| 37 |
+
"""
|
| 38 |
+
Remove rows from the DataFrame where the target column's value is either numeric or
|
| 39 |
+
consists solely of special characters.
|
| 40 |
+
|
| 41 |
+
Parameters:
|
| 42 |
+
df (pd.DataFrame): The input DataFrame.
|
| 43 |
+
target_col (str): The name of the column to filter.
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
pd.DataFrame: A DataFrame with the undesired responses removed.
|
| 47 |
+
"""
|
| 48 |
+
filtered_df = df[~df[target_col].map(is_numeric_or_special)].reset_index(drop=True)
|
| 49 |
+
return filtered_df
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
#####################
|
| 53 |
+
# DATE CONVERT
|
| 54 |
+
#####################
|
| 55 |
+
|
| 56 |
+
import pandas as pd
|
| 57 |
+
import datetime
|
| 58 |
+
from dateutil import parser
|
| 59 |
+
|
| 60 |
+
def robust_convert_date(date_series):
|
| 61 |
+
"""
|
| 62 |
+
Convert a pandas Series containing dates in various formats to datetime objects.
|
| 63 |
+
|
| 64 |
+
This function tries:
|
| 65 |
+
1. The built-in pd.to_datetime() with infer_datetime_format and dayfirst options.
|
| 66 |
+
2. Falls back to dateutil.parser.parse for any values that remain unparsed.
|
| 67 |
+
|
| 68 |
+
Parameters:
|
| 69 |
+
date_series (pd.Series): A pandas Series with date values (as strings, numbers, etc.)
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
pd.Series: A Series of datetime objects (or pd.NaT if conversion fails)
|
| 73 |
+
"""
|
| 74 |
+
def convert_single(x):
|
| 75 |
+
# If the value is already a datetime, just return it.
|
| 76 |
+
if pd.isnull(x):
|
| 77 |
+
return pd.NaT
|
| 78 |
+
if isinstance(x, (pd.Timestamp, datetime.datetime)):
|
| 79 |
+
return x
|
| 80 |
+
# First, try using pd.to_datetime with coercion.
|
| 81 |
+
dt = pd.to_datetime(x, errors='coerce', infer_datetime_format=True, dayfirst=True)
|
| 82 |
+
if pd.notnull(dt):
|
| 83 |
+
return dt
|
| 84 |
+
# Fallback: use dateutil.parser to attempt parsing.
|
| 85 |
+
try:
|
| 86 |
+
return parser.parse(str(x), dayfirst=True)
|
| 87 |
+
except Exception:
|
| 88 |
+
return pd.NaT
|
| 89 |
+
|
| 90 |
+
return date_series.apply(convert_single)
|
| 91 |
+
|
functions/sentiment_analysis.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from textblob import TextBlob
|
| 2 |
+
|
| 3 |
+
def analyze_sentiment(text):
|
| 4 |
+
analysis = TextBlob(text)
|
| 5 |
+
return analysis.sentiment.polarity
|
| 6 |
+
|
| 7 |
+
def label_sentiment(score, threshold=0.2):
|
| 8 |
+
if score > threshold:
|
| 9 |
+
return 'Positive'
|
| 10 |
+
elif score < 0:
|
| 11 |
+
return 'Negative'
|
| 12 |
+
else:
|
| 13 |
+
return 'Neutral'
|
functions/topicModeling_contentRequests.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import re
|
| 3 |
+
import string
|
| 4 |
+
import torch
|
| 5 |
+
import spacy
|
| 6 |
+
|
| 7 |
+
from sentence_transformers import SentenceTransformer
|
| 8 |
+
import nltk
|
| 9 |
+
from nltk.corpus import stopwords
|
| 10 |
+
import contractions
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 14 |
+
from bertopic import BERTopic
|
| 15 |
+
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
|
| 16 |
+
import openai
|
| 17 |
+
import numpy as np
|
| 18 |
+
|
| 19 |
+
OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
"""
|
| 24 |
+
-----------------------------------
|
| 25 |
+
Lemmatization & Stopword Removal
|
| 26 |
+
-----------------------------------
|
| 27 |
+
|
| 28 |
+
"""
|
| 29 |
+
def topicModeling_preprocessing(df, spacy_model="en_core_web_lg"):
|
| 30 |
+
|
| 31 |
+
base_stopwords = set(stopwords.words('english'))
|
| 32 |
+
|
| 33 |
+
custom_stopwords = {
|
| 34 |
+
'material', 'materials', 'resources', 'resource', 'activity',
|
| 35 |
+
'activities', 'sheet', 'sheets', 'worksheet', 'worksheets',
|
| 36 |
+
'teacher', 'teachers', 'teach', 'high school', 'highschool',
|
| 37 |
+
'middle school', 'grade', 'grades', 'hs', 'level', 'age', 'ages',
|
| 38 |
+
'older', 'older kid', 'kid', 'student', "1st", "2nd", "3rd", "4th", '5th', '6th',
|
| 39 |
+
'7th', '8th', '9th'
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
stopword_set = base_stopwords.union(custom_stopwords)
|
| 43 |
+
|
| 44 |
+
stopword_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in stopword_set) + r')\b'
|
| 45 |
+
|
| 46 |
+
nlp = spacy.load(spacy_model)
|
| 47 |
+
|
| 48 |
+
def clean_lemmatize_text(text):
|
| 49 |
+
if not isinstance(text, str):
|
| 50 |
+
return None
|
| 51 |
+
|
| 52 |
+
text = contractions.fix(text)
|
| 53 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 54 |
+
text = re.sub(stopword_pattern, '', text)
|
| 55 |
+
|
| 56 |
+
doc = nlp(text)
|
| 57 |
+
tokens = [token.lemma_ for token in doc]
|
| 58 |
+
|
| 59 |
+
clean_text = " ".join(tokens).strip()
|
| 60 |
+
clean_text = re.sub(r'\s+', ' ', clean_text)
|
| 61 |
+
|
| 62 |
+
return clean_text if clean_text else None
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
df['processedForModeling'] = df['preprocessedBasic'].apply(clean_lemmatize_text)
|
| 66 |
+
|
| 67 |
+
# Drop rows where cleaned text is empty or None
|
| 68 |
+
df = df.dropna(subset=['processedForModeling'])
|
| 69 |
+
|
| 70 |
+
return df
|
| 71 |
+
|
| 72 |
+
"""
|
| 73 |
+
--------------------------
|
| 74 |
+
Load Transformer Model
|
| 75 |
+
--------------------------
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
@st.cache_resource
|
| 79 |
+
def load_embedding_model():
|
| 80 |
+
if torch.cuda.is_available():
|
| 81 |
+
device = "cuda"
|
| 82 |
+
elif torch.backends.mps.is_available():
|
| 83 |
+
device = "mps"
|
| 84 |
+
else:
|
| 85 |
+
device = "cpu"
|
| 86 |
+
|
| 87 |
+
st.write(f"Using device: {device}")
|
| 88 |
+
return SentenceTransformer("paraphrase-mpnet-base-v2", device=device)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
"""
|
| 92 |
+
-------------------------
|
| 93 |
+
Batch Embedding Creation
|
| 94 |
+
-------------------------
|
| 95 |
+
"""
|
| 96 |
+
|
| 97 |
+
def encode_content_documents(embedding_model, content_documents, batch_size=20):
|
| 98 |
+
embeddings_batches = []
|
| 99 |
+
|
| 100 |
+
for i in range(0, len(content_documents), batch_size):
|
| 101 |
+
batch_docs = content_documents[i:i + batch_size]
|
| 102 |
+
batch_embeddings = embedding_model.encode(batch_docs, convert_to_numpy=True, show_progress_bar=True)
|
| 103 |
+
embeddings_batches.append(batch_embeddings)
|
| 104 |
+
|
| 105 |
+
return np.vstack(embeddings_batches)
|
| 106 |
+
|
| 107 |
+
"""
|
| 108 |
+
-----------------------------
|
| 109 |
+
Topic Modeling with BERTopic
|
| 110 |
+
-----------------------------
|
| 111 |
+
"""
|
| 112 |
+
|
| 113 |
+
stopwords = list(stopwords.words('english')) + [
|
| 114 |
+
'activities',
|
| 115 |
+
'activity',
|
| 116 |
+
'class',
|
| 117 |
+
'classroom',
|
| 118 |
+
'material',
|
| 119 |
+
'materials',
|
| 120 |
+
'membership',
|
| 121 |
+
'memberships',
|
| 122 |
+
'pupil',
|
| 123 |
+
'pupils',
|
| 124 |
+
'resource',
|
| 125 |
+
'resources',
|
| 126 |
+
'sheet',
|
| 127 |
+
'sheets',
|
| 128 |
+
'student',
|
| 129 |
+
'students',
|
| 130 |
+
'subscription',
|
| 131 |
+
'subscriptions',
|
| 132 |
+
'subscribe',
|
| 133 |
+
'subscribed',
|
| 134 |
+
'recommend',
|
| 135 |
+
'recommendation',
|
| 136 |
+
'teach',
|
| 137 |
+
'teacher',
|
| 138 |
+
'teachers',
|
| 139 |
+
'tutor',
|
| 140 |
+
'tutors',
|
| 141 |
+
'twinkl',
|
| 142 |
+
'twinkls',
|
| 143 |
+
'twinkle',
|
| 144 |
+
'worksheet',
|
| 145 |
+
'worksheets',
|
| 146 |
+
]
|
| 147 |
+
|
| 148 |
+
######### --------------- BERTOPIC ----------------- #############
|
| 149 |
+
@st.cache_resource
|
| 150 |
+
def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_model):
|
| 151 |
+
|
| 152 |
+
main_representation_model = KeyBERTInspired()
|
| 153 |
+
aspect_representation_model1 = MaximalMarginalRelevance(diversity=.3)
|
| 154 |
+
|
| 155 |
+
# OpenAI Representation Model
|
| 156 |
+
client = openai.OpenAI(api_key=OPENAI_API_KEY)
|
| 157 |
+
prompt = """
|
| 158 |
+
I have a topic that contains the following documents:
|
| 159 |
+
[DOCUMENTS]
|
| 160 |
+
|
| 161 |
+
The topic is described by the following keywords: [KEYWORDS]
|
| 162 |
+
|
| 163 |
+
Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
|
| 164 |
+
|
| 165 |
+
topic: <topic label>
|
| 166 |
+
"""
|
| 167 |
+
openai_model = OpenAI(client, model="gpt-4o-mini", exponential_backoff=True, chat=True, prompt=prompt)
|
| 168 |
+
|
| 169 |
+
representation_model = {
|
| 170 |
+
"Main": main_representation_model,
|
| 171 |
+
"Secondary Representation": aspect_representation_model1,
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
vectorizer_model = CountVectorizer(min_df=2, max_df=0.60, stop_words=stopwords)
|
| 175 |
+
|
| 176 |
+
seed_topic_list = [
|
| 177 |
+
["autism", "special needs", "special education needs", "special education", "adhd", "autistic", "dyslexia", "dyslexic", "sen"],
|
| 178 |
+
]
|
| 179 |
+
|
| 180 |
+
topic_model = BERTopic(
|
| 181 |
+
verbose=True,
|
| 182 |
+
embedding_model=_embedding_model,
|
| 183 |
+
umap_model=_umap_model,
|
| 184 |
+
hdbscan_model = _hdbscan_model,
|
| 185 |
+
vectorizer_model=vectorizer_model,
|
| 186 |
+
#seed_topic_list = seed_topic_list,
|
| 187 |
+
representation_model=representation_model,
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
topics, probs = topic_model.fit_transform(docs, embeddings)
|
| 191 |
+
return topic_model, topics, probs
|
| 192 |
+
|
| 193 |
+
##################################
|
| 194 |
+
# TOPIC MERGING
|
| 195 |
+
##################################
|
| 196 |
+
|
| 197 |
+
def merge_specific_topics(topic_model, sentences,
|
| 198 |
+
cancellation_keywords=["cancel", "cancellation", "cancel", "canceled"],
|
| 199 |
+
thanks_keywords=["thank", "thanks", "thank you", "thankyou", "ty", "thx"],
|
| 200 |
+
expensive_keywords=["can't afford", "price", "expensive", "cost"]):
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
topic_info = topic_model.get_topic_info()
|
| 204 |
+
|
| 205 |
+
# Identify cancellation-related topics by checking if any cancellation keyword appears in the topic name.
|
| 206 |
+
cancellation_regex = '|'.join(cancellation_keywords)
|
| 207 |
+
cancellation_topics = topic_info[
|
| 208 |
+
topic_info['Name'].str.contains(cancellation_regex, case=False, na=False)
|
| 209 |
+
]['Topic'].tolist()
|
| 210 |
+
|
| 211 |
+
# Identify thank-you-related topics similarly.
|
| 212 |
+
thanks_regex = '|'.join(thanks_keywords)
|
| 213 |
+
thanks_topics = topic_info[
|
| 214 |
+
topic_info['Name'].str.contains(thanks_regex, case=False, na=False)
|
| 215 |
+
]['Topic'].tolist()
|
| 216 |
+
|
| 217 |
+
# Identify expensive-related topics.
|
| 218 |
+
expensive_regex = '|'.join(expensive_keywords)
|
| 219 |
+
expensive_topics = topic_info[
|
| 220 |
+
topic_info['Name'].str.contains(expensive_regex, case=False, na=False)
|
| 221 |
+
]['Topic'].tolist()
|
| 222 |
+
|
| 223 |
+
# Exclude the outlier topic (-1) if it appears.
|
| 224 |
+
cancellation_topics = [t for t in cancellation_topics if t != -1]
|
| 225 |
+
thanks_topics = [t for t in thanks_topics if t != -1]
|
| 226 |
+
expensive_topics = [t for t in expensive_topics if t != -1]
|
| 227 |
+
|
| 228 |
+
# Create a list of topics to merge
|
| 229 |
+
topics_to_merge = []
|
| 230 |
+
|
| 231 |
+
if len(cancellation_topics) > 1:
|
| 232 |
+
print(f"Merging cancellation topics: {cancellation_topics}")
|
| 233 |
+
topics_to_merge.append(cancellation_topics)
|
| 234 |
+
|
| 235 |
+
if len(thanks_topics) > 1:
|
| 236 |
+
print(f"Merging thank-you topics: {thanks_topics}")
|
| 237 |
+
topics_to_merge.append(thanks_topics)
|
| 238 |
+
|
| 239 |
+
if len(expensive_topics) > 1:
|
| 240 |
+
print(f"Merging expensive topics: {expensive_topics}")
|
| 241 |
+
topics_to_merge.append(expensive_topics)
|
| 242 |
+
|
| 243 |
+
# Call merge_topics
|
| 244 |
+
if topics_to_merge:
|
| 245 |
+
topic_model.merge_topics(sentences, topics_to_merge)
|
| 246 |
+
|
| 247 |
+
return topic_model
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
##################################
|
| 251 |
+
# Topic to Dataframe Mapping
|
| 252 |
+
#################################
|
| 253 |
+
|
| 254 |
+
def update_df_with_topics(df, mapping, sentence_topics, topic_label_map):
|
| 255 |
+
topics_by_row = {}
|
| 256 |
+
for i, row_idx in enumerate(mapping):
|
| 257 |
+
topic = sentence_topics[i]
|
| 258 |
+
topics_by_row.setdefault(row_idx, set()).add(topic)
|
| 259 |
+
|
| 260 |
+
updated_df = df.copy()
|
| 261 |
+
|
| 262 |
+
def map_topics(row_idx):
|
| 263 |
+
topic_ids = topics_by_row.get(row_idx, set())
|
| 264 |
+
topic_names = [topic_label_map.get(t, str(t)) for t in topic_ids if t != -1]
|
| 265 |
+
return ", ".join(sorted(topic_names))
|
| 266 |
+
|
| 267 |
+
updated_df['Topics'] = updated_df.index.map(map_topics)
|
| 268 |
+
return updated_df
|
| 269 |
+
|
plots/overview_charts.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import plotly.express as px
|
| 2 |
+
|
| 3 |
+
legend_font_size=14
|
| 4 |
+
xaxis_font_size=16
|
| 5 |
+
ticks_size=14
|
| 6 |
+
|
| 7 |
+
## -- WORD COUNT PLOT
|
| 8 |
+
|
| 9 |
+
def create_word_count_histogram(df, nbins=40, height=550):
|
| 10 |
+
|
| 11 |
+
fig = px.histogram(
|
| 12 |
+
df,
|
| 13 |
+
x='word-count',
|
| 14 |
+
nbins=nbins,
|
| 15 |
+
title=None,
|
| 16 |
+
color_discrete_sequence=['#646DEF']
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
fig.update_layout(
|
| 20 |
+
height=height,
|
| 21 |
+
margin=dict(t=30),
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
return fig
|
| 25 |
+
|
| 26 |
+
## -- SENTIMENT PLOT
|
| 27 |
+
|
| 28 |
+
def create_sentiment_pie(df, height=450):
|
| 29 |
+
|
| 30 |
+
sentiment_pie = px.pie(
|
| 31 |
+
df,
|
| 32 |
+
names='sentiment',
|
| 33 |
+
color='sentiment',
|
| 34 |
+
color_discrete_map={ 'Positive':'darkturquoise', 'Neutral':'#646DEF', 'Negative':'red'},
|
| 35 |
+
hole=0.45,
|
| 36 |
+
title=None
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
sentiment_pie.update_traces(hovertemplate='%{label}<extra></extra>')
|
| 40 |
+
|
| 41 |
+
sentiment_pie.update_layout(
|
| 42 |
+
showlegend=False,
|
| 43 |
+
margin=dict(r=50),
|
| 44 |
+
legend=dict(
|
| 45 |
+
font=dict(size=legend_font_size),
|
| 46 |
+
orientation="h", # Vertical orientation
|
| 47 |
+
x=0.5,
|
| 48 |
+
xanchor="center",
|
| 49 |
+
)
|
| 50 |
+
)
|
| 51 |
+
return sentiment_pie
|
| 52 |
+
|
| 53 |
+
## -- CANCELLATION REASONS
|
| 54 |
+
|
| 55 |
+
def create_cancellation_reasons_plot(cancellation_overview):
|
| 56 |
+
|
| 57 |
+
reasons_bar = px.bar(
|
| 58 |
+
cancellation_overview,
|
| 59 |
+
x='Category',
|
| 60 |
+
y='Count',
|
| 61 |
+
color_discrete_sequence=['#646DEF'],
|
| 62 |
+
color_discrete_map={'Low':'darkturquoise', 'Medium':'orangered', 'High':'red'},
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
reasons_bar.update_traces(
|
| 66 |
+
customdata=cancellation_overview['Percentage'],
|
| 67 |
+
hovertemplate='Count = %{y}<br>Percentage = %{customdata}%'
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
reasons_bar.update_layout(
|
| 71 |
+
height=600,
|
| 72 |
+
xaxis_title="",
|
| 73 |
+
yaxis_title="",
|
| 74 |
+
xaxis=dict(title_font=dict(size=xaxis_font_size), tickfont=dict(size=ticks_size)),
|
| 75 |
+
# yaxis=dict(title_font=dict(size=xaxis_font_size), tickfont=dict(size=ticks_size)),
|
| 76 |
+
# yaxis_title=None,
|
| 77 |
+
# margin=dict(r=70),
|
| 78 |
+
# legend=dict(
|
| 79 |
+
# font=dict(size=legend_font_size),
|
| 80 |
+
# orientation='h', # Makes the legend horizontal
|
| 81 |
+
# yanchor='bottom', # Aligns the bottom of the legend box
|
| 82 |
+
# y=1.05, # Places the legend slightly above the plot
|
| 83 |
+
#)
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
return reasons_bar
|
| 87 |
+
|
| 88 |
+
############# Grouped By Career ############
|
| 89 |
+
|
| 90 |
+
def create_grouped_chart(grouped_df, group_name_col, color_col):
|
| 91 |
+
|
| 92 |
+
grouped_chart = px.bar(
|
| 93 |
+
grouped_df,
|
| 94 |
+
x=group_name_col,
|
| 95 |
+
y='count',
|
| 96 |
+
color= color_col,
|
| 97 |
+
color_discrete_map={'Positive':'darkturquoise', 'Neutral':'#646DEF', 'Negative':'red'},
|
| 98 |
+
title=None,
|
| 99 |
+
barmode="stack")
|
| 100 |
+
|
| 101 |
+
grouped_chart.update_layout(
|
| 102 |
+
legend=dict(
|
| 103 |
+
x=-0.05,
|
| 104 |
+
xanchor="left",
|
| 105 |
+
y=1.2,
|
| 106 |
+
yanchor="top",
|
| 107 |
+
orientation='h'
|
| 108 |
+
)
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
return grouped_chart
|
plots/topicModeling_charts.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bertopic import BERTopic # Ensure you have BERTopic installed
|
| 2 |
+
import plotly.graph_objects as go # BERTopic visualization uses Plotly
|
| 3 |
+
import plotly.colors as pc
|
| 4 |
+
import plotly.express as px
|
| 5 |
+
|
| 6 |
+
xaxis_font_size=14
|
| 7 |
+
ticks_size=14
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def topicDistribution(topic_model, top_n_topics=6, n_words=5):
|
| 11 |
+
content_topics_barchart = topic_model.visualize_barchart(top_n_topics=top_n_topics, n_words=n_words)
|
| 12 |
+
colors = pc.qualitative.Plotly
|
| 13 |
+
for i, trace in enumerate(content_topics_barchart.data):
|
| 14 |
+
trace.marker.color = colors[i % len(colors)] # Cycle through colors
|
| 15 |
+
|
| 16 |
+
content_topics_barchart.update_layout(title_text="") # Remove the title
|
| 17 |
+
|
| 18 |
+
return content_topics_barchart
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
####################
|
| 22 |
+
# TOPIC FREQUENCY
|
| 23 |
+
###################
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def create_topicFreq_chart(topics_df):
|
| 27 |
+
|
| 28 |
+
# Create a new column "top_words" that holds the top 5 words for each topic.
|
| 29 |
+
# `topic_model.get_topic(topic)` returns a list of (word, score) tuples.
|
| 30 |
+
topics_df['top_5_words'] = topics_df.iloc[:,3].apply(lambda x: ', '.join(x[:5]) if isinstance(x, list) else x)
|
| 31 |
+
# Create the bar chart using Plotly Express.
|
| 32 |
+
# Pass the "top_words" column as custom data for use in the hover template.
|
| 33 |
+
topicFreq_barchart = px.bar(
|
| 34 |
+
topics_df,
|
| 35 |
+
x="Topic Name",
|
| 36 |
+
y="Count",
|
| 37 |
+
custom_data=["top_5_words"],
|
| 38 |
+
title=None,
|
| 39 |
+
labels={"Count": "Frequency", "Topic": "CutomName"},
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# Update traces to include custom hover text showing the top 5 words.
|
| 43 |
+
topicFreq_barchart.update_traces(
|
| 44 |
+
marker_color='#646DEF',
|
| 45 |
+
textposition='outside',
|
| 46 |
+
hovertemplate=(
|
| 47 |
+
'Frequency: %{y}<br>'
|
| 48 |
+
'Top 5 words: %{customdata[0]}<extra></extra>'
|
| 49 |
+
)
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
topicFreq_barchart.update_layout(
|
| 53 |
+
uniformtext_minsize=8,
|
| 54 |
+
uniformtext_mode='hide',
|
| 55 |
+
xaxis_title="Topic Name",
|
| 56 |
+
yaxis_title="Frequency",
|
| 57 |
+
height=650,
|
| 58 |
+
xaxis=dict(title_font=dict(size=xaxis_font_size), tickfont=dict(size=ticks_size)),
|
| 59 |
+
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
return topicFreq_barchart
|
| 63 |
+
|
| 64 |
+
###############################
|
| 65 |
+
# Stacked Topic Freq Per Class
|
| 66 |
+
###############################
|
| 67 |
+
|
| 68 |
+
def create_stacked_topics_per_class(df):
|
| 69 |
+
topcis_per_class_chart = px.bar(
|
| 70 |
+
df,
|
| 71 |
+
x="CustomName", # Classes on the x-axis
|
| 72 |
+
y="Frequency", # Count of documents per topic
|
| 73 |
+
color="Class", # Different colors for different topics
|
| 74 |
+
title=None,
|
| 75 |
+
barmode="stack", # Stacked bars
|
| 76 |
+
labels={"Count": "Frequency", "Topics":"CustomName"},
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
topcis_per_class_chart.update_layout(
|
| 80 |
+
uniformtext_minsize=8,
|
| 81 |
+
uniformtext_mode='hide',
|
| 82 |
+
xaxis_title="Topic Name",
|
| 83 |
+
yaxis_title="Frequency",
|
| 84 |
+
height=650,
|
| 85 |
+
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
return topcis_per_class_chart
|
| 89 |
+
|
| 90 |
+
#######################
|
| 91 |
+
# Intertopic Distance
|
| 92 |
+
#######################
|
| 93 |
+
|
| 94 |
+
def intertopicDistanceMap(topic_model, color="orangered"):
|
| 95 |
+
# Generate the base figure
|
| 96 |
+
fig = topic_model.visualize_topics(
|
| 97 |
+
title="")
|
| 98 |
+
|
| 99 |
+
# Update trace colors
|
| 100 |
+
for trace in fig.data:
|
| 101 |
+
trace.marker.color = color
|
| 102 |
+
trace.marker.line.width = 0
|
| 103 |
+
|
| 104 |
+
fig.update_layout(
|
| 105 |
+
margin=dict(r=50)
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
return fig
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
##########################
|
| 113 |
+
# Topics Over Time
|
| 114 |
+
#########################
|
| 115 |
+
|
| 116 |
+
def create_topics_overtime_chart(topics_overtime_df):
|
| 117 |
+
topics_overtime_chart = px.line(
|
| 118 |
+
topics_overtime_df,
|
| 119 |
+
x="Timestamp",
|
| 120 |
+
y="Frequency",
|
| 121 |
+
color="CustomName",
|
| 122 |
+
markers=True,
|
| 123 |
+
title=None,
|
| 124 |
+
labels={"Timestamp": "Time", "Frequency": "Topic Frequency", "Name": "CustomName"},
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
topics_overtime_chart.update_layout(
|
| 128 |
+
xaxis_title="Time",
|
| 129 |
+
yaxis_title="Frequency",
|
| 130 |
+
legend_title="Topics",
|
| 131 |
+
height=700,
|
| 132 |
+
legend=dict(
|
| 133 |
+
orientation="h",
|
| 134 |
+
yanchor="bottom",
|
| 135 |
+
y=-0.5, # Adjust this value as needed to move the legend further down
|
| 136 |
+
xanchor="center",
|
| 137 |
+
x=0.5
|
| 138 |
+
)
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
return topics_overtime_chart
|
requirements.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
bertopic==0.16.4
|
| 2 |
+
contractions==0.1.73
|
| 3 |
+
hdbscan==0.8.40
|
| 4 |
+
langid==1.1.6
|
| 5 |
+
nltk==3.9.1
|
| 6 |
+
numpy==2.2.3
|
| 7 |
+
openai==1.65.2
|
| 8 |
+
pandas==2.2.3
|
| 9 |
+
plotly==5.24.1
|
| 10 |
+
python_dateutil==2.9.0.post0
|
| 11 |
+
scikit_learn==1.6.1
|
| 12 |
+
sentence_transformers==3.3.1
|
| 13 |
+
spacy==3.8.2
|
| 14 |
+
streamlit==1.42.2
|
| 15 |
+
tenacity==9.0.0
|
| 16 |
+
textblob==0.19.0
|
| 17 |
+
torch==2.5.1
|
| 18 |
+
umap_learn==0.5.7
|