lynn-twinkl commited on
Commit
2e164d2
·
1 Parent(s): 47fac11

first commit

Browse files
.gitignore ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ #Local Files
158
+ .DS_Store
159
+ secrets.toml
160
+
161
+ # PyCharm
162
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
163
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
164
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
165
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
166
+ #.idea/
app.py ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###############################
2
+ # IMPORTS & CONFIG
3
+ ###############################
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import time
7
+ from datetime import datetime
8
+ from nltk.tokenize import sent_tokenize
9
+ from hdbscan import HDBSCAN
10
+ from umap import UMAP
11
+ from openai import OpenAI
12
+ from tenacity import retry, wait_exponential, stop_after_attempt
13
+
14
+ from functions.auto_column_detection import auto_detect_columns
15
+ from functions.preprocessing_functions import remove_numeric_or_special_responses, robust_convert_date
16
+ from functions.language_labeling_translation import detect_language, translate_text
17
+ from functions.sentiment_analysis import analyze_sentiment, label_sentiment
18
+ from functions.create_cancellation_reasons_table import generate_cancellation_reasons_overview
19
+ from html_helpers.cancellation_reasons_table_html import generate_cancellation_table_html
20
+
21
+ from functions.topicModeling_contentRequests import (
22
+ load_embedding_model,
23
+ bertopic_model,
24
+ merge_specific_topics,
25
+ update_df_with_topics
26
+ )
27
+ from plots.overview_charts import (
28
+ create_word_count_histogram,
29
+ create_sentiment_pie,
30
+ create_cancellation_reasons_plot,
31
+ create_grouped_chart
32
+ )
33
+ from plots.topicModeling_charts import (
34
+ create_topics_overtime_chart,
35
+ create_stacked_topics_per_class
36
+ )
37
+
38
+ ############################
39
+ # STREAMLIT APP CONFIGURATION
40
+ ############################
41
+ st.set_page_config(
42
+ layout='wide',
43
+ page_title="Exit Survey Processing App",
44
+ initial_sidebar_state="expanded",
45
+ )
46
+
47
+ # Global settings
48
+ OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
49
+ client = OpenAI(api_key=OPENAI_API_KEY)
50
+
51
+ ###############################
52
+ # HELPER CLASSES & FUNCTIONS
53
+ ###############################
54
+ class OpenAIWrapper:
55
+ """
56
+ Wraps the OpenAI chat.completions call with automatic retries and
57
+ a configurable prompt.
58
+ """
59
+ def __init__(self, model, prompt=""):
60
+ self.model = model
61
+ self.prompt = prompt
62
+
63
+ @retry(wait=wait_exponential(multiplier=1, min=2, max=10), stop=stop_after_attempt(5))
64
+ def run(self, user_text):
65
+ try:
66
+ response = client.chat.completions.create(
67
+ model=self.model,
68
+ messages=[
69
+ {"role": "system", "content": self.prompt},
70
+ {"role": "user", "content": user_text},
71
+ ]
72
+ )
73
+ return response.choices[0].message.content
74
+ except Exception as e:
75
+ st.error(f"Error during OpenAI API call: {e}")
76
+ raise
77
+
78
+ @st.cache_data(show_spinner=False)
79
+ def cached_translate(text):
80
+ """Cached translation function to reduce repeated OpenAI calls."""
81
+ return translate_text(text, skip_translation=False, translator_model=openai_model)
82
+
83
+ @st.cache_resource(show_spinner=False)
84
+ def get_embedding_model():
85
+ """Caches the embedding model for topic modeling."""
86
+ return load_embedding_model()
87
+
88
+ def translate_non_english(df):
89
+ """
90
+ Identifies and translates non-English rows (with word-count > 8) in 'freeform_answer'.
91
+ Uses the globally cached `cached_translate`.
92
+ """
93
+ df['language'] = df['freeform_answer'].apply(detect_language)
94
+ to_translate = df[(df['language'] == 'non-en') & (df['word-count'] > 8)].copy()
95
+ if not to_translate.empty:
96
+ progress_text = st.empty()
97
+ progress_bar = st.progress(0)
98
+ total = len(to_translate)
99
+ for i, (idx, row) in enumerate(to_translate.iterrows(), 1):
100
+ progress_text.text(f"Translating non-English responses ({i} of {total})")
101
+ try:
102
+ translated = cached_translate(row['freeform_answer'])
103
+ df.at[idx, 'freeform_answer'] = translated
104
+ except Exception as e:
105
+ st.error(f"Error translating response {i}: {str(e)}")
106
+ progress_bar.progress(i / total)
107
+ progress_text.empty()
108
+ progress_bar.empty()
109
+ st.success(
110
+ f"Successfully translated {total} non-English responses",
111
+ icon='✅'
112
+ )
113
+ df.drop(columns='language', inplace=True, errors='ignore')
114
+ return df
115
+
116
+ @st.cache_data(show_spinner=False)
117
+ def run_topic_modeling(df):
118
+ """
119
+ Full pipeline for:
120
+ 1. Sentence tokenization
121
+ 2. Embedding
122
+ 3. UMAP, HDBSCAN
123
+ 4. BERTopic modeling
124
+ 5. Custom topic naming via OpenAI
125
+ 6. Merging small topics, final labeling
126
+
127
+ Returns:
128
+ (topic_model, updated_topics, mapping, chatgpt_topic_labels)
129
+ """
130
+ # --- 1. Sentence tokenization ---
131
+ sentences = []
132
+ mapping = []
133
+ for idx, response in df['freeform_answer'].dropna().items():
134
+ for sentence in sent_tokenize(response):
135
+ sentences.append(sentence)
136
+ mapping.append(idx)
137
+
138
+ # --- 2. Embedding ---
139
+ embedding_model = get_embedding_model()
140
+ embeddings = embedding_model.encode(sentences, show_progress_bar=True)
141
+
142
+ # --- 3. UMAP, HDBSCAN ---
143
+ umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
144
+ hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean',
145
+ cluster_selection_method='eom',
146
+ prediction_data=True)
147
+
148
+ # --- 4. BERTopic model creation ---
149
+ _topic_model, topics, probs = bertopic_model(
150
+ sentences, embeddings, embedding_model,
151
+ umap_model, hdbscan_model
152
+ )
153
+
154
+ # Merge small or closely
155
+ _topic_model = merge_specific_topics(_topic_model, sentences)
156
+ updated_topics, _ = _topic_model.transform(sentences)
157
+
158
+ # --- 5. Custom topic naming via OpenAI ---
159
+ topic_info = _topic_model.get_topic_info()
160
+ chatgpt_topic_labels = {}
161
+ for topic_id in topic_info['Topic']:
162
+ if topic_id == -1:
163
+ continue
164
+ rep_docs = _topic_model.get_representative_docs(topic_id)
165
+ doc_text = " ".join(rep_docs[:10]) # Up to 10 docs for context
166
+ topic_keywords = _topic_model.get_topic(topic_id) or []
167
+ keywords_text = ", ".join([word for word, score in topic_keywords])
168
+
169
+ prompt_template = """
170
+ I have a topic that contains the following documents:
171
+ [DOCUMENTS]
172
+
173
+ The topic is described by the following keywords: [KEYWORDS]
174
+
175
+ Based on the information above, extract a short but highly descriptive topic label
176
+ of at most 5 words. Make sure it is in the following format:
177
+
178
+ topic: <topic label>
179
+ """.strip()
180
+
181
+ prompt_filled = prompt_template.replace("[DOCUMENTS]", doc_text).replace("[KEYWORDS]", keywords_text)
182
+ response = naming_model.run(prompt_filled)
183
+ label = response.strip()
184
+ if label.lower().startswith("topic:"):
185
+ label = label[len("topic:"):].strip()
186
+ chatgpt_topic_labels[topic_id] = label
187
+
188
+ if -1 in chatgpt_topic_labels:
189
+ del chatgpt_topic_labels[-1]
190
+ _topic_model.set_topic_labels(chatgpt_topic_labels)
191
+
192
+ return _topic_model, updated_topics, mapping, chatgpt_topic_labels
193
+
194
+ def process_file(uploaded_file):
195
+ """
196
+ Process the uploaded file, perform data cleaning, and return a processed DataFrame.
197
+ """
198
+ # 1. Read file
199
+ try:
200
+ if uploaded_file.name.endswith('.csv'):
201
+ df = pd.read_csv(uploaded_file)
202
+ else:
203
+ df = pd.read_excel(uploaded_file)
204
+ except Exception as e:
205
+ st.error(f"Error reading file: {e}")
206
+ st.stop()
207
+
208
+ original_row_count = len(df)
209
+
210
+ # 2. Auto-detect columns
211
+ st.header("Data Preview")
212
+ df_preview_col, spacer, detected_cols_col = st.columns([1, 0.05, 1])
213
+
214
+ with df_preview_col:
215
+ st.subheader("Raw Data Preview")
216
+ st.dataframe(df, hide_index=True)
217
+
218
+ with detected_cols_col:
219
+ detected = auto_detect_columns(df)
220
+ st.subheader("Column Detection & Selection")
221
+ st.info(
222
+ "We've automatically detected a few columns. Verify these are correct or select manually.",
223
+ icon='💡'
224
+ )
225
+ st.json(detected)
226
+
227
+ for req in ['freeform_answer', 'date']:
228
+ if req not in detected:
229
+ detected[req] = st.selectbox(f"Select column for {req}", df.columns.tolist())
230
+
231
+ if not st.button("Continue with these columns"):
232
+ st.stop()
233
+
234
+ # 3. Rename columns
235
+ rename_mapping = {detected[col]: col for col in detected}
236
+ df.rename(columns=rename_mapping, inplace=True)
237
+ df.columns = df.columns.str.lower().str.replace(" ", "_")
238
+
239
+ # 4. Basic cleaning steps
240
+ if 'freeform_answer' not in df.columns:
241
+ st.error("Column 'freeform_answer' not found.")
242
+ st.stop()
243
+
244
+ # Word count
245
+ df['word-count'] = df['freeform_answer'].apply(
246
+ lambda x: len(str(x).split()) if pd.notnull(x) else 0
247
+ )
248
+
249
+ # Convert date
250
+ if 'date' in df.columns:
251
+ df['date'] = robust_convert_date(df['date'])
252
+ else:
253
+ st.error("'date' column is missing.")
254
+ st.stop()
255
+
256
+ # Remove numeric or special responses
257
+ df = remove_numeric_or_special_responses(df, 'freeform_answer')
258
+
259
+ # 5. Translate non-English
260
+ df = translate_non_english(df)
261
+
262
+ # 6. Sentiment
263
+ df['sentiment-score'] = df['freeform_answer'].apply(analyze_sentiment)
264
+ df['sentiment'] = df['sentiment-score'].apply(label_sentiment)
265
+
266
+ final_row_count = len(df)
267
+ row_count_delta = final_row_count - original_row_count
268
+
269
+ return df, row_count_delta, final_row_count, original_row_count
270
+
271
+ ############################
272
+ # APP ENTRY POINT
273
+ ############################
274
+ def main():
275
+ st.title("Exit Survey Processing App")
276
+ st.markdown("Upload your Exit Survey file in CSV or Excel format; the app cleans & processes it.")
277
+
278
+ # Global/Shared models
279
+ global openai_model, naming_model
280
+ openai_model = OpenAIWrapper(model="gpt-4o-mini", prompt="")
281
+ naming_model = OpenAIWrapper(model="gpt-4o-mini", prompt="") # for topic naming
282
+
283
+ # Reset button
284
+ if st.button("Reset App"):
285
+ st.session_state.clear()
286
+
287
+ # File upload
288
+ uploaded_file = st.file_uploader("Upload an exit survey file", type=["csv", "xlsx"])
289
+
290
+ if uploaded_file:
291
+ if 'processed_df' not in st.session_state:
292
+ with st.spinner("Processing file..."):
293
+ df, row_count_delta, final_row_count, original_row_count = process_file(uploaded_file)
294
+ st.session_state['processed_df'] = df
295
+ st.session_state['row_count_delta'] = row_count_delta
296
+ st.session_state['final_row_count'] = final_row_count
297
+ st.session_state['original_row_count'] = original_row_count
298
+ else:
299
+ df = st.session_state['processed_df']
300
+ row_count_delta = st.session_state['row_count_delta']
301
+ final_row_count = st.session_state['final_row_count']
302
+ original_row_count = st.session_state['original_row_count']
303
+
304
+ st.divider()
305
+
306
+ ########################################
307
+ # 1. General Overview
308
+ ########################################
309
+ st.header("General Overview")
310
+ with st.container():
311
+ metric_col1, metric_col2 = st.columns(2)
312
+ metric_col1.metric(
313
+ label="No. Responses After Processing",
314
+ value=final_row_count,
315
+ delta=row_count_delta
316
+ )
317
+ avg_length = int(df['word-count'].mean().round())
318
+ metric_col2.metric(
319
+ label="Avg. Response Length",
320
+ value=f"{avg_length} words"
321
+ )
322
+
323
+ st.write("#### Data Overview")
324
+ st.dataframe(
325
+ df,
326
+ hide_index=True,
327
+ column_config={'date': st.column_config.DatetimeColumn(format="YYYY-MM-DD")}
328
+ )
329
+
330
+ if 'exit_reason' in df.columns:
331
+ st.write("#### Exit Reason Distribution")
332
+ overview = generate_cancellation_reasons_overview(df, 'exit_reason')
333
+ reasons_bar = create_cancellation_reasons_plot(overview)
334
+ st.plotly_chart(reasons_bar, use_container_width=True)
335
+
336
+ ########################################
337
+ # 2. Sentiment Analysis
338
+ ########################################
339
+ st.subheader("Sentiment Analysis")
340
+ st.write("Visual representation of sentiment distribution, plus a grouped bar chart if you like.")
341
+ exclude_cols_sentiment = ['freeform_answer', 'date', 'word-count', 'sentiment-score', 'sentiment']
342
+ candidate_cols = [col for col in df.columns if col not in exclude_cols_sentiment and df[col].nunique() > 1]
343
+
344
+ col_left, col_right = st.columns([2,1])
345
+ with col_left:
346
+ if candidate_cols:
347
+ grouping_col = st.selectbox(
348
+ "Select a column to group sentiment by",
349
+ candidate_cols,
350
+ index=0
351
+ )
352
+ grouped_data = df.groupby([grouping_col, 'sentiment']).size().reset_index(name='count')
353
+ st.write(f"##### Sentiment Grouped by {grouping_col}")
354
+ chart = create_grouped_chart(grouped_data, grouping_col, 'sentiment')
355
+ st.plotly_chart(chart, use_container_width=True)
356
+ else:
357
+ st.write("##### Sentiment (no grouping column available)")
358
+ grouped_data = df.groupby(['sentiment']).size().reset_index(name='count')
359
+ chart = create_grouped_chart(grouped_data, 'sentiment', 'sentiment')
360
+ st.plotly_chart(chart, use_container_width=True)
361
+
362
+ with col_right:
363
+ st.write("##### Overall Sentiment Distribution")
364
+ sentiment_pie = create_sentiment_pie(df)
365
+ st.plotly_chart(sentiment_pie, use_container_width=True)
366
+
367
+ ########################################
368
+ # 3. Topic Modeling
369
+ ########################################
370
+ st.header("Topic Modeling")
371
+
372
+ # Only run the modeling once per data set (cached).
373
+ _topic_model, updated_topics, mapping, chatgpt_topic_labels = run_topic_modeling(df)
374
+
375
+ topics_df = _topic_model.get_topic_info()
376
+ topics_df = topics_df[topics_df['Topic'] != -1].copy()
377
+ topics_df.drop(columns=['Name'], errors='ignore', inplace=True)
378
+ topics_df.rename(columns={
379
+ 'CustomName': 'Topic Name',
380
+ 'Topic': 'Topic Number (ID)'
381
+ }, inplace=True)
382
+
383
+ # Re-arrange cols for easier viewing
384
+ cols_order = ['Topic Number (ID)', 'Topic Name', 'Count',
385
+ 'Representation', 'Secondary Representation', 'Representative_Docs']
386
+ topics_df = topics_df[[c for c in cols_order if c in topics_df.columns]]
387
+
388
+ st.subheader("Topics Barchart (Stacked by Class)")
389
+ st.markdown("""
390
+ Choose a categorical column from your data to visualize how frequently each topic appears
391
+ across different classes.
392
+ """)
393
+
394
+ with st.expander("Explore Topic Details", expanded=False):
395
+ st.write("""
396
+ **Table Info:**
397
+ - **Topic Name**: AI-generated label
398
+ - **Representation**: Top 10 keywords
399
+ - **Secondary Representation**: Reranked keywords for diversity
400
+ - **Representative Docs**: Sample sentences contributing to the topic
401
+ """)
402
+ st.dataframe(topics_df, hide_index=True)
403
+
404
+ # For stacked barchart, pick a class column
405
+ exclude_cols = ["freeform_answer", "sat_score", "date",
406
+ "word-count", "sentiment-score", "sentiment"]
407
+ available_cols = [c for c in df.columns if c not in exclude_cols]
408
+ default_idx = available_cols.index("exit_reason") if "exit_reason" in available_cols else 0
409
+ class_column = st.selectbox(
410
+ "How to group topics for visualization?",
411
+ available_cols,
412
+ index=default_idx
413
+ )
414
+
415
+ @st.cache_data(show_spinner=False)
416
+ def get_topics_per_class(class_col, mapping, df, sentences, _model):
417
+ sentence_classes = [df.loc[idx, class_col] for idx in mapping]
418
+ tpc = _model.topics_per_class(sentences, classes=sentence_classes)
419
+ t_labels = _model.get_topic_info()[['Topic', 'CustomName']]
420
+ tpc = tpc.merge(t_labels, on='Topic', how='left')
421
+ tpc = tpc[tpc['Topic'] != -1].reset_index(drop=True)
422
+ return tpc
423
+
424
+ # Create stacked bar chart
425
+ sentences = [""] * len(mapping)
426
+ sentences = []
427
+ for idx, response in df['freeform_answer'].dropna().items():
428
+ for sentence in sent_tokenize(response):
429
+ sentences.append(sentence)
430
+
431
+ topics_per_class = get_topics_per_class(class_column, mapping, df, sentences, _topic_model)
432
+ stacked_chart = create_stacked_topics_per_class(topics_per_class)
433
+ st.plotly_chart(stacked_chart, use_container_width=True)
434
+
435
+ ########################################
436
+ # 4. Topics Over Time
437
+ ########################################
438
+ st.subheader("Topics Over Time")
439
+ valid_dates = df['date'].dropna()
440
+ if valid_dates.nunique() < 2:
441
+ st.warning("Not enough distinct date values to plot topics over time.")
442
+ else:
443
+ # Build list of dates for each sentence
444
+ sentence_dates = [df.loc[idx, 'date'] for idx in mapping]
445
+ topics_over_time = _topic_model.topics_over_time(sentences, sentence_dates, nr_bins=20)
446
+
447
+ # Merge custom labels
448
+ topic_labels = _topic_model.get_topic_info()[['Topic', 'CustomName']]
449
+ topics_over_time = topics_over_time.merge(topic_labels, on='Topic', how='left')
450
+ topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
451
+
452
+ chart = create_topics_overtime_chart(topics_over_time)
453
+ st.plotly_chart(chart, use_container_width=True)
454
+
455
+ ########################################
456
+ # 5. Updated DataFrame
457
+ ########################################
458
+ updated_df = update_df_with_topics(df, mapping, updated_topics, chatgpt_topic_labels)
459
+ with st.expander("View Final Updated DataFrame", expanded=False):
460
+ st.dataframe(updated_df, hide_index=True)
461
+
462
+
463
+ if __name__ == "__main__":
464
+ main()
functions/auto_column_detection.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import re
4
+ import string
5
+
6
+ # ----------------------------------------
7
+ # 1. HELPER FUNCTIONS
8
+ # ----------------------------------------
9
+
10
+ def get_keyword_fraction(series, keywords):
11
+ """
12
+ Returns the fraction of non-null string values in `series` that contain any of the provided `keywords`.
13
+ Uses a vectorized regex search for improved performance.
14
+ """
15
+ values = series.dropna().astype(str).str.lower().str.strip()
16
+ if values.empty:
17
+ return 0
18
+ pattern = '|'.join(re.escape(keyword) for keyword in keywords)
19
+ matches = values.str.contains(pattern, regex=True)
20
+ return matches.mean()
21
+
22
+ def detect_keyword_based_column(
23
+ df,
24
+ candidate_columns,
25
+ keywords,
26
+ bonus_pattern=None,
27
+ threshold=0.5,
28
+ bonus_multiplier=1.1
29
+ ):
30
+ """
31
+ Computes the fraction of values that match any of the given keywords using regex for each candidate column.
32
+ Optionally applies a bonus multiplier if the column name matches the bonus pattern.
33
+ Returns the best candidate column if its score exceeds the threshold.
34
+ """
35
+ possible = {}
36
+ for col in candidate_columns:
37
+ fraction = get_keyword_fraction(df[col], keywords)
38
+ # Apply column-name bonus
39
+ if bonus_pattern and re.search(bonus_pattern, col, re.IGNORECASE):
40
+ fraction *= bonus_multiplier
41
+ possible[col] = fraction
42
+
43
+ if not possible:
44
+ return None
45
+
46
+ best_col = max(possible, key=possible.get)
47
+ if possible[best_col] >= threshold:
48
+ return best_col
49
+ return None
50
+
51
+ def detect_exact_match_column(
52
+ df,
53
+ candidate_columns,
54
+ expected_values,
55
+ bonus_pattern=None,
56
+ threshold=0.5,
57
+ bonus_multiplier=1.1
58
+ ):
59
+ """
60
+ Computes the fraction of values that exactly match any of the expected_values for each candidate column.
61
+ Optionally applies a bonus multiplier if the column name matches the bonus pattern.
62
+ Returns the best candidate column if its score exceeds the threshold.
63
+ """
64
+ expected_set = {str(val).lower().strip() for val in expected_values}
65
+ possible = {}
66
+ for col in candidate_columns:
67
+ values = df[col].dropna().astype(str).str.lower().str.strip()
68
+ if values.empty:
69
+ continue
70
+ fraction = values.isin(expected_set).mean()
71
+ # Apply column-name bonus
72
+ if bonus_pattern and re.search(bonus_pattern, col, re.IGNORECASE):
73
+ fraction *= bonus_multiplier
74
+ possible[col] = fraction
75
+
76
+ if not possible:
77
+ return None
78
+
79
+ best_col = max(possible, key=possible.get)
80
+ if possible[best_col] >= threshold:
81
+ return best_col
82
+ return None
83
+
84
+ # ----------------------------------------
85
+ # 2. REFAC: DETECTION SUBROUTINES
86
+ # ----------------------------------------
87
+
88
+ def detect_numeric_column(df, col_name='sat_score', min_fraction=0.9):
89
+ """
90
+ Detect a single numeric column (by default for 'sat_score').
91
+ Returns the name of the column or None.
92
+ """
93
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
94
+ # 1) If there's exactly one numeric column, just pick it.
95
+ if len(numeric_cols) == 1:
96
+ return numeric_cols[0]
97
+
98
+ # 2) Otherwise, pick the column that is numeric for the largest fraction of rows.
99
+ # We only accept it if that fraction is above `min_fraction`.
100
+ possible_numeric = {}
101
+ for col in df.columns:
102
+ conv = pd.to_numeric(df[col], errors='coerce')
103
+ fraction_numeric = conv.notna().mean()
104
+ possible_numeric[col] = fraction_numeric
105
+
106
+ if not possible_numeric:
107
+ return None
108
+
109
+ best_col = max(possible_numeric, key=possible_numeric.get)
110
+ if possible_numeric[best_col] >= min_fraction:
111
+ return best_col
112
+ return None
113
+
114
+ def detect_freeform_answer_column(df, penalty_for_low_uniqueness=0.4):
115
+ """
116
+ Detect the 'freeform_answer' column using heuristics: average length, punctuation, uniqueness.
117
+ Returns the most likely column name or None.
118
+ """
119
+ text_cols = df.select_dtypes(include=['object']).columns.tolist()
120
+ if not text_cols:
121
+ return None
122
+
123
+ scores = {}
124
+ for col in text_cols:
125
+ series = df[col].dropna().astype(str)
126
+ if series.empty:
127
+ continue
128
+ avg_len = series.apply(len).mean()
129
+ punct_counts = series.apply(lambda x: sum(1 for char in x if char in string.punctuation))
130
+ avg_punct = punct_counts.mean()
131
+ total = len(series)
132
+ unique_ratio = series.nunique() / total if total else 0
133
+
134
+ # Weighted composite
135
+ weight_length = 0.4
136
+ weight_punct = 0.3
137
+ weight_unique = 0.3
138
+ norm_factor = 1e-9 # avoid dividing by 0
139
+ scores[col] = {
140
+ 'avg_len': avg_len,
141
+ 'avg_punct': avg_punct,
142
+ 'unique_ratio': unique_ratio,
143
+ }
144
+
145
+ if not scores:
146
+ return None
147
+
148
+ # Normalizing across all columns
149
+ max_len = max(s['avg_len'] for s in scores.values()) or 1e-9
150
+ max_punct = max(s['avg_punct'] for s in scores.values()) or 1e-9
151
+
152
+ composite = {}
153
+ for col, s in scores.items():
154
+ norm_len = s['avg_len'] / max_len
155
+ norm_punct = s['avg_punct'] / max_punct
156
+ comp_score = (0.4 * norm_len) + (0.3 * norm_punct) + (0.3 * s['unique_ratio'])
157
+
158
+ # Bonus/penalty for column names
159
+ if "additional_comment" in col.lower():
160
+ comp_score *= 3.1
161
+ if "usage_reason" in col.lower():
162
+ comp_score *= 0.5
163
+
164
+ # Penalize low uniqueness
165
+ if s['unique_ratio'] < penalty_for_low_uniqueness:
166
+ comp_score *= 0.5
167
+
168
+ composite[col] = comp_score
169
+
170
+ return max(composite, key=composite.get)
171
+
172
+ def detect_date_column(df, detected_cols):
173
+ """
174
+ Detect a date column by parsing and measuring fraction_valid + uniqueness ratio.
175
+ Returns the best date column or None.
176
+ """
177
+ # We exclude columns already detected for something else
178
+ remaining = [col for col in df.columns if col not in detected_cols.values()]
179
+
180
+ possible_dates = {}
181
+ for col in remaining:
182
+ # Attempt to parse the column as a date
183
+ dt_series = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
184
+ fraction_valid = dt_series.notna().mean()
185
+ total = len(dt_series)
186
+ uniqueness_ratio = dt_series.nunique() / total if total > 0 else 0
187
+ # Weighted composite
188
+ score = 0.6 * fraction_valid + 0.4 * uniqueness_ratio
189
+
190
+ # Name-based bonus
191
+ if re.search(r'date|time', col, re.IGNORECASE):
192
+ score *= 1.2
193
+ possible_dates[col] = score
194
+
195
+ if not possible_dates:
196
+ return None
197
+
198
+ best_col = max(possible_dates, key=possible_dates.get)
199
+ # Adjust threshold logic or do multiple checks if you like
200
+ if possible_dates[best_col] >= 0.6:
201
+ return best_col
202
+
203
+ # Fallback: if there's a partial match, you could do another pass
204
+ if possible_dates[best_col] >= 0.5:
205
+ return best_col
206
+ return None
207
+
208
+ # ----------------------------------------
209
+ # 3. MAIN AUTO-DETECT FUNCTION
210
+ # ----------------------------------------
211
+
212
+ def auto_detect_columns(df):
213
+ """
214
+ Automatically detect and label DataFrame columns based on heuristics.
215
+ Returns a dictionary mapping semantic names to the corresponding column names.
216
+ """
217
+ detected = {}
218
+
219
+ # 1. Detect numeric column (for example, 'sat_score')
220
+ sat_score_col = detect_numeric_column(df, col_name='sat_score', min_fraction=0.9)
221
+ if sat_score_col:
222
+ detected['sat_score'] = sat_score_col
223
+
224
+ # 2. Detect natural language response (freeform_answer)
225
+ freeform_col = detect_freeform_answer_column(df)
226
+ if freeform_col:
227
+ detected['freeform_answer'] = freeform_col
228
+
229
+ # Helper functino for skipping columns that have already been detected
230
+ def remaining_text_cols():
231
+ return [
232
+ col for col in df.select_dtypes(include=['object']).columns
233
+ if col not in detected.values()
234
+ ]
235
+
236
+ # 3. Detect "career" column
237
+ career_keywords = ["ks3", "parent", "sen", "tutor", "grade", "esl"]
238
+ career_candidate = detect_keyword_based_column(
239
+ df,
240
+ remaining_text_cols(),
241
+ career_keywords,
242
+ bonus_pattern="career",
243
+ threshold=0.5
244
+ )
245
+ if career_candidate:
246
+ detected['career'] = career_candidate
247
+
248
+ # 4. Detect "country" column
249
+ country_keywords = [
250
+ 'poland','england','united states','romania','jordan','kazakhstan','thailand',
251
+ 'italy','philippines','australia','india','south africa','south korea','vietnam',
252
+ 'norway','moldova','malaysia','austria','chile','cameroon'
253
+ ]
254
+ country_candidate = detect_keyword_based_column(
255
+ df,
256
+ remaining_text_cols(),
257
+ country_keywords,
258
+ bonus_pattern="country",
259
+ threshold=0.5
260
+ )
261
+ if country_candidate:
262
+ detected['country'] = country_candidate
263
+
264
+ # 5. Detect "exit_reason" column
265
+ exit_reason_values = [
266
+ "I can't afford it right now",
267
+ "I'm not using the membership enough",
268
+ "Other",
269
+ "I am on family leave",
270
+ "I can't find the resources I need",
271
+ "I've changed careers",
272
+ "I'm using an alternative resource provider",
273
+ "My school has subscribed",
274
+ "I'm unwell and not working at the moment",
275
+ "I'm retiring"
276
+ ]
277
+ exit_reason_candidate = detect_exact_match_column(
278
+ df,
279
+ remaining_text_cols(),
280
+ exit_reason_values,
281
+ bonus_pattern=r'exit|reason',
282
+ threshold=0.5
283
+ )
284
+ if exit_reason_candidate:
285
+ detected['exit_reason'] = exit_reason_candidate
286
+
287
+ # 6. Detect "secondary_reason" column
288
+ secondary_reason_values = [
289
+ 'Customer Service','Resource Quality','Variety of Materials',
290
+ 'Price','Ease of Website','other'
291
+ ]
292
+ secondary_reason_candidate = detect_exact_match_column(
293
+ df,
294
+ remaining_text_cols(),
295
+ secondary_reason_values,
296
+ bonus_pattern=r'secondary|reason',
297
+ threshold=0.5
298
+ )
299
+ if secondary_reason_candidate:
300
+ detected['secondary_reason'] = secondary_reason_candidate
301
+
302
+ # 7. Detect date column
303
+ date_col = detect_date_column(df, detected)
304
+ if date_col:
305
+ detected['date'] = date_col
306
+
307
+ print("Auto-detected columns:", detected)
308
+ print("All columns:", df.columns.tolist())
309
+
310
+ return detected
functions/broad_category_priorities.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ def assign_priority(count):
2
+
3
+ if count >= high_threshold:
4
+ return 'High'
5
+ elif count >= low_threshold:
6
+ return 'Medium'
7
+ else:
8
+ return 'Low'
functions/create_cancellation_reasons_table.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def generate_cancellation_reasons_overview(df, source_col):
4
+ category_counts = df[source_col].value_counts()
5
+ percentages = (category_counts / len(df)) * 100
6
+
7
+ # Assigning Priority Thresholds
8
+ low_threshold = category_counts.quantile(0.33)
9
+ high_threshold = category_counts.quantile(0.67)
10
+
11
+ # Assigning Priorities
12
+ def assign_priority(count):
13
+ if count >= high_threshold:
14
+ return 'High'
15
+ elif count >= low_threshold:
16
+ return 'Medium'
17
+ else:
18
+ return 'Low'
19
+
20
+ # Creating the overview DataFrame
21
+ overview_df = pd.DataFrame({
22
+ 'Category': category_counts.index,
23
+ 'Count': category_counts.values,
24
+ 'Percentage': percentages.round(1).values,
25
+ 'Priority': category_counts.apply(assign_priority).values,
26
+ }).reset_index(drop=True)
27
+
28
+ return overview_df
functions/language_labeling_translation.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import langid
2
+ import openai
3
+ from typing import Optional
4
+ import streamlit as st
5
+
6
+ OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
7
+
8
+ ## -- DETECT LANGUAGE
9
+
10
+ def detect_language(text):
11
+ try:
12
+ lang, _ = langid.classify(text)
13
+ return 'en' if lang == 'en' else 'non-en'
14
+ except:
15
+ return "unknown"
16
+
17
+ ## -- TRANSLATE TEXT
18
+
19
+
20
+ # Example: Reuse your existing OpenAIWrapper for robust retry logic.
21
+ # from my_wrappers import OpenAIWrapper # Hypothetical import if your wrapper is in a separate module.
22
+
23
+ def translate_text(
24
+ text: str,
25
+ skip_translation: bool = False,
26
+ translator_model: Optional["OpenAIWrapper"] = None
27
+ ) -> str:
28
+ """
29
+ Translate the provided text into English using the specified translator model.
30
+ If 'skip_translation' is True, it returns the original text without translation.
31
+
32
+ If the text is already in English or gibberish,
33
+ the output should mirror the original text as per the system prompt instructions.
34
+
35
+ Parameters:
36
+ text (str): The text to translate.
37
+ skip_translation (bool): Whether to skip translation entirely. Defaults to False.
38
+ translator_model (OpenAIWrapper, optional): An instance of your OpenAIWrapper class
39
+ for robust, retriable OpenAI calls. If None, no translation is performed.
40
+
41
+ Returns:
42
+ str: The translated text (or original text if skip_translation is True).
43
+ """
44
+ # If skip translation is set or there's no translator provided, just return the original text.
45
+ if skip_translation or translator_model is None:
46
+ return text
47
+
48
+ # Prepare a system prompt and user prompt.
49
+ # For instance, you could store this in translator_model or pass it here.
50
+ system_prompt = (
51
+ "You are an expert multilingual translator working at a subscription-based EDU publishing company."
52
+ )
53
+ user_prompt_template = """
54
+ Below you will find a survey response from our Exit Survey that is not in English.
55
+ Your goal is to read it carefully to identify the original language,
56
+ and then translate it into English being as true to the original intent as possible.
57
+
58
+ ## RULES:
59
+ 1. Your output should ONLY contain the translated text.
60
+ Do NOT include any additional text, information, or explanations.
61
+ 2. Do NOT wrap your answer in quotation marks.
62
+ 3. If the text seems to be in English or you can't identify the language, or the text appears
63
+ to be gibberish, simply return the same exact text you received.
64
+
65
+ ## TEXT FOR TRANSLATION:
66
+ {text}
67
+ """
68
+
69
+ user_prompt = user_prompt_template.format(text=text)
70
+
71
+ # translator_model might already have a "system" prompt built in,
72
+ # or we can combine them here. For example:
73
+ full_prompt = f"{system_prompt}\n\n{user_prompt}"
74
+
75
+ # Use the run() method with robust retry logic.
76
+ # (Adjust depending on how your wrapper is structured)
77
+ translated_text = translator_model.run(full_prompt)
78
+
79
+ return translated_text
80
+
functions/preprocessing_functions.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ from typing import Any
4
+
5
+ # Precompile regex for special-only strings.
6
+ SPECIAL_ONLY_REGEX = re.compile(r'^[^A-Za-z0-9]+$')
7
+
8
+ def is_numeric_or_special(s: Any) -> bool:
9
+ """
10
+ Check if the provided value is numeric or consists solely of special characters.
11
+
12
+ Parameters:
13
+ s (Any): The input value to check.
14
+
15
+ Returns:
16
+ bool: True if the value is numeric or special-only, False otherwise.
17
+ """
18
+ if pd.isnull(s):
19
+ return False
20
+ # Ensure the input is a string.
21
+ s = str(s).strip()
22
+
23
+ # Check if the string can be converted to a float.
24
+ try:
25
+ float(s)
26
+ return True
27
+ except ValueError:
28
+ pass
29
+
30
+ # Check if the string is composed exclusively of special characters.
31
+ if SPECIAL_ONLY_REGEX.match(s):
32
+ return True
33
+
34
+ return False
35
+
36
+ def remove_numeric_or_special_responses(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
37
+ """
38
+ Remove rows from the DataFrame where the target column's value is either numeric or
39
+ consists solely of special characters.
40
+
41
+ Parameters:
42
+ df (pd.DataFrame): The input DataFrame.
43
+ target_col (str): The name of the column to filter.
44
+
45
+ Returns:
46
+ pd.DataFrame: A DataFrame with the undesired responses removed.
47
+ """
48
+ filtered_df = df[~df[target_col].map(is_numeric_or_special)].reset_index(drop=True)
49
+ return filtered_df
50
+
51
+
52
+ #####################
53
+ # DATE CONVERT
54
+ #####################
55
+
56
+ import pandas as pd
57
+ import datetime
58
+ from dateutil import parser
59
+
60
+ def robust_convert_date(date_series):
61
+ """
62
+ Convert a pandas Series containing dates in various formats to datetime objects.
63
+
64
+ This function tries:
65
+ 1. The built-in pd.to_datetime() with infer_datetime_format and dayfirst options.
66
+ 2. Falls back to dateutil.parser.parse for any values that remain unparsed.
67
+
68
+ Parameters:
69
+ date_series (pd.Series): A pandas Series with date values (as strings, numbers, etc.)
70
+
71
+ Returns:
72
+ pd.Series: A Series of datetime objects (or pd.NaT if conversion fails)
73
+ """
74
+ def convert_single(x):
75
+ # If the value is already a datetime, just return it.
76
+ if pd.isnull(x):
77
+ return pd.NaT
78
+ if isinstance(x, (pd.Timestamp, datetime.datetime)):
79
+ return x
80
+ # First, try using pd.to_datetime with coercion.
81
+ dt = pd.to_datetime(x, errors='coerce', infer_datetime_format=True, dayfirst=True)
82
+ if pd.notnull(dt):
83
+ return dt
84
+ # Fallback: use dateutil.parser to attempt parsing.
85
+ try:
86
+ return parser.parse(str(x), dayfirst=True)
87
+ except Exception:
88
+ return pd.NaT
89
+
90
+ return date_series.apply(convert_single)
91
+
functions/sentiment_analysis.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from textblob import TextBlob
2
+
3
+ def analyze_sentiment(text):
4
+ analysis = TextBlob(text)
5
+ return analysis.sentiment.polarity
6
+
7
+ def label_sentiment(score, threshold=0.2):
8
+ if score > threshold:
9
+ return 'Positive'
10
+ elif score < 0:
11
+ return 'Negative'
12
+ else:
13
+ return 'Neutral'
functions/topicModeling_contentRequests.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import re
3
+ import string
4
+ import torch
5
+ import spacy
6
+
7
+ from sentence_transformers import SentenceTransformer
8
+ import nltk
9
+ from nltk.corpus import stopwords
10
+ import contractions
11
+
12
+
13
+ from sklearn.feature_extraction.text import CountVectorizer
14
+ from bertopic import BERTopic
15
+ from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
16
+ import openai
17
+ import numpy as np
18
+
19
+ OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
20
+
21
+
22
+
23
+ """
24
+ -----------------------------------
25
+ Lemmatization & Stopword Removal
26
+ -----------------------------------
27
+
28
+ """
29
+ def topicModeling_preprocessing(df, spacy_model="en_core_web_lg"):
30
+
31
+ base_stopwords = set(stopwords.words('english'))
32
+
33
+ custom_stopwords = {
34
+ 'material', 'materials', 'resources', 'resource', 'activity',
35
+ 'activities', 'sheet', 'sheets', 'worksheet', 'worksheets',
36
+ 'teacher', 'teachers', 'teach', 'high school', 'highschool',
37
+ 'middle school', 'grade', 'grades', 'hs', 'level', 'age', 'ages',
38
+ 'older', 'older kid', 'kid', 'student', "1st", "2nd", "3rd", "4th", '5th', '6th',
39
+ '7th', '8th', '9th'
40
+ }
41
+
42
+ stopword_set = base_stopwords.union(custom_stopwords)
43
+
44
+ stopword_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in stopword_set) + r')\b'
45
+
46
+ nlp = spacy.load(spacy_model)
47
+
48
+ def clean_lemmatize_text(text):
49
+ if not isinstance(text, str):
50
+ return None
51
+
52
+ text = contractions.fix(text)
53
+ text = re.sub(r'\s+', ' ', text).strip()
54
+ text = re.sub(stopword_pattern, '', text)
55
+
56
+ doc = nlp(text)
57
+ tokens = [token.lemma_ for token in doc]
58
+
59
+ clean_text = " ".join(tokens).strip()
60
+ clean_text = re.sub(r'\s+', ' ', clean_text)
61
+
62
+ return clean_text if clean_text else None
63
+
64
+
65
+ df['processedForModeling'] = df['preprocessedBasic'].apply(clean_lemmatize_text)
66
+
67
+ # Drop rows where cleaned text is empty or None
68
+ df = df.dropna(subset=['processedForModeling'])
69
+
70
+ return df
71
+
72
+ """
73
+ --------------------------
74
+ Load Transformer Model
75
+ --------------------------
76
+ """
77
+
78
+ @st.cache_resource
79
+ def load_embedding_model():
80
+ if torch.cuda.is_available():
81
+ device = "cuda"
82
+ elif torch.backends.mps.is_available():
83
+ device = "mps"
84
+ else:
85
+ device = "cpu"
86
+
87
+ st.write(f"Using device: {device}")
88
+ return SentenceTransformer("paraphrase-mpnet-base-v2", device=device)
89
+
90
+
91
+ """
92
+ -------------------------
93
+ Batch Embedding Creation
94
+ -------------------------
95
+ """
96
+
97
+ def encode_content_documents(embedding_model, content_documents, batch_size=20):
98
+ embeddings_batches = []
99
+
100
+ for i in range(0, len(content_documents), batch_size):
101
+ batch_docs = content_documents[i:i + batch_size]
102
+ batch_embeddings = embedding_model.encode(batch_docs, convert_to_numpy=True, show_progress_bar=True)
103
+ embeddings_batches.append(batch_embeddings)
104
+
105
+ return np.vstack(embeddings_batches)
106
+
107
+ """
108
+ -----------------------------
109
+ Topic Modeling with BERTopic
110
+ -----------------------------
111
+ """
112
+
113
+ stopwords = list(stopwords.words('english')) + [
114
+ 'activities',
115
+ 'activity',
116
+ 'class',
117
+ 'classroom',
118
+ 'material',
119
+ 'materials',
120
+ 'membership',
121
+ 'memberships',
122
+ 'pupil',
123
+ 'pupils',
124
+ 'resource',
125
+ 'resources',
126
+ 'sheet',
127
+ 'sheets',
128
+ 'student',
129
+ 'students',
130
+ 'subscription',
131
+ 'subscriptions',
132
+ 'subscribe',
133
+ 'subscribed',
134
+ 'recommend',
135
+ 'recommendation',
136
+ 'teach',
137
+ 'teacher',
138
+ 'teachers',
139
+ 'tutor',
140
+ 'tutors',
141
+ 'twinkl',
142
+ 'twinkls',
143
+ 'twinkle',
144
+ 'worksheet',
145
+ 'worksheets',
146
+ ]
147
+
148
+ ######### --------------- BERTOPIC ----------------- #############
149
+ @st.cache_resource
150
+ def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_model):
151
+
152
+ main_representation_model = KeyBERTInspired()
153
+ aspect_representation_model1 = MaximalMarginalRelevance(diversity=.3)
154
+
155
+ # OpenAI Representation Model
156
+ client = openai.OpenAI(api_key=OPENAI_API_KEY)
157
+ prompt = """
158
+ I have a topic that contains the following documents:
159
+ [DOCUMENTS]
160
+
161
+ The topic is described by the following keywords: [KEYWORDS]
162
+
163
+ Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
164
+
165
+ topic: <topic label>
166
+ """
167
+ openai_model = OpenAI(client, model="gpt-4o-mini", exponential_backoff=True, chat=True, prompt=prompt)
168
+
169
+ representation_model = {
170
+ "Main": main_representation_model,
171
+ "Secondary Representation": aspect_representation_model1,
172
+ }
173
+
174
+ vectorizer_model = CountVectorizer(min_df=2, max_df=0.60, stop_words=stopwords)
175
+
176
+ seed_topic_list = [
177
+ ["autism", "special needs", "special education needs", "special education", "adhd", "autistic", "dyslexia", "dyslexic", "sen"],
178
+ ]
179
+
180
+ topic_model = BERTopic(
181
+ verbose=True,
182
+ embedding_model=_embedding_model,
183
+ umap_model=_umap_model,
184
+ hdbscan_model = _hdbscan_model,
185
+ vectorizer_model=vectorizer_model,
186
+ #seed_topic_list = seed_topic_list,
187
+ representation_model=representation_model,
188
+ )
189
+
190
+ topics, probs = topic_model.fit_transform(docs, embeddings)
191
+ return topic_model, topics, probs
192
+
193
+ ##################################
194
+ # TOPIC MERGING
195
+ ##################################
196
+
197
+ def merge_specific_topics(topic_model, sentences,
198
+ cancellation_keywords=["cancel", "cancellation", "cancel", "canceled"],
199
+ thanks_keywords=["thank", "thanks", "thank you", "thankyou", "ty", "thx"],
200
+ expensive_keywords=["can't afford", "price", "expensive", "cost"]):
201
+
202
+
203
+ topic_info = topic_model.get_topic_info()
204
+
205
+ # Identify cancellation-related topics by checking if any cancellation keyword appears in the topic name.
206
+ cancellation_regex = '|'.join(cancellation_keywords)
207
+ cancellation_topics = topic_info[
208
+ topic_info['Name'].str.contains(cancellation_regex, case=False, na=False)
209
+ ]['Topic'].tolist()
210
+
211
+ # Identify thank-you-related topics similarly.
212
+ thanks_regex = '|'.join(thanks_keywords)
213
+ thanks_topics = topic_info[
214
+ topic_info['Name'].str.contains(thanks_regex, case=False, na=False)
215
+ ]['Topic'].tolist()
216
+
217
+ # Identify expensive-related topics.
218
+ expensive_regex = '|'.join(expensive_keywords)
219
+ expensive_topics = topic_info[
220
+ topic_info['Name'].str.contains(expensive_regex, case=False, na=False)
221
+ ]['Topic'].tolist()
222
+
223
+ # Exclude the outlier topic (-1) if it appears.
224
+ cancellation_topics = [t for t in cancellation_topics if t != -1]
225
+ thanks_topics = [t for t in thanks_topics if t != -1]
226
+ expensive_topics = [t for t in expensive_topics if t != -1]
227
+
228
+ # Create a list of topics to merge
229
+ topics_to_merge = []
230
+
231
+ if len(cancellation_topics) > 1:
232
+ print(f"Merging cancellation topics: {cancellation_topics}")
233
+ topics_to_merge.append(cancellation_topics)
234
+
235
+ if len(thanks_topics) > 1:
236
+ print(f"Merging thank-you topics: {thanks_topics}")
237
+ topics_to_merge.append(thanks_topics)
238
+
239
+ if len(expensive_topics) > 1:
240
+ print(f"Merging expensive topics: {expensive_topics}")
241
+ topics_to_merge.append(expensive_topics)
242
+
243
+ # Call merge_topics
244
+ if topics_to_merge:
245
+ topic_model.merge_topics(sentences, topics_to_merge)
246
+
247
+ return topic_model
248
+
249
+
250
+ ##################################
251
+ # Topic to Dataframe Mapping
252
+ #################################
253
+
254
+ def update_df_with_topics(df, mapping, sentence_topics, topic_label_map):
255
+ topics_by_row = {}
256
+ for i, row_idx in enumerate(mapping):
257
+ topic = sentence_topics[i]
258
+ topics_by_row.setdefault(row_idx, set()).add(topic)
259
+
260
+ updated_df = df.copy()
261
+
262
+ def map_topics(row_idx):
263
+ topic_ids = topics_by_row.get(row_idx, set())
264
+ topic_names = [topic_label_map.get(t, str(t)) for t in topic_ids if t != -1]
265
+ return ", ".join(sorted(topic_names))
266
+
267
+ updated_df['Topics'] = updated_df.index.map(map_topics)
268
+ return updated_df
269
+
plots/overview_charts.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.express as px
2
+
3
+ legend_font_size=14
4
+ xaxis_font_size=16
5
+ ticks_size=14
6
+
7
+ ## -- WORD COUNT PLOT
8
+
9
+ def create_word_count_histogram(df, nbins=40, height=550):
10
+
11
+ fig = px.histogram(
12
+ df,
13
+ x='word-count',
14
+ nbins=nbins,
15
+ title=None,
16
+ color_discrete_sequence=['#646DEF']
17
+ )
18
+
19
+ fig.update_layout(
20
+ height=height,
21
+ margin=dict(t=30),
22
+ )
23
+
24
+ return fig
25
+
26
+ ## -- SENTIMENT PLOT
27
+
28
+ def create_sentiment_pie(df, height=450):
29
+
30
+ sentiment_pie = px.pie(
31
+ df,
32
+ names='sentiment',
33
+ color='sentiment',
34
+ color_discrete_map={ 'Positive':'darkturquoise', 'Neutral':'#646DEF', 'Negative':'red'},
35
+ hole=0.45,
36
+ title=None
37
+ )
38
+
39
+ sentiment_pie.update_traces(hovertemplate='%{label}<extra></extra>')
40
+
41
+ sentiment_pie.update_layout(
42
+ showlegend=False,
43
+ margin=dict(r=50),
44
+ legend=dict(
45
+ font=dict(size=legend_font_size),
46
+ orientation="h", # Vertical orientation
47
+ x=0.5,
48
+ xanchor="center",
49
+ )
50
+ )
51
+ return sentiment_pie
52
+
53
+ ## -- CANCELLATION REASONS
54
+
55
+ def create_cancellation_reasons_plot(cancellation_overview):
56
+
57
+ reasons_bar = px.bar(
58
+ cancellation_overview,
59
+ x='Category',
60
+ y='Count',
61
+ color_discrete_sequence=['#646DEF'],
62
+ color_discrete_map={'Low':'darkturquoise', 'Medium':'orangered', 'High':'red'},
63
+ )
64
+
65
+ reasons_bar.update_traces(
66
+ customdata=cancellation_overview['Percentage'],
67
+ hovertemplate='Count = %{y}<br>Percentage = %{customdata}%'
68
+ )
69
+
70
+ reasons_bar.update_layout(
71
+ height=600,
72
+ xaxis_title="",
73
+ yaxis_title="",
74
+ xaxis=dict(title_font=dict(size=xaxis_font_size), tickfont=dict(size=ticks_size)),
75
+ # yaxis=dict(title_font=dict(size=xaxis_font_size), tickfont=dict(size=ticks_size)),
76
+ # yaxis_title=None,
77
+ # margin=dict(r=70),
78
+ # legend=dict(
79
+ # font=dict(size=legend_font_size),
80
+ # orientation='h', # Makes the legend horizontal
81
+ # yanchor='bottom', # Aligns the bottom of the legend box
82
+ # y=1.05, # Places the legend slightly above the plot
83
+ #)
84
+ )
85
+
86
+ return reasons_bar
87
+
88
+ ############# Grouped By Career ############
89
+
90
+ def create_grouped_chart(grouped_df, group_name_col, color_col):
91
+
92
+ grouped_chart = px.bar(
93
+ grouped_df,
94
+ x=group_name_col,
95
+ y='count',
96
+ color= color_col,
97
+ color_discrete_map={'Positive':'darkturquoise', 'Neutral':'#646DEF', 'Negative':'red'},
98
+ title=None,
99
+ barmode="stack")
100
+
101
+ grouped_chart.update_layout(
102
+ legend=dict(
103
+ x=-0.05,
104
+ xanchor="left",
105
+ y=1.2,
106
+ yanchor="top",
107
+ orientation='h'
108
+ )
109
+ )
110
+
111
+ return grouped_chart
plots/topicModeling_charts.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bertopic import BERTopic # Ensure you have BERTopic installed
2
+ import plotly.graph_objects as go # BERTopic visualization uses Plotly
3
+ import plotly.colors as pc
4
+ import plotly.express as px
5
+
6
+ xaxis_font_size=14
7
+ ticks_size=14
8
+
9
+
10
+ def topicDistribution(topic_model, top_n_topics=6, n_words=5):
11
+ content_topics_barchart = topic_model.visualize_barchart(top_n_topics=top_n_topics, n_words=n_words)
12
+ colors = pc.qualitative.Plotly
13
+ for i, trace in enumerate(content_topics_barchart.data):
14
+ trace.marker.color = colors[i % len(colors)] # Cycle through colors
15
+
16
+ content_topics_barchart.update_layout(title_text="") # Remove the title
17
+
18
+ return content_topics_barchart
19
+
20
+
21
+ ####################
22
+ # TOPIC FREQUENCY
23
+ ###################
24
+
25
+
26
+ def create_topicFreq_chart(topics_df):
27
+
28
+ # Create a new column "top_words" that holds the top 5 words for each topic.
29
+ # `topic_model.get_topic(topic)` returns a list of (word, score) tuples.
30
+ topics_df['top_5_words'] = topics_df.iloc[:,3].apply(lambda x: ', '.join(x[:5]) if isinstance(x, list) else x)
31
+ # Create the bar chart using Plotly Express.
32
+ # Pass the "top_words" column as custom data for use in the hover template.
33
+ topicFreq_barchart = px.bar(
34
+ topics_df,
35
+ x="Topic Name",
36
+ y="Count",
37
+ custom_data=["top_5_words"],
38
+ title=None,
39
+ labels={"Count": "Frequency", "Topic": "CutomName"},
40
+ )
41
+
42
+ # Update traces to include custom hover text showing the top 5 words.
43
+ topicFreq_barchart.update_traces(
44
+ marker_color='#646DEF',
45
+ textposition='outside',
46
+ hovertemplate=(
47
+ 'Frequency: %{y}<br>'
48
+ 'Top 5 words: %{customdata[0]}<extra></extra>'
49
+ )
50
+ )
51
+
52
+ topicFreq_barchart.update_layout(
53
+ uniformtext_minsize=8,
54
+ uniformtext_mode='hide',
55
+ xaxis_title="Topic Name",
56
+ yaxis_title="Frequency",
57
+ height=650,
58
+ xaxis=dict(title_font=dict(size=xaxis_font_size), tickfont=dict(size=ticks_size)),
59
+
60
+ )
61
+
62
+ return topicFreq_barchart
63
+
64
+ ###############################
65
+ # Stacked Topic Freq Per Class
66
+ ###############################
67
+
68
+ def create_stacked_topics_per_class(df):
69
+ topcis_per_class_chart = px.bar(
70
+ df,
71
+ x="CustomName", # Classes on the x-axis
72
+ y="Frequency", # Count of documents per topic
73
+ color="Class", # Different colors for different topics
74
+ title=None,
75
+ barmode="stack", # Stacked bars
76
+ labels={"Count": "Frequency", "Topics":"CustomName"},
77
+ )
78
+
79
+ topcis_per_class_chart.update_layout(
80
+ uniformtext_minsize=8,
81
+ uniformtext_mode='hide',
82
+ xaxis_title="Topic Name",
83
+ yaxis_title="Frequency",
84
+ height=650,
85
+
86
+ )
87
+
88
+ return topcis_per_class_chart
89
+
90
+ #######################
91
+ # Intertopic Distance
92
+ #######################
93
+
94
+ def intertopicDistanceMap(topic_model, color="orangered"):
95
+ # Generate the base figure
96
+ fig = topic_model.visualize_topics(
97
+ title="")
98
+
99
+ # Update trace colors
100
+ for trace in fig.data:
101
+ trace.marker.color = color
102
+ trace.marker.line.width = 0
103
+
104
+ fig.update_layout(
105
+ margin=dict(r=50)
106
+ )
107
+
108
+ return fig
109
+
110
+
111
+
112
+ ##########################
113
+ # Topics Over Time
114
+ #########################
115
+
116
+ def create_topics_overtime_chart(topics_overtime_df):
117
+ topics_overtime_chart = px.line(
118
+ topics_overtime_df,
119
+ x="Timestamp",
120
+ y="Frequency",
121
+ color="CustomName",
122
+ markers=True,
123
+ title=None,
124
+ labels={"Timestamp": "Time", "Frequency": "Topic Frequency", "Name": "CustomName"},
125
+ )
126
+
127
+ topics_overtime_chart.update_layout(
128
+ xaxis_title="Time",
129
+ yaxis_title="Frequency",
130
+ legend_title="Topics",
131
+ height=700,
132
+ legend=dict(
133
+ orientation="h",
134
+ yanchor="bottom",
135
+ y=-0.5, # Adjust this value as needed to move the legend further down
136
+ xanchor="center",
137
+ x=0.5
138
+ )
139
+ )
140
+
141
+ return topics_overtime_chart
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bertopic==0.16.4
2
+ contractions==0.1.73
3
+ hdbscan==0.8.40
4
+ langid==1.1.6
5
+ nltk==3.9.1
6
+ numpy==2.2.3
7
+ openai==1.65.2
8
+ pandas==2.2.3
9
+ plotly==5.24.1
10
+ python_dateutil==2.9.0.post0
11
+ scikit_learn==1.6.1
12
+ sentence_transformers==3.3.1
13
+ spacy==3.8.2
14
+ streamlit==1.42.2
15
+ tenacity==9.0.0
16
+ textblob==0.19.0
17
+ torch==2.5.1
18
+ umap_learn==0.5.7