lynn-twinkl commited on
Commit ·
45c7ceb
1
Parent(s): e9cf842
firt commit of tailored model
Browse files
src/models/topicModeling_contentRequests.py
CHANGED
|
@@ -1,45 +1,23 @@
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import re
|
| 3 |
import string
|
| 4 |
import torch
|
| 5 |
import spacy
|
| 6 |
-
|
| 7 |
-
from sentence_transformers import SentenceTransformer
|
| 8 |
-
import nltk
|
| 9 |
-
from nltk.corpus import stopwords
|
| 10 |
-
import contractions
|
| 11 |
from tqdm import tqdm
|
| 12 |
|
| 13 |
-
|
| 14 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 15 |
from bertopic import BERTopic
|
| 16 |
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
|
| 17 |
-
|
| 18 |
-
import numpy as np
|
| 19 |
|
| 20 |
import os
|
| 21 |
from dotenv import load_dotenv
|
| 22 |
-
load_dotenv(
|
| 23 |
-
|
| 24 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 25 |
|
| 26 |
-
#################################
|
| 27 |
-
# OpenAI Topic Representation
|
| 28 |
-
#################################
|
| 29 |
-
def create_openai_model():
|
| 30 |
-
client = openai.OpenAI(api_key=OPENAI_API_KEY)
|
| 31 |
-
prompt = """
|
| 32 |
-
I have a topic that contains the following documents:
|
| 33 |
-
[DOCUMENTS]
|
| 34 |
-
|
| 35 |
-
The topic is described by the following keywords: [KEYWORDS]
|
| 36 |
-
|
| 37 |
-
Based on the information above, extract a short yet descriptive topic label of at most 4 words. The labels should be interpretable enough to stakeholders that don't have access to the raw data. Make sure it is in the following format:
|
| 38 |
-
|
| 39 |
-
topic: <topic label>
|
| 40 |
-
"""
|
| 41 |
-
openai_model = OpenAI(client, model="gpt-4o-mini", exponential_backoff=True, chat=True, prompt=prompt)
|
| 42 |
-
return openai_model
|
| 43 |
|
| 44 |
#############################################
|
| 45 |
# Convert OpenAI Representation to CustomName
|
|
@@ -50,60 +28,13 @@ def ai_labeles_to_custom_name(model):
|
|
| 50 |
chatgpt_topic_labels[-1] = "Outlier Topic"
|
| 51 |
model.set_topic_labels(chatgpt_topic_labels)
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
-----------------------------------
|
| 57 |
-
|
| 58 |
-
"""
|
| 59 |
-
def topicModeling_preprocessing(df, spacy_model="en_core_web_lg"):
|
| 60 |
-
|
| 61 |
-
base_stopwords = set(stopwords.words('english'))
|
| 62 |
-
|
| 63 |
-
custom_stopwords = {
|
| 64 |
-
'material', 'materials', 'resources', 'resource', 'activity',
|
| 65 |
-
'activities', 'sheet', 'sheets', 'worksheet', 'worksheets',
|
| 66 |
-
'teacher', 'teachers', 'teach', 'high school', 'highschool',
|
| 67 |
-
'middle school', 'grade', 'grades', 'hs', 'level', 'age', 'ages',
|
| 68 |
-
'older', 'older kid', 'kid', 'student', "1st", "2nd", "3rd", "4th", '5th', '6th',
|
| 69 |
-
'7th', '8th', '9th'
|
| 70 |
-
}
|
| 71 |
-
|
| 72 |
-
stopword_set = base_stopwords.union(custom_stopwords)
|
| 73 |
-
|
| 74 |
-
stopword_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in stopword_set) + r')\b'
|
| 75 |
-
|
| 76 |
-
nlp = spacy.load(spacy_model)
|
| 77 |
|
| 78 |
-
def clean_lemmatize_text(text):
|
| 79 |
-
if not isinstance(text, str):
|
| 80 |
-
return None
|
| 81 |
-
|
| 82 |
-
text = contractions.fix(text)
|
| 83 |
-
text = re.sub(r'\s+', ' ', text).strip()
|
| 84 |
-
text = re.sub(stopword_pattern, '', text)
|
| 85 |
|
| 86 |
-
doc = nlp(text)
|
| 87 |
-
tokens = [token.lemma_ for token in doc]
|
| 88 |
-
|
| 89 |
-
clean_text = " ".join(tokens).strip()
|
| 90 |
-
clean_text = re.sub(r'\s+', ' ', clean_text)
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
df['processedForModeling'] = df['preprocessedBasic'].apply(clean_lemmatize_text)
|
| 96 |
-
|
| 97 |
-
# Drop rows where cleaned text is empty or None
|
| 98 |
-
df = df.dropna(subset=['processedForModeling'])
|
| 99 |
-
|
| 100 |
-
return df
|
| 101 |
-
|
| 102 |
-
"""
|
| 103 |
-
--------------------------
|
| 104 |
-
Load Transformer Model
|
| 105 |
-
--------------------------
|
| 106 |
-
"""
|
| 107 |
|
| 108 |
def load_embedding_model(model_name):
|
| 109 |
if torch.cuda.is_available():
|
|
@@ -117,11 +48,7 @@ def load_embedding_model(model_name):
|
|
| 117 |
return SentenceTransformer(model_name, device=device)
|
| 118 |
|
| 119 |
|
| 120 |
-
|
| 121 |
-
-------------------------
|
| 122 |
-
Batch Embedding Creation
|
| 123 |
-
-------------------------
|
| 124 |
-
"""
|
| 125 |
|
| 126 |
def encode_content_documents(embedding_model, content_documents, batch_size=20):
|
| 127 |
embeddings_batches = []
|
|
@@ -136,155 +63,79 @@ def encode_content_documents(embedding_model, content_documents, batch_size=20):
|
|
| 136 |
|
| 137 |
return np.vstack(embeddings_batches)
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
"""
|
| 144 |
-
|
| 145 |
-
try:
|
| 146 |
-
nltk.data.find("corpora/stopwords")
|
| 147 |
-
except LookupError:
|
| 148 |
-
nltk.download("stopwords")
|
| 149 |
-
|
| 150 |
-
stopwords = list(stopwords.words('english')) + [
|
| 151 |
-
'activities',
|
| 152 |
-
'activity',
|
| 153 |
-
'class',
|
| 154 |
-
'classroom',
|
| 155 |
-
'material',
|
| 156 |
-
'materials',
|
| 157 |
-
'membership',
|
| 158 |
-
'memberships',
|
| 159 |
-
'pupil',
|
| 160 |
-
'pupils',
|
| 161 |
-
'resource',
|
| 162 |
-
'resources',
|
| 163 |
-
'sheet',
|
| 164 |
-
'sheets',
|
| 165 |
-
'student',
|
| 166 |
-
'students',
|
| 167 |
-
'subscription',
|
| 168 |
-
'subscriptions',
|
| 169 |
-
'subscribe',
|
| 170 |
-
'subscribed',
|
| 171 |
-
'recommend',
|
| 172 |
-
'recommendation',
|
| 173 |
-
'teach',
|
| 174 |
-
'teacher',
|
| 175 |
-
'teachers',
|
| 176 |
-
'tutor',
|
| 177 |
-
'tutors',
|
| 178 |
-
'twinkl',
|
| 179 |
-
'twinkls',
|
| 180 |
-
'twinkle',
|
| 181 |
-
'worksheet',
|
| 182 |
-
'worksheets',
|
| 183 |
-
]
|
| 184 |
-
|
| 185 |
-
######### --------------- BERTOPIC ----------------- #############
|
| 186 |
-
def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_model):
|
| 187 |
-
|
| 188 |
-
main_representation_model = KeyBERTInspired()
|
| 189 |
-
aspect_representation_model1 = MaximalMarginalRelevance(diversity=.3)
|
| 190 |
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
|
| 197 |
-
|
| 198 |
|
| 199 |
-
|
| 200 |
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
representation_model = {
|
| 206 |
"Main": main_representation_model,
|
| 207 |
-
"
|
| 208 |
}
|
| 209 |
|
| 210 |
-
vectorizer_model = CountVectorizer(min_df=2, max_df=0.60, stop_words=stopwords)
|
| 211 |
|
| 212 |
-
|
| 213 |
-
["autism", "special needs", "special education needs", "special education", "adhd", "autistic", "dyslexia", "dyslexic", "sen"],
|
| 214 |
-
]
|
| 215 |
|
| 216 |
topic_model = BERTopic(
|
| 217 |
-
verbose=
|
| 218 |
-
embedding_model=_embedding_model,
|
| 219 |
umap_model=_umap_model,
|
| 220 |
-
|
| 221 |
vectorizer_model=vectorizer_model,
|
| 222 |
-
|
| 223 |
-
|
|
|
|
| 224 |
)
|
| 225 |
-
|
| 226 |
topics, probs = topic_model.fit_transform(docs, embeddings)
|
| 227 |
-
return topic_model, topics, probs
|
| 228 |
|
| 229 |
-
|
| 230 |
-
# TOPIC MERGING
|
| 231 |
-
##################################
|
| 232 |
-
|
| 233 |
-
def merge_specific_topics(topic_model, sentences,
|
| 234 |
-
cancellation_keywords=["cancel", "cancellation", "cancel", "canceled"],
|
| 235 |
-
thanks_keywords=["thank", "thanks", "thank you", "thankyou", "ty", "thx"],
|
| 236 |
-
expensive_keywords=["can't afford", "price", "expensive", "cost"]):
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
topic_info = topic_model.get_topic_info()
|
| 240 |
-
|
| 241 |
-
# Identify cancellation-related topics by checking if any cancellation keyword appears in the topic name.
|
| 242 |
-
cancellation_regex = '|'.join(cancellation_keywords)
|
| 243 |
-
cancellation_topics = topic_info[
|
| 244 |
-
topic_info['Name'].str.contains(cancellation_regex, case=False, na=False)
|
| 245 |
-
]['Topic'].tolist()
|
| 246 |
-
|
| 247 |
-
# Identify thank-you-related topics similarly.
|
| 248 |
-
thanks_regex = '|'.join(thanks_keywords)
|
| 249 |
-
thanks_topics = topic_info[
|
| 250 |
-
topic_info['Name'].str.contains(thanks_regex, case=False, na=False)
|
| 251 |
-
]['Topic'].tolist()
|
| 252 |
-
|
| 253 |
-
# Identify expensive-related topics.
|
| 254 |
-
expensive_regex = '|'.join(expensive_keywords)
|
| 255 |
-
expensive_topics = topic_info[
|
| 256 |
-
topic_info['Name'].str.contains(expensive_regex, case=False, na=False)
|
| 257 |
-
]['Topic'].tolist()
|
| 258 |
-
|
| 259 |
-
# Exclude the outlier topic (-1) if it appears.
|
| 260 |
-
cancellation_topics = [t for t in cancellation_topics if t != -1]
|
| 261 |
-
thanks_topics = [t for t in thanks_topics if t != -1]
|
| 262 |
-
expensive_topics = [t for t in expensive_topics if t != -1]
|
| 263 |
-
|
| 264 |
-
# Create a list of topics to merge
|
| 265 |
-
topics_to_merge = []
|
| 266 |
-
|
| 267 |
-
if len(cancellation_topics) > 1:
|
| 268 |
-
print(f"Merging cancellation topics: {cancellation_topics}")
|
| 269 |
-
topics_to_merge.append(cancellation_topics)
|
| 270 |
-
|
| 271 |
-
if len(thanks_topics) > 1:
|
| 272 |
-
print(f"Merging thank-you topics: {thanks_topics}")
|
| 273 |
-
topics_to_merge.append(thanks_topics)
|
| 274 |
-
|
| 275 |
-
if len(expensive_topics) > 1:
|
| 276 |
-
print(f"Merging expensive topics: {expensive_topics}")
|
| 277 |
-
topics_to_merge.append(expensive_topics)
|
| 278 |
-
|
| 279 |
-
# Call merge_topics
|
| 280 |
-
if topics_to_merge:
|
| 281 |
-
topic_model.merge_topics(sentences, topics_to_merge)
|
| 282 |
-
|
| 283 |
-
return topic_model
|
| 284 |
|
| 285 |
|
| 286 |
##################################
|
| 287 |
-
#
|
| 288 |
#################################
|
| 289 |
|
| 290 |
def update_df_with_topics(df, mapping, sentence_topics, topic_label_map):
|
|
|
|
| 1 |
+
import openai
|
| 2 |
+
import numpy as np
|
| 3 |
import streamlit as st
|
| 4 |
import re
|
| 5 |
import string
|
| 6 |
import torch
|
| 7 |
import spacy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
from tqdm import tqdm
|
| 9 |
|
| 10 |
+
from sentence_transformers import SentenceTransformer
|
| 11 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 12 |
from bertopic import BERTopic
|
| 13 |
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
|
| 14 |
+
|
|
|
|
| 15 |
|
| 16 |
import os
|
| 17 |
from dotenv import load_dotenv
|
| 18 |
+
load_dotenv()
|
|
|
|
| 19 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
#############################################
|
| 23 |
# Convert OpenAI Representation to CustomName
|
|
|
|
| 28 |
chatgpt_topic_labels[-1] = "Outlier Topic"
|
| 29 |
model.set_topic_labels(chatgpt_topic_labels)
|
| 30 |
|
| 31 |
+
###################################
|
| 32 |
+
# HELPER FUNCTIONS
|
| 33 |
+
###################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
## -------- LOAD TRANSFORMER MODEL -------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def load_embedding_model(model_name):
|
| 40 |
if torch.cuda.is_available():
|
|
|
|
| 48 |
return SentenceTransformer(model_name, device=device)
|
| 49 |
|
| 50 |
|
| 51 |
+
## ------------ GENERATE EMBEDDINGS ------------
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
def encode_content_documents(embedding_model, content_documents, batch_size=20):
|
| 54 |
embeddings_batches = []
|
|
|
|
| 63 |
|
| 64 |
return np.vstack(embeddings_batches)
|
| 65 |
|
| 66 |
+
## ------- BUILDING STOPWORDS LIST ------
|
| 67 |
+
|
| 68 |
+
stopwords = list(nlp.Defaults.stop_words)
|
| 69 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
+
custom_stopwords = [
|
| 72 |
+
'thank you', 'thankyou', 'thanks', 'thank'
|
| 73 |
+
'children', 'child', 'students',
|
| 74 |
+
'twinkl',
|
| 75 |
+
'funding'
|
| 76 |
|
| 77 |
+
]
|
| 78 |
|
| 79 |
+
stopwords.extend(custom_stopwords)
|
| 80 |
|
| 81 |
+
|
| 82 |
+
## --------- INSTANTIATE OPENAI MODEL ---------
|
| 83 |
+
|
| 84 |
+
def create_openai_model():
|
| 85 |
+
client = openai.OpenAI(api_key=OPENAI_API_KEY)
|
| 86 |
+
prompt = """
|
| 87 |
+
This topic contains the following documents:
|
| 88 |
+
[DOCUMENTS]
|
| 89 |
+
|
| 90 |
+
The topic is described by the following keywords: [KEYWORDS]
|
| 91 |
+
|
| 92 |
+
Based on the information above, extract a short yet descriptive topic label.
|
| 93 |
+
"""
|
| 94 |
+
|
| 95 |
+
openai_model = OpenAI(
|
| 96 |
+
client,
|
| 97 |
+
model="gpt-4.1-nano",
|
| 98 |
+
exponential_backoff=True,
|
| 99 |
+
chat=True, prompt=prompt,
|
| 100 |
+
system_prompt= """ **Task**: As a topic modeling expert, your responsibility is to generate concise yet comprehensive topic labels from rows in a BertTopic `topic_info` dataframe. These topics have been derived from grant application forms submitted by schools, tutors, or other institutions participating in Twinkl giveaways.\n\n**Objective**: Your goal is to create labels for the extracted topics that accurately and clearly describe each topic within the specified context. These labels should be easily interpretable by the members of the Community Collections team.\n\n**Instructions**: \n\n1. **Understand the Context**: The topics relate to grant applications and are relevant to educational institutions participating in Twinkl giveaways.\n\n2. **Generate Labels**:\n- Create labels that are short yet capture the essence of each topic.\n- Ensure that the labels are contextually appropriate and provide clarity.\n- Focus on making the labels easily understandable for the Community Collections team. \n\n3. **Considerations**:\n- Each label should succinctly convey the main idea of the topic.\n- Avoid overly technical language unless necessary for precision.\n- Ensure the labels align with the overall educational and grant-related context."""
|
| 101 |
+
)
|
| 102 |
+
return openai_model
|
| 103 |
+
|
| 104 |
+
#############################
|
| 105 |
+
# BERTOPIC MODELING
|
| 106 |
+
#############################
|
| 107 |
+
|
| 108 |
+
def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_model):
|
| 109 |
+
|
| 110 |
+
main_representation_model = MaximalMarginalRelevance(diversity=.3)
|
| 111 |
+
|
| 112 |
+
openai_model = create_openai_model()
|
| 113 |
|
| 114 |
representation_model = {
|
| 115 |
"Main": main_representation_model,
|
| 116 |
+
"OpenAI": openai_model
|
| 117 |
}
|
| 118 |
|
|
|
|
| 119 |
|
| 120 |
+
vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=(1,2))
|
|
|
|
|
|
|
| 121 |
|
| 122 |
topic_model = BERTopic(
|
| 123 |
+
verbose=true,
|
|
|
|
| 124 |
umap_model=_umap_model,
|
| 125 |
+
representation_model=representation_model
|
| 126 |
vectorizer_model=vectorizer_model,
|
| 127 |
+
hdbscan_model=_hdbscan_model,
|
| 128 |
+
embedding_model=_embedding_model,
|
| 129 |
+
nr_topics='auto'
|
| 130 |
)
|
| 131 |
+
|
| 132 |
topics, probs = topic_model.fit_transform(docs, embeddings)
|
|
|
|
| 133 |
|
| 134 |
+
return topic_model, topics, probs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
##################################
|
| 138 |
+
# TOPIC TO DATAFRAME MAPPING
|
| 139 |
#################################
|
| 140 |
|
| 141 |
def update_df_with_topics(df, mapping, sentence_topics, topic_label_map):
|