lynn-twinkl commited on
Commit
45c7ceb
·
1 Parent(s): e9cf842

firt commit of tailored model

Browse files
src/models/topicModeling_contentRequests.py CHANGED
@@ -1,45 +1,23 @@
 
 
1
  import streamlit as st
2
  import re
3
  import string
4
  import torch
5
  import spacy
6
-
7
- from sentence_transformers import SentenceTransformer
8
- import nltk
9
- from nltk.corpus import stopwords
10
- import contractions
11
  from tqdm import tqdm
12
 
13
-
14
  from sklearn.feature_extraction.text import CountVectorizer
15
  from bertopic import BERTopic
16
  from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
17
- import openai
18
- import numpy as np
19
 
20
  import os
21
  from dotenv import load_dotenv
22
- load_dotenv(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../",".env")))
23
-
24
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
25
 
26
- #################################
27
- # OpenAI Topic Representation
28
- #################################
29
- def create_openai_model():
30
- client = openai.OpenAI(api_key=OPENAI_API_KEY)
31
- prompt = """
32
- I have a topic that contains the following documents:
33
- [DOCUMENTS]
34
-
35
- The topic is described by the following keywords: [KEYWORDS]
36
-
37
- Based on the information above, extract a short yet descriptive topic label of at most 4 words. The labels should be interpretable enough to stakeholders that don't have access to the raw data. Make sure it is in the following format:
38
-
39
- topic: <topic label>
40
- """
41
- openai_model = OpenAI(client, model="gpt-4o-mini", exponential_backoff=True, chat=True, prompt=prompt)
42
- return openai_model
43
 
44
  #############################################
45
  # Convert OpenAI Representation to CustomName
@@ -50,60 +28,13 @@ def ai_labeles_to_custom_name(model):
50
  chatgpt_topic_labels[-1] = "Outlier Topic"
51
  model.set_topic_labels(chatgpt_topic_labels)
52
 
53
- """
54
- -----------------------------------
55
- Lemmatization & Stopword Removal
56
- -----------------------------------
57
-
58
- """
59
- def topicModeling_preprocessing(df, spacy_model="en_core_web_lg"):
60
-
61
- base_stopwords = set(stopwords.words('english'))
62
-
63
- custom_stopwords = {
64
- 'material', 'materials', 'resources', 'resource', 'activity',
65
- 'activities', 'sheet', 'sheets', 'worksheet', 'worksheets',
66
- 'teacher', 'teachers', 'teach', 'high school', 'highschool',
67
- 'middle school', 'grade', 'grades', 'hs', 'level', 'age', 'ages',
68
- 'older', 'older kid', 'kid', 'student', "1st", "2nd", "3rd", "4th", '5th', '6th',
69
- '7th', '8th', '9th'
70
- }
71
-
72
- stopword_set = base_stopwords.union(custom_stopwords)
73
-
74
- stopword_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in stopword_set) + r')\b'
75
-
76
- nlp = spacy.load(spacy_model)
77
 
78
- def clean_lemmatize_text(text):
79
- if not isinstance(text, str):
80
- return None
81
-
82
- text = contractions.fix(text)
83
- text = re.sub(r'\s+', ' ', text).strip()
84
- text = re.sub(stopword_pattern, '', text)
85
 
86
- doc = nlp(text)
87
- tokens = [token.lemma_ for token in doc]
88
-
89
- clean_text = " ".join(tokens).strip()
90
- clean_text = re.sub(r'\s+', ' ', clean_text)
91
 
92
- return clean_text if clean_text else None
93
-
94
-
95
- df['processedForModeling'] = df['preprocessedBasic'].apply(clean_lemmatize_text)
96
-
97
- # Drop rows where cleaned text is empty or None
98
- df = df.dropna(subset=['processedForModeling'])
99
-
100
- return df
101
-
102
- """
103
- --------------------------
104
- Load Transformer Model
105
- --------------------------
106
- """
107
 
108
  def load_embedding_model(model_name):
109
  if torch.cuda.is_available():
@@ -117,11 +48,7 @@ def load_embedding_model(model_name):
117
  return SentenceTransformer(model_name, device=device)
118
 
119
 
120
- """
121
- -------------------------
122
- Batch Embedding Creation
123
- -------------------------
124
- """
125
 
126
  def encode_content_documents(embedding_model, content_documents, batch_size=20):
127
  embeddings_batches = []
@@ -136,155 +63,79 @@ def encode_content_documents(embedding_model, content_documents, batch_size=20):
136
 
137
  return np.vstack(embeddings_batches)
138
 
139
- """
140
- -----------------------------
141
- Topic Modeling with BERTopic
142
- -----------------------------
143
- """
144
-
145
- try:
146
- nltk.data.find("corpora/stopwords")
147
- except LookupError:
148
- nltk.download("stopwords")
149
-
150
- stopwords = list(stopwords.words('english')) + [
151
- 'activities',
152
- 'activity',
153
- 'class',
154
- 'classroom',
155
- 'material',
156
- 'materials',
157
- 'membership',
158
- 'memberships',
159
- 'pupil',
160
- 'pupils',
161
- 'resource',
162
- 'resources',
163
- 'sheet',
164
- 'sheets',
165
- 'student',
166
- 'students',
167
- 'subscription',
168
- 'subscriptions',
169
- 'subscribe',
170
- 'subscribed',
171
- 'recommend',
172
- 'recommendation',
173
- 'teach',
174
- 'teacher',
175
- 'teachers',
176
- 'tutor',
177
- 'tutors',
178
- 'twinkl',
179
- 'twinkls',
180
- 'twinkle',
181
- 'worksheet',
182
- 'worksheets',
183
- ]
184
-
185
- ######### --------------- BERTOPIC ----------------- #############
186
- def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_model):
187
-
188
- main_representation_model = KeyBERTInspired()
189
- aspect_representation_model1 = MaximalMarginalRelevance(diversity=.3)
190
 
191
- # OpenAI Representation Model
192
- client = openai.OpenAI(api_key=OPENAI_API_KEY)
193
- prompt = """
194
- I have a topic that contains the following documents:
195
- [DOCUMENTS]
196
 
197
- The topic is described by the following keywords: [KEYWORDS]
198
 
199
- Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
200
 
201
- topic: <topic label>
202
- """
203
- openai_model = OpenAI(client, model="gpt-4o-mini", exponential_backoff=True, chat=True, prompt=prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  representation_model = {
206
  "Main": main_representation_model,
207
- "Secondary Representation": aspect_representation_model1,
208
  }
209
 
210
- vectorizer_model = CountVectorizer(min_df=2, max_df=0.60, stop_words=stopwords)
211
 
212
- seed_topic_list = [
213
- ["autism", "special needs", "special education needs", "special education", "adhd", "autistic", "dyslexia", "dyslexic", "sen"],
214
- ]
215
 
216
  topic_model = BERTopic(
217
- verbose=True,
218
- embedding_model=_embedding_model,
219
  umap_model=_umap_model,
220
- hdbscan_model = _hdbscan_model,
221
  vectorizer_model=vectorizer_model,
222
- #seed_topic_list = seed_topic_list,
223
- representation_model=representation_model,
 
224
  )
225
-
226
  topics, probs = topic_model.fit_transform(docs, embeddings)
227
- return topic_model, topics, probs
228
 
229
- ##################################
230
- # TOPIC MERGING
231
- ##################################
232
-
233
- def merge_specific_topics(topic_model, sentences,
234
- cancellation_keywords=["cancel", "cancellation", "cancel", "canceled"],
235
- thanks_keywords=["thank", "thanks", "thank you", "thankyou", "ty", "thx"],
236
- expensive_keywords=["can't afford", "price", "expensive", "cost"]):
237
-
238
-
239
- topic_info = topic_model.get_topic_info()
240
-
241
- # Identify cancellation-related topics by checking if any cancellation keyword appears in the topic name.
242
- cancellation_regex = '|'.join(cancellation_keywords)
243
- cancellation_topics = topic_info[
244
- topic_info['Name'].str.contains(cancellation_regex, case=False, na=False)
245
- ]['Topic'].tolist()
246
-
247
- # Identify thank-you-related topics similarly.
248
- thanks_regex = '|'.join(thanks_keywords)
249
- thanks_topics = topic_info[
250
- topic_info['Name'].str.contains(thanks_regex, case=False, na=False)
251
- ]['Topic'].tolist()
252
-
253
- # Identify expensive-related topics.
254
- expensive_regex = '|'.join(expensive_keywords)
255
- expensive_topics = topic_info[
256
- topic_info['Name'].str.contains(expensive_regex, case=False, na=False)
257
- ]['Topic'].tolist()
258
-
259
- # Exclude the outlier topic (-1) if it appears.
260
- cancellation_topics = [t for t in cancellation_topics if t != -1]
261
- thanks_topics = [t for t in thanks_topics if t != -1]
262
- expensive_topics = [t for t in expensive_topics if t != -1]
263
-
264
- # Create a list of topics to merge
265
- topics_to_merge = []
266
-
267
- if len(cancellation_topics) > 1:
268
- print(f"Merging cancellation topics: {cancellation_topics}")
269
- topics_to_merge.append(cancellation_topics)
270
-
271
- if len(thanks_topics) > 1:
272
- print(f"Merging thank-you topics: {thanks_topics}")
273
- topics_to_merge.append(thanks_topics)
274
-
275
- if len(expensive_topics) > 1:
276
- print(f"Merging expensive topics: {expensive_topics}")
277
- topics_to_merge.append(expensive_topics)
278
-
279
- # Call merge_topics
280
- if topics_to_merge:
281
- topic_model.merge_topics(sentences, topics_to_merge)
282
-
283
- return topic_model
284
 
285
 
286
  ##################################
287
- # Topic to Dataframe Mapping
288
  #################################
289
 
290
  def update_df_with_topics(df, mapping, sentence_topics, topic_label_map):
 
1
+ import openai
2
+ import numpy as np
3
  import streamlit as st
4
  import re
5
  import string
6
  import torch
7
  import spacy
 
 
 
 
 
8
  from tqdm import tqdm
9
 
10
+ from sentence_transformers import SentenceTransformer
11
  from sklearn.feature_extraction.text import CountVectorizer
12
  from bertopic import BERTopic
13
  from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
14
+
 
15
 
16
  import os
17
  from dotenv import load_dotenv
18
+ load_dotenv()
 
19
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  #############################################
23
  # Convert OpenAI Representation to CustomName
 
28
  chatgpt_topic_labels[-1] = "Outlier Topic"
29
  model.set_topic_labels(chatgpt_topic_labels)
30
 
31
+ ###################################
32
+ # HELPER FUNCTIONS
33
+ ###################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
 
 
 
 
 
 
 
35
 
 
 
 
 
 
36
 
37
+ ## -------- LOAD TRANSFORMER MODEL -------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def load_embedding_model(model_name):
40
  if torch.cuda.is_available():
 
48
  return SentenceTransformer(model_name, device=device)
49
 
50
 
51
+ ## ------------ GENERATE EMBEDDINGS ------------
 
 
 
 
52
 
53
  def encode_content_documents(embedding_model, content_documents, batch_size=20):
54
  embeddings_batches = []
 
63
 
64
  return np.vstack(embeddings_batches)
65
 
66
+ ## ------- BUILDING STOPWORDS LIST ------
67
+
68
+ stopwords = list(nlp.Defaults.stop_words)
69
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ custom_stopwords = [
72
+ 'thank you', 'thankyou', 'thanks', 'thank'
73
+ 'children', 'child', 'students',
74
+ 'twinkl',
75
+ 'funding'
76
 
77
+ ]
78
 
79
+ stopwords.extend(custom_stopwords)
80
 
81
+
82
+ ## --------- INSTANTIATE OPENAI MODEL ---------
83
+
84
+ def create_openai_model():
85
+ client = openai.OpenAI(api_key=OPENAI_API_KEY)
86
+ prompt = """
87
+ This topic contains the following documents:
88
+ [DOCUMENTS]
89
+
90
+ The topic is described by the following keywords: [KEYWORDS]
91
+
92
+ Based on the information above, extract a short yet descriptive topic label.
93
+ """
94
+
95
+ openai_model = OpenAI(
96
+ client,
97
+ model="gpt-4.1-nano",
98
+ exponential_backoff=True,
99
+ chat=True, prompt=prompt,
100
+ system_prompt= """ **Task**: As a topic modeling expert, your responsibility is to generate concise yet comprehensive topic labels from rows in a BertTopic `topic_info` dataframe. These topics have been derived from grant application forms submitted by schools, tutors, or other institutions participating in Twinkl giveaways.\n\n**Objective**: Your goal is to create labels for the extracted topics that accurately and clearly describe each topic within the specified context. These labels should be easily interpretable by the members of the Community Collections team.\n\n**Instructions**: \n\n1. **Understand the Context**: The topics relate to grant applications and are relevant to educational institutions participating in Twinkl giveaways.\n\n2. **Generate Labels**:\n- Create labels that are short yet capture the essence of each topic.\n- Ensure that the labels are contextually appropriate and provide clarity.\n- Focus on making the labels easily understandable for the Community Collections team. \n\n3. **Considerations**:\n- Each label should succinctly convey the main idea of the topic.\n- Avoid overly technical language unless necessary for precision.\n- Ensure the labels align with the overall educational and grant-related context."""
101
+ )
102
+ return openai_model
103
+
104
+ #############################
105
+ # BERTOPIC MODELING
106
+ #############################
107
+
108
+ def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_model):
109
+
110
+ main_representation_model = MaximalMarginalRelevance(diversity=.3)
111
+
112
+ openai_model = create_openai_model()
113
 
114
  representation_model = {
115
  "Main": main_representation_model,
116
+ "OpenAI": openai_model
117
  }
118
 
 
119
 
120
+ vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=(1,2))
 
 
121
 
122
  topic_model = BERTopic(
123
+ verbose=true,
 
124
  umap_model=_umap_model,
125
+ representation_model=representation_model
126
  vectorizer_model=vectorizer_model,
127
+ hdbscan_model=_hdbscan_model,
128
+ embedding_model=_embedding_model,
129
+ nr_topics='auto'
130
  )
131
+
132
  topics, probs = topic_model.fit_transform(docs, embeddings)
 
133
 
134
+ return topic_model, topics, probs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
 
137
  ##################################
138
+ # TOPIC TO DATAFRAME MAPPING
139
  #################################
140
 
141
  def update_df_with_topics(df, mapping, sentence_topics, topic_label_map):