lynn-twinkl commited on
Commit
2e31929
·
1 Parent(s): 11d9a88

changed to KeyBERTInspired model; light refactoring

Browse files
src/models/topic_modeling_pipeline.py CHANGED
@@ -12,7 +12,7 @@ from tqdm import tqdm
12
  from sentence_transformers import SentenceTransformer
13
  from sklearn.feature_extraction.text import CountVectorizer
14
  from bertopic import BERTopic
15
- from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
16
 
17
 
18
  import os
@@ -25,16 +25,11 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
25
  # Convert OpenAI Representation to CustomName
26
  #############################################
27
 
28
- def ai_labeles_to_custom_name(model):
29
- chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in model.topic_aspects_["OpenAI"].items()}
30
- chatgpt_topic_labels[-1] = "Outlier Topic"
31
- model.set_topic_labels(chatgpt_topic_labels)
32
-
33
  ###################################
34
  # HELPER FUNCTIONS
35
  ###################################
36
 
37
- ## ---------- LOAD SPACY MODEL -------
38
 
39
  def load_spacy_model(model_name="en_core_web_md"):
40
  """
@@ -47,6 +42,12 @@ def load_spacy_model(model_name="en_core_web_md"):
47
 
48
  return spacy.load(model_name)
49
 
 
 
 
 
 
 
50
 
51
  ## -------- LOAD TRANSFORMER MODEL -------
52
 
@@ -116,13 +117,21 @@ def create_openai_model():
116
  )
117
  return openai_model
118
 
 
 
 
 
 
 
 
 
119
  #############################
120
  # BERTOPIC MODELING
121
  #############################
122
 
123
  def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_model):
124
 
125
- main_representation_model = MaximalMarginalRelevance(diversity=.3)
126
 
127
  openai_model = create_openai_model()
128
 
@@ -135,7 +144,7 @@ def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_mod
135
  vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=(1,2))
136
 
137
  topic_model = BERTopic(
138
- verbose=true,
139
  umap_model=_umap_model,
140
  representation_model=representation_model,
141
  vectorizer_model=vectorizer_model,
@@ -149,6 +158,8 @@ def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_mod
149
  return topic_model, topics, probs
150
 
151
 
 
 
152
  ##################################
153
  # TOPIC TO DATAFRAME MAPPING
154
  #################################
 
12
  from sentence_transformers import SentenceTransformer
13
  from sklearn.feature_extraction.text import CountVectorizer
14
  from bertopic import BERTopic
15
+ from bertopic.representation import KeyBERTInspired, OpenAI
16
 
17
 
18
  import os
 
25
  # Convert OpenAI Representation to CustomName
26
  #############################################
27
 
 
 
 
 
 
28
  ###################################
29
  # HELPER FUNCTIONS
30
  ###################################
31
 
32
+ ## ---------- LOAD SPACY MODEL ---------
33
 
34
  def load_spacy_model(model_name="en_core_web_md"):
35
  """
 
42
 
43
  return spacy.load(model_name)
44
 
45
+ ## -------- SENTENCE TOKENIZER -------
46
+
47
+ def spacy_sent_tokenize(text):
48
+ doc = nlp(text)
49
+ sentences = [sent.text.strip() for sent in doc.sents]
50
+ return sentences
51
 
52
  ## -------- LOAD TRANSFORMER MODEL -------
53
 
 
117
  )
118
  return openai_model
119
 
120
+ ## ---------- AI LABELS TO TOPIC NAME ----------
121
+
122
+ def ai_labels_to_custom_name(model):
123
+ chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in model.topic_aspects_["OpenAI"].items()}
124
+ chatgpt_topic_labels[-1] = "Outlier Topic"
125
+ model.set_topic_labels(chatgpt_topic_labels)
126
+
127
+
128
  #############################
129
  # BERTOPIC MODELING
130
  #############################
131
 
132
  def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_model):
133
 
134
+ main_representation_model = KeyBERTInspired()
135
 
136
  openai_model = create_openai_model()
137
 
 
144
  vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=(1,2))
145
 
146
  topic_model = BERTopic(
147
+ verbose=True,
148
  umap_model=_umap_model,
149
  representation_model=representation_model,
150
  vectorizer_model=vectorizer_model,
 
158
  return topic_model, topics, probs
159
 
160
 
161
+
162
+
163
  ##################################
164
  # TOPIC TO DATAFRAME MAPPING
165
  #################################