lynn-twinkl commited on
Commit ·
2e31929
1
Parent(s): 11d9a88
changed to KeyBERTInspired model; light refactoring
Browse files
src/models/topic_modeling_pipeline.py
CHANGED
|
@@ -12,7 +12,7 @@ from tqdm import tqdm
|
|
| 12 |
from sentence_transformers import SentenceTransformer
|
| 13 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 14 |
from bertopic import BERTopic
|
| 15 |
-
from bertopic.representation import KeyBERTInspired,
|
| 16 |
|
| 17 |
|
| 18 |
import os
|
|
@@ -25,16 +25,11 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
| 25 |
# Convert OpenAI Representation to CustomName
|
| 26 |
#############################################
|
| 27 |
|
| 28 |
-
def ai_labeles_to_custom_name(model):
|
| 29 |
-
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in model.topic_aspects_["OpenAI"].items()}
|
| 30 |
-
chatgpt_topic_labels[-1] = "Outlier Topic"
|
| 31 |
-
model.set_topic_labels(chatgpt_topic_labels)
|
| 32 |
-
|
| 33 |
###################################
|
| 34 |
# HELPER FUNCTIONS
|
| 35 |
###################################
|
| 36 |
|
| 37 |
-
## ---------- LOAD SPACY MODEL -------
|
| 38 |
|
| 39 |
def load_spacy_model(model_name="en_core_web_md"):
|
| 40 |
"""
|
|
@@ -47,6 +42,12 @@ def load_spacy_model(model_name="en_core_web_md"):
|
|
| 47 |
|
| 48 |
return spacy.load(model_name)
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
## -------- LOAD TRANSFORMER MODEL -------
|
| 52 |
|
|
@@ -116,13 +117,21 @@ def create_openai_model():
|
|
| 116 |
)
|
| 117 |
return openai_model
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
#############################
|
| 120 |
# BERTOPIC MODELING
|
| 121 |
#############################
|
| 122 |
|
| 123 |
def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_model):
|
| 124 |
|
| 125 |
-
main_representation_model =
|
| 126 |
|
| 127 |
openai_model = create_openai_model()
|
| 128 |
|
|
@@ -135,7 +144,7 @@ def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_mod
|
|
| 135 |
vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=(1,2))
|
| 136 |
|
| 137 |
topic_model = BERTopic(
|
| 138 |
-
verbose=
|
| 139 |
umap_model=_umap_model,
|
| 140 |
representation_model=representation_model,
|
| 141 |
vectorizer_model=vectorizer_model,
|
|
@@ -149,6 +158,8 @@ def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_mod
|
|
| 149 |
return topic_model, topics, probs
|
| 150 |
|
| 151 |
|
|
|
|
|
|
|
| 152 |
##################################
|
| 153 |
# TOPIC TO DATAFRAME MAPPING
|
| 154 |
#################################
|
|
|
|
| 12 |
from sentence_transformers import SentenceTransformer
|
| 13 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 14 |
from bertopic import BERTopic
|
| 15 |
+
from bertopic.representation import KeyBERTInspired, OpenAI
|
| 16 |
|
| 17 |
|
| 18 |
import os
|
|
|
|
| 25 |
# Convert OpenAI Representation to CustomName
|
| 26 |
#############################################
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
###################################
|
| 29 |
# HELPER FUNCTIONS
|
| 30 |
###################################
|
| 31 |
|
| 32 |
+
## ---------- LOAD SPACY MODEL ---------
|
| 33 |
|
| 34 |
def load_spacy_model(model_name="en_core_web_md"):
|
| 35 |
"""
|
|
|
|
| 42 |
|
| 43 |
return spacy.load(model_name)
|
| 44 |
|
| 45 |
+
## -------- SENTENCE TOKENIZER -------
|
| 46 |
+
|
| 47 |
+
def spacy_sent_tokenize(text):
|
| 48 |
+
doc = nlp(text)
|
| 49 |
+
sentences = [sent.text.strip() for sent in doc.sents]
|
| 50 |
+
return sentences
|
| 51 |
|
| 52 |
## -------- LOAD TRANSFORMER MODEL -------
|
| 53 |
|
|
|
|
| 117 |
)
|
| 118 |
return openai_model
|
| 119 |
|
| 120 |
+
## ---------- AI LABELS TO TOPIC NAME ----------
|
| 121 |
+
|
| 122 |
+
def ai_labels_to_custom_name(model):
|
| 123 |
+
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in model.topic_aspects_["OpenAI"].items()}
|
| 124 |
+
chatgpt_topic_labels[-1] = "Outlier Topic"
|
| 125 |
+
model.set_topic_labels(chatgpt_topic_labels)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
#############################
|
| 129 |
# BERTOPIC MODELING
|
| 130 |
#############################
|
| 131 |
|
| 132 |
def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_model):
|
| 133 |
|
| 134 |
+
main_representation_model = KeyBERTInspired()
|
| 135 |
|
| 136 |
openai_model = create_openai_model()
|
| 137 |
|
|
|
|
| 144 |
vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=(1,2))
|
| 145 |
|
| 146 |
topic_model = BERTopic(
|
| 147 |
+
verbose=True,
|
| 148 |
umap_model=_umap_model,
|
| 149 |
representation_model=representation_model,
|
| 150 |
vectorizer_model=vectorizer_model,
|
|
|
|
| 158 |
return topic_model, topics, probs
|
| 159 |
|
| 160 |
|
| 161 |
+
|
| 162 |
+
|
| 163 |
##################################
|
| 164 |
# TOPIC TO DATAFRAME MAPPING
|
| 165 |
#################################
|