Update BERTopic/bertopic/_bertopic.py
Browse files
BERTopic/bertopic/_bertopic.py
CHANGED
|
@@ -262,6 +262,8 @@ class BERTopic:
|
|
| 262 |
self.representative_images_ = None
|
| 263 |
self.representative_docs_ = {}
|
| 264 |
self.topic_aspects_ = {}
|
|
|
|
|
|
|
| 265 |
|
| 266 |
# Private attributes for internal tracking purposes
|
| 267 |
self._outliers = 1
|
|
@@ -535,7 +537,7 @@ class BERTopic:
|
|
| 535 |
logger.info("Clustering - Approximating new points with `hdbscan_model`")
|
| 536 |
if is_supported_hdbscan(self.hdbscan_model):
|
| 537 |
predictions, probabilities = hdbscan_delegator(self.hdbscan_model, "approximate_predict", umap_embeddings)
|
| 538 |
-
|
| 539 |
# Show all proba of topic in one sentence
|
| 540 |
self.probabilities_transform = hdbscan_delegator(self.hdbscan_model, "membership_vector", umap_embeddings)
|
| 541 |
|
|
@@ -551,16 +553,19 @@ class BERTopic:
|
|
| 551 |
|
| 552 |
# Map probabilities and predictions
|
| 553 |
probabilities = self._map_probabilities(probabilities, original_topics=True)
|
|
|
|
|
|
|
| 554 |
self.probabilities_transform = self._map_probabilities(self.probabilities_transform, original_topics=True)
|
| 555 |
predictions = self._map_predictions(predictions)
|
| 556 |
-
|
| 557 |
self.predictions_transform = predictions
|
| 558 |
return predictions, probabilities
|
| 559 |
|
|
|
|
| 560 |
def get_result_transform(self):
|
| 561 |
return self.predictions_transform, self.probabilities_transform
|
| 562 |
-
|
| 563 |
-
|
| 564 |
def partial_fit(self,
|
| 565 |
documents: List[str],
|
| 566 |
embeddings: np.ndarray = None,
|
|
@@ -2013,7 +2018,7 @@ class BERTopic:
|
|
| 2013 |
to nr_topics using `AgglomerativeClustering` on the cosine distance matrix
|
| 2014 |
of the topic embeddings.
|
| 2015 |
|
| 2016 |
-
If nr_topics is `"auto"`, then HDBSCAN is used to
|
| 2017 |
reduce the number of topics by running it on the topic embeddings.
|
| 2018 |
|
| 2019 |
The topics, their sizes, and representations are updated.
|
|
@@ -2055,6 +2060,8 @@ class BERTopic:
|
|
| 2055 |
self._merged_topics = None
|
| 2056 |
self._save_representative_docs(documents)
|
| 2057 |
self.probabilities_ = self._map_probabilities(self.probabilities_)
|
|
|
|
|
|
|
| 2058 |
|
| 2059 |
return self
|
| 2060 |
|
|
@@ -4259,12 +4266,22 @@ class BERTopic:
|
|
| 4259 |
|
| 4260 |
# Map array of probabilities (probability for assigned topic per document)
|
| 4261 |
if probabilities is not None:
|
| 4262 |
-
|
| 4263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4264 |
len(set(mappings.values())) - self._outliers))
|
| 4265 |
for from_topic, to_topic in mappings.items():
|
| 4266 |
if to_topic != -1 and from_topic != -1:
|
| 4267 |
-
mapped_probabilities[:, to_topic] +=
|
| 4268 |
|
| 4269 |
return mapped_probabilities
|
| 4270 |
|
|
|
|
| 262 |
self.representative_images_ = None
|
| 263 |
self.representative_docs_ = {}
|
| 264 |
self.topic_aspects_ = {}
|
| 265 |
+
self.predictions_transform = None
|
| 266 |
+
self.probabilities_transform = None
|
| 267 |
|
| 268 |
# Private attributes for internal tracking purposes
|
| 269 |
self._outliers = 1
|
|
|
|
| 537 |
logger.info("Clustering - Approximating new points with `hdbscan_model`")
|
| 538 |
if is_supported_hdbscan(self.hdbscan_model):
|
| 539 |
predictions, probabilities = hdbscan_delegator(self.hdbscan_model, "approximate_predict", umap_embeddings)
|
| 540 |
+
|
| 541 |
# Show all proba of topic in one sentence
|
| 542 |
self.probabilities_transform = hdbscan_delegator(self.hdbscan_model, "membership_vector", umap_embeddings)
|
| 543 |
|
|
|
|
| 553 |
|
| 554 |
# Map probabilities and predictions
|
| 555 |
probabilities = self._map_probabilities(probabilities, original_topics=True)
|
| 556 |
+
|
| 557 |
+
print("Self.probabilities\_transform: \n",self.probabilities_transform)
|
| 558 |
self.probabilities_transform = self._map_probabilities(self.probabilities_transform, original_topics=True)
|
| 559 |
predictions = self._map_predictions(predictions)
|
| 560 |
+
|
| 561 |
self.predictions_transform = predictions
|
| 562 |
return predictions, probabilities
|
| 563 |
|
| 564 |
+
|
| 565 |
def get_result_transform(self):
|
| 566 |
return self.predictions_transform, self.probabilities_transform
|
| 567 |
+
|
| 568 |
+
|
| 569 |
def partial_fit(self,
|
| 570 |
documents: List[str],
|
| 571 |
embeddings: np.ndarray = None,
|
|
|
|
| 2018 |
to nr_topics using `AgglomerativeClustering` on the cosine distance matrix
|
| 2019 |
of the topic embeddings.
|
| 2020 |
|
| 2021 |
+
If nr_topics is `"auto"`, then HDBSCAN is used to automaticallyreduce_topic
|
| 2022 |
reduce the number of topics by running it on the topic embeddings.
|
| 2023 |
|
| 2024 |
The topics, their sizes, and representations are updated.
|
|
|
|
| 2060 |
self._merged_topics = None
|
| 2061 |
self._save_representative_docs(documents)
|
| 2062 |
self.probabilities_ = self._map_probabilities(self.probabilities_)
|
| 2063 |
+
self.probabilities_transform = self._map_probabilities(self.probabilities_transform)
|
| 2064 |
+
self.predictions_transform = self._map_predictions(self.predictions_transform)
|
| 2065 |
|
| 2066 |
return self
|
| 2067 |
|
|
|
|
| 4266 |
|
| 4267 |
# Map array of probabilities (probability for assigned topic per document)
|
| 4268 |
if probabilities is not None:
|
| 4269 |
+
arrs = []
|
| 4270 |
+
if isinstance(probabilities, tuple):
|
| 4271 |
+
for element in probabilities:
|
| 4272 |
+
arr = element[0].astype(float)
|
| 4273 |
+
arrs.append(arr)
|
| 4274 |
+
arrs = np.array(arrs)
|
| 4275 |
+
else:
|
| 4276 |
+
arrs = probabilities
|
| 4277 |
+
|
| 4278 |
+
|
| 4279 |
+
if len(arrs.shape) == 2:
|
| 4280 |
+
mapped_probabilities = np.zeros((arrs.shape[0],
|
| 4281 |
len(set(mappings.values())) - self._outliers))
|
| 4282 |
for from_topic, to_topic in mappings.items():
|
| 4283 |
if to_topic != -1 and from_topic != -1:
|
| 4284 |
+
mapped_probabilities[:, to_topic] += arrs[:, from_topic]
|
| 4285 |
|
| 4286 |
return mapped_probabilities
|
| 4287 |
|