kisejin commited on
Commit
6ce7df4
·
verified ·
1 Parent(s): b0f7b59

Update BERTopic/bertopic/_bertopic.py

Browse files
Files changed (1) hide show
  1. BERTopic/bertopic/_bertopic.py +25 -8
BERTopic/bertopic/_bertopic.py CHANGED
@@ -262,6 +262,8 @@ class BERTopic:
262
  self.representative_images_ = None
263
  self.representative_docs_ = {}
264
  self.topic_aspects_ = {}
 
 
265
 
266
  # Private attributes for internal tracking purposes
267
  self._outliers = 1
@@ -535,7 +537,7 @@ class BERTopic:
535
  logger.info("Clustering - Approximating new points with `hdbscan_model`")
536
  if is_supported_hdbscan(self.hdbscan_model):
537
  predictions, probabilities = hdbscan_delegator(self.hdbscan_model, "approximate_predict", umap_embeddings)
538
-
539
  # Show all proba of topic in one sentence
540
  self.probabilities_transform = hdbscan_delegator(self.hdbscan_model, "membership_vector", umap_embeddings)
541
 
@@ -551,16 +553,19 @@ class BERTopic:
551
 
552
  # Map probabilities and predictions
553
  probabilities = self._map_probabilities(probabilities, original_topics=True)
 
 
554
  self.probabilities_transform = self._map_probabilities(self.probabilities_transform, original_topics=True)
555
  predictions = self._map_predictions(predictions)
556
-
557
  self.predictions_transform = predictions
558
  return predictions, probabilities
559
 
 
560
  def get_result_transform(self):
561
  return self.predictions_transform, self.probabilities_transform
562
-
563
-
564
  def partial_fit(self,
565
  documents: List[str],
566
  embeddings: np.ndarray = None,
@@ -2013,7 +2018,7 @@ class BERTopic:
2013
  to nr_topics using `AgglomerativeClustering` on the cosine distance matrix
2014
  of the topic embeddings.
2015
 
2016
- If nr_topics is `"auto"`, then HDBSCAN is used to automatically
2017
  reduce the number of topics by running it on the topic embeddings.
2018
 
2019
  The topics, their sizes, and representations are updated.
@@ -2055,6 +2060,8 @@ class BERTopic:
2055
  self._merged_topics = None
2056
  self._save_representative_docs(documents)
2057
  self.probabilities_ = self._map_probabilities(self.probabilities_)
 
 
2058
 
2059
  return self
2060
 
@@ -4259,12 +4266,22 @@ class BERTopic:
4259
 
4260
  # Map array of probabilities (probability for assigned topic per document)
4261
  if probabilities is not None:
4262
- if len(probabilities.shape) == 2:
4263
- mapped_probabilities = np.zeros((probabilities.shape[0],
 
 
 
 
 
 
 
 
 
 
4264
  len(set(mappings.values())) - self._outliers))
4265
  for from_topic, to_topic in mappings.items():
4266
  if to_topic != -1 and from_topic != -1:
4267
- mapped_probabilities[:, to_topic] += probabilities[:, from_topic]
4268
 
4269
  return mapped_probabilities
4270
 
 
262
  self.representative_images_ = None
263
  self.representative_docs_ = {}
264
  self.topic_aspects_ = {}
265
+ self.predictions_transform = None
266
+ self.probabilities_transform = None
267
 
268
  # Private attributes for internal tracking purposes
269
  self._outliers = 1
 
537
  logger.info("Clustering - Approximating new points with `hdbscan_model`")
538
  if is_supported_hdbscan(self.hdbscan_model):
539
  predictions, probabilities = hdbscan_delegator(self.hdbscan_model, "approximate_predict", umap_embeddings)
540
+
541
  # Show all proba of topic in one sentence
542
  self.probabilities_transform = hdbscan_delegator(self.hdbscan_model, "membership_vector", umap_embeddings)
543
 
 
553
 
554
  # Map probabilities and predictions
555
  probabilities = self._map_probabilities(probabilities, original_topics=True)
556
+
557
+ print("Self.probabilities\_transform: \n",self.probabilities_transform)
558
  self.probabilities_transform = self._map_probabilities(self.probabilities_transform, original_topics=True)
559
  predictions = self._map_predictions(predictions)
560
+
561
  self.predictions_transform = predictions
562
  return predictions, probabilities
563
 
564
+
565
  def get_result_transform(self):
566
  return self.predictions_transform, self.probabilities_transform
567
+
568
+
569
  def partial_fit(self,
570
  documents: List[str],
571
  embeddings: np.ndarray = None,
 
2018
  to nr_topics using `AgglomerativeClustering` on the cosine distance matrix
2019
  of the topic embeddings.
2020
 
2021
+ If nr_topics is `"auto"`, then HDBSCAN is used to automaticallyreduce_topic
2022
  reduce the number of topics by running it on the topic embeddings.
2023
 
2024
  The topics, their sizes, and representations are updated.
 
2060
  self._merged_topics = None
2061
  self._save_representative_docs(documents)
2062
  self.probabilities_ = self._map_probabilities(self.probabilities_)
2063
+ self.probabilities_transform = self._map_probabilities(self.probabilities_transform)
2064
+ self.predictions_transform = self._map_predictions(self.predictions_transform)
2065
 
2066
  return self
2067
 
 
4266
 
4267
  # Map array of probabilities (probability for assigned topic per document)
4268
  if probabilities is not None:
4269
+ arrs = []
4270
+ if isinstance(probabilities, tuple):
4271
+ for element in probabilities:
4272
+ arr = element[0].astype(float)
4273
+ arrs.append(arr)
4274
+ arrs = np.array(arrs)
4275
+ else:
4276
+ arrs = probabilities
4277
+
4278
+
4279
+ if len(arrs.shape) == 2:
4280
+ mapped_probabilities = np.zeros((arrs.shape[0],
4281
  len(set(mappings.values())) - self._outliers))
4282
  for from_topic, to_topic in mappings.items():
4283
  if to_topic != -1 and from_topic != -1:
4284
+ mapped_probabilities[:, to_topic] += arrs[:, from_topic]
4285
 
4286
  return mapped_probabilities
4287