kisejin
/

TopicModelingRepo

Model card Files Files and versions

xet

Community

kisejin commited on Apr 10, 2024

Commit

6ce7df4

verified ·

1 Parent(s): b0f7b59

Update BERTopic/bertopic/_bertopic.py

Browse files

Files changed (1) hide show

BERTopic/bertopic/_bertopic.py +25 -8

BERTopic/bertopic/_bertopic.py CHANGED Viewed

@@ -262,6 +262,8 @@ class BERTopic:
         self.representative_images_ = None
         self.representative_docs_ = {}
         self.topic_aspects_ = {}
         # Private attributes for internal tracking purposes
         self._outliers = 1
@@ -535,7 +537,7 @@ class BERTopic:
             logger.info("Clustering - Approximating new points with `hdbscan_model`")
             if is_supported_hdbscan(self.hdbscan_model):
                 predictions, probabilities = hdbscan_delegator(self.hdbscan_model, "approximate_predict", umap_embeddings)
                 # Show all proba of topic in one sentence
                 self.probabilities_transform = hdbscan_delegator(self.hdbscan_model, "membership_vector", umap_embeddings)
@@ -551,16 +553,19 @@ class BERTopic:
             # Map probabilities and predictions
             probabilities = self._map_probabilities(probabilities, original_topics=True)
             self.probabilities_transform = self._map_probabilities(self.probabilities_transform, original_topics=True)
             predictions = self._map_predictions(predictions)
         self.predictions_transform = predictions
         return predictions, probabilities
     def get_result_transform(self):
         return self.predictions_transform, self.probabilities_transform
     def partial_fit(self,
                     documents: List[str],
                     embeddings: np.ndarray = None,
@@ -2013,7 +2018,7 @@ class BERTopic:
         to nr_topics using `AgglomerativeClustering` on the cosine distance matrix
         of the topic embeddings.
-        If nr_topics is `"auto"`, then HDBSCAN is used to automatically
         reduce the number of topics by running it on the topic embeddings.
         The topics, their sizes, and representations are updated.
@@ -2055,6 +2060,8 @@ class BERTopic:
         self._merged_topics = None
         self._save_representative_docs(documents)
         self.probabilities_ = self._map_probabilities(self.probabilities_)
         return self
@@ -4259,12 +4266,22 @@ class BERTopic:
         # Map array of probabilities (probability for assigned topic per document)
         if probabilities is not None:
-            if len(probabilities.shape) == 2:
-                mapped_probabilities = np.zeros((probabilities.shape[0],
                                                  len(set(mappings.values())) - self._outliers))
                 for from_topic, to_topic in mappings.items():
                     if to_topic != -1 and from_topic != -1:
-                        mapped_probabilities[:, to_topic] += probabilities[:, from_topic]
                 return mapped_probabilities

         self.representative_images_ = None
         self.representative_docs_ = {}
         self.topic_aspects_ = {}
+        self.predictions_transform = None
+        self.probabilities_transform = None
         # Private attributes for internal tracking purposes
         self._outliers = 1
             logger.info("Clustering - Approximating new points with `hdbscan_model`")
             if is_supported_hdbscan(self.hdbscan_model):
                 predictions, probabilities = hdbscan_delegator(self.hdbscan_model, "approximate_predict", umap_embeddings)
                 # Show all proba of topic in one sentence
                 self.probabilities_transform = hdbscan_delegator(self.hdbscan_model, "membership_vector", umap_embeddings)
             # Map probabilities and predictions
             probabilities = self._map_probabilities(probabilities, original_topics=True)
+            print("Self.probabilities\_transform: \n",self.probabilities_transform)
             self.probabilities_transform = self._map_probabilities(self.probabilities_transform, original_topics=True)
             predictions = self._map_predictions(predictions)
         self.predictions_transform = predictions
         return predictions, probabilities
     def get_result_transform(self):
         return self.predictions_transform, self.probabilities_transform
     def partial_fit(self,
                     documents: List[str],
                     embeddings: np.ndarray = None,
         to nr_topics using `AgglomerativeClustering` on the cosine distance matrix
         of the topic embeddings.
+        If nr_topics is `"auto"`, then HDBSCAN is used to automaticallyreduce_topic
         reduce the number of topics by running it on the topic embeddings.
         The topics, their sizes, and representations are updated.
         self._merged_topics = None
         self._save_representative_docs(documents)
         self.probabilities_ = self._map_probabilities(self.probabilities_)
+        self.probabilities_transform = self._map_probabilities(self.probabilities_transform)
+        self.predictions_transform = self._map_predictions(self.predictions_transform)
         return self
         # Map array of probabilities (probability for assigned topic per document)
         if probabilities is not None:
+            arrs = []
+            if isinstance(probabilities, tuple):
+                for element in probabilities:
+                    arr = element[0].astype(float)
+                    arrs.append(arr)
+                arrs = np.array(arrs)
+            else:
+                arrs = probabilities
+            if len(arrs.shape) == 2:
+                mapped_probabilities = np.zeros((arrs.shape[0],
                                                  len(set(mappings.values())) - self._outliers))
                 for from_topic, to_topic in mappings.items():
                     if to_topic != -1 and from_topic != -1:
+                        mapped_probabilities[:, to_topic] += arrs[:, from_topic]
                 return mapped_probabilities