Upload 255 files

Browse files

Files changed (7) hide show

BERTopic/.gitignore +3 -0
BERTopic/bertopic/_bertopic.py +40 -38
BERTopic/docs/api/plotting/document_datamap.md +1 -1
BERTopic/docs/faq.md +20 -0
BERTopic/docs/getting_started/visualization/visualize_documents.md +27 -0
BERTopic/docs/index.md +3 -1
BERTopic/mkdocs.yml +1 -0

BERTopic/.gitignore CHANGED Viewed

@@ -81,3 +81,6 @@ venv.bak/
 .idea/
 .vscode
 .DS_Store

 .idea/
 .vscode
 .DS_Store
+# mkdocs
+site/

BERTopic/bertopic/_bertopic.py CHANGED Viewed

@@ -158,7 +158,7 @@ class BERTopic:
                           NOTE: This param will not be used if you pass in your own
                           CountVectorizer.
             min_topic_size: The minimum size of the topic. Increasing this value will lead
-                            to a lower number of clusters/topics and vice versa.
                             It is the same parameter as `min_cluster_size` in HDBSCAN.
                             NOTE: This param will not be used if you are using `hdbscan_model`.
             nr_topics: Specifying the number of topics will reduce the initial
@@ -321,7 +321,7 @@ class BERTopic:
                       embeddings: np.ndarray = None,
                       images: List[str] = None,
                       y: Union[List[int], np.ndarray] = None) -> Tuple[List[int],
-                                                                       Union[np.ndarray, None]]:
         """ Fit the models on a collection of documents, generate topics,
         and return the probabilities and topic per document.
@@ -699,10 +699,11 @@ class BERTopic:
     def topics_over_time(self,
                          docs: List[str],
-                         timestamps: Union[List[str],
-                                           List[int]],
                          topics: List[int] = None,
                          nr_bins: int = None,
                          datetime_format: str = None,
                          evolution_tuning: bool = True,
                          global_tuning: bool = True) -> pd.DataFrame:
@@ -826,7 +827,8 @@ class BERTopic:
             # Fill dataframe with results
             topics_at_timestamp = [(topic,
-                                    ", ".join([words[0] for words in values][:5]),
                                     topic_frequency[topic],
                                     timestamp) for topic, values in words_per_topic.items()]
             topics_over_time.extend(topics_at_timestamp)
@@ -835,7 +837,7 @@ class BERTopic:
                 previous_topics = sorted(list(documents_per_topic.Topic.values))
                 previous_c_tf_idf = c_tf_idf.copy()
-        return pd.DataFrame(topics_over_time, columns=["Topic", "Words", "Frequency", "Timestamp"])
     def topics_per_class(self,
                          docs: List[str],
@@ -932,8 +934,8 @@ class BERTopic:
                               `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`
             distance_function: The distance function to use on the c-TF-IDF matrix. Default is:
                                `lambda x: 1 - cosine_similarity(x)`.
-                               You can pass any function that returns either a square matrix of
-                               shape (n_samples, n_samples) with zeros on the diagonal and
                                non-negative values or condensed distance matrix of shape
                                (n_samples * (n_samples - 1) / 2,) containing the upper
                                triangular of the distance matrix.
@@ -1067,7 +1069,7 @@ class BERTopic:
                                  use_embedding_model: bool = False,
                                  calculate_tokens: bool = False,
                                  separator: str = " ") -> Tuple[np.ndarray,
-                                                                Union[List[np.ndarray], None]]:
         """ A post-hoc approximation of topic distributions across documents.
         In order to perform this approximation, each document is split into tokens
@@ -1977,8 +1979,8 @@ class BERTopic:
         for key, val in sorted(mapping.items()):
             mappings[val].append(key)
         mappings = {topic_from:
-                    {"topics_to": topics_to,
-                     "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
                     for topic_from, topics_to in mappings.items()}
         # Update topics
@@ -2464,7 +2466,7 @@ class BERTopic:
                                  specific points. Helps to speed up generation of visualizations.
             nr_levels: The number of levels to be visualized in the hierarchy. First, the distances
                        in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances with
-                       equal length. Then, for each list of distances, the merged topics, that have
                        a distance less or equal to the maximum distance of the selected list of distances, are selected.
                        NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to
                        the length of `hierarchical_topics`.
@@ -3264,7 +3266,7 @@ class BERTopic:
             sims = np.max(sim_matrix, axis=1)
             to_merge = {
                 a - selected_topics["_outliers"]:
-                b - merged_topics["_outliers"] for a, (b, val) in enumerate(zip(sims_idx, sims))
                 if val >= min_similarity
             }
             to_merge.update(new_topics_dict)
@@ -3295,7 +3297,7 @@ class BERTopic:
             serialization: str = "safetensors",
             save_embedding_model: Union[str, bool] = True,
             save_ctfidf: bool = False,
-            ):
         """ Push your BERTopic model to a HuggingFace Hub
         Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account:
@@ -3469,7 +3471,7 @@ class BERTopic:
                             documents: pd.DataFrame,
                             partial_fit: bool = False,
                             y: np.ndarray = None) -> Tuple[pd.DataFrame,
-                                                           np.ndarray]:
         """ Cluster UMAP embeddings with HDBSCAN
         Arguments:
@@ -3520,7 +3522,7 @@ class BERTopic:
         return documents, probabilities
     def _zeroshot_topic_modeling(self, documents: pd.DataFrame, embeddings: np.ndarray) -> Tuple[pd.DataFrame, np.array,
-                                                                                                 pd.DataFrame, np.array]:
         """ Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list
         We transform the topics in `self.zeroshot_topic_list` to embeddings and
@@ -3605,16 +3607,16 @@ class BERTopic:
         empty_dimensionality_model = BaseDimensionalityReduction()
         empty_cluster_model = BaseCluster()
         zeroshot_model = BERTopic(
-                n_gram_range=self.n_gram_range,
-                low_memory=self.low_memory,
-                calculate_probabilities=self.calculate_probabilities,
-                embedding_model=self.embedding_model,
-                umap_model=empty_dimensionality_model,
-                hdbscan_model=empty_cluster_model,
-                vectorizer_model=self.vectorizer_model,
-                ctfidf_model=self.ctfidf_model,
-                representation_model=self.representation_model,
-                verbose=self.verbose
         ).fit(docs, embeddings=embeddings, y=y)
         logger.info("Zeroshot Step 2 - Completed \u2713")
         logger.info("Zeroshot Step 3 - Combining clustered topics with the zeroshot model")
@@ -3824,9 +3826,9 @@ class BERTopic:
         # Sample documents per topic
         documents_per_topic = (
             documents.drop("Image", axis=1, errors="ignore")
-                     .groupby('Topic')
-                     .sample(n=nr_samples, replace=True, random_state=42)
-                     .drop_duplicates()
         )
         # Find and extract documents that are most similar to the topic
@@ -4007,7 +4009,7 @@ class BERTopic:
                                  documents: pd.DataFrame,
                                  c_tf_idf: csr_matrix = None,
                                  calculate_aspects: bool = True) -> Mapping[str,
-                                                                            List[Tuple[str, float]]]:
         """ Based on tf_idf scores per topic, extract the top n words per topic
         If the top words per topic need to be extracted, then only the `words` parameter
@@ -4126,8 +4128,8 @@ class BERTopic:
         for key, val in sorted(mapped_topics.items()):
             mappings[val].append(key)
         mappings = {topic_from:
-                    {"topics_to": topics_to,
-                     "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
                     for topic_from, topics_to in mappings.items()}
         # Map topics
@@ -4177,8 +4179,8 @@ class BERTopic:
         for key, val in sorted(mapped_topics.items()):
             mappings[val].append(key)
         mappings = {topic_from:
-                    {"topics_to": topics_to,
-                     "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
                     for topic_from, topics_to in mappings.items()}
         # Update documents and topics
@@ -4479,10 +4481,10 @@ def _create_model_from_files(
     # Fit BERTopic without actually performing any clustering
     topic_model = BERTopic(
-            embedding_model=embedding_model,
-            umap_model=empty_dimensionality_model,
-            hdbscan_model=empty_cluster_model,
-            **params
     )
     topic_model.topic_embeddings_ = tensors["topic_embeddings"].numpy()
     topic_model.topic_representations_ = {int(key): val for key, val in topics["topic_representations"].items()}

                           NOTE: This param will not be used if you pass in your own
                           CountVectorizer.
             min_topic_size: The minimum size of the topic. Increasing this value will lead
+                            to a lower number of clusters/topics and vice versa.
                             It is the same parameter as `min_cluster_size` in HDBSCAN.
                             NOTE: This param will not be used if you are using `hdbscan_model`.
             nr_topics: Specifying the number of topics will reduce the initial
                       embeddings: np.ndarray = None,
                       images: List[str] = None,
                       y: Union[List[int], np.ndarray] = None) -> Tuple[List[int],
+    Union[np.ndarray, None]]:
         """ Fit the models on a collection of documents, generate topics,
         and return the probabilities and topic per document.
     def topics_over_time(self,
                          docs: List[str],
+                         timestamps: Union[List[str]],
                          topics: List[int] = None,
+                         n_topics: int = 5,
                          nr_bins: int = None,
+                         n_keywords: int = 5,
                          datetime_format: str = None,
                          evolution_tuning: bool = True,
                          global_tuning: bool = True) -> pd.DataFrame:
             # Fill dataframe with results
             topics_at_timestamp = [(topic,
+                                    ", ".join([words[0] for words in values][:n_topics]),
+                                    [words[1] for weights in values][:n_topics],
                                     topic_frequency[topic],
                                     timestamp) for topic, values in words_per_topic.items()]
             topics_over_time.extend(topics_at_timestamp)
                 previous_topics = sorted(list(documents_per_topic.Topic.values))
                 previous_c_tf_idf = c_tf_idf.copy()
+        return pd.DataFrame(topics_over_time, columns=["Topic", "Words", "Weight", "Frequency", "Timestamp"])
     def topics_per_class(self,
                          docs: List[str],
                               `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`
             distance_function: The distance function to use on the c-TF-IDF matrix. Default is:
                                `lambda x: 1 - cosine_similarity(x)`.
+                               You can pass any function that returns either a square matrix of
+                               shape (n_samples, n_samples) with zeros on the diagonal and
                                non-negative values or condensed distance matrix of shape
                                (n_samples * (n_samples - 1) / 2,) containing the upper
                                triangular of the distance matrix.
                                  use_embedding_model: bool = False,
                                  calculate_tokens: bool = False,
                                  separator: str = " ") -> Tuple[np.ndarray,
+    Union[List[np.ndarray], None]]:
         """ A post-hoc approximation of topic distributions across documents.
         In order to perform this approximation, each document is split into tokens
         for key, val in sorted(mapping.items()):
             mappings[val].append(key)
         mappings = {topic_from:
+                        {"topics_to": topics_to,
+                         "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
                     for topic_from, topics_to in mappings.items()}
         # Update topics
                                  specific points. Helps to speed up generation of visualizations.
             nr_levels: The number of levels to be visualized in the hierarchy. First, the distances
                        in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances with
+                       equal length. Then, for each list of distances, the merged topics, that have
                        a distance less or equal to the maximum distance of the selected list of distances, are selected.
                        NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to
                        the length of `hierarchical_topics`.
             sims = np.max(sim_matrix, axis=1)
             to_merge = {
                 a - selected_topics["_outliers"]:
+                    b - merged_topics["_outliers"] for a, (b, val) in enumerate(zip(sims_idx, sims))
                 if val >= min_similarity
             }
             to_merge.update(new_topics_dict)
             serialization: str = "safetensors",
             save_embedding_model: Union[str, bool] = True,
             save_ctfidf: bool = False,
+    ):
         """ Push your BERTopic model to a HuggingFace Hub
         Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account:
                             documents: pd.DataFrame,
                             partial_fit: bool = False,
                             y: np.ndarray = None) -> Tuple[pd.DataFrame,
+    np.ndarray]:
         """ Cluster UMAP embeddings with HDBSCAN
         Arguments:
         return documents, probabilities
     def _zeroshot_topic_modeling(self, documents: pd.DataFrame, embeddings: np.ndarray) -> Tuple[pd.DataFrame, np.array,
+    pd.DataFrame, np.array]:
         """ Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list
         We transform the topics in `self.zeroshot_topic_list` to embeddings and
         empty_dimensionality_model = BaseDimensionalityReduction()
         empty_cluster_model = BaseCluster()
         zeroshot_model = BERTopic(
+            n_gram_range=self.n_gram_range,
+            low_memory=self.low_memory,
+            calculate_probabilities=self.calculate_probabilities,
+            embedding_model=self.embedding_model,
+            umap_model=empty_dimensionality_model,
+            hdbscan_model=empty_cluster_model,
+            vectorizer_model=self.vectorizer_model,
+            ctfidf_model=self.ctfidf_model,
+            representation_model=self.representation_model,
+            verbose=self.verbose
         ).fit(docs, embeddings=embeddings, y=y)
         logger.info("Zeroshot Step 2 - Completed \u2713")
         logger.info("Zeroshot Step 3 - Combining clustered topics with the zeroshot model")
         # Sample documents per topic
         documents_per_topic = (
             documents.drop("Image", axis=1, errors="ignore")
+            .groupby('Topic')
+            .sample(n=nr_samples, replace=True, random_state=42)
+            .drop_duplicates()
         )
         # Find and extract documents that are most similar to the topic
                                  documents: pd.DataFrame,
                                  c_tf_idf: csr_matrix = None,
                                  calculate_aspects: bool = True) -> Mapping[str,
+    List[Tuple[str, float]]]:
         """ Based on tf_idf scores per topic, extract the top n words per topic
         If the top words per topic need to be extracted, then only the `words` parameter
         for key, val in sorted(mapped_topics.items()):
             mappings[val].append(key)
         mappings = {topic_from:
+                        {"topics_to": topics_to,
+                         "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
                     for topic_from, topics_to in mappings.items()}
         # Map topics
         for key, val in sorted(mapped_topics.items()):
             mappings[val].append(key)
         mappings = {topic_from:
+                        {"topics_to": topics_to,
+                         "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
                     for topic_from, topics_to in mappings.items()}
         # Update documents and topics
     # Fit BERTopic without actually performing any clustering
     topic_model = BERTopic(
+        embedding_model=embedding_model,
+        umap_model=empty_dimensionality_model,
+        hdbscan_model=empty_cluster_model,
+        **params
     )
     topic_model.topic_embeddings_ = tensors["topic_embeddings"].numpy()
     topic_model.topic_representations_ = {int(key): val for key, val in topics["topic_representations"].items()}

BERTopic/docs/api/plotting/document_datamap.md CHANGED Viewed

@@ -1,3 +1,3 @@
-# `Document Data Map`
 ::: bertopic.plotting._datamap.visualize_document_datamap


1	+ # `Documents with DataMapPlot`
2
3	::: bertopic.plotting._datamap.visualize_document_datamap

BERTopic/docs/faq.md CHANGED Viewed

@@ -311,3 +311,23 @@ are important in understanding the general topic of the document. Although this
 have data that contains a lot of noise, for example, HTML-tags, then it would be best to remove them. HTML-tags
 typically do not contribute to the meaning of a document and should therefore be removed. However, if you apply
 topic modeling to HTML-code to extract topics of code, then it becomes important.

 have data that contains a lot of noise, for example, HTML-tags, then it would be best to remove them. HTML-tags
 typically do not contribute to the meaning of a document and should therefore be removed. However, if you apply
 topic modeling to HTML-code to extract topics of code, then it becomes important.
+## **I run into issues running on Apple Silicon. What should I do?**
+Apple Silicon chips (M1 & M2) are based on the ARM64 (aka [AArch64](https://apple.stackexchange.com/questions/451238/is-m1-chip-aarch64-or-amd64), not to be confused with AMD64). There are known issues with upstream dependencies for this architecture, for example [numba](https://github.com/numba/numba/issues/5520). You may not always run into this issue, depending on the extras that you need.
+One possible solution to this is to use [VS Code Dev Containers](https://code.visualstudio.com/docs/devcontainers/containers), which allows you to setup a Linux-based environment. To run BERTopic effectively you need to be aware of two things:
+- Make sure to use a Docker image specifically compiled for ARM64
+- Make sure to use `volume` instead of `mount-bind`, since the latter significantly reduces I/O speeds to disk
+Using the pre-configured [Data Science Devcontainers](https://github.com/b-data/data-science-devcontainers) makes sure these setting are optimized. To start using them, do the following:
+- Install and run Docker
+- Install `python-base` or `python-scipy` [devcontainer](https://github.com/b-data/data-science-devcontainers)
+- ℹ️ Change PYTHON_VERSION to 3.11 in the `devcontainer.json` to work with the latest version of Python 3.11 (currently 3.11.8)
+- Open VS Code, build the container and start working
+- Note that data is persisted in the container
+    - When using an unmodified devcontainer.json: work in `/home/vscode` which is the `home` directory of user `vscode`
+    - Python packages are installed to the home directory by default. This is due to env variable `PIP_USER=1`
+    - Note that the directory `/workspaces` is also persisted

BERTopic/docs/getting_started/visualization/visualize_documents.md CHANGED Viewed

@@ -1,3 +1,5 @@
 Using the `.visualize_topics`, we can visualize the topics and get insight into their relationships. However,
 you might want a more fine-grained approach where we can visualize the documents inside the topics to see
 if they were assigned correctly or whether they make sense. To do so, we can use the `topic_model.visualize_documents()`
@@ -43,6 +45,30 @@ When you visualize the documents, you might not always want to see the complete
 topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings)
 ```
 ## **Visualize Probablities or Distribution**
 We can generate the topic-document probability matrix by simply setting `calculate_probabilities=True` if a HDBSCAN model is used:
@@ -100,3 +126,4 @@ df
     the distribution of the frequencies of topics across a document. It merely shows
     how confident BERTopic is that certain topics can be found in a document.

+## **Visualize documents with Plotly**
 Using the `.visualize_topics`, we can visualize the topics and get insight into their relationships. However,
 you might want a more fine-grained approach where we can visualize the documents inside the topics to see
 if they were assigned correctly or whether they make sense. To do so, we can use the `topic_model.visualize_documents()`
 topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings)
 ```
+## **Visualize documents with DataMapPlot**
+`.visualize_document_datamap` provides an alternative way to visualize the documents inside the topics as a static [DataMapPlot](https://datamapplot.readthedocs.io/en/latest/intro_splash.html). Using the same pipeline as above, you can generate a DataMapPlot by running:
+```python
+# with the original embeddings
+topic_model.visualize_document_datamap(docs, embeddings=embeddings)
+# with the reduced embeddings
+topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)
+```
+<br><br>
+<img src="./datamapplot.png">
+<br><br>
+Or if you want to save the resulting figure:
+```python
+fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)
+fig.savefig("path/to/file.png", bbox_inches="tight")
+```
 ## **Visualize Probablities or Distribution**
 We can generate the topic-document probability matrix by simply setting `calculate_probabilities=True` if a HDBSCAN model is used:
     the distribution of the frequencies of topics across a document. It merely shows
     how confident BERTopic is that certain topics can be found in a document.

BERTopic/docs/index.md CHANGED Viewed

@@ -246,6 +246,7 @@ to tweak the model to your liking.
 |-----------------------|---|
 | Visualize Topics    |  `.visualize_topics()` |
 | Visualize Documents    |  `.visualize_documents()` |
 | Visualize Document Hierarchy    |  `.visualize_hierarchical_documents()` |
 | Visualize Topic Hierarchy    |  `.visualize_hierarchy()` |
 | Visualize Topic Tree   |  `.get_topic_tree(hierarchical_topics)` |
@@ -254,7 +255,8 @@ to tweak the model to your liking.
 | Visualize Term Score Decline  |  `.visualize_term_rank()` |
 | Visualize Topic Probability Distribution    |  `.visualize_distribution(probs[0])` |
 | Visualize Topics over Time   |  `.visualize_topics_over_time(topics_over_time)` |
-| Visualize Topics per Class | `.visualize_topics_per_class(topics_per_class)` |
 ## **Citation**

 |-----------------------|---|
 | Visualize Topics    |  `.visualize_topics()` |
 | Visualize Documents    |  `.visualize_documents()` |
+| Visualize Document with DataMapPlot | `.visualize_document_datamap()` |
 | Visualize Document Hierarchy    |  `.visualize_hierarchical_documents()` |
 | Visualize Topic Hierarchy    |  `.visualize_hierarchy()` |
 | Visualize Topic Tree   |  `.get_topic_tree(hierarchical_topics)` |
 | Visualize Term Score Decline  |  `.visualize_term_rank()` |
 | Visualize Topic Probability Distribution    |  `.visualize_distribution(probs[0])` |
 | Visualize Topics over Time   |  `.visualize_topics_over_time(topics_over_time)` |
+| Visualize Topics per Class | `.visualize_topics_per_class(topics_per_class)` |
 ## **Citation**

BERTopic/mkdocs.yml CHANGED Viewed

@@ -83,6 +83,7 @@ nav:
       - Plotting:
         - Barchart: api/plotting/barchart.md
         - Documents: api/plotting/documents.md
         - DTM: api/plotting/dtm.md
         - Hierarchical documents: api/plotting/hierarchical_documents.md
         - Hierarchical topics: api/plotting/hierarchy.md

       - Plotting:
         - Barchart: api/plotting/barchart.md
         - Documents: api/plotting/documents.md
+        - Documents with DataMapPlot: api/plotting/document_datamap.md
         - DTM: api/plotting/dtm.md
         - Hierarchical documents: api/plotting/hierarchical_documents.md
         - Hierarchical topics: api/plotting/hierarchy.md