Upload 255 files
Browse files
BERTopic/.gitignore
CHANGED
|
@@ -81,3 +81,6 @@ venv.bak/
|
|
| 81 |
.idea/
|
| 82 |
.vscode
|
| 83 |
.DS_Store
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
.idea/
|
| 82 |
.vscode
|
| 83 |
.DS_Store
|
| 84 |
+
|
| 85 |
+
# mkdocs
|
| 86 |
+
site/
|
BERTopic/bertopic/_bertopic.py
CHANGED
|
@@ -158,7 +158,7 @@ class BERTopic:
|
|
| 158 |
NOTE: This param will not be used if you pass in your own
|
| 159 |
CountVectorizer.
|
| 160 |
min_topic_size: The minimum size of the topic. Increasing this value will lead
|
| 161 |
-
to a lower number of clusters/topics and vice versa.
|
| 162 |
It is the same parameter as `min_cluster_size` in HDBSCAN.
|
| 163 |
NOTE: This param will not be used if you are using `hdbscan_model`.
|
| 164 |
nr_topics: Specifying the number of topics will reduce the initial
|
|
@@ -321,7 +321,7 @@ class BERTopic:
|
|
| 321 |
embeddings: np.ndarray = None,
|
| 322 |
images: List[str] = None,
|
| 323 |
y: Union[List[int], np.ndarray] = None) -> Tuple[List[int],
|
| 324 |
-
|
| 325 |
""" Fit the models on a collection of documents, generate topics,
|
| 326 |
and return the probabilities and topic per document.
|
| 327 |
|
|
@@ -699,10 +699,11 @@ class BERTopic:
|
|
| 699 |
|
| 700 |
def topics_over_time(self,
|
| 701 |
docs: List[str],
|
| 702 |
-
timestamps: Union[List[str],
|
| 703 |
-
List[int]],
|
| 704 |
topics: List[int] = None,
|
|
|
|
| 705 |
nr_bins: int = None,
|
|
|
|
| 706 |
datetime_format: str = None,
|
| 707 |
evolution_tuning: bool = True,
|
| 708 |
global_tuning: bool = True) -> pd.DataFrame:
|
|
@@ -826,7 +827,8 @@ class BERTopic:
|
|
| 826 |
|
| 827 |
# Fill dataframe with results
|
| 828 |
topics_at_timestamp = [(topic,
|
| 829 |
-
", ".join([words[0] for words in values][:
|
|
|
|
| 830 |
topic_frequency[topic],
|
| 831 |
timestamp) for topic, values in words_per_topic.items()]
|
| 832 |
topics_over_time.extend(topics_at_timestamp)
|
|
@@ -835,7 +837,7 @@ class BERTopic:
|
|
| 835 |
previous_topics = sorted(list(documents_per_topic.Topic.values))
|
| 836 |
previous_c_tf_idf = c_tf_idf.copy()
|
| 837 |
|
| 838 |
-
return pd.DataFrame(topics_over_time, columns=["Topic", "Words", "Frequency", "Timestamp"])
|
| 839 |
|
| 840 |
def topics_per_class(self,
|
| 841 |
docs: List[str],
|
|
@@ -932,8 +934,8 @@ class BERTopic:
|
|
| 932 |
`lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`
|
| 933 |
distance_function: The distance function to use on the c-TF-IDF matrix. Default is:
|
| 934 |
`lambda x: 1 - cosine_similarity(x)`.
|
| 935 |
-
You can pass any function that returns either a square matrix of
|
| 936 |
-
shape (n_samples, n_samples) with zeros on the diagonal and
|
| 937 |
non-negative values or condensed distance matrix of shape
|
| 938 |
(n_samples * (n_samples - 1) / 2,) containing the upper
|
| 939 |
triangular of the distance matrix.
|
|
@@ -1067,7 +1069,7 @@ class BERTopic:
|
|
| 1067 |
use_embedding_model: bool = False,
|
| 1068 |
calculate_tokens: bool = False,
|
| 1069 |
separator: str = " ") -> Tuple[np.ndarray,
|
| 1070 |
-
|
| 1071 |
""" A post-hoc approximation of topic distributions across documents.
|
| 1072 |
|
| 1073 |
In order to perform this approximation, each document is split into tokens
|
|
@@ -1977,8 +1979,8 @@ class BERTopic:
|
|
| 1977 |
for key, val in sorted(mapping.items()):
|
| 1978 |
mappings[val].append(key)
|
| 1979 |
mappings = {topic_from:
|
| 1980 |
-
|
| 1981 |
-
|
| 1982 |
for topic_from, topics_to in mappings.items()}
|
| 1983 |
|
| 1984 |
# Update topics
|
|
@@ -2464,7 +2466,7 @@ class BERTopic:
|
|
| 2464 |
specific points. Helps to speed up generation of visualizations.
|
| 2465 |
nr_levels: The number of levels to be visualized in the hierarchy. First, the distances
|
| 2466 |
in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances with
|
| 2467 |
-
equal length. Then, for each list of distances, the merged topics, that have
|
| 2468 |
a distance less or equal to the maximum distance of the selected list of distances, are selected.
|
| 2469 |
NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to
|
| 2470 |
the length of `hierarchical_topics`.
|
|
@@ -3264,7 +3266,7 @@ class BERTopic:
|
|
| 3264 |
sims = np.max(sim_matrix, axis=1)
|
| 3265 |
to_merge = {
|
| 3266 |
a - selected_topics["_outliers"]:
|
| 3267 |
-
|
| 3268 |
if val >= min_similarity
|
| 3269 |
}
|
| 3270 |
to_merge.update(new_topics_dict)
|
|
@@ -3295,7 +3297,7 @@ class BERTopic:
|
|
| 3295 |
serialization: str = "safetensors",
|
| 3296 |
save_embedding_model: Union[str, bool] = True,
|
| 3297 |
save_ctfidf: bool = False,
|
| 3298 |
-
|
| 3299 |
""" Push your BERTopic model to a HuggingFace Hub
|
| 3300 |
|
| 3301 |
Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account:
|
|
@@ -3469,7 +3471,7 @@ class BERTopic:
|
|
| 3469 |
documents: pd.DataFrame,
|
| 3470 |
partial_fit: bool = False,
|
| 3471 |
y: np.ndarray = None) -> Tuple[pd.DataFrame,
|
| 3472 |
-
|
| 3473 |
""" Cluster UMAP embeddings with HDBSCAN
|
| 3474 |
|
| 3475 |
Arguments:
|
|
@@ -3520,7 +3522,7 @@ class BERTopic:
|
|
| 3520 |
return documents, probabilities
|
| 3521 |
|
| 3522 |
def _zeroshot_topic_modeling(self, documents: pd.DataFrame, embeddings: np.ndarray) -> Tuple[pd.DataFrame, np.array,
|
| 3523 |
-
|
| 3524 |
""" Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list
|
| 3525 |
|
| 3526 |
We transform the topics in `self.zeroshot_topic_list` to embeddings and
|
|
@@ -3605,16 +3607,16 @@ class BERTopic:
|
|
| 3605 |
empty_dimensionality_model = BaseDimensionalityReduction()
|
| 3606 |
empty_cluster_model = BaseCluster()
|
| 3607 |
zeroshot_model = BERTopic(
|
| 3608 |
-
|
| 3609 |
-
|
| 3610 |
-
|
| 3611 |
-
|
| 3612 |
-
|
| 3613 |
-
|
| 3614 |
-
|
| 3615 |
-
|
| 3616 |
-
|
| 3617 |
-
|
| 3618 |
).fit(docs, embeddings=embeddings, y=y)
|
| 3619 |
logger.info("Zeroshot Step 2 - Completed \u2713")
|
| 3620 |
logger.info("Zeroshot Step 3 - Combining clustered topics with the zeroshot model")
|
|
@@ -3824,9 +3826,9 @@ class BERTopic:
|
|
| 3824 |
# Sample documents per topic
|
| 3825 |
documents_per_topic = (
|
| 3826 |
documents.drop("Image", axis=1, errors="ignore")
|
| 3827 |
-
|
| 3828 |
-
|
| 3829 |
-
|
| 3830 |
)
|
| 3831 |
|
| 3832 |
# Find and extract documents that are most similar to the topic
|
|
@@ -4007,7 +4009,7 @@ class BERTopic:
|
|
| 4007 |
documents: pd.DataFrame,
|
| 4008 |
c_tf_idf: csr_matrix = None,
|
| 4009 |
calculate_aspects: bool = True) -> Mapping[str,
|
| 4010 |
-
|
| 4011 |
""" Based on tf_idf scores per topic, extract the top n words per topic
|
| 4012 |
|
| 4013 |
If the top words per topic need to be extracted, then only the `words` parameter
|
|
@@ -4126,8 +4128,8 @@ class BERTopic:
|
|
| 4126 |
for key, val in sorted(mapped_topics.items()):
|
| 4127 |
mappings[val].append(key)
|
| 4128 |
mappings = {topic_from:
|
| 4129 |
-
|
| 4130 |
-
|
| 4131 |
for topic_from, topics_to in mappings.items()}
|
| 4132 |
|
| 4133 |
# Map topics
|
|
@@ -4177,8 +4179,8 @@ class BERTopic:
|
|
| 4177 |
for key, val in sorted(mapped_topics.items()):
|
| 4178 |
mappings[val].append(key)
|
| 4179 |
mappings = {topic_from:
|
| 4180 |
-
|
| 4181 |
-
|
| 4182 |
for topic_from, topics_to in mappings.items()}
|
| 4183 |
|
| 4184 |
# Update documents and topics
|
|
@@ -4479,10 +4481,10 @@ def _create_model_from_files(
|
|
| 4479 |
|
| 4480 |
# Fit BERTopic without actually performing any clustering
|
| 4481 |
topic_model = BERTopic(
|
| 4482 |
-
|
| 4483 |
-
|
| 4484 |
-
|
| 4485 |
-
|
| 4486 |
)
|
| 4487 |
topic_model.topic_embeddings_ = tensors["topic_embeddings"].numpy()
|
| 4488 |
topic_model.topic_representations_ = {int(key): val for key, val in topics["topic_representations"].items()}
|
|
|
|
| 158 |
NOTE: This param will not be used if you pass in your own
|
| 159 |
CountVectorizer.
|
| 160 |
min_topic_size: The minimum size of the topic. Increasing this value will lead
|
| 161 |
+
to a lower number of clusters/topics and vice versa.
|
| 162 |
It is the same parameter as `min_cluster_size` in HDBSCAN.
|
| 163 |
NOTE: This param will not be used if you are using `hdbscan_model`.
|
| 164 |
nr_topics: Specifying the number of topics will reduce the initial
|
|
|
|
| 321 |
embeddings: np.ndarray = None,
|
| 322 |
images: List[str] = None,
|
| 323 |
y: Union[List[int], np.ndarray] = None) -> Tuple[List[int],
|
| 324 |
+
Union[np.ndarray, None]]:
|
| 325 |
""" Fit the models on a collection of documents, generate topics,
|
| 326 |
and return the probabilities and topic per document.
|
| 327 |
|
|
|
|
| 699 |
|
| 700 |
def topics_over_time(self,
|
| 701 |
docs: List[str],
|
| 702 |
+
timestamps: Union[List[str]],
|
|
|
|
| 703 |
topics: List[int] = None,
|
| 704 |
+
n_topics: int = 5,
|
| 705 |
nr_bins: int = None,
|
| 706 |
+
n_keywords: int = 5,
|
| 707 |
datetime_format: str = None,
|
| 708 |
evolution_tuning: bool = True,
|
| 709 |
global_tuning: bool = True) -> pd.DataFrame:
|
|
|
|
| 827 |
|
| 828 |
# Fill dataframe with results
|
| 829 |
topics_at_timestamp = [(topic,
|
| 830 |
+
", ".join([words[0] for words in values][:n_topics]),
|
| 831 |
+
[words[1] for weights in values][:n_topics],
|
| 832 |
topic_frequency[topic],
|
| 833 |
timestamp) for topic, values in words_per_topic.items()]
|
| 834 |
topics_over_time.extend(topics_at_timestamp)
|
|
|
|
| 837 |
previous_topics = sorted(list(documents_per_topic.Topic.values))
|
| 838 |
previous_c_tf_idf = c_tf_idf.copy()
|
| 839 |
|
| 840 |
+
return pd.DataFrame(topics_over_time, columns=["Topic", "Words", "Weight", "Frequency", "Timestamp"])
|
| 841 |
|
| 842 |
def topics_per_class(self,
|
| 843 |
docs: List[str],
|
|
|
|
| 934 |
`lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`
|
| 935 |
distance_function: The distance function to use on the c-TF-IDF matrix. Default is:
|
| 936 |
`lambda x: 1 - cosine_similarity(x)`.
|
| 937 |
+
You can pass any function that returns either a square matrix of
|
| 938 |
+
shape (n_samples, n_samples) with zeros on the diagonal and
|
| 939 |
non-negative values or condensed distance matrix of shape
|
| 940 |
(n_samples * (n_samples - 1) / 2,) containing the upper
|
| 941 |
triangular of the distance matrix.
|
|
|
|
| 1069 |
use_embedding_model: bool = False,
|
| 1070 |
calculate_tokens: bool = False,
|
| 1071 |
separator: str = " ") -> Tuple[np.ndarray,
|
| 1072 |
+
Union[List[np.ndarray], None]]:
|
| 1073 |
""" A post-hoc approximation of topic distributions across documents.
|
| 1074 |
|
| 1075 |
In order to perform this approximation, each document is split into tokens
|
|
|
|
| 1979 |
for key, val in sorted(mapping.items()):
|
| 1980 |
mappings[val].append(key)
|
| 1981 |
mappings = {topic_from:
|
| 1982 |
+
{"topics_to": topics_to,
|
| 1983 |
+
"topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
|
| 1984 |
for topic_from, topics_to in mappings.items()}
|
| 1985 |
|
| 1986 |
# Update topics
|
|
|
|
| 2466 |
specific points. Helps to speed up generation of visualizations.
|
| 2467 |
nr_levels: The number of levels to be visualized in the hierarchy. First, the distances
|
| 2468 |
in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances with
|
| 2469 |
+
equal length. Then, for each list of distances, the merged topics, that have
|
| 2470 |
a distance less or equal to the maximum distance of the selected list of distances, are selected.
|
| 2471 |
NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to
|
| 2472 |
the length of `hierarchical_topics`.
|
|
|
|
| 3266 |
sims = np.max(sim_matrix, axis=1)
|
| 3267 |
to_merge = {
|
| 3268 |
a - selected_topics["_outliers"]:
|
| 3269 |
+
b - merged_topics["_outliers"] for a, (b, val) in enumerate(zip(sims_idx, sims))
|
| 3270 |
if val >= min_similarity
|
| 3271 |
}
|
| 3272 |
to_merge.update(new_topics_dict)
|
|
|
|
| 3297 |
serialization: str = "safetensors",
|
| 3298 |
save_embedding_model: Union[str, bool] = True,
|
| 3299 |
save_ctfidf: bool = False,
|
| 3300 |
+
):
|
| 3301 |
""" Push your BERTopic model to a HuggingFace Hub
|
| 3302 |
|
| 3303 |
Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account:
|
|
|
|
| 3471 |
documents: pd.DataFrame,
|
| 3472 |
partial_fit: bool = False,
|
| 3473 |
y: np.ndarray = None) -> Tuple[pd.DataFrame,
|
| 3474 |
+
np.ndarray]:
|
| 3475 |
""" Cluster UMAP embeddings with HDBSCAN
|
| 3476 |
|
| 3477 |
Arguments:
|
|
|
|
| 3522 |
return documents, probabilities
|
| 3523 |
|
| 3524 |
def _zeroshot_topic_modeling(self, documents: pd.DataFrame, embeddings: np.ndarray) -> Tuple[pd.DataFrame, np.array,
|
| 3525 |
+
pd.DataFrame, np.array]:
|
| 3526 |
""" Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list
|
| 3527 |
|
| 3528 |
We transform the topics in `self.zeroshot_topic_list` to embeddings and
|
|
|
|
| 3607 |
empty_dimensionality_model = BaseDimensionalityReduction()
|
| 3608 |
empty_cluster_model = BaseCluster()
|
| 3609 |
zeroshot_model = BERTopic(
|
| 3610 |
+
n_gram_range=self.n_gram_range,
|
| 3611 |
+
low_memory=self.low_memory,
|
| 3612 |
+
calculate_probabilities=self.calculate_probabilities,
|
| 3613 |
+
embedding_model=self.embedding_model,
|
| 3614 |
+
umap_model=empty_dimensionality_model,
|
| 3615 |
+
hdbscan_model=empty_cluster_model,
|
| 3616 |
+
vectorizer_model=self.vectorizer_model,
|
| 3617 |
+
ctfidf_model=self.ctfidf_model,
|
| 3618 |
+
representation_model=self.representation_model,
|
| 3619 |
+
verbose=self.verbose
|
| 3620 |
).fit(docs, embeddings=embeddings, y=y)
|
| 3621 |
logger.info("Zeroshot Step 2 - Completed \u2713")
|
| 3622 |
logger.info("Zeroshot Step 3 - Combining clustered topics with the zeroshot model")
|
|
|
|
| 3826 |
# Sample documents per topic
|
| 3827 |
documents_per_topic = (
|
| 3828 |
documents.drop("Image", axis=1, errors="ignore")
|
| 3829 |
+
.groupby('Topic')
|
| 3830 |
+
.sample(n=nr_samples, replace=True, random_state=42)
|
| 3831 |
+
.drop_duplicates()
|
| 3832 |
)
|
| 3833 |
|
| 3834 |
# Find and extract documents that are most similar to the topic
|
|
|
|
| 4009 |
documents: pd.DataFrame,
|
| 4010 |
c_tf_idf: csr_matrix = None,
|
| 4011 |
calculate_aspects: bool = True) -> Mapping[str,
|
| 4012 |
+
List[Tuple[str, float]]]:
|
| 4013 |
""" Based on tf_idf scores per topic, extract the top n words per topic
|
| 4014 |
|
| 4015 |
If the top words per topic need to be extracted, then only the `words` parameter
|
|
|
|
| 4128 |
for key, val in sorted(mapped_topics.items()):
|
| 4129 |
mappings[val].append(key)
|
| 4130 |
mappings = {topic_from:
|
| 4131 |
+
{"topics_to": topics_to,
|
| 4132 |
+
"topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
|
| 4133 |
for topic_from, topics_to in mappings.items()}
|
| 4134 |
|
| 4135 |
# Map topics
|
|
|
|
| 4179 |
for key, val in sorted(mapped_topics.items()):
|
| 4180 |
mappings[val].append(key)
|
| 4181 |
mappings = {topic_from:
|
| 4182 |
+
{"topics_to": topics_to,
|
| 4183 |
+
"topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]}
|
| 4184 |
for topic_from, topics_to in mappings.items()}
|
| 4185 |
|
| 4186 |
# Update documents and topics
|
|
|
|
| 4481 |
|
| 4482 |
# Fit BERTopic without actually performing any clustering
|
| 4483 |
topic_model = BERTopic(
|
| 4484 |
+
embedding_model=embedding_model,
|
| 4485 |
+
umap_model=empty_dimensionality_model,
|
| 4486 |
+
hdbscan_model=empty_cluster_model,
|
| 4487 |
+
**params
|
| 4488 |
)
|
| 4489 |
topic_model.topic_embeddings_ = tensors["topic_embeddings"].numpy()
|
| 4490 |
topic_model.topic_representations_ = {int(key): val for key, val in topics["topic_representations"].items()}
|
BERTopic/docs/api/plotting/document_datamap.md
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
-
# `
|
| 2 |
|
| 3 |
::: bertopic.plotting._datamap.visualize_document_datamap
|
|
|
|
| 1 |
+
# `Documents with DataMapPlot`
|
| 2 |
|
| 3 |
::: bertopic.plotting._datamap.visualize_document_datamap
|
BERTopic/docs/faq.md
CHANGED
|
@@ -311,3 +311,23 @@ are important in understanding the general topic of the document. Although this
|
|
| 311 |
have data that contains a lot of noise, for example, HTML-tags, then it would be best to remove them. HTML-tags
|
| 312 |
typically do not contribute to the meaning of a document and should therefore be removed. However, if you apply
|
| 313 |
topic modeling to HTML-code to extract topics of code, then it becomes important.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
have data that contains a lot of noise, for example, HTML-tags, then it would be best to remove them. HTML-tags
|
| 312 |
typically do not contribute to the meaning of a document and should therefore be removed. However, if you apply
|
| 313 |
topic modeling to HTML-code to extract topics of code, then it becomes important.
|
| 314 |
+
|
| 315 |
+
## **I run into issues running on Apple Silicon. What should I do?**
|
| 316 |
+
Apple Silicon chips (M1 & M2) are based on the ARM64 (aka [AArch64](https://apple.stackexchange.com/questions/451238/is-m1-chip-aarch64-or-amd64), not to be confused with AMD64). There are known issues with upstream dependencies for this architecture, for example [numba](https://github.com/numba/numba/issues/5520). You may not always run into this issue, depending on the extras that you need.
|
| 317 |
+
|
| 318 |
+
One possible solution to this is to use [VS Code Dev Containers](https://code.visualstudio.com/docs/devcontainers/containers), which allows you to setup a Linux-based environment. To run BERTopic effectively you need to be aware of two things:
|
| 319 |
+
|
| 320 |
+
- Make sure to use a Docker image specifically compiled for ARM64
|
| 321 |
+
- Make sure to use `volume` instead of `mount-bind`, since the latter significantly reduces I/O speeds to disk
|
| 322 |
+
|
| 323 |
+
Using the pre-configured [Data Science Devcontainers](https://github.com/b-data/data-science-devcontainers) makes sure these setting are optimized. To start using them, do the following:
|
| 324 |
+
|
| 325 |
+
- Install and run Docker
|
| 326 |
+
- Install `python-base` or `python-scipy` [devcontainer](https://github.com/b-data/data-science-devcontainers)
|
| 327 |
+
- ℹ️ Change PYTHON_VERSION to 3.11 in the `devcontainer.json` to work with the latest version of Python 3.11 (currently 3.11.8)
|
| 328 |
+
- Open VS Code, build the container and start working
|
| 329 |
+
- Note that data is persisted in the container
|
| 330 |
+
- When using an unmodified devcontainer.json: work in `/home/vscode` which is the `home` directory of user `vscode`
|
| 331 |
+
- Python packages are installed to the home directory by default. This is due to env variable `PIP_USER=1`
|
| 332 |
+
- Note that the directory `/workspaces` is also persisted
|
| 333 |
+
|
BERTopic/docs/getting_started/visualization/visualize_documents.md
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
Using the `.visualize_topics`, we can visualize the topics and get insight into their relationships. However,
|
| 2 |
you might want a more fine-grained approach where we can visualize the documents inside the topics to see
|
| 3 |
if they were assigned correctly or whether they make sense. To do so, we can use the `topic_model.visualize_documents()`
|
|
@@ -43,6 +45,30 @@ When you visualize the documents, you might not always want to see the complete
|
|
| 43 |
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings)
|
| 44 |
```
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
## **Visualize Probablities or Distribution**
|
| 47 |
|
| 48 |
We can generate the topic-document probability matrix by simply setting `calculate_probabilities=True` if a HDBSCAN model is used:
|
|
@@ -100,3 +126,4 @@ df
|
|
| 100 |
the distribution of the frequencies of topics across a document. It merely shows
|
| 101 |
how confident BERTopic is that certain topics can be found in a document.
|
| 102 |
|
|
|
|
|
|
| 1 |
+
## **Visualize documents with Plotly**
|
| 2 |
+
|
| 3 |
Using the `.visualize_topics`, we can visualize the topics and get insight into their relationships. However,
|
| 4 |
you might want a more fine-grained approach where we can visualize the documents inside the topics to see
|
| 5 |
if they were assigned correctly or whether they make sense. To do so, we can use the `topic_model.visualize_documents()`
|
|
|
|
| 45 |
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings)
|
| 46 |
```
|
| 47 |
|
| 48 |
+
## **Visualize documents with DataMapPlot**
|
| 49 |
+
|
| 50 |
+
`.visualize_document_datamap` provides an alternative way to visualize the documents inside the topics as a static [DataMapPlot](https://datamapplot.readthedocs.io/en/latest/intro_splash.html). Using the same pipeline as above, you can generate a DataMapPlot by running:
|
| 51 |
+
|
| 52 |
+
```python
|
| 53 |
+
|
| 54 |
+
# with the original embeddings
|
| 55 |
+
topic_model.visualize_document_datamap(docs, embeddings=embeddings)
|
| 56 |
+
|
| 57 |
+
# with the reduced embeddings
|
| 58 |
+
topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
<br><br>
|
| 62 |
+
<img src="./datamapplot.png">
|
| 63 |
+
<br><br>
|
| 64 |
+
|
| 65 |
+
Or if you want to save the resulting figure:
|
| 66 |
+
|
| 67 |
+
```python
|
| 68 |
+
fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)
|
| 69 |
+
fig.savefig("path/to/file.png", bbox_inches="tight")
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
## **Visualize Probablities or Distribution**
|
| 73 |
|
| 74 |
We can generate the topic-document probability matrix by simply setting `calculate_probabilities=True` if a HDBSCAN model is used:
|
|
|
|
| 126 |
the distribution of the frequencies of topics across a document. It merely shows
|
| 127 |
how confident BERTopic is that certain topics can be found in a document.
|
| 128 |
|
| 129 |
+
|
BERTopic/docs/index.md
CHANGED
|
@@ -246,6 +246,7 @@ to tweak the model to your liking.
|
|
| 246 |
|-----------------------|---|
|
| 247 |
| Visualize Topics | `.visualize_topics()` |
|
| 248 |
| Visualize Documents | `.visualize_documents()` |
|
|
|
|
| 249 |
| Visualize Document Hierarchy | `.visualize_hierarchical_documents()` |
|
| 250 |
| Visualize Topic Hierarchy | `.visualize_hierarchy()` |
|
| 251 |
| Visualize Topic Tree | `.get_topic_tree(hierarchical_topics)` |
|
|
@@ -254,7 +255,8 @@ to tweak the model to your liking.
|
|
| 254 |
| Visualize Term Score Decline | `.visualize_term_rank()` |
|
| 255 |
| Visualize Topic Probability Distribution | `.visualize_distribution(probs[0])` |
|
| 256 |
| Visualize Topics over Time | `.visualize_topics_over_time(topics_over_time)` |
|
| 257 |
-
| Visualize Topics per Class | `.visualize_topics_per_class(topics_per_class)` |
|
|
|
|
| 258 |
|
| 259 |
|
| 260 |
## **Citation**
|
|
|
|
| 246 |
|-----------------------|---|
|
| 247 |
| Visualize Topics | `.visualize_topics()` |
|
| 248 |
| Visualize Documents | `.visualize_documents()` |
|
| 249 |
+
| Visualize Document with DataMapPlot | `.visualize_document_datamap()` |
|
| 250 |
| Visualize Document Hierarchy | `.visualize_hierarchical_documents()` |
|
| 251 |
| Visualize Topic Hierarchy | `.visualize_hierarchy()` |
|
| 252 |
| Visualize Topic Tree | `.get_topic_tree(hierarchical_topics)` |
|
|
|
|
| 255 |
| Visualize Term Score Decline | `.visualize_term_rank()` |
|
| 256 |
| Visualize Topic Probability Distribution | `.visualize_distribution(probs[0])` |
|
| 257 |
| Visualize Topics over Time | `.visualize_topics_over_time(topics_over_time)` |
|
| 258 |
+
| Visualize Topics per Class | `.visualize_topics_per_class(topics_per_class)` |
|
| 259 |
+
|
| 260 |
|
| 261 |
|
| 262 |
## **Citation**
|
BERTopic/mkdocs.yml
CHANGED
|
@@ -83,6 +83,7 @@ nav:
|
|
| 83 |
- Plotting:
|
| 84 |
- Barchart: api/plotting/barchart.md
|
| 85 |
- Documents: api/plotting/documents.md
|
|
|
|
| 86 |
- DTM: api/plotting/dtm.md
|
| 87 |
- Hierarchical documents: api/plotting/hierarchical_documents.md
|
| 88 |
- Hierarchical topics: api/plotting/hierarchy.md
|
|
|
|
| 83 |
- Plotting:
|
| 84 |
- Barchart: api/plotting/barchart.md
|
| 85 |
- Documents: api/plotting/documents.md
|
| 86 |
+
- Documents with DataMapPlot: api/plotting/document_datamap.md
|
| 87 |
- DTM: api/plotting/dtm.md
|
| 88 |
- Hierarchical documents: api/plotting/hierarchical_documents.md
|
| 89 |
- Hierarchical topics: api/plotting/hierarchy.md
|