Add BERTopic model

Browse files

Files changed (6) hide show

README.md +72 -0
config.json +17 -0
ctfidf.safetensors +3 -0
ctfidf_config.json +0 -0
topic_embeddings.safetensors +3 -0
topics.json +154 -0

README.md ADDED Viewed

	@@ -0,0 +1,72 @@

+---
+tags:
+- bertopic
+library_name: bertopic
+pipeline_tag: text-classification
+---
+# bertopic_openai_emb_model
+This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model.
+BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets.
+## Usage
+To use this model, please install BERTopic:
+```
+pip install -U bertopic
+```
+You can use the model as follows:
+```python
+from bertopic import BERTopic
+topic_model = BERTopic.load("MaximSIMO/bertopic_openai_emb_model")
+topic_model.get_topic_info()
+```
+## Topic overview
+* Number of topics: 3
+* Number of training documents: 100
+<details>
+  <summary>Click here for an overview of all topics.</summary>
+  | Topic ID | Topic Keywords | Topic Frequency | Label |
+|----------|----------------|-----------------|-------|
+| -1 | Evening TV Programming | 13 | -1_Evening TV Programming |
+| 0 | Elettrodotti e ambiente | 15 | 0_Elettrodotti e ambiente |
+| 1 | Political Tensions | 72 | 1_Political Tensions |
+</details>
+## Training hyperparameters
+* calculate_probabilities: False
+* language: multilingual
+* low_memory: False
+* min_topic_size: 10
+* n_gram_range: (1, 1)
+* nr_topics: None
+* seed_topic_list: None
+* top_n_words: 10
+* verbose: True
+* zeroshot_min_similarity: 0.7
+* zeroshot_topic_list: None
+## Framework versions
+* Numpy: 2.2.6
+* HDBSCAN: 0.8.41
+* UMAP: 0.5.11
+* Pandas: 2.3.3
+* Scikit-Learn: 1.7.2
+* Sentence-transformers: 5.2.2
+* Transformers: 5.1.0
+* Numba: 0.63.1
+* Plotly: 6.5.2
+* Python: 3.10.19

config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "calculate_probabilities": false,
+  "language": "multilingual",
+  "low_memory": false,
+  "min_topic_size": 10,
+  "n_gram_range": [
+    1,
+    1
+  ],
+  "nr_topics": null,
+  "seed_topic_list": null,
+  "top_n_words": 10,
+  "verbose": true,
+  "zeroshot_min_similarity": 0.7,
+  "zeroshot_topic_list": null,
+  "embedding_model": "text-embedding-3-large"
+}

ctfidf.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2acc8a20bd1102778a6dcc66b127684ac1e7596be40b95386a497a7a0851f6d
+size 749500

ctfidf_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

topic_embeddings.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df710c9fa4325239be3897c4c4e6e8d30b9c0fb62054ec4b30c0b5263888c520
+size 36952

topics.json ADDED Viewed

	@@ -0,0 +1,154 @@

+{
+  "topic_representations": {
+    "-1": [
+      [
+        "Evening TV Programming",
+        1
+      ]
+    ],
+    "0": [
+      [
+        "Elettrodotti e ambiente",
+        1
+      ]
+    ],
+    "1": [
+      [
+        "Political Tensions",
+        1
+      ]
+    ]
+  },
+  "topics": [
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    -1,
+    1,
+    -1,
+    -1,
+    1,
+    -1,
+    1,
+    1,
+    0,
+    0,
+    -1,
+    -1,
+    -1,
+    -1,
+    -1,
+    1,
+    0,
+    -1,
+    -1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    -1,
+    1,
+    1,
+    1,
+    1,
+    -1,
+    -1,
+    -1,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    1,
+    1,
+    0,
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0
+  ],
+  "topic_sizes": {
+    "0": 72,
+    "-1": 15,
+    "1": 13
+  },
+  "topic_mapper": [
+    [
+      -1,
+      -1,
+      -1
+    ],
+    [
+      0,
+      0,
+      1
+    ],
+    [
+      1,
+      1,
+      0
+    ]
+  ],
+  "topic_labels": {
+    "-1": "-1_Evening TV Programming",
+    "0": "0_Elettrodotti e ambiente",
+    "1": "1_Political Tensions"
+  },
+  "custom_labels": null,
+  "_outliers": 1,
+  "topic_aspects": {}
+}