ychu612 commited on
Commit
11c6f6a
·
verified ·
1 Parent(s): 7645cf8

Add BERTopic model

Browse files
Files changed (4) hide show
  1. README.md +72 -0
  2. config.json +17 -0
  3. topic_embeddings.safetensors +3 -0
  4. topics.json +265 -0
README.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ tags:
4
+ - bertopic
5
+ library_name: bertopic
6
+ pipeline_tag: text-classification
7
+ ---
8
+
9
+ # BERTopic_vafn
10
+
11
+ This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model.
12
+ BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets.
13
+
14
+ ## Usage
15
+
16
+ To use this model, please install BERTopic:
17
+
18
+ ```
19
+ pip install -U bertopic
20
+ ```
21
+
22
+ You can use the model as follows:
23
+
24
+ ```python
25
+ from bertopic import BERTopic
26
+ topic_model = BERTopic.load("ychu612/BERTopic_vafn")
27
+
28
+ topic_model.get_topic_info()
29
+ ```
30
+
31
+ ## Topic overview
32
+
33
+ * Number of topics: 3
34
+ * Number of training documents: 103
35
+
36
+ <details>
37
+ <summary>Click here for an overview of all topics.</summary>
38
+
39
+ | Topic ID | Topic Keywords | Topic Frequency | Label |
40
+ |----------|----------------|-----------------|-------|
41
+ | -1 | the - was - she - and - to | 15 | -1_the_was_she_and |
42
+ | 0 | the - she - was - and - her | 55 | 0_the_she_was_and |
43
+ | 1 | the - was - he - and - to | 33 | 1_the_was_he_and |
44
+
45
+ </details>
46
+
47
+ ## Training hyperparameters
48
+
49
+ * calculate_probabilities: False
50
+ * language: english
51
+ * low_memory: False
52
+ * min_topic_size: 10
53
+ * n_gram_range: (1, 1)
54
+ * nr_topics: None
55
+ * seed_topic_list: None
56
+ * top_n_words: 10
57
+ * verbose: False
58
+ * zeroshot_min_similarity: 0.7
59
+ * zeroshot_topic_list: None
60
+
61
+ ## Framework versions
62
+
63
+ * Numpy: 1.23.0
64
+ * HDBSCAN: 0.8.33
65
+ * UMAP: 0.5.5
66
+ * Pandas: 2.1.4
67
+ * Scikit-Learn: 1.1.0
68
+ * Sentence-transformers: 2.3.1
69
+ * Transformers: 4.38.1
70
+ * Numba: 0.56.4
71
+ * Plotly: 5.9.0
72
+ * Python: 3.10.9
config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "calculate_probabilities": false,
3
+ "language": "english",
4
+ "low_memory": false,
5
+ "min_topic_size": 10,
6
+ "n_gram_range": [
7
+ 1,
8
+ 1
9
+ ],
10
+ "nr_topics": null,
11
+ "seed_topic_list": null,
12
+ "top_n_words": 10,
13
+ "verbose": false,
14
+ "zeroshot_min_similarity": 0.7,
15
+ "zeroshot_topic_list": null,
16
+ "embedding_model": "sentence-transformers/all-MiniLM-L6-v2"
17
+ }
topic_embeddings.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cadfa8aa7011577bca017f470ee45e5693f6c388f52839fd1c2fd864b7781c32
3
+ size 4696
topics.json ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "topic_representations": {
3
+ "-1": [
4
+ [
5
+ "the",
6
+ 0.13742718218795796
7
+ ],
8
+ [
9
+ "was",
10
+ 0.10401424795221015
11
+ ],
12
+ [
13
+ "she",
14
+ 0.1015715645907204
15
+ ],
16
+ [
17
+ "and",
18
+ 0.07659591217201081
19
+ ],
20
+ [
21
+ "to",
22
+ 0.07592298564333361
23
+ ],
24
+ [
25
+ "baby",
26
+ 0.07139075689232881
27
+ ],
28
+ [
29
+ "her",
30
+ 0.0629814738419722
31
+ ],
32
+ [
33
+ "they",
34
+ 0.05443672238507993
35
+ ],
36
+ [
37
+ "that",
38
+ 0.04770398186197523
39
+ ],
40
+ [
41
+ "on",
42
+ 0.04728269721896559
43
+ ]
44
+ ],
45
+ "0": [
46
+ [
47
+ "the",
48
+ 0.13844740564012709
49
+ ],
50
+ [
51
+ "she",
52
+ 0.10199991617971646
53
+ ],
54
+ [
55
+ "was",
56
+ 0.08782635385923653
57
+ ],
58
+ [
59
+ "and",
60
+ 0.08475642666800337
61
+ ],
62
+ [
63
+ "her",
64
+ 0.08108600228917424
65
+ ],
66
+ [
67
+ "baby",
68
+ 0.07608382014852672
69
+ ],
70
+ [
71
+ "to",
72
+ 0.07486508934251049
73
+ ],
74
+ [
75
+ "that",
76
+ 0.05191060920808593
77
+ ],
78
+ [
79
+ "they",
80
+ 0.0476049013649591
81
+ ],
82
+ [
83
+ "not",
84
+ 0.04479719687171663
85
+ ]
86
+ ],
87
+ "1": [
88
+ [
89
+ "the",
90
+ 0.14362369546548528
91
+ ],
92
+ [
93
+ "was",
94
+ 0.12727289911502307
95
+ ],
96
+ [
97
+ "he",
98
+ 0.07768687174821065
99
+ ],
100
+ [
101
+ "and",
102
+ 0.0716539037862848
103
+ ],
104
+ [
105
+ "to",
106
+ 0.07042282208672329
107
+ ],
108
+ [
109
+ "child",
110
+ 0.06821516432006061
111
+ ],
112
+ [
113
+ "mother",
114
+ 0.06076109057121903
115
+ ],
116
+ [
117
+ "she",
118
+ 0.053563162541500516
119
+ ],
120
+ [
121
+ "born",
122
+ 0.051571267513334976
123
+ ],
124
+ [
125
+ "baby",
126
+ 0.05155810172983109
127
+ ]
128
+ ]
129
+ },
130
+ "topics": [
131
+ -1,
132
+ -1,
133
+ -1,
134
+ -1,
135
+ -1,
136
+ -1,
137
+ -1,
138
+ 1,
139
+ 1,
140
+ -1,
141
+ -1,
142
+ -1,
143
+ -1,
144
+ -1,
145
+ 0,
146
+ 0,
147
+ 0,
148
+ 0,
149
+ 0,
150
+ -1,
151
+ -1,
152
+ -1,
153
+ 0,
154
+ 1,
155
+ 1,
156
+ 0,
157
+ 1,
158
+ -1,
159
+ 0,
160
+ -1,
161
+ 0,
162
+ 0,
163
+ -1,
164
+ -1,
165
+ 0,
166
+ 0,
167
+ -1,
168
+ -1,
169
+ 0,
170
+ 0,
171
+ 0,
172
+ 0,
173
+ -1,
174
+ -1,
175
+ 0,
176
+ -1,
177
+ -1,
178
+ 0,
179
+ -1,
180
+ 0,
181
+ -1,
182
+ -1,
183
+ -1,
184
+ -1,
185
+ 1,
186
+ 0,
187
+ -1,
188
+ -1,
189
+ 0,
190
+ -1,
191
+ 0,
192
+ -1,
193
+ 1,
194
+ 0,
195
+ -1,
196
+ -1,
197
+ -1,
198
+ 0,
199
+ 1,
200
+ -1,
201
+ -1,
202
+ -1,
203
+ 0,
204
+ 1,
205
+ -1,
206
+ -1,
207
+ -1,
208
+ -1,
209
+ -1,
210
+ 1,
211
+ -1,
212
+ 1,
213
+ -1,
214
+ -1,
215
+ 0,
216
+ 0,
217
+ 1,
218
+ -1,
219
+ 0,
220
+ -1,
221
+ -1,
222
+ -1,
223
+ 0,
224
+ 0,
225
+ 1,
226
+ -1,
227
+ 1,
228
+ 1,
229
+ 0,
230
+ 0,
231
+ -1,
232
+ 0,
233
+ -1
234
+ ],
235
+ "topic_sizes": {
236
+ "-1": 55,
237
+ "1": 15,
238
+ "0": 33
239
+ },
240
+ "topic_mapper": [
241
+ [
242
+ -1,
243
+ -1,
244
+ -1
245
+ ],
246
+ [
247
+ 0,
248
+ 0,
249
+ 0
250
+ ],
251
+ [
252
+ 1,
253
+ 1,
254
+ 1
255
+ ]
256
+ ],
257
+ "topic_labels": {
258
+ "-1": "-1_the_was_she_and",
259
+ "0": "0_the_she_was_and",
260
+ "1": "1_the_was_he_and"
261
+ },
262
+ "custom_labels": null,
263
+ "_outliers": 1,
264
+ "topic_aspects": {}
265
+ }