AngelPanizo commited on
Commit
826c20a
·
verified ·
1 Parent(s): a2ea2e3

Add BERTopic model

Browse files
README.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ tags:
4
+ - bertopic
5
+ library_name: bertopic
6
+ pipeline_tag: text-classification
7
+ ---
8
+
9
+ # MARTINI_enrich_BERTopic_ZaferPartisiBilgi
10
+
11
+ This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model.
12
+ BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets.
13
+
14
+ ## Usage
15
+
16
+ To use this model, please install BERTopic:
17
+
18
+ ```
19
+ pip install -U bertopic
20
+ ```
21
+
22
+ You can use the model as follows:
23
+
24
+ ```python
25
+ from bertopic import BERTopic
26
+ topic_model = BERTopic.load("AIDA-UPM/MARTINI_enrich_BERTopic_ZaferPartisiBilgi")
27
+
28
+ topic_model.get_topic_info()
29
+ ```
30
+
31
+ ## Topic overview
32
+
33
+ * Number of topics: 6
34
+ * Number of training documents: 375
35
+
36
+ <details>
37
+ <summary>Click here for an overview of all topics.</summary>
38
+
39
+ | Topic ID | Topic Keywords | Topic Frequency | Label |
40
+ |----------|----------------|-----------------|-------|
41
+ | -1 | akdeniz - sıgınmacılar - bakanlıgı - ataturk - osmaniye | 23 | -1_akdeniz_sıgınmacılar_bakanlıgı_ataturk |
42
+ | 0 | tasımacılıgında - merkezleri - burokratlar - sanayi - planı | 164 | 0_tasımacılıgında_merkezleri_burokratlar_sanayi |
43
+ | 1 | erdogan - bakanı - sıgınmacıları - ırkcı - sokak | 65 | 1_erdogan_bakanı_sıgınmacıları_ırkcı |
44
+ | 2 | konferansımıza - baskanlıgımızın - cankırı - trabzon - merkezi | 65 | 2_konferansımıza_baskanlıgımızın_cankırı_trabzon |
45
+ | 3 | kanalı - basdanısmanımız - programında - mehmet - yesiltepe | 31 | 3_kanalı_basdanısmanımız_programında_mehmet |
46
+ | 4 | erdogan - kılıcdaroglu - maliyetinin - ciftcilerimizin - milyonlarca | 27 | 4_erdogan_kılıcdaroglu_maliyetinin_ciftcilerimizin |
47
+
48
+ </details>
49
+
50
+ ## Training hyperparameters
51
+
52
+ * calculate_probabilities: True
53
+ * language: None
54
+ * low_memory: False
55
+ * min_topic_size: 10
56
+ * n_gram_range: (1, 1)
57
+ * nr_topics: None
58
+ * seed_topic_list: None
59
+ * top_n_words: 10
60
+ * verbose: False
61
+ * zeroshot_min_similarity: 0.7
62
+ * zeroshot_topic_list: None
63
+
64
+ ## Framework versions
65
+
66
+ * Numpy: 1.26.4
67
+ * HDBSCAN: 0.8.40
68
+ * UMAP: 0.5.7
69
+ * Pandas: 2.2.3
70
+ * Scikit-Learn: 1.5.2
71
+ * Sentence-transformers: 3.3.1
72
+ * Transformers: 4.46.3
73
+ * Numba: 0.60.0
74
+ * Plotly: 5.24.1
75
+ * Python: 3.10.12
config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "calculate_probabilities": true,
3
+ "language": null,
4
+ "low_memory": false,
5
+ "min_topic_size": 10,
6
+ "n_gram_range": [
7
+ 1,
8
+ 1
9
+ ],
10
+ "nr_topics": null,
11
+ "seed_topic_list": null,
12
+ "top_n_words": 10,
13
+ "verbose": false,
14
+ "zeroshot_min_similarity": 0.7,
15
+ "zeroshot_topic_list": null
16
+ }
ctfidf.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d41f0549583fd135cf2d9e73e4bca44b614a3e19f725a557a7dbbed23d1e057d
3
+ size 112292
ctfidf_config.json ADDED
The diff for this file is too large to render. See raw diff
 
topic_embeddings.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f92f75bb9bd44905f9669497ec97fa7a1917f2fe845c6041b20420bb3ee2ada4
3
+ size 24664
topics.json ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "topic_representations": {
3
+ "-1": [
4
+ [
5
+ "akdeniz",
6
+ 0.585024356842041
7
+ ],
8
+ [
9
+ "s\u0131g\u0131nmac\u0131lar",
10
+ 0.583515465259552
11
+ ],
12
+ [
13
+ "bakanl\u0131g\u0131",
14
+ 0.5792111158370972
15
+ ],
16
+ [
17
+ "ataturk",
18
+ 0.5755276679992676
19
+ ],
20
+ [
21
+ "osmaniye",
22
+ 0.5471533536911011
23
+ ]
24
+ ],
25
+ "0": [
26
+ [
27
+ "tas\u0131mac\u0131l\u0131g\u0131nda",
28
+ 0.5838664770126343
29
+ ],
30
+ [
31
+ "merkezleri",
32
+ 0.5468962788581848
33
+ ],
34
+ [
35
+ "burokratlar",
36
+ 0.5457158088684082
37
+ ],
38
+ [
39
+ "sanayi",
40
+ 0.5442032814025879
41
+ ],
42
+ [
43
+ "plan\u0131",
44
+ 0.5412929058074951
45
+ ]
46
+ ],
47
+ "1": [
48
+ [
49
+ "erdogan",
50
+ 0.5959498882293701
51
+ ],
52
+ [
53
+ "bakan\u0131",
54
+ 0.5756554007530212
55
+ ],
56
+ [
57
+ "s\u0131g\u0131nmac\u0131lar\u0131",
58
+ 0.5751449465751648
59
+ ],
60
+ [
61
+ "\u0131rkc\u0131",
62
+ 0.5687888860702515
63
+ ],
64
+ [
65
+ "sokak",
66
+ 0.5347297191619873
67
+ ]
68
+ ],
69
+ "2": [
70
+ [
71
+ "konferans\u0131m\u0131za",
72
+ 0.6120175123214722
73
+ ],
74
+ [
75
+ "baskanl\u0131g\u0131m\u0131z\u0131n",
76
+ 0.5986858606338501
77
+ ],
78
+ [
79
+ "cank\u0131r\u0131",
80
+ 0.5532304644584656
81
+ ],
82
+ [
83
+ "trabzon",
84
+ 0.5171080231666565
85
+ ],
86
+ [
87
+ "merkezi",
88
+ 0.5090340375900269
89
+ ]
90
+ ],
91
+ "3": [
92
+ [
93
+ "kanal\u0131",
94
+ 0.5743088126182556
95
+ ],
96
+ [
97
+ "basdan\u0131sman\u0131m\u0131z",
98
+ 0.5417798757553101
99
+ ],
100
+ [
101
+ "program\u0131nda",
102
+ 0.5383641123771667
103
+ ],
104
+ [
105
+ "mehmet",
106
+ 0.5301561951637268
107
+ ],
108
+ [
109
+ "yesiltepe",
110
+ 0.4924757480621338
111
+ ]
112
+ ],
113
+ "4": [
114
+ [
115
+ "erdogan",
116
+ 0.6088460087776184
117
+ ],
118
+ [
119
+ "k\u0131l\u0131cdaroglu",
120
+ 0.6036186218261719
121
+ ],
122
+ [
123
+ "maliyetinin",
124
+ 0.5769613981246948
125
+ ],
126
+ [
127
+ "ciftcilerimizin",
128
+ 0.5688290596008301
129
+ ],
130
+ [
131
+ "milyonlarca",
132
+ 0.5389319658279419
133
+ ]
134
+ ]
135
+ },
136
+ "topics": [
137
+ -1,
138
+ 3,
139
+ 2,
140
+ -1,
141
+ 0,
142
+ -1,
143
+ -1,
144
+ 0,
145
+ 4,
146
+ 0,
147
+ 3,
148
+ -1,
149
+ 4,
150
+ 0,
151
+ 3,
152
+ 2,
153
+ 3,
154
+ 0,
155
+ 0,
156
+ 2,
157
+ -1,
158
+ -1,
159
+ 0,
160
+ 0,
161
+ -1,
162
+ 0,
163
+ -1,
164
+ 1,
165
+ 0,
166
+ 0,
167
+ 0,
168
+ 4,
169
+ -1,
170
+ 2,
171
+ -1,
172
+ 1,
173
+ 4,
174
+ -1,
175
+ 2,
176
+ -1,
177
+ 4,
178
+ 2,
179
+ -1,
180
+ -1,
181
+ 3,
182
+ -1,
183
+ -1,
184
+ -1,
185
+ 1,
186
+ 2,
187
+ 2,
188
+ 0,
189
+ -1,
190
+ 0,
191
+ 0,
192
+ 4,
193
+ 0,
194
+ -1,
195
+ 3,
196
+ 1,
197
+ 0,
198
+ -1,
199
+ 2,
200
+ -1,
201
+ -1,
202
+ 0,
203
+ 4,
204
+ 3,
205
+ 2,
206
+ 2,
207
+ -1,
208
+ 1,
209
+ -1,
210
+ 0,
211
+ -1,
212
+ 0,
213
+ 0,
214
+ 0,
215
+ -1,
216
+ 1,
217
+ 0,
218
+ 1,
219
+ -1,
220
+ 0,
221
+ 1,
222
+ 3,
223
+ -1,
224
+ -1,
225
+ -1,
226
+ -1,
227
+ -1,
228
+ -1,
229
+ -1,
230
+ 1,
231
+ 1,
232
+ -1,
233
+ -1,
234
+ -1,
235
+ -1,
236
+ -1,
237
+ -1,
238
+ -1,
239
+ 0,
240
+ 1,
241
+ 0,
242
+ 0,
243
+ 3,
244
+ 1,
245
+ -1,
246
+ 1,
247
+ 0,
248
+ 3,
249
+ 0,
250
+ 2,
251
+ -1,
252
+ 4,
253
+ -1,
254
+ 1,
255
+ 1,
256
+ 0,
257
+ -1,
258
+ 0,
259
+ -1,
260
+ -1,
261
+ -1,
262
+ 2,
263
+ 0,
264
+ 2,
265
+ 0,
266
+ 0,
267
+ 2,
268
+ 1,
269
+ 0,
270
+ 0,
271
+ 1,
272
+ 2,
273
+ -1,
274
+ 0,
275
+ 0,
276
+ -1,
277
+ 1,
278
+ 2,
279
+ 2,
280
+ 0,
281
+ 0,
282
+ 1,
283
+ -1,
284
+ 1,
285
+ -1,
286
+ -1,
287
+ -1,
288
+ 1,
289
+ 0,
290
+ -1,
291
+ -1,
292
+ -1,
293
+ 0,
294
+ 4,
295
+ 4,
296
+ 0,
297
+ 4,
298
+ -1,
299
+ -1,
300
+ 1,
301
+ -1,
302
+ 2,
303
+ -1,
304
+ -1,
305
+ -1,
306
+ 3,
307
+ -1,
308
+ 0,
309
+ 2,
310
+ 0,
311
+ 0,
312
+ -1,
313
+ -1,
314
+ -1,
315
+ 0,
316
+ -1,
317
+ 0,
318
+ 0,
319
+ 3,
320
+ 0,
321
+ 2,
322
+ 3,
323
+ -1,
324
+ 3,
325
+ -1,
326
+ 0,
327
+ 4,
328
+ 0,
329
+ -1,
330
+ 1,
331
+ -1,
332
+ 2,
333
+ -1,
334
+ -1,
335
+ -1,
336
+ 1,
337
+ -1,
338
+ 3,
339
+ -1,
340
+ -1,
341
+ 0,
342
+ -1,
343
+ -1,
344
+ 1,
345
+ 0,
346
+ 4,
347
+ 0,
348
+ -1,
349
+ -1,
350
+ 1,
351
+ 0,
352
+ -1,
353
+ 1,
354
+ -1,
355
+ -1,
356
+ 3,
357
+ 1,
358
+ 0,
359
+ 1,
360
+ -1,
361
+ -1,
362
+ 1,
363
+ -1,
364
+ 0,
365
+ -1,
366
+ 0,
367
+ 1,
368
+ 2,
369
+ 2,
370
+ -1,
371
+ 1,
372
+ -1,
373
+ 0,
374
+ -1,
375
+ -1,
376
+ 0,
377
+ -1,
378
+ 0,
379
+ 1,
380
+ -1,
381
+ -1,
382
+ 0,
383
+ 3,
384
+ -1,
385
+ -1,
386
+ 0,
387
+ -1,
388
+ -1,
389
+ -1,
390
+ -1,
391
+ -1,
392
+ 1,
393
+ -1,
394
+ -1,
395
+ -1,
396
+ -1,
397
+ 2,
398
+ 0,
399
+ -1,
400
+ -1,
401
+ 1,
402
+ -1,
403
+ -1,
404
+ -1,
405
+ 4,
406
+ -1,
407
+ -1,
408
+ -1,
409
+ -1,
410
+ 4,
411
+ -1,
412
+ -1,
413
+ 4,
414
+ 2,
415
+ -1,
416
+ -1,
417
+ -1,
418
+ -1,
419
+ -1,
420
+ 1,
421
+ -1,
422
+ 2,
423
+ 4,
424
+ -1,
425
+ -1,
426
+ 3,
427
+ -1,
428
+ 2,
429
+ 3,
430
+ -1,
431
+ -1,
432
+ -1,
433
+ -1,
434
+ 3,
435
+ 1,
436
+ 1,
437
+ -1,
438
+ -1,
439
+ 1,
440
+ 4,
441
+ 3,
442
+ 1,
443
+ -1,
444
+ 4,
445
+ -1,
446
+ -1,
447
+ 1,
448
+ 3,
449
+ -1,
450
+ -1,
451
+ 3,
452
+ 1,
453
+ 1,
454
+ 1,
455
+ -1,
456
+ -1,
457
+ 1,
458
+ 3,
459
+ 1,
460
+ 1,
461
+ -1,
462
+ -1,
463
+ 2,
464
+ -1,
465
+ 2,
466
+ -1,
467
+ -1,
468
+ 1,
469
+ 1,
470
+ 1,
471
+ -1,
472
+ 1,
473
+ 4,
474
+ -1,
475
+ -1,
476
+ 1,
477
+ 1,
478
+ -1,
479
+ 3,
480
+ -1,
481
+ 4,
482
+ -1,
483
+ -1,
484
+ 4,
485
+ -1,
486
+ -1,
487
+ 1,
488
+ 1,
489
+ -1,
490
+ -1,
491
+ 1,
492
+ -1,
493
+ 3,
494
+ 1,
495
+ 1,
496
+ -1,
497
+ 1,
498
+ -1,
499
+ 3,
500
+ 1,
501
+ 1,
502
+ 1,
503
+ -1,
504
+ 4,
505
+ -1,
506
+ -1,
507
+ 2,
508
+ 1,
509
+ 1,
510
+ 1,
511
+ -1
512
+ ],
513
+ "topic_sizes": {
514
+ "-1": 164,
515
+ "3": 27,
516
+ "2": 31,
517
+ "0": 65,
518
+ "4": 23,
519
+ "1": 65
520
+ },
521
+ "topic_mapper": [
522
+ [
523
+ -1,
524
+ -1,
525
+ -1
526
+ ],
527
+ [
528
+ 0,
529
+ 0,
530
+ 3
531
+ ],
532
+ [
533
+ 1,
534
+ 1,
535
+ 2
536
+ ],
537
+ [
538
+ 2,
539
+ 2,
540
+ 0
541
+ ],
542
+ [
543
+ 3,
544
+ 3,
545
+ 4
546
+ ],
547
+ [
548
+ 4,
549
+ 4,
550
+ 1
551
+ ]
552
+ ],
553
+ "topic_labels": {
554
+ "-1": "-1_akdeniz_s\u0131g\u0131nmac\u0131lar_bakanl\u0131g\u0131_ataturk",
555
+ "0": "0_tas\u0131mac\u0131l\u0131g\u0131nda_merkezleri_burokratlar_sanayi",
556
+ "1": "1_erdogan_bakan\u0131_s\u0131g\u0131nmac\u0131lar\u0131_\u0131rkc\u0131",
557
+ "2": "2_konferans\u0131m\u0131za_baskanl\u0131g\u0131m\u0131z\u0131n_cank\u0131r\u0131_trabzon",
558
+ "3": "3_kanal\u0131_basdan\u0131sman\u0131m\u0131z_program\u0131nda_mehmet",
559
+ "4": "4_erdogan_k\u0131l\u0131cdaroglu_maliyetinin_ciftcilerimizin"
560
+ },
561
+ "custom_labels": null,
562
+ "_outliers": 1,
563
+ "topic_aspects": {}
564
+ }