MARTINI_enrich_BERTopic_CI_Lib / ctfidf_config.json
AngelPanizo's picture
Add BERTopic model
9b9b1ab verified
{
"ctfidf_model": {
"bm25_weighting": false,
"reduce_frequent_words": true
},
"vectorizer_model": {
"params": {
"analyzer": "word",
"binary": false,
"decode_error": "strict",
"encoding": "utf-8",
"input": "content",
"lowercase": true,
"max_df": 0.8,
"max_features": null,
"min_df": 1,
"ngram_range": [
1,
1
],
"stop_words": [
"i",
"me",
"my",
"myself",
"we",
"our",
"ours",
"ourselves",
"you",
"you're",
"you've",
"you'll",
"you'd",
"your",
"yours",
"yourself",
"yourselves",
"he",
"him",
"his",
"himself",
"she",
"she's",
"her",
"hers",
"herself",
"it",
"it's",
"its",
"itself",
"they",
"them",
"their",
"theirs",
"themselves",
"what",
"which",
"who",
"whom",
"this",
"that",
"that'll",
"these",
"those",
"am",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"having",
"do",
"does",
"did",
"doing",
"a",
"an",
"the",
"and",
"but",
"if",
"or",
"because",
"as",
"until",
"while",
"of",
"at",
"by",
"for",
"with",
"about",
"against",
"between",
"into",
"through",
"during",
"before",
"after",
"above",
"below",
"to",
"from",
"up",
"down",
"in",
"out",
"on",
"off",
"over",
"under",
"again",
"further",
"then",
"once",
"here",
"there",
"when",
"where",
"why",
"how",
"all",
"any",
"both",
"each",
"few",
"more",
"most",
"other",
"some",
"such",
"no",
"nor",
"not",
"only",
"own",
"same",
"so",
"than",
"too",
"very",
"s",
"t",
"can",
"will",
"just",
"don",
"don't",
"should",
"should've",
"now",
"d",
"ll",
"m",
"o",
"re",
"ve",
"y",
"ain",
"aren",
"aren't",
"couldn",
"couldn't",
"didn",
"didn't",
"doesn",
"doesn't",
"hadn",
"hadn't",
"hasn",
"hasn't",
"haven",
"haven't",
"isn",
"isn't",
"ma",
"mightn",
"mightn't",
"mustn",
"mustn't",
"needn",
"needn't",
"shan",
"shan't",
"shouldn",
"shouldn't",
"wasn",
"wasn't",
"weren",
"weren't",
"won",
"won't",
"wouldn",
"wouldn't",
"CI_Lib"
],
"strip_accents": "unicode",
"token_pattern": "(?u)\\b\\w\\w+\\b",
"vocabulary": null
},
"vocab": {
"1855": 95,
"17": 77,
"lines": 338,
"184": 93,
"167": 74,
"153": 59,
"152": 58,
"151": 57,
"150": 56,
"148": 53,
"144": 49,
"142": 47,
"141": 46,
"140": 45,
"139": 43,
"138": 42,
"137": 41,
"133": 37,
"130": 34,
"127": 30,
"125": 28,
"124": 27,
"122": 25,
"121": 24,
"120": 23,
"112": 14,
"110": 12,
"102": 3,
"101": 2,
"100": 1,
"99": 228,
"98": 227,
"97": 226,
"96": 225,
"95": 224,
"94": 223,
"93": 222,
"92": 221,
"91": 220,
"90": 219,
"87": 216,
"84": 213,
"76": 205,
"54": 183,
"53": 182,
"52": 181,
"51": 180,
"50": 179,
"49": 178,
"48": 177,
"47": 176,
"45": 174,
"44": 173,
"43": 172,
"42": 171,
"41": 170,
"40": 169,
"35": 164,
"34": 163,
"33": 162,
"32": 161,
"31": 160,
"30": 159,
"15": 55,
"14": 44,
"13": 33,
"12": 22,
"10": 0,
"patrologi\u00e6": 370,
"entire": 292,
"volumes": 431,
"graeca": 305,
"161": 68,
"vols": 429,
"added": 237,
"enormous": 291,
"collections": 265,
"greek": 306,
"texts": 412,
"spanning": 407,
"previous": 381,
"2000": 114,
"years": 436,
"recommend": 388,
"using": 424,
"digitised": 280,
"text": 411,
"files": 298,
"linked": 340,
"navigate": 357,
"use": 423,
"search": 397,
"queries": 384,
"point": 376,
"book": 251,
"page": 368,
"scanned": 395,
"find": 300,
"looking": 343,
"note": 363,
"ocr": 364,
"inaccurate": 318,
"mispelled": 352,
"keep": 333,
"mind": 350,
"searching": 398,
"include": 319,
"preface": 380,
"basic": 246,
"information": 322,
"volume": 430,
"indexes": 320,
"1866": 97,
"160": 67,
"159": 65,
"158": 64,
"157": 63,
"156": 62,
"155": 61,
"154": 60,
"149": 54,
"147": 52,
"146": 51,
"145": 50,
"143": 48,
"136": 40,
"135": 39,
"134": 38,
"132": 36,
"131": 35,
"129": 32,
"128": 31,
"126": 29,
"123": 26,
"119": 21,
"118": 20,
"117": 19,
"116": 18,
"115": 17,
"114": 16,
"113": 15,
"111": 13,
"109": 10,
"108": 9,
"107": 8,
"106": 7,
"105": 6,
"104": 5,
"103": 4,
"89": 218,
"88": 217,
"86": 215,
"85": 214,
"83": 212,
"82": 211,
"81": 210,
"80": 209,
"79": 208,
"78": 207,
"77": 206,
"75": 204,
"74": 203,
"73": 202,
"72": 201,
"71": 200,
"70": 199,
"69": 198,
"68": 197,
"67": 196,
"66": 195,
"65": 194,
"64": 193,
"63": 192,
"62": 191,
"61": 190,
"60": 189,
"59": 188,
"58": 187,
"57": 186,
"56": 185,
"55": 184,
"46": 175,
"39": 168,
"38": 167,
"37": 166,
"36": 165,
"29": 158,
"28": 157,
"27": 156,
"26": 155,
"25": 154,
"24": 153,
"23": 152,
"22": 150,
"21": 139,
"20": 112,
"19": 101,
"18": 88,
"16": 66,
"11": 11,
"219": 149,
"218": 148,
"209": 138,
"208": 137,
"199": 111,
"198": 110,
"195": 107,
"194": 106,
"193": 105,
"192": 104,
"191": 103,
"190": 102,
"189": 100,
"188": 99,
"185": 94,
"183": 92,
"182": 91,
"181": 90,
"180": 89,
"216": 146,
"206": 135,
"196": 108,
"186": 96,
"176": 84,
"169": 76,
"168": 75,
"166": 73,
"165": 72,
"164": 71,
"163": 70,
"162": 69,
"217": 147,
"207": 136,
"197": 109,
"187": 98,
"179": 87,
"178": 86,
"177": 85,
"175": 83,
"174": 82,
"173": 81,
"172": 80,
"171": 79,
"170": 78,
"welcome": 432,
"christian": 260,
"identity": 314,
"library": 336,
"immediate": 316,
"link": 339,
"one": 365,
"important": 317,
"archives": 245,
"understanding": 420,
"race": 385,
"mixing": 353,
"anti": 242,
"biblical": 248,
"evil": 296,
"would": 435,
"like": 337,
"contribute": 272,
"materials": 347,
"archive": 244,
"please": 375,
"post": 378,
"chatroom": 259,
"try": 419,
"metadata": 349,
"filled": 299,
"caption": 256,
"makes": 346,
"easier": 286,
"add": 236,
"currently": 273,
"private": 382,
"access": 230,
"disclaimers": 282,
"purely": 383,
"academic": 229,
"condone": 269,
"views": 426,
"potential": 379,
"calls": 253,
"violence": 427,
"contained": 270,
"within": 434,
"true": 418,
"exhaustive": 297,
"unedited": 421,
"means": 348,
"none": 362,
"content": 271,
"vetted": 425,
"accuracy": 231,
"modified": 354,
"unfaithful": 422,
"endorse": 289,
"theological": 413,
"positions": 377,
"disagreement": 281,
"different": 279,
"every": 295,
"topic": 416,
"thinkable": 414,
"treat": 417,
"list": 341,
"canon": 255,
"studies": 408,
"read": 387,
"make": 345,
"informed": 323,
"decisions": 275,
"christianity": 261,
"desert": 277,
"religion": 390,
"sloan": 406,
"sutherland": 409,
"2020102": 130,
"english": 290,
"company": 267,
"nations": 356,
"acon": 232,
"teknatoutheou": 410,
"home": 311,
"blog": 250,
"sect": 399,
"ci": 263,
"adamic": 234,
"genesis": 303,
"flood": 301,
"global": 304,
"local": 342,
"2021013": 131,
"identifying": 313,
"sin": 405,
"antediluvian": 241,
"adamites": 235,
"jewish": 329,
"ties": 415,
"infamous": 321,
"opponents": 366,
"early": 285,
"church": 262,
"ethiopian": 293,
"eunuch": 294,
"simeon": 404,
"niger": 359,
"negroe": 358,
"saints": 393,
"perversion": 372,
"circumcision": 264,
"rite": 392,
"judaizers": 332,
"2020082": 128,
"refuting": 389,
"black": 249,
"hebrew": 308,
"israelites": 325,
"2020041": 123,
"killed": 335,
"jesus": 328,
"2020032": 122,
"fornication": 302,
"adultery": 238,
"idolatry": 315,
"case": 257,
"miscegenation": 351,
"multiculturalism": 355,
"2019112": 117,
"lying": 344,
"pen": 371,
"scribes": 396,
"2020091": 129,
"deuteronomy": 278,
"reuelite": 391,
"edomites": 288,
"2020052": 127,
"satanic": 394,
"origins": 367,
"kenite": 334,
"canaanite": 254,
"edomite": 287,
"jews": 330,
"brood": 252,
"vipers": 428,
"serpent": 402,
"seed": 400,
"concerning": 268,
"ancient": 240,
"aethiops": 239,
"caucasian": 258,
"hamites": 307,
"2020051": 126,
"house": 312,
"pharoahs": 373,
"hebrews": 309,
"white": 433,
"aram": 243,
"descriptions": 276,
"2020010": 120,
"non": 361,
"races": 386,
"dispersions": 283,
"israel": 324,
"phoenicians": 374,
"danaans": 274,
"dorians": 284,
"2020042": 124,
"noahite": 360,
"shemites": 403,
"2020013": 121,
"2020050": 125,
"japhethites": 327,
"2019120": 118,
"adam": 233,
"patriarch": 369,
"bible": 247,
"john": 331,
"herrell": 310,
"2007032": 115,
"jacobsheirs": 326,
"com": 266,
"separatist": 401,
"220": 151,
"215": 145,
"214": 144,
"213": 143,
"212": 142,
"211": 141,
"210": 140,
"205": 134,
"204": 133,
"203": 132,
"202": 119,
"201": 116,
"200": 113
}
}
}