Transformers
common_voice_uni500 / tokenizer.json
Lakoc's picture
Upload tokenizer
2c2010f verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "([bos])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "([eos])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "([unk])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "([pad])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "([mask])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 6,
"content": "▁",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Metaspace",
"replacement": "▁",
"add_prefix_space": true,
"prepend_scheme": "always"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "([eos])",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "([eos])",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "([eos])",
"type_id": 1
}
}
],
"special_tokens": {
"([bos])": {
"id": "([bos])",
"ids": [
0
],
"tokens": [
"([bos])"
]
},
"([eos])": {
"id": "([eos])",
"ids": [
1
],
"tokens": [
"([eos])"
]
}
}
},
"decoder": {
"type": "Metaspace",
"replacement": "▁",
"add_prefix_space": true,
"prepend_scheme": "always"
},
"model": {
"type": "Unigram",
"unk_id": 2,
"vocab": [
[
"([bos])",
0.0
],
[
"([eos])",
0.0
],
[
"([unk])",
0.0
],
[
"([pad])",
0.0
],
[
"([mask])",
0.0
],
[
"s",
-2.7087862142460697
],
[
"▁",
-3.159713564956691
],
[
"t",
-3.247934739073319
],
[
"e",
-3.284542651100784
],
[
"▁the",
-3.458398141767942
],
[
"d",
-3.5931812444772806
],
[
"a",
-3.6366062534936177
],
[
"i",
-3.8026230161282193
],
[
"r",
-3.894013105983418
],
[
"n",
-3.9598806426474606
],
[
"o",
-4.108208965984497
],
[
"▁a",
-4.123017148530064
],
[
"l",
-4.143189339085515
],
[
"ed",
-4.185019838756677
],
[
"y",
-4.417883797200398
],
[
"▁of",
-4.418687866578182
],
[
"▁in",
-4.419989440610058
],
[
"c",
-4.4928092728880475
],
[
"m",
-4.549063923185756
],
[
"p",
-4.632586952754286
],
[
"▁to",
-4.718095797580903
],
[
"ing",
-4.718473047785871
],
[
"▁is",
-4.728594972551644
],
[
"b",
-4.766989553904384
],
[
"▁and",
-4.77880181362533
],
[
"g",
-4.8105314868809845
],
[
"er",
-4.838757388268354
],
[
"in",
-4.8881845890278015
],
[
"u",
-4.926181255679008
],
[
"f",
-4.950396547385262
],
[
"▁was",
-4.968557781850629
],
[
"al",
-5.0589039874057224
],
[
"▁he",
-5.063578475115438
],
[
"le",
-5.081968948387759
],
[
"or",
-5.12773130976289
],
[
"an",
-5.154499608378208
],
[
"ar",
-5.228999932874652
],
[
"h",
-5.239710728899752
],
[
"en",
-5.276913049118749
],
[
"on",
-5.287257314930473
],
[
"▁re",
-5.29170807429036
],
[
"re",
-5.376911310235307
],
[
"w",
-5.37736139685706
],
[
"▁it",
-5.438829496188923
],
[
"k",
-5.44315681092983
],
[
"ly",
-5.450308027570038
],
[
"it",
-5.450545322048562
],
[
"▁be",
-5.468488006995733
],
[
"ch",
-5.521802761779233
],
[
"▁for",
-5.548513141901553
],
[
"is",
-5.596158258362125
],
[
"ic",
-5.635561935799826
],
[
"▁are",
-5.665579329263476
],
[
"▁w",
-5.671906510671381
],
[
"ter",
-5.710373423850561
],
[
"ur",
-5.712613548234113
],
[
"ve",
-5.795164697207461
],
[
"▁his",
-5.81351636418619
],
[
"▁de",
-5.848193572865624
],
[
"▁g",
-5.858931530242998
],
[
"-",
-5.865170099410005
],
[
"th",
-5.912270797510585
],
[
"v",
-5.916207085762094
],
[
"▁ma",
-5.925417651109351
],
[
"▁on",
-5.928735685698497
],
[
"▁by",
-5.929119253111821
],
[
"▁with",
-5.939965393017241
],
[
"ce",
-5.944001428238662
],
[
"▁this",
-5.9817138264670735
],
[
"▁also",
-5.983956225982473
],
[
"▁di",
-5.98632093594669
],
[
"ation",
-5.989407901945238
],
[
"▁se",
-5.991591319651997
],
[
"ck",
-6.069032952442802
],
[
"▁ba",
-6.076568601953877
],
[
"ent",
-6.089212892005653
],
[
"▁con",
-6.111962249346897
],
[
"ng",
-6.143275412549821
],
[
"▁as",
-6.147747034810779
],
[
"▁an",
-6.1691273738919055
],
[
"▁mo",
-6.176024423820149
],
[
"ul",
-6.193235263628999
],
[
"▁co",
-6.207722089411254
],
[
"▁po",
-6.236389870862725
],
[
"▁li",
-6.244125340728081
],
[
"▁c",
-6.24659314699014
],
[
"▁so",
-6.2584892075074325
],
[
"z",
-6.262350036105108
],
[
"▁me",
-6.265813448196459
],
[
"▁pa",
-6.274214368513942
],
[
"ke",
-6.284788373891308
],
[
"ll",
-6.29727599976812
],
[
"vi",
-6.297314723995699
],
[
"▁su",
-6.312873884063279
],
[
"▁ch",
-6.316733133214962
],
[
"▁were",
-6.317001674657302
],
[
"ion",
-6.32961679439499
],
[
"ge",
-6.365805102591967
],
[
"▁at",
-6.366347577865874
],
[
"▁lo",
-6.368448533632174
],
[
"▁ro",
-6.379973682438701
],
[
"▁has",
-6.381738025259727
],
[
"ment",
-6.385379137103088
],
[
"▁k",
-6.386969828185796
],
[
"ver",
-6.38992638519708
],
[
"▁bo",
-6.4113570767182395
],
[
"un",
-6.413821832531996
],
[
"▁le",
-6.416708463357619
],
[
"▁from",
-6.422667892332576
],
[
"ate",
-6.4282492634575
],
[
"▁fa",
-6.432844147368696
],
[
"x",
-6.433501289905884
],
[
"am",
-6.436652615309573
],
[
"▁ha",
-6.439630948443641
],
[
"▁ex",
-6.444623596405842
],
[
"ow",
-6.444696534920828
],
[
"est",
-6.445636255562283
],
[
"▁that",
-6.455257224426214
],
[
"▁ca",
-6.455280496823114
],
[
"▁she",
-6.47833123379923
],
[
"▁la",
-6.48007668633581
],
[
"at",
-6.495527161682395
],
[
"us",
-6.51192066539333
],
[
"ies",
-6.514049205315091
],
[
"▁fi",
-6.52545677841473
],
[
"▁sp",
-6.52685820151644
],
[
"▁pro",
-6.528963178196921
],
[
"mp",
-6.538279720909209
],
[
"▁not",
-6.54785989826822
],
[
"▁ho",
-6.563725279244
],
[
"▁ne",
-6.573235986746392
],
[
"▁sta",
-6.57398357678378
],
[
"▁ra",
-6.5807826647107
],
[
"▁th",
-6.58745411057925
],
[
"▁do",
-6.5949293099703254
],
[
"ut",
-6.6132785145605
],
[
"om",
-6.629934340802123
],
[
"ive",
-6.641840396749629
],
[
"▁no",
-6.653866998404455
],
[
"▁or",
-6.655009632782145
],
[
"▁mi",
-6.65771141558985
],
[
"▁sh",
-6.661553283522425
],
[
"im",
-6.664912974933774
],
[
"lo",
-6.683941877897254
],
[
"per",
-6.6936353152327115
],
[
"ther",
-6.724802931319967
],
[
"▁un",
-6.738575581758305
],
[
"▁fr",
-6.738822448715048
],
[
"ide",
-6.740660495044171
],
[
"ers",
-6.7432945338476475
],
[
"ry",
-6.746615494127802
],
[
"▁her",
-6.747959730078813
],
[
"qu",
-6.754441987038513
],
[
"ight",
-6.756453568565393
],
[
"tion",
-6.763906947562896
],
[
"▁have",
-6.7661222733482465
],
[
"▁two",
-6.769677687251608
],
[
"man",
-6.778532527455926
],
[
"ph",
-6.788187691225737
],
[
"ated",
-6.791664740264787
],
[
"land",
-6.806277135033374
],
[
"pp",
-6.812916060110025
],
[
"▁wa",
-6.815750988179804
],
[
"▁can",
-6.819250477743541
],
[
"um",
-6.855212810060099
],
[
"▁all",
-6.863064196733795
],
[
"▁one",
-6.883919766877581
],
[
"ally",
-6.894574461770981
],
[
"▁we",
-6.899248968878656
],
[
"▁there",
-6.900372415768
],
[
"▁go",
-6.901571026299864
],
[
"if",
-6.9039358488068565
],
[
"▁i",
-6.90424414037885
],
[
"▁you",
-6.912114395683812
],
[
"▁they",
-6.919467977755152
],
[
"▁part",
-6.9361168002465785
],
[
"▁name",
-6.941953920415864
],
[
"▁bu",
-6.944066485594995
],
[
"ian",
-6.963794923353964
],
[
"tic",
-6.970500057495697
],
[
"▁play",
-6.9790510011425315
],
[
"▁pre",
-6.981428022342056
],
[
"▁com",
-6.987803687852885
],
[
"ction",
-7.000168263913396
],
[
"▁had",
-7.011058670335936
],
[
"▁new",
-7.024996664700733
],
[
"age",
-7.025595740271836
],
[
"▁vi",
-7.027038997638414
],
[
"ous",
-7.028303776740474
],
[
"▁mu",
-7.029504161358892
],
[
"▁tra",
-7.038938213895344
],
[
"▁after",
-7.044631576412625
],
[
"▁first",
-7.056126055402661
],
[
"lu",
-7.062308639897436
],
[
"▁been",
-7.064865376182913
],
[
"▁comp",
-7.066245780002827
],
[
"▁fe",
-7.079193763985936
],
[
"ity",
-7.080227628541172
],
[
"ial",
-7.080810993511831
],
[
"hi",
-7.084277518187218
],
[
"vo",
-7.093572325435632
],
[
"▁school",
-7.09957083097669
],
[
"▁ar",
-7.105511095191675
],
[
"▁fl",
-7.1078982175516074
],
[
"▁their",
-7.1150041203253735
],
[
"ance",
-7.1176754041850945
],
[
"▁cl",
-7.118630542416138
],
[
"▁year",
-7.120028089606139
],
[
"ition",
-7.124910466647542
],
[
"▁its",
-7.1355199302022445
],
[
"ical",
-7.1428101778764095
],
[
"▁work",
-7.144270629429425
],
[
"ence",
-7.144764437238724
],
[
"▁other",
-7.147743511062384
],
[
"▁str",
-7.15178771128703
],
[
"▁car",
-7.160388944412565
],
[
"min",
-7.166712819426699
],
[
"▁some",
-7.172070182565264
],
[
"▁time",
-7.18110890535751
],
[
"ture",
-7.186357992243128
],
[
"j",
-7.190662353563619
],
[
"tri",
-7.198171903624486
],
[
"▁mar",
-7.202127789306614
],
[
"▁sc",
-7.209850737464274
],
[
"▁pri",
-7.2180317110096865
],
[
"ard",
-7.221767758579366
],
[
"ill",
-7.235003472921441
],
[
"tro",
-7.243444688167287
],
[
"ary",
-7.243461404640545
],
[
"port",
-7.281185317840659
],
[
"cu",
-7.287158066523228
],
[
"▁man",
-7.291792484051358
],
[
"ell",
-7.291801906698197
],
[
"day",
-7.301005768455026
],
[
"▁ru",
-7.303300816069839
],
[
"▁up",
-7.308648049018128
],
[
"▁bi",
-7.320100963035362
],
[
"▁count",
-7.320844720582148
],
[
"ugh",
-7.325578420982355
],
[
"way",
-7.325586345677872
],
[
"▁these",
-7.326853317808698
],
[
"▁but",
-7.327670200652525
],
[
"▁most",
-7.345183341289596
],
[
"▁later",
-7.35122830418946
],
[
"▁pe",
-7.357512849956709
],
[
"▁bro",
-7.366609334491269
],
[
"▁fu",
-7.368805431506818
],
[
"▁many",
-7.373823250648902
],
[
"▁va",
-7.375833538931573
],
[
"▁out",
-7.381536894704848
],
[
"▁gra",
-7.384781529137044
],
[
"▁him",
-7.393000388890625
],
[
"able",
-7.403376105408167
],
[
"rie",
-7.408729531470341
],
[
"▁three",
-7.42606380204095
],
[
"▁town",
-7.427183101702385
],
[
"▁used",
-7.429946275883083
],
[
"cent",
-7.430185019770578
],
[
"came",
-7.4366302808378
],
[
"pla",
-7.441968222063206
],
[
"▁pi",
-7.443664632996766
],
[
"▁ju",
-7.4494340210691234
],
[
"▁el",
-7.459397493055167
],
[
"▁high",
-7.462373722776693
],
[
"▁sto",
-7.4846231391256595
],
[
"▁however",
-7.485133824745827
],
[
"ctor",
-7.487695780691876
],
[
"▁jo",
-7.495143111156208
],
[
"▁ja",
-7.525092312225434
],
[
"▁city",
-7.534486590732598
],
[
"▁hu",
-7.535640061380375
],
[
"gre",
-7.537763191129182
],
[
"ship",
-7.541985965754442
],
[
"▁known",
-7.560567236477107
],
[
"▁state",
-7.569655102775105
],
[
"▁ri",
-7.587086252535897
],
[
"ward",
-7.590294044961327
],
[
"produc",
-7.599503988032209
],
[
"▁into",
-7.600409279475954
],
[
"▁over",
-7.606763657533852
],
[
"▁will",
-7.609507506403235
],
[
"▁born",
-7.625680579045742
],
[
"▁pu",
-7.634136634897592
],
[
"▁both",
-7.638833495872163
],
[
"▁north",
-7.640978527524354
],
[
"▁acc",
-7.649090219623101
],
[
"▁bri",
-7.650634774682942
],
[
"▁several",
-7.65607141655474
],
[
"line",
-7.674195552978695
],
[
"▁du",
-7.674919488957654
],
[
"during",
-7.67722606134674
],
[
"▁south",
-7.677812087373661
],
[
"▁act",
-7.711595022299397
],
[
"▁include",
-7.712805810284968
],
[
"▁call",
-7.720985117787054
],
[
"▁fo",
-7.725039496806479
],
[
"ign",
-7.728179065376409
],
[
"▁through",
-7.7453947359728765
],
[
"▁four",
-7.76434808542699
],
[
"▁found",
-7.797665609419493
],
[
"▁large",
-7.798877478656337
],
[
"▁film",
-7.805622653996474
],
[
"▁under",
-7.819562251260857
],
[
"▁would",
-7.820400125605291
],
[
"▁who",
-7.831715157199552
],
[
"▁located",
-7.8341538978384335
],
[
"▁follow",
-7.839028375712218
],
[
"▁serve",
-7.846505271839957
],
[
"▁music",
-7.876533841393858
],
[
"▁member",
-7.887007719205323
],
[
"▁made",
-7.892064035760038
],
[
"▁game",
-7.907399306900546
],
[
"▁when",
-7.919178671382307
],
[
"▁team",
-7.9396147754433954
],
[
"▁current",
-7.9438600228097425
],
[
"▁second",
-8.00484816543743
],
[
"▁each",
-8.006456901441448
],
[
"▁university",
-8.007102176509145
],
[
"▁people",
-8.01003188400972
],
[
"▁album",
-8.031629388314958
],
[
"▁group",
-8.035099266998278
],
[
"▁which",
-8.043699536635295
],
[
"▁very",
-8.04658253705172
],
[
"▁park",
-8.052210607923111
],
[
"▁remain",
-8.060677347794147
],
[
"▁record",
-8.060683068624387
],
[
"▁house",
-8.062960860388575
],
[
"▁plan",
-8.063101760194963
],
[
"▁about",
-8.066841585630506
],
[
"▁cri",
-8.07267117759941
],
[
"▁appear",
-8.10014737460207
],
[
"▁opera",
-8.10838607600883
],
[
"▁number",
-8.123578901880322
],
[
"▁children",
-8.126500399957528
],
[
"▁bra",
-8.13205521993703
],
[
"▁small",
-8.146106911164622
],
[
"▁place",
-8.14671201920939
],
[
"▁family",
-8.156412204279727
],
[
"▁world",
-8.161354359029843
],
[
"▁take",
-8.166237382028916
],
[
"self",
-8.168209060426628
],
[
"▁public",
-8.193281029714244
],
[
"▁still",
-8.197891609099521
],
[
"▁what",
-8.198255226895526
],
[
"▁old",
-8.200644872829253
],
[
"▁local",
-8.202687765772456
],
[
"▁national",
-8.206761519181546
],
[
"field",
-8.208413299368788
],
[
"’",
-8.210447064991602
],
[
"▁same",
-8.221385696641576
],
[
"▁east",
-8.227046672948772
],
[
"▁village",
-8.240148070680647
],
[
"▁college",
-8.24099685302726
],
[
"▁general",
-8.243282309916683
],
[
"▁release",
-8.252196761643082
],
[
"▁feature",
-8.255292351555504
],
[
"▁said",
-8.274305489790384
],
[
"▁service",
-8.275617917046828
],
[
"▁develop",
-8.278382172206454
],
[
"▁america",
-8.28866665312544
],
[
"▁perform",
-8.292972514805687
],
[
"▁system",
-8.297304797335745
],
[
"▁district",
-8.302092271380655
],
[
"▁receive",
-8.314701158197984
],
[
"▁building",
-8.316371990568316
],
[
"wood",
-8.334157204402871
],
[
"▁major",
-8.339930648867668
],
[
"▁continue",
-8.354719374662293
],
[
"▁before",
-8.355901431136036
],
[
"▁church",
-8.373864752064762
],
[
"cause",
-8.374994523725174
],
[
"▁different",
-8.378677909559569
],
[
"▁consider",
-8.379644319096371
],
[
"▁return",
-8.398236208111976
],
[
"▁married",
-8.403561309816
],
[
"▁between",
-8.405165503565001
],
[
"▁office",
-8.432822246570016
],
[
"▁while",
-8.439216722942351
],
[
"▁black",
-8.455245051296673
],
[
"▁white",
-8.456305352690476
],
[
"▁where",
-8.489684984231863
],
[
"▁success",
-8.508232115728687
],
[
"▁community",
-8.58451658089278
],
[
"▁popular",
-8.618055276521458
],
[
"▁government",
-8.626466578878379
],
[
"ground",
-8.632522706351363
],
[
"▁publish",
-8.642585807118772
],
[
"▁around",
-8.65245225770258
],
[
"▁character",
-8.67372648620668
],
[
"▁daughter",
-8.690311524289319
],
[
"▁studie",
-8.701632852931988
],
[
"▁student",
-8.712422518543383
],
[
"▁language",
-8.721007489965347
],
[
"▁written",
-8.732011646611262
],
[
"▁english",
-8.735706145750054
],
[
"▁availabl",
-8.76611508104501
],
[
"▁council",
-8.783614175792328
],
[
"▁represent",
-8.81992241306137
],
[
"against",
-8.828021082322095
],
[
"▁international",
-8.828747745472203
],
[
"▁president",
-8.830966865525369
],
[
"▁business",
-8.83133702015181
],
[
"▁similar",
-8.861789608880173
],
[
"▁important",
-8.873694541232751
],
[
"▁football",
-8.889268058772672
],
[
"▁project",
-8.924414574369045
],
[
"▁describe",
-8.988608347920229
],
[
"▁construct",
-9.021653077852411
],
[
"▁australia",
-9.04109987847347
],
[
"▁effect",
-9.04247092997062
],
[
"▁subsequent",
-9.116983546757709
],
[
"▁california",
-9.175316509902466
],
[
"▁independen",
-9.19749858554971
],
[
"▁establish",
-9.292088040742566
],
[
"”",
-9.573625240210962
],
[
"“",
-9.591679104840328
],
[
"‘",
-9.76603248107811
],
[
"q",
-10.074918108580825
],
[
"—",
-10.303615482492846
],
[
"é",
-11.36889054536739
],
[
"–",
-11.648021551968636
],
[
"ü",
-11.718563368388722
],
[
"ä",
-12.28400621656532
],
[
"ö",
-12.491192913680244
],
[
"á",
-12.629873544080864
],
[
"í",
-12.771700006335152
],
[
"ó",
-12.9370180637035
],
[
"ç",
-13.31475682611221
],
[
"ß",
-13.533785779279093
],
[
"â",
-13.533785779279093
],
[
"à",
-13.57545244594576
],
[
"ō",
-13.929014963803612
],
[
"ú",
-13.991514963803615
],
[
"ô",
-14.28986661215544
],
[
"ï",
-14.3807757030648
],
[
"…",
-14.716886814188262
],
[
"ã",
-14.716886814188262
],
[
"ł",
-14.716886814188264
],
[
"ë",
-14.859743957085366
],
[
"ø",
-15.026410623712072
],
[
"č",
-15.026410623712072
],
[
"ć",
-15.026410623752032
],
[
"ă",
-15.226410623712075
],
[
"´",
-15.476410623712074
],
[
"š",
-15.476410623712075
],
[
"î",
-15.476410623752033
],
[
"ā",
-15.809743957045407
],
[
"ș",
-15.809743957045407
],
[
"ò",
-15.809743957085365
],
[
"û",
-15.809743957085365
],
[
"]",
-15.809743957085365
],
[
"ž",
-15.809743957085365
],
[
"ş",
-16.30974395708536
],
[
"ʻ",
-16.309743957085367
],
[
"ř",
-16.309743957085367
],
[
"α",
-16.309743957085367
],
[
"ı",
-16.309743957085367
],
[
"å",
-16.309743957085367
],
[
"»",
-16.309743957085367
],
[
"·",
-17.308943957085365
],
[
"ñ",
-17.30904395708537
],
[
"[",
-17.309143957085368
],
[
"œ",
-17.309243957085368
],
[
"ê",
-17.309343957085368
],
[
"ū",
-17.309443957085367
],
[
"«",
-17.309543957085367
],
[
"è",
-17.309643957085367
],
[
"ả",
-17.309743957045416
],
[
"尚",
-17.30974395708534
],
[
"先",
-17.30974395708534
],
[
"ạ",
-17.30974395708536
],
[
"π",
-17.309743957085367
],
[
"都",
-17.309743957085367
],
[
"大",
-17.309743957085367
],
[
"€",
-17.309743957085367
],
[
"奔",
-17.309743957085367
],
[
"נ",
-17.309743957085367
],
[
"ň",
-17.309743957085367
],
[
"ő",
-17.309743957085367
],
[
"„",
-17.309743957085367
],
[
"ð",
-17.309743957085367
],
[
"ị",
-17.309743957085367
],
[
"熊",
-17.309743957085367
],
[
"阪",
-17.309743957085367
],
[
"生",
-17.309743957085367
],
[
"京",
-17.309743957085367
],
[
"ý",
-17.309743957085367
],
[
"а",
-17.309743957085367
],
[
"¡",
-17.309743957085367
],
[
"ń",
-17.309743957085367
],
[
"χ",
-17.309743957085367
],
[
"时",
-17.309743957085367
],
[
"→",
-17.309743957085367
],
[
"ī",
-17.309743957085367
],
[
"ע",
-17.309743957085367
]
],
"byte_fallback": false
}
}