diff --git "a/tokenizer.json" "b/tokenizer.json" --- "a/tokenizer.json" +++ "b/tokenizer.json" @@ -2,18 +2,11 @@ "version": "1.0", "truncation": { "direction": "Right", - "max_length": 64, + "max_length": 768, "strategy": "LongestFirst", "stride": 0 }, - "padding": { - "strategy": "BatchLongest", - "direction": "Left", - "pad_to_multiple_of": null, - "pad_id": 0, - "pad_type_id": 0, - "pad_token": "" - }, + "padding": null, "added_tokens": [ { "id": 0, @@ -21,7 +14,7 @@ "single_word": false, "lstrip": false, "rstrip": false, - "normalized": true, + "normalized": false, "special": true }, { @@ -30,7 +23,7 @@ "single_word": false, "lstrip": false, "rstrip": false, - "normalized": true, + "normalized": false, "special": true }, { @@ -39,8 +32,27080 @@ "single_word": false, "lstrip": false, "rstrip": false, - "normalized": true, + "normalized": false, "special": true + }, + { + "id": 32000, + "content": "kanggo", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32001, + "content": "bisa", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32002, + "content": "saka", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32003, + "content": "dina", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32004, + "content": "sareng", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32005, + "content": "bakal", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32006, + "content": "karo", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32007, + "content": "pikeun", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32008, + "content": "kuwi", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32009, + "content": "jeung", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32010, + "content": "wong", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32011, + "content": "paling", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32012, + "content": "wis", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32013, + "content": "utawa", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32014, + "content": "sampeyan", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32015, + "content": "dadi", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32016, + "content": "anjeun", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32017, + "content": "kang", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32018, + "content": "kanthi", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32019, + "content": "luwih", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32020, + "content": "mung", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32021, + "content": "taun", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32022, + "content": "minangka", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32023, + "content": "Aku", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32024, + "content": "tapi", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32025, + "content": "meuble", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32026, + "content": "nanging", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32027, + "content": "urang", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32028, + "content": "kita", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32029, + "content": "salah", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32030, + "content": "kudu", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32031, + "content": "bokep", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32032, + "content": "nganggo", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32033, + "content": "sarta", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32034, + "content": "teu", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32035, + "content": "tina", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32036, + "content": "tak", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32037, + "content": "apik", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32038, + "content": "siji", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32039, + "content": "ngentot", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32040, + "content": "nalika", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32041, + "content": "kaya", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32042, + "content": "jadi", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32043, + "content": "padha", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32044, + "content": "kana", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32045, + "content": "cara", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32046, + "content": "hiji", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": false + }, + { + "id": 32047, + "content": "/>