ctaguchi commited on
Commit
9db5ebe
·
verified ·
1 Parent(s): 10d85a9

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer_config.json +17 -17
  2. vocab.json +92 -92
tokenizer_config.json CHANGED
@@ -1,38 +1,38 @@
1
  {
2
  "added_tokens_decoder": {
3
- "5": {
4
- "content": "ll",
5
  "lstrip": true,
6
  "normalized": false,
7
  "rstrip": true,
8
  "single_word": false,
9
  "special": false
10
  },
11
- "11": {
12
- "content": "zh",
13
  "lstrip": true,
14
  "normalized": false,
15
  "rstrip": true,
16
  "single_word": false,
17
  "special": false
18
  },
19
- "26": {
20
- "content": "sh",
21
  "lstrip": true,
22
  "normalized": false,
23
  "rstrip": true,
24
  "single_word": false,
25
  "special": false
26
  },
27
- "31": {
28
- "content": "nj",
29
  "lstrip": true,
30
  "normalized": false,
31
  "rstrip": true,
32
  "single_word": false,
33
  "special": false
34
  },
35
- "40": {
36
  "content": "th",
37
  "lstrip": true,
38
  "normalized": false,
@@ -40,32 +40,32 @@
40
  "single_word": false,
41
  "special": false
42
  },
43
- "44": {
44
- "content": "xh",
45
  "lstrip": true,
46
  "normalized": false,
47
  "rstrip": true,
48
  "single_word": false,
49
  "special": false
50
  },
51
- "48": {
52
- "content": "gj",
53
  "lstrip": true,
54
  "normalized": false,
55
  "rstrip": true,
56
  "single_word": false,
57
  "special": false
58
  },
59
- "61": {
60
- "content": "dh",
61
  "lstrip": true,
62
  "normalized": false,
63
  "rstrip": true,
64
  "single_word": false,
65
  "special": false
66
  },
67
- "75": {
68
- "content": "rr",
69
  "lstrip": true,
70
  "normalized": false,
71
  "rstrip": true,
 
1
  {
2
  "added_tokens_decoder": {
3
+ "10": {
4
+ "content": "sh",
5
  "lstrip": true,
6
  "normalized": false,
7
  "rstrip": true,
8
  "single_word": false,
9
  "special": false
10
  },
11
+ "15": {
12
+ "content": "dh",
13
  "lstrip": true,
14
  "normalized": false,
15
  "rstrip": true,
16
  "single_word": false,
17
  "special": false
18
  },
19
+ "19": {
20
+ "content": "zh",
21
  "lstrip": true,
22
  "normalized": false,
23
  "rstrip": true,
24
  "single_word": false,
25
  "special": false
26
  },
27
+ "23": {
28
+ "content": "rr",
29
  "lstrip": true,
30
  "normalized": false,
31
  "rstrip": true,
32
  "single_word": false,
33
  "special": false
34
  },
35
+ "25": {
36
  "content": "th",
37
  "lstrip": true,
38
  "normalized": false,
 
40
  "single_word": false,
41
  "special": false
42
  },
43
+ "28": {
44
+ "content": "ll",
45
  "lstrip": true,
46
  "normalized": false,
47
  "rstrip": true,
48
  "single_word": false,
49
  "special": false
50
  },
51
+ "41": {
52
+ "content": "nj",
53
  "lstrip": true,
54
  "normalized": false,
55
  "rstrip": true,
56
  "single_word": false,
57
  "special": false
58
  },
59
+ "70": {
60
+ "content": "gj",
61
  "lstrip": true,
62
  "normalized": false,
63
  "rstrip": true,
64
  "single_word": false,
65
  "special": false
66
  },
67
+ "93": {
68
+ "content": "xh",
69
  "lstrip": true,
70
  "normalized": false,
71
  "rstrip": true,
vocab.json CHANGED
@@ -1,100 +1,100 @@
1
  {
2
  "aln": {
3
- "'": 66,
4
- "(": 88,
5
- ")": 12,
6
- "-": 51,
7
- ".": 90,
8
- "/": 78,
9
- "=": 9,
10
- "A": 67,
11
- "B": 83,
12
- "C": 33,
13
- "D": 13,
14
- "E": 64,
15
- "F": 25,
16
- "G": 84,
17
- "H": 72,
18
- "I": 0,
19
- "J": 49,
20
- "K": 91,
21
- "L": 73,
22
- "M": 65,
23
- "N": 1,
24
- "O": 42,
25
- "P": 15,
26
- "Q": 93,
27
- "R": 76,
28
- "S": 62,
29
- "T": 20,
30
- "U": 17,
31
- "V": 22,
32
- "W": 38,
33
- "X": 55,
34
- "Y": 18,
35
- "Z": 7,
36
- "[": 50,
37
  "[PAD]": 95,
38
  "[UNK]": 94,
39
- "a": 68,
40
- "b": 82,
41
- "c": 77,
42
- "d": 27,
43
- "dh": 61,
44
- "e": 87,
45
- "f": 80,
46
- "g": 92,
47
- "gj": 48,
48
- "h": 32,
49
- "i": 63,
50
- "j": 30,
51
- "k": 6,
52
- "l": 41,
53
- "ll": 5,
54
- "m": 59,
55
- "n": 2,
56
- "nj": 31,
57
- "o": 46,
58
- "p": 21,
59
- "q": 79,
60
- "r": 86,
61
- "rr": 75,
62
- "s": 37,
63
- "sh": 26,
64
- "t": 35,
65
- "th": 40,
66
  "u": 3,
67
- "v": 34,
68
- "w": 4,
69
- "x": 60,
70
- "xh": 44,
71
- "y": 24,
72
- "z": 71,
73
- "zh": 11,
74
- "{": 43,
75
- "|": 57,
76
- "}": 14,
77
- "Ε": 69,
78
- "Τ": 8,
79
- "ά": 23,
80
- "έ": 47,
81
- "ή": 53,
82
- "ί": 29,
83
- "α": 36,
84
- "ε": 16,
85
- "ζ": 89,
86
- "θ": 85,
87
- "κ": 10,
88
- "λ": 74,
89
- "ν": 81,
90
- "ο": 58,
91
- "π": 52,
92
- "ρ": 45,
93
- "ς": 70,
94
  "σ": 54,
95
- "τ": 56,
96
- "υ": 28,
97
- "φ": 19,
98
- "ω": 39
99
  }
100
  }
 
1
  {
2
  "aln": {
3
+ "'": 68,
4
+ "(": 38,
5
+ ")": 84,
6
+ "-": 9,
7
+ ".": 52,
8
+ "/": 1,
9
+ "=": 60,
10
+ "A": 59,
11
+ "B": 46,
12
+ "C": 90,
13
+ "D": 35,
14
+ "E": 82,
15
+ "F": 72,
16
+ "G": 79,
17
+ "H": 51,
18
+ "I": 24,
19
+ "J": 45,
20
+ "K": 71,
21
+ "L": 89,
22
+ "M": 58,
23
+ "N": 69,
24
+ "O": 57,
25
+ "P": 86,
26
+ "Q": 83,
27
+ "R": 6,
28
+ "S": 11,
29
+ "T": 81,
30
+ "U": 91,
31
+ "V": 50,
32
+ "W": 13,
33
+ "X": 75,
34
+ "Y": 26,
35
+ "Z": 64,
36
+ "[": 37,
37
  "[PAD]": 95,
38
  "[UNK]": 94,
39
+ "a": 0,
40
+ "b": 88,
41
+ "c": 87,
42
+ "d": 92,
43
+ "dh": 15,
44
+ "e": 30,
45
+ "f": 8,
46
+ "g": 80,
47
+ "gj": 70,
48
+ "h": 20,
49
+ "i": 43,
50
+ "j": 32,
51
+ "k": 65,
52
+ "l": 16,
53
+ "ll": 28,
54
+ "m": 67,
55
+ "n": 48,
56
+ "nj": 41,
57
+ "o": 55,
58
+ "p": 2,
59
+ "q": 18,
60
+ "r": 27,
61
+ "rr": 23,
62
+ "s": 42,
63
+ "sh": 10,
64
+ "t": 62,
65
+ "th": 25,
66
  "u": 3,
67
+ "v": 4,
68
+ "w": 5,
69
+ "x": 47,
70
+ "xh": 93,
71
+ "y": 44,
72
+ "z": 74,
73
+ "zh": 19,
74
+ "{": 7,
75
+ "|": 29,
76
+ "}": 61,
77
+ "Ε": 21,
78
+ "Τ": 17,
79
+ "ά": 56,
80
+ "έ": 40,
81
+ "ή": 73,
82
+ "ί": 31,
83
+ "α": 66,
84
+ "ε": 85,
85
+ "ζ": 78,
86
+ "θ": 12,
87
+ "κ": 14,
88
+ "λ": 76,
89
+ "ν": 49,
90
+ "ο": 63,
91
+ "π": 39,
92
+ "ρ": 22,
93
+ "ς": 53,
94
  "σ": 54,
95
+ "τ": 34,
96
+ "υ": 77,
97
+ "φ": 33,
98
+ "ω": 36
99
  }
100
  }