gsaltintas commited on
Commit
8d2b461
·
verified ·
1 Parent(s): 7fef727

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. vocab.json +88 -88
README.md CHANGED
@@ -19,7 +19,7 @@ A **Byte-Level BPE** tokenizer trained on **numeric** data from Fineweb-2-HQ.
19
  |-----------|-------|
20
  | Algorithm | Byte-Level BPE |
21
  | Language | `numeric` |
22
- | Target Vocab Size | 107 |
23
  | Final Vocab Size | 107 |
24
  | Pre-tokenizer | byte_level |
25
  | Number handling | ltr_2digit |
 
19
  |-----------|-------|
20
  | Algorithm | Byte-Level BPE |
21
  | Language | `numeric` |
22
+ | Target Vocab Size | 116 |
23
  | Final Vocab Size | 107 |
24
  | Pre-tokenizer | byte_level |
25
  | Number handling | ltr_2digit |
vocab.json CHANGED
@@ -1,109 +1,109 @@
1
  {
2
- "22": 29,
3
- "99": 106,
4
- "48": 55,
5
  "52": 59,
6
- "41": 48,
7
  "15": 22,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "34": 41,
 
 
 
9
  "50": 57,
 
 
 
 
10
  "67": 74,
11
- "5": 12,
12
- "72": 79,
 
 
 
 
 
 
 
 
 
13
  "95": 102,
 
 
 
 
 
 
 
 
 
 
 
 
14
  "85": 92,
15
- "65": 72,
 
16
  "78": 85,
17
- "73": 80,
18
- "62": 69,
19
- "51": 58,
20
- "74": 81,
21
- "16": 23,
22
- "39": 46,
23
- "12": 19,
24
- "</s>": 2,
25
  "47": 54,
26
- "75": 82,
27
- "46": 53,
28
- "4": 11,
29
- "64": 71,
30
- "40": 47,
31
- "36": 43,
32
- "35": 42,
33
- "19": 26,
34
- "9": 16,
35
- "38": 45,
36
- "69": 76,
37
  "28": 35,
38
- "45": 52,
 
 
 
39
  "90": 97,
40
- "94": 101,
41
- "86": 93,
42
- "98": 105,
43
- "21": 28,
44
- "<s>": 1,
45
- "54": 61,
46
- "77": 84,
47
  "<pad>": 3,
48
- "24": 31,
49
- "7": 14,
50
- "43": 50,
51
- "44": 51,
52
- "mod": 4,
53
- "61": 68,
54
- "60": 67,
55
- "81": 88,
56
- "11": 18,
57
- "17": 24,
58
- "<unk>": 0,
59
- "92": 99,
60
- "31": 38,
61
  "23": 30,
62
- "96": 103,
63
- "68": 75,
64
- "42": 49,
65
- "93": 100,
66
- "97": 104,
67
- "32": 39,
68
- "=": 5,
69
- "20": 27,
70
- "13": 20,
71
  "1": 8,
72
- "10": 17,
73
- "56": 63,
74
- "26": 33,
75
- "3": 10,
76
- "79": 86,
77
- "0": 7,
78
- "83": 90,
79
- "82": 89,
80
- "87": 94,
81
- "8": 15,
82
- "53": 60,
83
- "29": 36,
84
- "80": 87,
85
- "91": 98,
86
  "49": 56,
87
- "30": 37,
 
 
 
 
 
 
 
88
  "6": 13,
89
- "59": 66,
90
  "14": 21,
91
- "66": 73,
92
- "55": 62,
93
- "25": 32,
94
- "58": 65,
95
- "27": 34,
96
- "76": 83,
97
- "89": 96,
98
- "63": 70,
99
- " ": 6,
100
  "57": 64,
101
- "37": 44,
102
- "33": 40,
103
- "88": 95,
104
- "18": 25,
105
- "71": 78,
106
  "84": 91,
107
- "2": 9,
108
- "70": 77
 
 
 
 
109
  }
 
1
  {
 
 
 
2
  "52": 59,
3
+ "66": 73,
4
  "15": 22,
5
+ "<s>": 1,
6
+ "62": 69,
7
+ "68": 75,
8
+ "29": 36,
9
+ "8": 15,
10
+ "82": 89,
11
+ "26": 33,
12
+ "61": 68,
13
+ "17": 24,
14
+ "91": 98,
15
+ "2": 9,
16
+ "9": 16,
17
+ "5": 12,
18
+ "21": 28,
19
+ "94": 101,
20
+ "30": 37,
21
+ "42": 49,
22
+ "38": 45,
23
+ "31": 38,
24
+ "45": 52,
25
+ "53": 60,
26
+ "40": 47,
27
  "34": 41,
28
+ "mod": 4,
29
+ "3": 10,
30
+ "33": 40,
31
  "50": 57,
32
+ "51": 58,
33
+ "<unk>": 0,
34
+ "83": 90,
35
+ "11": 18,
36
  "67": 74,
37
+ "24": 31,
38
+ "59": 66,
39
+ "76": 83,
40
+ "56": 63,
41
+ "18": 25,
42
+ "79": 86,
43
+ "99": 106,
44
+ "81": 88,
45
+ "92": 99,
46
+ "13": 20,
47
+ "69": 76,
48
  "95": 102,
49
+ "96": 103,
50
+ "64": 71,
51
+ "19": 26,
52
+ "98": 105,
53
+ "4": 11,
54
+ "97": 104,
55
+ "54": 61,
56
+ "60": 67,
57
+ "71": 78,
58
+ "86": 93,
59
+ "87": 94,
60
+ "16": 23,
61
  "85": 92,
62
+ "93": 100,
63
+ "48": 55,
64
  "78": 85,
 
 
 
 
 
 
 
 
65
  "47": 54,
66
+ "74": 81,
67
+ "27": 34,
68
+ "0": 7,
 
 
 
 
 
 
 
 
69
  "28": 35,
70
+ "25": 32,
71
+ "7": 14,
72
+ "10": 17,
73
+ "72": 79,
74
  "90": 97,
75
+ "12": 19,
 
 
 
 
 
 
76
  "<pad>": 3,
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  "23": 30,
78
+ "41": 48,
79
+ "75": 82,
80
+ "88": 95,
 
 
 
 
 
 
81
  "1": 8,
82
+ "65": 72,
83
+ "35": 42,
84
+ "55": 62,
85
+ "32": 39,
 
 
 
 
 
 
 
 
 
 
86
  "49": 56,
87
+ "37": 44,
88
+ "44": 51,
89
+ "22": 29,
90
+ "39": 46,
91
+ "</s>": 2,
92
+ "63": 70,
93
+ "77": 84,
94
+ "58": 65,
95
  "6": 13,
96
+ "=": 5,
97
  "14": 21,
98
+ "70": 77,
99
+ "20": 27,
100
+ "36": 43,
 
 
 
 
 
 
101
  "57": 64,
 
 
 
 
 
102
  "84": 91,
103
+ "89": 96,
104
+ "43": 50,
105
+ " ": 6,
106
+ "80": 87,
107
+ "73": 80,
108
+ "46": 53
109
  }