iDSLR commited on
Commit
10c6436
·
verified ·
1 Parent(s): fb3b4fc

사용자 정의 특수 토큰 제거 : 해당 토큰을 학습하는 것은 매우 비효율적.

Browse files
added_tokens.json CHANGED
@@ -1,14 +1,6 @@
1
  {
2
- "$~bos$": 50257,
3
- "$~dev$": 50260,
4
- "$~eos$": 50256,
5
- "$~func-continue$": 50263,
6
- "$~func-time$": 50262,
7
- "$~info$": 50264,
8
- "$~me$": 50265,
9
- "$~pad$": 50258,
10
- "$~somebody$": 50266,
11
- "$~tfot$": 50261,
12
- "$~time$": 50267,
13
- "$~unk$": 50259
14
  }
 
1
  {
2
+ "$~^bos$": 50257,
3
+ "$~^eos$": 50256,
4
+ "$~^pad$": 50258,
5
+ "$~^unk$": 50259
 
 
 
 
 
 
 
 
6
  }
special_tokens_map.json CHANGED
@@ -1,85 +1,27 @@
1
  {
2
- "additional_special_tokens": [
3
- {
4
- "content": "$~dev$",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "$~tfot$",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- },
17
- {
18
- "content": "$~func-time$",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- {
25
- "content": "$~func-continue$",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- },
31
- {
32
- "content": "$~info$",
33
- "lstrip": false,
34
- "normalized": false,
35
- "rstrip": false,
36
- "single_word": false
37
- },
38
- {
39
- "content": "$~me$",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": false,
43
- "single_word": false
44
- },
45
- {
46
- "content": "$~somebody$",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false
51
- },
52
- {
53
- "content": "$~time$",
54
- "lstrip": false,
55
- "normalized": false,
56
- "rstrip": false,
57
- "single_word": false
58
- }
59
- ],
60
  "bos_token": {
61
- "content": "$~bos$",
62
  "lstrip": false,
63
  "normalized": false,
64
  "rstrip": false,
65
  "single_word": false
66
  },
67
  "eos_token": {
68
- "content": "$~eos$",
69
  "lstrip": false,
70
  "normalized": true,
71
  "rstrip": false,
72
  "single_word": false
73
  },
74
  "pad_token": {
75
- "content": "$~pad$",
76
  "lstrip": false,
77
  "normalized": false,
78
  "rstrip": false,
79
  "single_word": false
80
  },
81
  "unk_token": {
82
- "content": "$~unk$",
83
  "lstrip": false,
84
  "normalized": false,
85
  "rstrip": false,
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": {
3
+ "content": "$~^bos$",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "$~^eos$",
11
  "lstrip": false,
12
  "normalized": true,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
+ "content": "$~^pad$",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
23
  "unk_token": {
24
+ "content": "$~^unk$",
25
  "lstrip": false,
26
  "normalized": false,
27
  "rstrip": false,
tokenizer.json CHANGED
@@ -5,7 +5,7 @@
5
  "added_tokens": [
6
  {
7
  "id": 50256,
8
- "content": "$~eos$",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
@@ -14,7 +14,7 @@
14
  },
15
  {
16
  "id": 50257,
17
- "content": "$~bos$",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
@@ -23,7 +23,7 @@
23
  },
24
  {
25
  "id": 50258,
26
- "content": "$~pad$",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
@@ -32,79 +32,7 @@
32
  },
33
  {
34
  "id": 50259,
35
- "content": "$~unk$",
36
- "single_word": false,
37
- "lstrip": false,
38
- "rstrip": false,
39
- "normalized": false,
40
- "special": true
41
- },
42
- {
43
- "id": 50260,
44
- "content": "$~dev$",
45
- "single_word": false,
46
- "lstrip": false,
47
- "rstrip": false,
48
- "normalized": false,
49
- "special": true
50
- },
51
- {
52
- "id": 50261,
53
- "content": "$~tfot$",
54
- "single_word": false,
55
- "lstrip": false,
56
- "rstrip": false,
57
- "normalized": false,
58
- "special": true
59
- },
60
- {
61
- "id": 50262,
62
- "content": "$~func-time$",
63
- "single_word": false,
64
- "lstrip": false,
65
- "rstrip": false,
66
- "normalized": false,
67
- "special": true
68
- },
69
- {
70
- "id": 50263,
71
- "content": "$~func-continue$",
72
- "single_word": false,
73
- "lstrip": false,
74
- "rstrip": false,
75
- "normalized": false,
76
- "special": true
77
- },
78
- {
79
- "id": 50264,
80
- "content": "$~info$",
81
- "single_word": false,
82
- "lstrip": false,
83
- "rstrip": false,
84
- "normalized": false,
85
- "special": true
86
- },
87
- {
88
- "id": 50265,
89
- "content": "$~me$",
90
- "single_word": false,
91
- "lstrip": false,
92
- "rstrip": false,
93
- "normalized": false,
94
- "special": true
95
- },
96
- {
97
- "id": 50266,
98
- "content": "$~somebody$",
99
- "single_word": false,
100
- "lstrip": false,
101
- "rstrip": false,
102
- "normalized": false,
103
- "special": true
104
- },
105
- {
106
- "id": 50267,
107
- "content": "$~time$",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
 
5
  "added_tokens": [
6
  {
7
  "id": 50256,
8
+ "content": "$~^eos$",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
 
14
  },
15
  {
16
  "id": 50257,
17
+ "content": "$~^bos$",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
 
23
  },
24
  {
25
  "id": 50258,
26
+ "content": "$~^pad$",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
 
32
  },
33
  {
34
  "id": 50259,
35
+ "content": "$~^unk$",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "add_prefix_space": false,
4
  "added_tokens_decoder": {
5
  "50256": {
6
- "content": "$~eos$",
7
  "lstrip": false,
8
  "normalized": true,
9
  "rstrip": false,
@@ -11,7 +11,7 @@
11
  "special": true
12
  },
13
  "50257": {
14
- "content": "$~bos$",
15
  "lstrip": false,
16
  "normalized": false,
17
  "rstrip": false,
@@ -19,7 +19,7 @@
19
  "special": true
20
  },
21
  "50258": {
22
- "content": "$~pad$",
23
  "lstrip": false,
24
  "normalized": false,
25
  "rstrip": false,
@@ -27,71 +27,7 @@
27
  "special": true
28
  },
29
  "50259": {
30
- "content": "$~unk$",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "50260": {
38
- "content": "$~dev$",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "50261": {
46
- "content": "$~tfot$",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "50262": {
54
- "content": "$~func-time$",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- },
61
- "50263": {
62
- "content": "$~func-continue$",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": true
68
- },
69
- "50264": {
70
- "content": "$~info$",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": true
76
- },
77
- "50265": {
78
- "content": "$~me$",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": true
84
- },
85
- "50266": {
86
- "content": "$~somebody$",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": true
92
- },
93
- "50267": {
94
- "content": "$~time$",
95
  "lstrip": false,
96
  "normalized": false,
97
  "rstrip": false,
@@ -99,22 +35,12 @@
99
  "special": true
100
  }
101
  },
102
- "additional_special_tokens": [
103
- "$~dev$",
104
- "$~tfot$",
105
- "$~func-time$",
106
- "$~func-continue$",
107
- "$~info$",
108
- "$~me$",
109
- "$~somebody$",
110
- "$~time$"
111
- ],
112
- "bos_token": "$~bos$",
113
  "clean_up_tokenization_spaces": true,
114
- "eos_token": "$~eos$",
115
  "errors": "replace",
116
  "model_max_length": 8192,
117
- "pad_token": "$~pad$",
118
  "tokenizer_class": "GPT2Tokenizer",
119
- "unk_token": "$~unk$"
120
  }
 
3
  "add_prefix_space": false,
4
  "added_tokens_decoder": {
5
  "50256": {
6
+ "content": "$~^eos$",
7
  "lstrip": false,
8
  "normalized": true,
9
  "rstrip": false,
 
11
  "special": true
12
  },
13
  "50257": {
14
+ "content": "$~^bos$",
15
  "lstrip": false,
16
  "normalized": false,
17
  "rstrip": false,
 
19
  "special": true
20
  },
21
  "50258": {
22
+ "content": "$~^pad$",
23
  "lstrip": false,
24
  "normalized": false,
25
  "rstrip": false,
 
27
  "special": true
28
  },
29
  "50259": {
30
+ "content": "$~^unk$",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  "lstrip": false,
32
  "normalized": false,
33
  "rstrip": false,
 
35
  "special": true
36
  }
37
  },
38
+ "bos_token": "$~^bos$",
 
 
 
 
 
 
 
 
 
 
39
  "clean_up_tokenization_spaces": true,
40
+ "eos_token": "$~^eos$",
41
  "errors": "replace",
42
  "model_max_length": 8192,
43
+ "pad_token": "$~^pad$",
44
  "tokenizer_class": "GPT2Tokenizer",
45
+ "unk_token": "$~^unk$"
46
  }