onlydj96 commited on
Commit
0baa47d
ยท
1 Parent(s): 5ec9643

add tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer_config.json +4 -2
  2. vocab.txt +106 -104
tokenizer_config.json CHANGED
@@ -3,11 +3,13 @@
3
  "do_basic_tokenize": true,
4
  "do_lower_case": true,
5
  "mask_token": "[MASK]",
6
- "name_or_path": "vocab.txt",
 
7
  "never_split": null,
8
  "pad_token": "[PAD]",
9
  "sep_token": "[SEP]",
10
- "strip_accents": null,
 
11
  "tokenize_chinese_chars": true,
12
  "tokenizer_class": "BertTokenizer",
13
  "unk_token": "[UNK]"
 
3
  "do_basic_tokenize": true,
4
  "do_lower_case": true,
5
  "mask_token": "[MASK]",
6
+ "model_max_length": 512,
7
+ "name_or_path": "kykim/bert-kor-base",
8
  "never_split": null,
9
  "pad_token": "[PAD]",
10
  "sep_token": "[SEP]",
11
+ "special_tokens_map_file": null,
12
+ "strip_accents": false,
13
  "tokenize_chinese_chars": true,
14
  "tokenizer_class": "BertTokenizer",
15
  "unk_token": "[UNK]"
vocab.txt CHANGED
@@ -3,109 +3,111 @@
3
  [CLS]
4
  [SEP]
5
  [MASK]
6
- ์ž๋ฆฟ์ˆ˜
7
- ๊ฐ€์••๋ฅ˜
8
- ๊ณต์ฑ„
9
- ๊ณผํƒœ๋ฃŒ
10
- ๊ตฌ๋งค์ž
11
- ๋“ฑ๋ก์›๋ถ€
12
- ##๋“ฑ๋ก์›๋ถ€
13
- ๊ทผ์ €๋‹น๊ถŒ
14
- ๋ง์†Œ
15
- ์ž๋™์ฐจ๋“ฑ๋ก์ฆ
16
- ๋Œ€๋ฆฌ์ธ
17
- ์‹ ๋ถ„์ฆ
18
- ์ง€์ฐธ
19
- ๊ฐœ์ธํ˜•
20
- ๋“ฑ๋ณธ
21
- ๋“ฑ๊ธฐ๋ถ€๋“ฑ๋ณธ
22
- ##๋“ฑ๊ธฐ๋ถ€๋“ฑ๋ณธ
23
- ๋ฐœ๊ธ‰์ฒ˜
24
- ๋ฒ”์น™๊ธˆ
25
- ๊ณ„์•ฝ์„œ
26
- ##๊ณ„์•ฝ์„œ
27
- ๋ช…์˜๋ณ€๊ฒฝ
28
- ๋ฏธ๋‚ฉ
29
- ๋ฏธ์ธ์‹
30
- ๋ฏผ์ฆ
31
- ๋ฒˆํ˜ธํŒ
32
- ์‚ฌ์šฉ์ธ๊ฐ
33
- ์ธ๊ฐ
34
- ์ธ๊ฐ๋„์žฅ
35
- ##์ฆ๋ช…์„œ
36
- ์ฆ๋ช…์„œ
37
- ๋™์˜์„œ
38
- ##๋™์˜์„œ
39
- ##ํฌ๊ธฐ๊ฐ์„œ
40
- ์„ธ๊ธˆ๊ณ„์‚ฐ์„œ
41
- ์†Œ์œ ๊ถŒ
42
- ์••๋ฅ˜
43
- ์ €๋‹น
44
- ์ž”๊ธˆ
45
- ์ง€๊ธ‰์ผ
46
- ##์ง€๊ธ‰์ผ
47
- ์ฃผ๋ฏผ๋“ฑ๋ก์ฆ
48
- ์ฃผ๋ฏผ๋“ฑ๋ก๋ฒˆํ˜ธ
49
- ์ฃผํ–‰๊ฑฐ๋ฆฌ
50
- ์†Œ์œ ์ž
51
- ์นด๋“œ๊ฒฐ์ œ
52
- ์ทจ๋“์„ธ
53
- ์ทจ๋“ฑ๋ก์„ธ
54
- ํ์ฐจ
55
- ##๊ธฐ๊ด€์—์„œ
56
- ##์ž๋ฆฟ์ˆ˜
57
- ๋“ฑ๋ก์ฆ
58
- ##๋ฒ”์น™๊ธˆ
59
- ##์ง€์—ฐ
60
- ##๋ช‡์‹œ
61
- ##๋ฌธ์žฅ
62
- ์‚ฌ๋ณธ
63
- ์ƒ๋‹ด์›
64
- ##์ด์ „
65
- ๋ณต์‚ฌ๋ณธ
66
- ์ƒ์†์ธ
67
- ์ƒ์†ํฌ๊ธฐ์„œ
68
- ์–‘์ˆ˜์ธ
69
- ์ €๊ณตํ•ด
70
- ์–ด์ฉŒ๊ตฌ
71
- ์ €์ฉŒ๊ตฌ
72
- ๊ตญ์ 
73
- ์šด์ „๋ฉดํ—ˆ์ฆ
74
- ์œ„์ž„์žฅ
75
- ์›๋ถ€
76
- ์ธ๊ฐ์ฆ๋ช…์„œ
77
- ๊ธฐ์ž…
78
- ๊ฐœ๋ช…
79
- ์•„๋‹ˆ์š”
80
- ๋“ฑ๋ก์„ธ
81
- ์—ฌ๋ณด์„ธ์š”
82
- ์ƒ์œ„๊ถŒ
83
- ์•Œ๊ฒ ์Šต๋‹ˆ๋‹ค
84
- AS
85
- ์–‘๋„์ธ
86
- ์–‘๋„์ฆ
87
- ์ฃผ์†Œ์ง€
88
- ๊ทœ๋ช…
89
- ๋งค์ˆ˜์ธ
90
- ์ค€๋น„์„œ๋ฅ˜
91
- ๋งค์ˆ˜์ž
92
- ์žฌ๋ฐœ๊ธ‰
93
- ์Šน๊ณ„
94
- ๊ณต๋งค
95
- ์ดˆ์ƒ๊ถŒ
96
- ์ฆ์—ฌ
97
- ์˜์น˜
98
- ์žฌ๊ต๋ถ€
99
- ์‚ฌ์—…์†Œ
100
- ์ƒ๋‹ด์‚ฌ
101
- ์˜์—…์šฉ
102
- ์ˆ˜๊ธฐ
103
- ์ƒ๋‹ด์ž
104
- ์ „๋ผ๋‚จ๋„
105
- ์ฐจ์ฒด
106
- ์ทจ๋“ฑ๋ก
107
- ##ํ•œํ…Œ์„œ
108
- ์–‘์ˆ˜
 
 
109
  [unused105]
110
  [unused106]
111
  [unused107]
@@ -28305,7 +28307,7 @@ can
28305
  ์ด์˜
28306
  ์˜คํ”ผ์Šคํ…”
28307
  ๊ฐ€๋Š”๋ฐ
28308
- ๊ฐ€์Šด์„๋‚ฉ
28309
  ์ˆ˜๋ถ„๊ฐ์ด
28310
  ์šฐ์—‰
28311
  ##๋ฒ”์ฃ„
 
3
  [CLS]
4
  [SEP]
5
  [MASK]
6
+ [unused0]
7
+ [unused1]
8
+ [unused2]
9
+ [unused3]
10
+ [unused4]
11
+ [unused5]
12
+ [unused6]
13
+ [unused7]
14
+ [unused8]
15
+ [unused9]
16
+ [unused10]
17
+ [unused11]
18
+ [unused12]
19
+ [unused13]
20
+ [unused14]
21
+ [unused15]
22
+ [unused16]
23
+ [unused17]
24
+ [unused18]
25
+ [unused19]
26
+ [unused20]
27
+ [unused21]
28
+ [unused22]
29
+ [unused23]
30
+ [unused24]
31
+ [unused25]
32
+ [unused26]
33
+ [unused27]
34
+ [unused28]
35
+ [unused29]
36
+ [unused30]
37
+ [unused31]
38
+ [unused32]
39
+ [unused33]
40
+ [unused34]
41
+ [unused35]
42
+ [unused36]
43
+ [unused37]
44
+ [unused38]
45
+ [unused39]
46
+ [unused40]
47
+ [unused41]
48
+ [unused42]
49
+ [unused43]
50
+ [unused44]
51
+ [unused45]
52
+ [unused46]
53
+ [unused47]
54
+ [unused48]
55
+ [unused49]
56
+ [unused50]
57
+ [unused51]
58
+ [unused52]
59
+ [unused53]
60
+ [unused54]
61
+ [unused55]
62
+ [unused56]
63
+ [unused57]
64
+ [unused58]
65
+ [unused59]
66
+ [unused60]
67
+ [unused61]
68
+ [unused62]
69
+ [unused63]
70
+ [unused64]
71
+ [unused65]
72
+ [unused66]
73
+ [unused67]
74
+ [unused68]
75
+ [unused69]
76
+ [unused70]
77
+ [unused71]
78
+ [unused72]
79
+ [unused73]
80
+ [unused74]
81
+ [unused75]
82
+ [unused76]
83
+ [unused77]
84
+ [unused78]
85
+ [unused79]
86
+ [unused80]
87
+ [unused81]
88
+ [unused82]
89
+ [unused83]
90
+ [unused84]
91
+ [unused85]
92
+ [unused86]
93
+ [unused87]
94
+ [unused88]
95
+ [unused89]
96
+ [unused90]
97
+ [unused91]
98
+ [unused92]
99
+ [unused93]
100
+ [unused94]
101
+ [unused95]
102
+ [unused96]
103
+ [unused97]
104
+ [unused98]
105
+ [unused99]
106
+ [unused100]
107
+ [unused101]
108
+ [unused102]
109
+ [unused103]
110
+ [unused104]
111
  [unused105]
112
  [unused106]
113
  [unused107]
 
28307
  ์ด์˜
28308
  ์˜คํ”ผ์Šคํ…”
28309
  ๊ฐ€๋Š”๋ฐ
28310
+ ๊ฐ€์Šด์„
28311
  ์ˆ˜๋ถ„๊ฐ์ด
28312
  ์šฐ์—‰
28313
  ##๋ฒ”์ฃ„