prismdata commited on
Commit
34811aa
·
verified ·
1 Parent(s): 3d81b89

Add tokenizer and update model

Browse files
README.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Korean GPT
2
+
3
+ 한국어 GPT 모델입니다.
4
+
5
+ ## 사용법
6
+ ```python
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
8
+
9
+ tokenizer = AutoTokenizer.from_pretrained(
10
+ "oz1115/korean-gpt-quick-test",
11
+ trust_remote_code=True
12
+ )
13
+
14
+ model = AutoModelForCausalLM.from_pretrained(
15
+ "oz1115/korean-gpt-quick-test",
16
+ trust_remote_code=True
17
+ )
18
+
19
+ inputs = tokenizer("안녕하세요", return_tensors="pt")
20
+ outputs = model.generate(**inputs, max_length=50)
21
+ print(tokenizer.decode(outputs[0]))
22
+ ```
23
+
24
+ ## 모델 정보
25
+ - Vocabulary: 32,000
26
+ - Hidden Size: 512
27
+ - Layers: 8
28
+ - Attention Heads: 8
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "KoreanGPTModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 2,
7
+ "dtype": "float32",
8
+ "eos_token_id": 3,
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 256,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 1024,
13
+ "layer_norm_eps": 1e-05,
14
+ "max_position_embeddings": 256,
15
+ "model_type": "korean_gpt",
16
+ "num_attention_heads": 4,
17
+ "num_hidden_layers": 4,
18
+ "pad_token_id": 0,
19
+ "transformers_version": "4.57.3",
20
+ "use_cache": true,
21
+ "vocab_size": 32000,
22
+ "auto_map": {
23
+ "AutoModelForCausalLM": "modeling_korean_gpt.KoreanGPTModel",
24
+ "AutoTokenizer": [
25
+ "tokenization_korean_gpt.KoreanGPTTokenizer",
26
+ null
27
+ ]
28
+ }
29
+ }
korean_sp_32k.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3ed6db693f545bd90ef9fed4d6415a511a7f59ac168c7f5e757a75c06d704fa
3
+ size 836193
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd8b533a5c3e7b5af23c7587f0d2085543f8e0c0634610e7dc8dd2a3447ac7e0
3
+ size 46739615
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "unk_token": "<unk>",
5
+ "pad_token": "<pad>"
6
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "KoreanGPTTokenizer",
3
+ "auto_map": {
4
+ "AutoTokenizer": [
5
+ "tokenization_korean_gpt.KoreanGPTTokenizer",
6
+ null
7
+ ]
8
+ },
9
+ "model_max_length": 512,
10
+ "bos_token": "<s>",
11
+ "eos_token": "</s>",
12
+ "unk_token": "<unk>",
13
+ "pad_token": "<pad>"
14
+ }