Upload folder using huggingface_hub
Browse files- .gitattributes +1 -35
- README.md +72 -53
- config.json +12 -11
- model.py +118 -0
- model.safetensors +3 -0
- tokenizer_config.json +8 -0
.gitattributes
CHANGED
|
@@ -1,35 +1 @@
|
|
| 1 |
-
*.
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -9,6 +9,7 @@ tags:
|
|
| 9 |
- from-scratch
|
| 10 |
- korean
|
| 11 |
- gpt
|
|
|
|
| 12 |
model-index:
|
| 13 |
- name: SOVYN-85M
|
| 14 |
results:
|
|
@@ -23,88 +24,106 @@ model-index:
|
|
| 23 |
|
| 24 |
# SOVYN-85M
|
| 25 |
|
| 26 |
-
|
| 27 |
|
| 28 |
-
|
| 29 |
-
수학, 코딩, 논리, 과학 등 다양한 추론 문제를 단계별로 풀이합니다.
|
| 30 |
|
| 31 |
-
##
|
| 32 |
|
| 33 |
-
|
|
| 34 |
-
|---
|
| 35 |
-
|
|
| 36 |
-
|
|
| 37 |
-
|
|
| 38 |
-
|
|
| 39 |
-
|
|
| 40 |
-
|
|
| 41 |
-
|
|
| 42 |
-
|
|
|
|
|
| 43 |
|
| 44 |
-
## 학습
|
| 45 |
|
| 46 |
-
-
|
| 47 |
-
-
|
| 48 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
-
##
|
| 51 |
|
| 52 |
-
|
| 53 |
-
- Schedule: Cosine decay with warmup (500 steps)
|
| 54 |
-
- Batch: 16 × 4 grad_accum = effective 64
|
| 55 |
-
- Steps: 20,000
|
| 56 |
-
- Mixed Precision: AMP + GradScaler
|
| 57 |
-
- Hardware: NVIDIA RTX 5080 (16GB)
|
| 58 |
-
|
| 59 |
-
## 벤치마크 결과
|
| 60 |
|
| 61 |
| 카테고리 | 정확도 |
|
| 62 |
|---------|--------|
|
| 63 |
-
| 산술
|
| 64 |
-
| 코드
|
| 65 |
-
| 숫자
|
| 66 |
| 서술형 | 100% |
|
| 67 |
-
| 연산
|
| 68 |
-
| 리스트
|
| 69 |
-
| 괄호
|
| 70 |
| 방정식 | 80% |
|
| 71 |
| 논리 | 80% |
|
| 72 |
| 수열 | 33% |
|
| 73 |
-
| **전체** | **86.5%
|
| 74 |
|
| 75 |
## 사용법
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
```python
|
| 78 |
import torch
|
|
|
|
| 79 |
from tokenizers import Tokenizer
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
model.load_state_dict(state_dict)
|
| 88 |
model.eval()
|
| 89 |
|
| 90 |
-
|
| 91 |
-
tokenizer = Tokenizer.from_file("tokenizer.json")
|
| 92 |
|
| 93 |
# 추론
|
| 94 |
prompt = "문제: 3x + 7 = 22일 때, x의 값을 구하시오.\n풀이:\n"
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
print(result)
|
| 102 |
```
|
| 103 |
|
| 104 |
-
|
| 105 |
|
| 106 |
-
|
| 107 |
|
| 108 |
-
|
|
|
|
|
|
|
| 109 |
|
| 110 |
-
|
|
|
|
|
|
|
|
|
| 9 |
- from-scratch
|
| 10 |
- korean
|
| 11 |
- gpt
|
| 12 |
+
pipeline_tag: text-generation
|
| 13 |
model-index:
|
| 14 |
- name: SOVYN-85M
|
| 15 |
results:
|
|
|
|
| 24 |
|
| 25 |
# SOVYN-85M
|
| 26 |
|
| 27 |
+
처음부터 학습한 85M 파라미터 한국어 추론 모델.
|
| 28 |
|
| 29 |
+
수학, 코드 트레이싱, 논리, 물리, 화학, 생물, 지구과학, 한국사, 미적분 등 119개 카테고리의 문제를 단계별로 풀이한다.
|
|
|
|
| 30 |
|
| 31 |
+
## 스펙
|
| 32 |
|
| 33 |
+
| | |
|
| 34 |
+
|---|---|
|
| 35 |
+
| 파라미터 | 85.4M |
|
| 36 |
+
| 아키텍처 | GPT (Decoder-only) |
|
| 37 |
+
| 레이어 | 12 |
|
| 38 |
+
| 어텐션 헤드 | 12 |
|
| 39 |
+
| 임베딩 차원 | 768 |
|
| 40 |
+
| 컨텍스트 길이 | 512 |
|
| 41 |
+
| 어휘 크기 | 16,384 (BPE) |
|
| 42 |
+
| 어텐션 | Flash Attention (SDPA) |
|
| 43 |
+
| 정밀도 | float16 |
|
| 44 |
|
| 45 |
+
## 학습
|
| 46 |
|
| 47 |
+
- 데이터: 591,261개 합성 추론 문제 (119 카테고리), 27.97M 토큰
|
| 48 |
+
- 옵티마이저: AdamW (lr=3e-4, weight_decay=0.1)
|
| 49 |
+
- 스케줄: Cosine decay + warmup 500 steps
|
| 50 |
+
- 배치: 16 x 4 grad_accum = effective 64
|
| 51 |
+
- 스텝: 20,000
|
| 52 |
+
- GPU: RTX 5080 16GB
|
| 53 |
+
- 학습 시간: ~4시간
|
| 54 |
|
| 55 |
+
## 벤치마크
|
| 56 |
|
| 57 |
+
자체 벤치마크 52문제, 10개 카테고리.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
| 카테고리 | 정확도 |
|
| 60 |
|---------|--------|
|
| 61 |
+
| 산술 | 100% |
|
| 62 |
+
| 코드 트레이싱 | 100% |
|
| 63 |
+
| 숫자 성질 | 100% |
|
| 64 |
| 서술형 | 100% |
|
| 65 |
+
| 연산 우선순위 | 88% |
|
| 66 |
+
| 리스트 연산 | 83% |
|
| 67 |
+
| 괄호 연산 | 80% |
|
| 68 |
| 방정식 | 80% |
|
| 69 |
| 논리 | 80% |
|
| 70 |
| 수열 | 33% |
|
| 71 |
+
| **전체** | **86.5%** |
|
| 72 |
|
| 73 |
## 사용법
|
| 74 |
|
| 75 |
+
```bash
|
| 76 |
+
pip install torch safetensors tokenizers huggingface_hub
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
```python
|
| 80 |
import torch
|
| 81 |
+
from safetensors.torch import load_file
|
| 82 |
from tokenizers import Tokenizer
|
| 83 |
+
from huggingface_hub import hf_hub_download
|
| 84 |
+
|
| 85 |
+
# 다운로드
|
| 86 |
+
model_path = hf_hub_download("SOVYN/SOVYN-85M", "model.safetensors")
|
| 87 |
+
tok_path = hf_hub_download("SOVYN/SOVYN-85M", "tokenizer.json")
|
| 88 |
+
code_path = hf_hub_download("SOVYN/SOVYN-85M", "model.py")
|
| 89 |
+
|
| 90 |
+
# 아키텍처 로드
|
| 91 |
+
import importlib.util
|
| 92 |
+
spec = importlib.util.spec_from_file_location("model", code_path)
|
| 93 |
+
mod = importlib.util.module_from_spec(spec)
|
| 94 |
+
spec.loader.exec_module(mod)
|
| 95 |
+
|
| 96 |
+
# 모델 로드
|
| 97 |
+
model = mod.SOVYN85M()
|
| 98 |
+
state_dict = load_file(model_path)
|
| 99 |
+
state_dict = {k: v.float() for k, v in state_dict.items()}
|
| 100 |
model.load_state_dict(state_dict)
|
| 101 |
model.eval()
|
| 102 |
|
| 103 |
+
tokenizer = Tokenizer.from_file(tok_path)
|
|
|
|
| 104 |
|
| 105 |
# 추론
|
| 106 |
prompt = "문제: 3x + 7 = 22일 때, x의 값을 구하시오.\n풀이:\n"
|
| 107 |
+
ids = torch.tensor([tokenizer.encode(prompt).ids])
|
| 108 |
+
out = model.generate(ids, max_new_tokens=200, temperature=0.3)
|
| 109 |
+
print(tokenizer.decode(out[0].tolist()))
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
## 프롬프트 형식
|
| 113 |
|
| 114 |
+
```
|
| 115 |
+
문제: {내용}
|
| 116 |
+
풀이:
|
|
|
|
| 117 |
```
|
| 118 |
|
| 119 |
+
"풀이:" 이후를 생성. 단계별 풀이 + "답: {정답}" 형태로 출력.
|
| 120 |
|
| 121 |
+
## 제한사항
|
| 122 |
|
| 123 |
+
- 합성 데이터로만 학습. 자유 대화 불가.
|
| 124 |
+
- 수열(등비/피보나치) 약함.
|
| 125 |
+
- 컨텍스트 512 토큰 제한.
|
| 126 |
|
| 127 |
+
## 라이선스
|
| 128 |
+
|
| 129 |
+
Apache-2.0
|
config.json
CHANGED
|
@@ -1,16 +1,17 @@
|
|
| 1 |
{
|
| 2 |
-
"model_type": "sovyn-gpt",
|
| 3 |
"architectures": [
|
| 4 |
-
"
|
| 5 |
],
|
|
|
|
| 6 |
"vocab_size": 16384,
|
| 7 |
-
"
|
| 8 |
-
"
|
| 9 |
-
"
|
| 10 |
-
"
|
| 11 |
-
"
|
| 12 |
-
"
|
| 13 |
-
"
|
| 14 |
-
"
|
| 15 |
-
"
|
|
|
|
| 16 |
}
|
|
|
|
| 1 |
{
|
|
|
|
| 2 |
"architectures": [
|
| 3 |
+
"SOVYN85M"
|
| 4 |
],
|
| 5 |
+
"model_type": "sovyn-gpt",
|
| 6 |
"vocab_size": 16384,
|
| 7 |
+
"max_position_embeddings": 512,
|
| 8 |
+
"hidden_size": 768,
|
| 9 |
+
"num_attention_heads": 12,
|
| 10 |
+
"num_hidden_layers": 12,
|
| 11 |
+
"intermediate_size": 3072,
|
| 12 |
+
"hidden_act": "gelu",
|
| 13 |
+
"hidden_dropout_prob": 0.1,
|
| 14 |
+
"attention_probs_dropout_prob": 0.1,
|
| 15 |
+
"tie_word_embeddings": true,
|
| 16 |
+
"torch_dtype": "float16"
|
| 17 |
}
|
model.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SOVYN-85M 모델 아키텍처
|
| 3 |
+
https://huggingface.co/SOVYN/SOVYN-85M
|
| 4 |
+
"""
|
| 5 |
+
import torch
|
| 6 |
+
import torch.nn as nn
|
| 7 |
+
import torch.nn.functional as F
|
| 8 |
+
import math
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ModelConfig:
|
| 12 |
+
vocab_size: int = 16384
|
| 13 |
+
context_length: int = 512
|
| 14 |
+
embed_dim: int = 768
|
| 15 |
+
num_heads: int = 12
|
| 16 |
+
num_layers: int = 12
|
| 17 |
+
dropout: float = 0.1
|
| 18 |
+
bias: bool = False
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class CausalSelfAttention(nn.Module):
|
| 22 |
+
def __init__(self, cfg):
|
| 23 |
+
super().__init__()
|
| 24 |
+
self.num_heads = cfg.num_heads
|
| 25 |
+
self.head_dim = cfg.embed_dim // cfg.num_heads
|
| 26 |
+
self.embed_dim = cfg.embed_dim
|
| 27 |
+
self.qkv = nn.Linear(cfg.embed_dim, 3 * cfg.embed_dim, bias=cfg.bias)
|
| 28 |
+
self.proj = nn.Linear(cfg.embed_dim, cfg.embed_dim, bias=cfg.bias)
|
| 29 |
+
self.resid_drop = nn.Dropout(cfg.dropout)
|
| 30 |
+
self.dropout_p = cfg.dropout
|
| 31 |
+
|
| 32 |
+
def forward(self, x):
|
| 33 |
+
B, T, C = x.shape
|
| 34 |
+
qkv = self.qkv(x).reshape(B, T, 3, self.num_heads, self.head_dim)
|
| 35 |
+
qkv = qkv.permute(2, 0, 3, 1, 4)
|
| 36 |
+
q, k, v = qkv.unbind(0)
|
| 37 |
+
out = F.scaled_dot_product_attention(
|
| 38 |
+
q, k, v, is_causal=True,
|
| 39 |
+
dropout_p=self.dropout_p if self.training else 0.0,
|
| 40 |
+
)
|
| 41 |
+
out = out.transpose(1, 2).reshape(B, T, C)
|
| 42 |
+
return self.resid_drop(self.proj(out))
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class FeedForward(nn.Module):
|
| 46 |
+
def __init__(self, cfg):
|
| 47 |
+
super().__init__()
|
| 48 |
+
hidden = 4 * cfg.embed_dim
|
| 49 |
+
self.fc1 = nn.Linear(cfg.embed_dim, hidden, bias=cfg.bias)
|
| 50 |
+
self.fc2 = nn.Linear(hidden, cfg.embed_dim, bias=cfg.bias)
|
| 51 |
+
self.drop = nn.Dropout(cfg.dropout)
|
| 52 |
+
|
| 53 |
+
def forward(self, x):
|
| 54 |
+
return self.drop(self.fc2(F.gelu(self.fc1(x))))
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class Block(nn.Module):
|
| 58 |
+
def __init__(self, cfg):
|
| 59 |
+
super().__init__()
|
| 60 |
+
self.ln1 = nn.LayerNorm(cfg.embed_dim)
|
| 61 |
+
self.attn = CausalSelfAttention(cfg)
|
| 62 |
+
self.ln2 = nn.LayerNorm(cfg.embed_dim)
|
| 63 |
+
self.ffn = FeedForward(cfg)
|
| 64 |
+
|
| 65 |
+
def forward(self, x):
|
| 66 |
+
x = x + self.attn(self.ln1(x))
|
| 67 |
+
x = x + self.ffn(self.ln2(x))
|
| 68 |
+
return x
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class SOVYN85M(nn.Module):
|
| 72 |
+
def __init__(self, cfg=None):
|
| 73 |
+
super().__init__()
|
| 74 |
+
if cfg is None:
|
| 75 |
+
cfg = ModelConfig()
|
| 76 |
+
self.cfg = cfg
|
| 77 |
+
self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.embed_dim)
|
| 78 |
+
self.pos_emb = nn.Embedding(cfg.context_length, cfg.embed_dim)
|
| 79 |
+
self.drop = nn.Dropout(cfg.dropout)
|
| 80 |
+
self.blocks = nn.ModuleList([Block(cfg) for _ in range(cfg.num_layers)])
|
| 81 |
+
self.ln_f = nn.LayerNorm(cfg.embed_dim)
|
| 82 |
+
self.head = nn.Linear(cfg.embed_dim, cfg.vocab_size, bias=False)
|
| 83 |
+
self.head.weight = self.tok_emb.weight
|
| 84 |
+
|
| 85 |
+
@property
|
| 86 |
+
def num_params(self):
|
| 87 |
+
return sum(p.numel() for p in self.parameters()) - self.tok_emb.weight.numel()
|
| 88 |
+
|
| 89 |
+
def forward(self, idx, targets=None):
|
| 90 |
+
B, T = idx.shape
|
| 91 |
+
pos = torch.arange(T, device=idx.device)
|
| 92 |
+
x = self.drop(self.tok_emb(idx) + self.pos_emb(pos))
|
| 93 |
+
for block in self.blocks:
|
| 94 |
+
x = block(x)
|
| 95 |
+
x = self.ln_f(x)
|
| 96 |
+
logits = self.head(x)
|
| 97 |
+
loss = None
|
| 98 |
+
if targets is not None:
|
| 99 |
+
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1),
|
| 100 |
+
ignore_index=0)
|
| 101 |
+
return logits, loss
|
| 102 |
+
|
| 103 |
+
@torch.no_grad()
|
| 104 |
+
def generate(self, idx, max_new_tokens=200, temperature=0.8, top_k=50):
|
| 105 |
+
self.eval()
|
| 106 |
+
for _ in range(max_new_tokens):
|
| 107 |
+
ctx = idx[:, -self.cfg.context_length:]
|
| 108 |
+
logits, _ = self(ctx)
|
| 109 |
+
logits = logits[:, -1, :] / max(temperature, 1e-8)
|
| 110 |
+
if top_k > 0:
|
| 111 |
+
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
| 112 |
+
logits[logits < v[:, -1:]] = float('-inf')
|
| 113 |
+
probs = F.softmax(logits, dim=-1)
|
| 114 |
+
nxt = torch.multinomial(probs, 1)
|
| 115 |
+
idx = torch.cat([idx, nxt], dim=1)
|
| 116 |
+
if nxt.item() == 2: # EOS
|
| 117 |
+
break
|
| 118 |
+
return idx
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0dce9dbb79b774716a7b2bf3962834dcc62295f653fd45c468b5539c9a3b0ee
|
| 3 |
+
size 221073472
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 3 |
+
"model_max_length": 512,
|
| 4 |
+
"bos_token": "<BOS>",
|
| 5 |
+
"eos_token": "<EOS>",
|
| 6 |
+
"pad_token": "<PAD>",
|
| 7 |
+
"unk_token": "<UNK>"
|
| 8 |
+
}
|