Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- checkpoint-1000/config.json +30 -0
- checkpoint-1000/generation_config.json +7 -0
- checkpoint-1000/model.safetensors +3 -0
- checkpoint-1000/optimizer.pt +3 -0
- checkpoint-1000/rng_state.pth +3 -0
- checkpoint-1000/scheduler.pt +3 -0
- checkpoint-1000/special_tokens_map.json +24 -0
- checkpoint-1000/tokenizer.json +0 -0
- checkpoint-1000/tokenizer.model +3 -0
- checkpoint-1000/tokenizer_config.json +42 -0
- checkpoint-1000/trainer_state.json +934 -0
- checkpoint-1000/training_args.bin +3 -0
- checkpoint-10000/config.json +30 -0
- checkpoint-10000/generation_config.json +7 -0
- checkpoint-10000/model.safetensors +3 -0
- checkpoint-10000/optimizer.pt +3 -0
- checkpoint-10000/rng_state.pth +3 -0
- checkpoint-10000/scheduler.pt +3 -0
- checkpoint-10000/special_tokens_map.json +24 -0
- checkpoint-10000/tokenizer.json +0 -0
- checkpoint-10000/tokenizer.model +3 -0
- checkpoint-10000/tokenizer_config.json +42 -0
- checkpoint-10000/trainer_state.json +0 -0
- checkpoint-10000/training_args.bin +3 -0
- checkpoint-10500/config.json +30 -0
- checkpoint-10500/generation_config.json +7 -0
- checkpoint-10500/model.safetensors +3 -0
- checkpoint-10500/optimizer.pt +3 -0
- checkpoint-10500/rng_state.pth +3 -0
- checkpoint-10500/scheduler.pt +3 -0
- checkpoint-10500/special_tokens_map.json +24 -0
- checkpoint-10500/tokenizer.json +0 -0
- checkpoint-10500/tokenizer.model +3 -0
- checkpoint-10500/tokenizer_config.json +42 -0
- checkpoint-10500/trainer_state.json +0 -0
- checkpoint-10500/training_args.bin +3 -0
- checkpoint-11000/config.json +30 -0
- checkpoint-11000/generation_config.json +7 -0
- checkpoint-11000/model.safetensors +3 -0
- checkpoint-11000/optimizer.pt +3 -0
- checkpoint-11000/rng_state.pth +3 -0
- checkpoint-11000/scheduler.pt +3 -0
- checkpoint-11000/special_tokens_map.json +24 -0
- checkpoint-11000/tokenizer.json +0 -0
- checkpoint-11000/tokenizer.model +3 -0
- checkpoint-11000/tokenizer_config.json +42 -0
- checkpoint-11000/trainer_state.json +0 -0
- checkpoint-11000/training_args.bin +3 -0
- checkpoint-11500/config.json +30 -0
- checkpoint-11500/generation_config.json +7 -0
checkpoint-1000/config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LlamaForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 1,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"head_dim": 128,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 1024,
|
| 14 |
+
"max_position_embeddings": 512,
|
| 15 |
+
"mlp_bias": false,
|
| 16 |
+
"model_type": "llama",
|
| 17 |
+
"num_attention_heads": 6,
|
| 18 |
+
"num_hidden_layers": 6,
|
| 19 |
+
"num_key_value_heads": 6,
|
| 20 |
+
"pad_token_id": 0,
|
| 21 |
+
"pretraining_tp": 1,
|
| 22 |
+
"rms_norm_eps": 1e-05,
|
| 23 |
+
"rope_scaling": null,
|
| 24 |
+
"rope_theta": 10000.0,
|
| 25 |
+
"tie_word_embeddings": false,
|
| 26 |
+
"torch_dtype": "float32",
|
| 27 |
+
"transformers_version": "4.51.3",
|
| 28 |
+
"use_cache": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
checkpoint-1000/generation_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"pad_token_id": 0,
|
| 6 |
+
"transformers_version": "4.51.3"
|
| 7 |
+
}
|
checkpoint-1000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e145ec4cb5b1c6fe568db7242a666c165cc4f4486b6c483180464bc77839d7f
|
| 3 |
+
size 309900448
|
checkpoint-1000/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:120cb0400da1c194dd0c3d5e7d8d348a540146047d251fd82e0cf02175099d30
|
| 3 |
+
size 619836730
|
checkpoint-1000/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
|
| 3 |
+
size 14244
|
checkpoint-1000/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83c694d65a347a9628fbbeee3ca54f077b8171008c69915415bd49f7d02ea9bc
|
| 3 |
+
size 1064
|
checkpoint-1000/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": true,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": true,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "</s>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": true,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
checkpoint-1000/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-1000/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
checkpoint-1000/tokenizer_config.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": true,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": true,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": true,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 37 |
+
"pad_token": "</s>",
|
| 38 |
+
"sp_model_kwargs": {},
|
| 39 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 40 |
+
"unk_token": "<unk>",
|
| 41 |
+
"use_default_system_prompt": false
|
| 42 |
+
}
|
checkpoint-1000/trainer_state.json
ADDED
|
@@ -0,0 +1,934 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.08945143905002571,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1000,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.0008945143905002571,
|
| 14 |
+
"grad_norm": 4.029027938842773,
|
| 15 |
+
"learning_rate": 1.9996779676178552e-05,
|
| 16 |
+
"loss": 6.931,
|
| 17 |
+
"mean_token_accuracy": 0.1403668148443103,
|
| 18 |
+
"num_tokens": 224900.0,
|
| 19 |
+
"step": 10
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"epoch": 0.0017890287810005143,
|
| 23 |
+
"grad_norm": 3.8816776275634766,
|
| 24 |
+
"learning_rate": 1.999320153859916e-05,
|
| 25 |
+
"loss": 6.6428,
|
| 26 |
+
"mean_token_accuracy": 0.15601451508700848,
|
| 27 |
+
"num_tokens": 449350.0,
|
| 28 |
+
"step": 20
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"epoch": 0.0026835431715007715,
|
| 32 |
+
"grad_norm": 3.696845531463623,
|
| 33 |
+
"learning_rate": 1.9989623401019772e-05,
|
| 34 |
+
"loss": 6.3909,
|
| 35 |
+
"mean_token_accuracy": 0.17340584620833396,
|
| 36 |
+
"num_tokens": 673110.0,
|
| 37 |
+
"step": 30
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"epoch": 0.0035780575620010286,
|
| 41 |
+
"grad_norm": 3.57285213470459,
|
| 42 |
+
"learning_rate": 1.998604526344038e-05,
|
| 43 |
+
"loss": 6.2,
|
| 44 |
+
"mean_token_accuracy": 0.20353572219610214,
|
| 45 |
+
"num_tokens": 897311.0,
|
| 46 |
+
"step": 40
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"epoch": 0.004472571952501286,
|
| 50 |
+
"grad_norm": 3.2977962493896484,
|
| 51 |
+
"learning_rate": 1.9982467125860992e-05,
|
| 52 |
+
"loss": 5.9805,
|
| 53 |
+
"mean_token_accuracy": 0.21226616874337195,
|
| 54 |
+
"num_tokens": 1121992.0,
|
| 55 |
+
"step": 50
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"epoch": 0.005367086343001543,
|
| 59 |
+
"grad_norm": 3.0260677337646484,
|
| 60 |
+
"learning_rate": 1.99788889882816e-05,
|
| 61 |
+
"loss": 5.7785,
|
| 62 |
+
"mean_token_accuracy": 0.22370390295982362,
|
| 63 |
+
"num_tokens": 1345684.0,
|
| 64 |
+
"step": 60
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"epoch": 0.0062616007335018,
|
| 68 |
+
"grad_norm": 2.671483039855957,
|
| 69 |
+
"learning_rate": 1.997531085070221e-05,
|
| 70 |
+
"loss": 5.5885,
|
| 71 |
+
"mean_token_accuracy": 0.23743247389793395,
|
| 72 |
+
"num_tokens": 1568546.0,
|
| 73 |
+
"step": 70
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.007156115124002057,
|
| 77 |
+
"grad_norm": 2.5646092891693115,
|
| 78 |
+
"learning_rate": 1.9971732713122823e-05,
|
| 79 |
+
"loss": 5.4085,
|
| 80 |
+
"mean_token_accuracy": 0.24720929898321628,
|
| 81 |
+
"num_tokens": 1791562.0,
|
| 82 |
+
"step": 80
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"epoch": 0.008050629514502314,
|
| 86 |
+
"grad_norm": 4.9957194328308105,
|
| 87 |
+
"learning_rate": 1.996815457554343e-05,
|
| 88 |
+
"loss": 5.2511,
|
| 89 |
+
"mean_token_accuracy": 0.25344080217182635,
|
| 90 |
+
"num_tokens": 2016530.0,
|
| 91 |
+
"step": 90
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"epoch": 0.008945143905002572,
|
| 95 |
+
"grad_norm": 2.079987049102783,
|
| 96 |
+
"learning_rate": 1.996457643796404e-05,
|
| 97 |
+
"loss": 5.1385,
|
| 98 |
+
"mean_token_accuracy": 0.2677029874175787,
|
| 99 |
+
"num_tokens": 2240898.0,
|
| 100 |
+
"step": 100
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"epoch": 0.009839658295502828,
|
| 104 |
+
"grad_norm": 1.875989556312561,
|
| 105 |
+
"learning_rate": 1.996099830038465e-05,
|
| 106 |
+
"loss": 4.9731,
|
| 107 |
+
"mean_token_accuracy": 0.2723236083984375,
|
| 108 |
+
"num_tokens": 2464033.0,
|
| 109 |
+
"step": 110
|
| 110 |
+
},
|
| 111 |
+
{
|
| 112 |
+
"epoch": 0.010734172686003086,
|
| 113 |
+
"grad_norm": 1.7837793827056885,
|
| 114 |
+
"learning_rate": 1.995742016280526e-05,
|
| 115 |
+
"loss": 4.8349,
|
| 116 |
+
"mean_token_accuracy": 0.27978694066405296,
|
| 117 |
+
"num_tokens": 2687104.0,
|
| 118 |
+
"step": 120
|
| 119 |
+
},
|
| 120 |
+
{
|
| 121 |
+
"epoch": 0.011628687076503344,
|
| 122 |
+
"grad_norm": 1.6277521848678589,
|
| 123 |
+
"learning_rate": 1.995384202522587e-05,
|
| 124 |
+
"loss": 4.6858,
|
| 125 |
+
"mean_token_accuracy": 0.29452711045742036,
|
| 126 |
+
"num_tokens": 2911815.0,
|
| 127 |
+
"step": 130
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"epoch": 0.0125232014670036,
|
| 131 |
+
"grad_norm": 1.5478984117507935,
|
| 132 |
+
"learning_rate": 1.9950263887646483e-05,
|
| 133 |
+
"loss": 4.547,
|
| 134 |
+
"mean_token_accuracy": 0.3018424347043037,
|
| 135 |
+
"num_tokens": 3136290.0,
|
| 136 |
+
"step": 140
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.013417715857503858,
|
| 140 |
+
"grad_norm": 1.314953327178955,
|
| 141 |
+
"learning_rate": 1.994668575006709e-05,
|
| 142 |
+
"loss": 4.4302,
|
| 143 |
+
"mean_token_accuracy": 0.3075145646929741,
|
| 144 |
+
"num_tokens": 3361157.0,
|
| 145 |
+
"step": 150
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"epoch": 0.014312230248004114,
|
| 149 |
+
"grad_norm": 1.1404365301132202,
|
| 150 |
+
"learning_rate": 1.9943107612487703e-05,
|
| 151 |
+
"loss": 4.329,
|
| 152 |
+
"mean_token_accuracy": 0.32004141956567767,
|
| 153 |
+
"num_tokens": 3585994.0,
|
| 154 |
+
"step": 160
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"epoch": 0.015206744638504372,
|
| 158 |
+
"grad_norm": 1.1736400127410889,
|
| 159 |
+
"learning_rate": 1.993952947490831e-05,
|
| 160 |
+
"loss": 4.2576,
|
| 161 |
+
"mean_token_accuracy": 0.3258902974426746,
|
| 162 |
+
"num_tokens": 3810406.0,
|
| 163 |
+
"step": 170
|
| 164 |
+
},
|
| 165 |
+
{
|
| 166 |
+
"epoch": 0.01610125902900463,
|
| 167 |
+
"grad_norm": 0.945701003074646,
|
| 168 |
+
"learning_rate": 1.9935951337328923e-05,
|
| 169 |
+
"loss": 4.1854,
|
| 170 |
+
"mean_token_accuracy": 0.33087451085448266,
|
| 171 |
+
"num_tokens": 4033558.0,
|
| 172 |
+
"step": 180
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"epoch": 0.016995773419504888,
|
| 176 |
+
"grad_norm": 0.8116940855979919,
|
| 177 |
+
"learning_rate": 1.9932373199749534e-05,
|
| 178 |
+
"loss": 4.1516,
|
| 179 |
+
"mean_token_accuracy": 0.33102416023612025,
|
| 180 |
+
"num_tokens": 4257791.0,
|
| 181 |
+
"step": 190
|
| 182 |
+
},
|
| 183 |
+
{
|
| 184 |
+
"epoch": 0.017890287810005144,
|
| 185 |
+
"grad_norm": 1.1380091905593872,
|
| 186 |
+
"learning_rate": 1.9928795062170142e-05,
|
| 187 |
+
"loss": 4.0638,
|
| 188 |
+
"mean_token_accuracy": 0.3386186122894287,
|
| 189 |
+
"num_tokens": 4481532.0,
|
| 190 |
+
"step": 200
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"epoch": 0.0187848022005054,
|
| 194 |
+
"grad_norm": 0.690470814704895,
|
| 195 |
+
"learning_rate": 1.992521692459075e-05,
|
| 196 |
+
"loss": 4.0419,
|
| 197 |
+
"mean_token_accuracy": 0.33742879405617715,
|
| 198 |
+
"num_tokens": 4707304.0,
|
| 199 |
+
"step": 210
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 0.019679316591005656,
|
| 203 |
+
"grad_norm": 0.6643151044845581,
|
| 204 |
+
"learning_rate": 1.9921638787011362e-05,
|
| 205 |
+
"loss": 3.9833,
|
| 206 |
+
"mean_token_accuracy": 0.34320330172777175,
|
| 207 |
+
"num_tokens": 4933695.0,
|
| 208 |
+
"step": 220
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"epoch": 0.020573830981505916,
|
| 212 |
+
"grad_norm": 0.7354695200920105,
|
| 213 |
+
"learning_rate": 1.991806064943197e-05,
|
| 214 |
+
"loss": 3.9584,
|
| 215 |
+
"mean_token_accuracy": 0.34674171581864355,
|
| 216 |
+
"num_tokens": 5157252.0,
|
| 217 |
+
"step": 230
|
| 218 |
+
},
|
| 219 |
+
{
|
| 220 |
+
"epoch": 0.021468345372006172,
|
| 221 |
+
"grad_norm": 0.6649508476257324,
|
| 222 |
+
"learning_rate": 1.9914482511852582e-05,
|
| 223 |
+
"loss": 3.8969,
|
| 224 |
+
"mean_token_accuracy": 0.3512790575623512,
|
| 225 |
+
"num_tokens": 5382779.0,
|
| 226 |
+
"step": 240
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"epoch": 0.02236285976250643,
|
| 230 |
+
"grad_norm": 0.68132483959198,
|
| 231 |
+
"learning_rate": 1.9910904374273194e-05,
|
| 232 |
+
"loss": 3.8822,
|
| 233 |
+
"mean_token_accuracy": 0.35188654661178587,
|
| 234 |
+
"num_tokens": 5606727.0,
|
| 235 |
+
"step": 250
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"epoch": 0.023257374153006688,
|
| 239 |
+
"grad_norm": 0.5945841670036316,
|
| 240 |
+
"learning_rate": 1.9907326236693802e-05,
|
| 241 |
+
"loss": 3.845,
|
| 242 |
+
"mean_token_accuracy": 0.3524425096809864,
|
| 243 |
+
"num_tokens": 5830914.0,
|
| 244 |
+
"step": 260
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
"epoch": 0.024151888543506944,
|
| 248 |
+
"grad_norm": 0.5904633402824402,
|
| 249 |
+
"learning_rate": 1.9903748099114414e-05,
|
| 250 |
+
"loss": 3.8385,
|
| 251 |
+
"mean_token_accuracy": 0.3543414056301117,
|
| 252 |
+
"num_tokens": 6054926.0,
|
| 253 |
+
"step": 270
|
| 254 |
+
},
|
| 255 |
+
{
|
| 256 |
+
"epoch": 0.0250464029340072,
|
| 257 |
+
"grad_norm": 0.5699465870857239,
|
| 258 |
+
"learning_rate": 1.9900169961535022e-05,
|
| 259 |
+
"loss": 3.7648,
|
| 260 |
+
"mean_token_accuracy": 0.36298312023282053,
|
| 261 |
+
"num_tokens": 6279612.0,
|
| 262 |
+
"step": 280
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 0.025940917324507456,
|
| 266 |
+
"grad_norm": 0.55901038646698,
|
| 267 |
+
"learning_rate": 1.9896591823955633e-05,
|
| 268 |
+
"loss": 3.7746,
|
| 269 |
+
"mean_token_accuracy": 0.36002359017729757,
|
| 270 |
+
"num_tokens": 6505051.0,
|
| 271 |
+
"step": 290
|
| 272 |
+
},
|
| 273 |
+
{
|
| 274 |
+
"epoch": 0.026835431715007716,
|
| 275 |
+
"grad_norm": 0.6177819967269897,
|
| 276 |
+
"learning_rate": 1.9893013686376245e-05,
|
| 277 |
+
"loss": 3.726,
|
| 278 |
+
"mean_token_accuracy": 0.366513279825449,
|
| 279 |
+
"num_tokens": 6728046.0,
|
| 280 |
+
"step": 300
|
| 281 |
+
},
|
| 282 |
+
{
|
| 283 |
+
"epoch": 0.027729946105507972,
|
| 284 |
+
"grad_norm": 0.525486409664154,
|
| 285 |
+
"learning_rate": 1.9889435548796853e-05,
|
| 286 |
+
"loss": 3.7335,
|
| 287 |
+
"mean_token_accuracy": 0.3637064002454281,
|
| 288 |
+
"num_tokens": 6952250.0,
|
| 289 |
+
"step": 310
|
| 290 |
+
},
|
| 291 |
+
{
|
| 292 |
+
"epoch": 0.02862446049600823,
|
| 293 |
+
"grad_norm": 0.5619438886642456,
|
| 294 |
+
"learning_rate": 1.9885857411217465e-05,
|
| 295 |
+
"loss": 3.7086,
|
| 296 |
+
"mean_token_accuracy": 0.3649679072201252,
|
| 297 |
+
"num_tokens": 7177356.0,
|
| 298 |
+
"step": 320
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"epoch": 0.029518974886508488,
|
| 302 |
+
"grad_norm": 0.5715098977088928,
|
| 303 |
+
"learning_rate": 1.9882279273638073e-05,
|
| 304 |
+
"loss": 3.7003,
|
| 305 |
+
"mean_token_accuracy": 0.36583819389343264,
|
| 306 |
+
"num_tokens": 7401855.0,
|
| 307 |
+
"step": 330
|
| 308 |
+
},
|
| 309 |
+
{
|
| 310 |
+
"epoch": 0.030413489277008744,
|
| 311 |
+
"grad_norm": 0.5622343420982361,
|
| 312 |
+
"learning_rate": 1.987870113605868e-05,
|
| 313 |
+
"loss": 3.6937,
|
| 314 |
+
"mean_token_accuracy": 0.36574283242225647,
|
| 315 |
+
"num_tokens": 7627253.0,
|
| 316 |
+
"step": 340
|
| 317 |
+
},
|
| 318 |
+
{
|
| 319 |
+
"epoch": 0.031308003667509,
|
| 320 |
+
"grad_norm": 0.4998467266559601,
|
| 321 |
+
"learning_rate": 1.9875122998479293e-05,
|
| 322 |
+
"loss": 3.6644,
|
| 323 |
+
"mean_token_accuracy": 0.3698362477123737,
|
| 324 |
+
"num_tokens": 7851897.0,
|
| 325 |
+
"step": 350
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"epoch": 0.03220251805800926,
|
| 329 |
+
"grad_norm": 0.6738699078559875,
|
| 330 |
+
"learning_rate": 1.98715448608999e-05,
|
| 331 |
+
"loss": 3.643,
|
| 332 |
+
"mean_token_accuracy": 0.3715419560670853,
|
| 333 |
+
"num_tokens": 8076130.0,
|
| 334 |
+
"step": 360
|
| 335 |
+
},
|
| 336 |
+
{
|
| 337 |
+
"epoch": 0.03309703244850951,
|
| 338 |
+
"grad_norm": 0.584710955619812,
|
| 339 |
+
"learning_rate": 1.9867966723320513e-05,
|
| 340 |
+
"loss": 3.6288,
|
| 341 |
+
"mean_token_accuracy": 0.3715605862438679,
|
| 342 |
+
"num_tokens": 8300267.0,
|
| 343 |
+
"step": 370
|
| 344 |
+
},
|
| 345 |
+
{
|
| 346 |
+
"epoch": 0.033991546839009776,
|
| 347 |
+
"grad_norm": 0.6669703722000122,
|
| 348 |
+
"learning_rate": 1.9864388585741125e-05,
|
| 349 |
+
"loss": 3.608,
|
| 350 |
+
"mean_token_accuracy": 0.374962493032217,
|
| 351 |
+
"num_tokens": 8523697.0,
|
| 352 |
+
"step": 380
|
| 353 |
+
},
|
| 354 |
+
{
|
| 355 |
+
"epoch": 0.03488606122951003,
|
| 356 |
+
"grad_norm": 0.7256543040275574,
|
| 357 |
+
"learning_rate": 1.9860810448161733e-05,
|
| 358 |
+
"loss": 3.5934,
|
| 359 |
+
"mean_token_accuracy": 0.3755581140518188,
|
| 360 |
+
"num_tokens": 8747098.0,
|
| 361 |
+
"step": 390
|
| 362 |
+
},
|
| 363 |
+
{
|
| 364 |
+
"epoch": 0.03578057562001029,
|
| 365 |
+
"grad_norm": 0.6731703281402588,
|
| 366 |
+
"learning_rate": 1.9857232310582344e-05,
|
| 367 |
+
"loss": 3.5885,
|
| 368 |
+
"mean_token_accuracy": 0.37688973248004914,
|
| 369 |
+
"num_tokens": 8971344.0,
|
| 370 |
+
"step": 400
|
| 371 |
+
},
|
| 372 |
+
{
|
| 373 |
+
"epoch": 0.036675090010510544,
|
| 374 |
+
"grad_norm": 0.9010092616081238,
|
| 375 |
+
"learning_rate": 1.9853654173002953e-05,
|
| 376 |
+
"loss": 3.5777,
|
| 377 |
+
"mean_token_accuracy": 0.37828439101576805,
|
| 378 |
+
"num_tokens": 9197198.0,
|
| 379 |
+
"step": 410
|
| 380 |
+
},
|
| 381 |
+
{
|
| 382 |
+
"epoch": 0.0375696044010108,
|
| 383 |
+
"grad_norm": 0.4792615473270416,
|
| 384 |
+
"learning_rate": 1.9850076035423564e-05,
|
| 385 |
+
"loss": 3.5561,
|
| 386 |
+
"mean_token_accuracy": 0.3797303937375546,
|
| 387 |
+
"num_tokens": 9421420.0,
|
| 388 |
+
"step": 420
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"epoch": 0.03846411879151106,
|
| 392 |
+
"grad_norm": 0.5398027896881104,
|
| 393 |
+
"learning_rate": 1.9846497897844176e-05,
|
| 394 |
+
"loss": 3.5485,
|
| 395 |
+
"mean_token_accuracy": 0.38109720274806025,
|
| 396 |
+
"num_tokens": 9646007.0,
|
| 397 |
+
"step": 430
|
| 398 |
+
},
|
| 399 |
+
{
|
| 400 |
+
"epoch": 0.03935863318201131,
|
| 401 |
+
"grad_norm": 3.100107431411743,
|
| 402 |
+
"learning_rate": 1.9842919760264784e-05,
|
| 403 |
+
"loss": 3.5545,
|
| 404 |
+
"mean_token_accuracy": 0.3794835329055786,
|
| 405 |
+
"num_tokens": 9869980.0,
|
| 406 |
+
"step": 440
|
| 407 |
+
},
|
| 408 |
+
{
|
| 409 |
+
"epoch": 0.040253147572511576,
|
| 410 |
+
"grad_norm": 0.646981418132782,
|
| 411 |
+
"learning_rate": 1.9839341622685392e-05,
|
| 412 |
+
"loss": 3.536,
|
| 413 |
+
"mean_token_accuracy": 0.382675875723362,
|
| 414 |
+
"num_tokens": 10093289.0,
|
| 415 |
+
"step": 450
|
| 416 |
+
},
|
| 417 |
+
{
|
| 418 |
+
"epoch": 0.04114766196301183,
|
| 419 |
+
"grad_norm": 0.6310556530952454,
|
| 420 |
+
"learning_rate": 1.9835763485106004e-05,
|
| 421 |
+
"loss": 3.5114,
|
| 422 |
+
"mean_token_accuracy": 0.3832168258726597,
|
| 423 |
+
"num_tokens": 10318254.0,
|
| 424 |
+
"step": 460
|
| 425 |
+
},
|
| 426 |
+
{
|
| 427 |
+
"epoch": 0.04204217635351209,
|
| 428 |
+
"grad_norm": 0.5271363258361816,
|
| 429 |
+
"learning_rate": 1.9832185347526612e-05,
|
| 430 |
+
"loss": 3.4967,
|
| 431 |
+
"mean_token_accuracy": 0.3867050640285015,
|
| 432 |
+
"num_tokens": 10541404.0,
|
| 433 |
+
"step": 470
|
| 434 |
+
},
|
| 435 |
+
{
|
| 436 |
+
"epoch": 0.042936690744012344,
|
| 437 |
+
"grad_norm": 0.5193490982055664,
|
| 438 |
+
"learning_rate": 1.9828607209947224e-05,
|
| 439 |
+
"loss": 3.5113,
|
| 440 |
+
"mean_token_accuracy": 0.3860153049230576,
|
| 441 |
+
"num_tokens": 10765992.0,
|
| 442 |
+
"step": 480
|
| 443 |
+
},
|
| 444 |
+
{
|
| 445 |
+
"epoch": 0.0438312051345126,
|
| 446 |
+
"grad_norm": 0.5134871006011963,
|
| 447 |
+
"learning_rate": 1.9825029072367835e-05,
|
| 448 |
+
"loss": 3.5039,
|
| 449 |
+
"mean_token_accuracy": 0.38554045259952546,
|
| 450 |
+
"num_tokens": 10992282.0,
|
| 451 |
+
"step": 490
|
| 452 |
+
},
|
| 453 |
+
{
|
| 454 |
+
"epoch": 0.04472571952501286,
|
| 455 |
+
"grad_norm": 0.4887460768222809,
|
| 456 |
+
"learning_rate": 1.9821450934788444e-05,
|
| 457 |
+
"loss": 3.4895,
|
| 458 |
+
"mean_token_accuracy": 0.3860923834145069,
|
| 459 |
+
"num_tokens": 11217294.0,
|
| 460 |
+
"step": 500
|
| 461 |
+
},
|
| 462 |
+
{
|
| 463 |
+
"epoch": 0.04562023391551311,
|
| 464 |
+
"grad_norm": 0.5546553730964661,
|
| 465 |
+
"learning_rate": 1.9817872797209055e-05,
|
| 466 |
+
"loss": 3.4938,
|
| 467 |
+
"mean_token_accuracy": 0.38622146248817446,
|
| 468 |
+
"num_tokens": 11442608.0,
|
| 469 |
+
"step": 510
|
| 470 |
+
},
|
| 471 |
+
{
|
| 472 |
+
"epoch": 0.046514748306013376,
|
| 473 |
+
"grad_norm": 0.5706290602684021,
|
| 474 |
+
"learning_rate": 1.9814294659629664e-05,
|
| 475 |
+
"loss": 3.4489,
|
| 476 |
+
"mean_token_accuracy": 0.39343543276190757,
|
| 477 |
+
"num_tokens": 11665939.0,
|
| 478 |
+
"step": 520
|
| 479 |
+
},
|
| 480 |
+
{
|
| 481 |
+
"epoch": 0.04740926269651363,
|
| 482 |
+
"grad_norm": 0.4757273197174072,
|
| 483 |
+
"learning_rate": 1.9810716522050275e-05,
|
| 484 |
+
"loss": 3.4634,
|
| 485 |
+
"mean_token_accuracy": 0.3893909424543381,
|
| 486 |
+
"num_tokens": 11891171.0,
|
| 487 |
+
"step": 530
|
| 488 |
+
},
|
| 489 |
+
{
|
| 490 |
+
"epoch": 0.04830377708701389,
|
| 491 |
+
"grad_norm": 0.5140799283981323,
|
| 492 |
+
"learning_rate": 1.9807138384470887e-05,
|
| 493 |
+
"loss": 3.4538,
|
| 494 |
+
"mean_token_accuracy": 0.3925728119909763,
|
| 495 |
+
"num_tokens": 12115773.0,
|
| 496 |
+
"step": 540
|
| 497 |
+
},
|
| 498 |
+
{
|
| 499 |
+
"epoch": 0.049198291477514144,
|
| 500 |
+
"grad_norm": 0.4599289894104004,
|
| 501 |
+
"learning_rate": 1.9803560246891495e-05,
|
| 502 |
+
"loss": 3.4482,
|
| 503 |
+
"mean_token_accuracy": 0.39118969812989235,
|
| 504 |
+
"num_tokens": 12339933.0,
|
| 505 |
+
"step": 550
|
| 506 |
+
},
|
| 507 |
+
{
|
| 508 |
+
"epoch": 0.0500928058680144,
|
| 509 |
+
"grad_norm": 0.517117440700531,
|
| 510 |
+
"learning_rate": 1.9799982109312103e-05,
|
| 511 |
+
"loss": 3.4205,
|
| 512 |
+
"mean_token_accuracy": 0.3933353215456009,
|
| 513 |
+
"num_tokens": 12564950.0,
|
| 514 |
+
"step": 560
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"epoch": 0.05098732025851466,
|
| 518 |
+
"grad_norm": 0.7124619483947754,
|
| 519 |
+
"learning_rate": 1.9796403971732715e-05,
|
| 520 |
+
"loss": 3.4234,
|
| 521 |
+
"mean_token_accuracy": 0.39552380964159967,
|
| 522 |
+
"num_tokens": 12790863.0,
|
| 523 |
+
"step": 570
|
| 524 |
+
},
|
| 525 |
+
{
|
| 526 |
+
"epoch": 0.05188183464901491,
|
| 527 |
+
"grad_norm": 10.448816299438477,
|
| 528 |
+
"learning_rate": 1.9792825834153323e-05,
|
| 529 |
+
"loss": 3.4277,
|
| 530 |
+
"mean_token_accuracy": 0.3924214608967304,
|
| 531 |
+
"num_tokens": 13017450.0,
|
| 532 |
+
"step": 580
|
| 533 |
+
},
|
| 534 |
+
{
|
| 535 |
+
"epoch": 0.052776349039515176,
|
| 536 |
+
"grad_norm": 0.5431159734725952,
|
| 537 |
+
"learning_rate": 1.9789247696573935e-05,
|
| 538 |
+
"loss": 3.4328,
|
| 539 |
+
"mean_token_accuracy": 0.39353245720267294,
|
| 540 |
+
"num_tokens": 13241608.0,
|
| 541 |
+
"step": 590
|
| 542 |
+
},
|
| 543 |
+
{
|
| 544 |
+
"epoch": 0.05367086343001543,
|
| 545 |
+
"grad_norm": 0.5687503814697266,
|
| 546 |
+
"learning_rate": 1.9785669558994546e-05,
|
| 547 |
+
"loss": 3.3989,
|
| 548 |
+
"mean_token_accuracy": 0.39768306240439416,
|
| 549 |
+
"num_tokens": 13465104.0,
|
| 550 |
+
"step": 600
|
| 551 |
+
},
|
| 552 |
+
{
|
| 553 |
+
"epoch": 0.05456537782051569,
|
| 554 |
+
"grad_norm": 0.5452563166618347,
|
| 555 |
+
"learning_rate": 1.9782091421415155e-05,
|
| 556 |
+
"loss": 3.3754,
|
| 557 |
+
"mean_token_accuracy": 0.39803339168429375,
|
| 558 |
+
"num_tokens": 13689303.0,
|
| 559 |
+
"step": 610
|
| 560 |
+
},
|
| 561 |
+
{
|
| 562 |
+
"epoch": 0.055459892211015945,
|
| 563 |
+
"grad_norm": 0.4787168800830841,
|
| 564 |
+
"learning_rate": 1.9778513283835766e-05,
|
| 565 |
+
"loss": 3.3908,
|
| 566 |
+
"mean_token_accuracy": 0.3983615793287754,
|
| 567 |
+
"num_tokens": 13913069.0,
|
| 568 |
+
"step": 620
|
| 569 |
+
},
|
| 570 |
+
{
|
| 571 |
+
"epoch": 0.0563544066015162,
|
| 572 |
+
"grad_norm": 0.533787190914154,
|
| 573 |
+
"learning_rate": 1.9774935146256374e-05,
|
| 574 |
+
"loss": 3.4083,
|
| 575 |
+
"mean_token_accuracy": 0.3976218432188034,
|
| 576 |
+
"num_tokens": 14136873.0,
|
| 577 |
+
"step": 630
|
| 578 |
+
},
|
| 579 |
+
{
|
| 580 |
+
"epoch": 0.05724892099201646,
|
| 581 |
+
"grad_norm": 0.6915440559387207,
|
| 582 |
+
"learning_rate": 1.9771357008676986e-05,
|
| 583 |
+
"loss": 3.3768,
|
| 584 |
+
"mean_token_accuracy": 0.40016965195536613,
|
| 585 |
+
"num_tokens": 14359757.0,
|
| 586 |
+
"step": 640
|
| 587 |
+
},
|
| 588 |
+
{
|
| 589 |
+
"epoch": 0.05814343538251671,
|
| 590 |
+
"grad_norm": 0.5388856530189514,
|
| 591 |
+
"learning_rate": 1.9767778871097598e-05,
|
| 592 |
+
"loss": 3.3652,
|
| 593 |
+
"mean_token_accuracy": 0.40155375823378564,
|
| 594 |
+
"num_tokens": 14583389.0,
|
| 595 |
+
"step": 650
|
| 596 |
+
},
|
| 597 |
+
{
|
| 598 |
+
"epoch": 0.059037949773016976,
|
| 599 |
+
"grad_norm": 0.5853003263473511,
|
| 600 |
+
"learning_rate": 1.9764200733518206e-05,
|
| 601 |
+
"loss": 3.401,
|
| 602 |
+
"mean_token_accuracy": 0.39647991508245467,
|
| 603 |
+
"num_tokens": 14806251.0,
|
| 604 |
+
"step": 660
|
| 605 |
+
},
|
| 606 |
+
{
|
| 607 |
+
"epoch": 0.05993246416351723,
|
| 608 |
+
"grad_norm": 0.6135736703872681,
|
| 609 |
+
"learning_rate": 1.9760622595938818e-05,
|
| 610 |
+
"loss": 3.4031,
|
| 611 |
+
"mean_token_accuracy": 0.3982516027987003,
|
| 612 |
+
"num_tokens": 15031471.0,
|
| 613 |
+
"step": 670
|
| 614 |
+
},
|
| 615 |
+
{
|
| 616 |
+
"epoch": 0.06082697855401749,
|
| 617 |
+
"grad_norm": 0.45374274253845215,
|
| 618 |
+
"learning_rate": 1.9757044458359426e-05,
|
| 619 |
+
"loss": 3.3841,
|
| 620 |
+
"mean_token_accuracy": 0.3975381299853325,
|
| 621 |
+
"num_tokens": 15256769.0,
|
| 622 |
+
"step": 680
|
| 623 |
+
},
|
| 624 |
+
{
|
| 625 |
+
"epoch": 0.061721492944517745,
|
| 626 |
+
"grad_norm": 0.5736910700798035,
|
| 627 |
+
"learning_rate": 1.9753466320780034e-05,
|
| 628 |
+
"loss": 3.383,
|
| 629 |
+
"mean_token_accuracy": 0.3996949538588524,
|
| 630 |
+
"num_tokens": 15479684.0,
|
| 631 |
+
"step": 690
|
| 632 |
+
},
|
| 633 |
+
{
|
| 634 |
+
"epoch": 0.062616007335018,
|
| 635 |
+
"grad_norm": 0.5454510450363159,
|
| 636 |
+
"learning_rate": 1.9749888183200646e-05,
|
| 637 |
+
"loss": 3.352,
|
| 638 |
+
"mean_token_accuracy": 0.40254419967532157,
|
| 639 |
+
"num_tokens": 15704356.0,
|
| 640 |
+
"step": 700
|
| 641 |
+
},
|
| 642 |
+
{
|
| 643 |
+
"epoch": 0.06351052172551826,
|
| 644 |
+
"grad_norm": 0.5370535850524902,
|
| 645 |
+
"learning_rate": 1.9746310045621254e-05,
|
| 646 |
+
"loss": 3.3559,
|
| 647 |
+
"mean_token_accuracy": 0.40261620208621024,
|
| 648 |
+
"num_tokens": 15928903.0,
|
| 649 |
+
"step": 710
|
| 650 |
+
},
|
| 651 |
+
{
|
| 652 |
+
"epoch": 0.06440503611601851,
|
| 653 |
+
"grad_norm": 0.6735969185829163,
|
| 654 |
+
"learning_rate": 1.9742731908041865e-05,
|
| 655 |
+
"loss": 3.3582,
|
| 656 |
+
"mean_token_accuracy": 0.40281880721449853,
|
| 657 |
+
"num_tokens": 16154120.0,
|
| 658 |
+
"step": 720
|
| 659 |
+
},
|
| 660 |
+
{
|
| 661 |
+
"epoch": 0.06529955050651877,
|
| 662 |
+
"grad_norm": 0.46152418851852417,
|
| 663 |
+
"learning_rate": 1.9739153770462477e-05,
|
| 664 |
+
"loss": 3.3388,
|
| 665 |
+
"mean_token_accuracy": 0.4018984198570251,
|
| 666 |
+
"num_tokens": 16378150.0,
|
| 667 |
+
"step": 730
|
| 668 |
+
},
|
| 669 |
+
{
|
| 670 |
+
"epoch": 0.06619406489701903,
|
| 671 |
+
"grad_norm": 0.5333797335624695,
|
| 672 |
+
"learning_rate": 1.9735575632883085e-05,
|
| 673 |
+
"loss": 3.358,
|
| 674 |
+
"mean_token_accuracy": 0.40131590217351915,
|
| 675 |
+
"num_tokens": 16602518.0,
|
| 676 |
+
"step": 740
|
| 677 |
+
},
|
| 678 |
+
{
|
| 679 |
+
"epoch": 0.06708857928751928,
|
| 680 |
+
"grad_norm": 0.6620674729347229,
|
| 681 |
+
"learning_rate": 1.9731997495303697e-05,
|
| 682 |
+
"loss": 3.3597,
|
| 683 |
+
"mean_token_accuracy": 0.40183877646923066,
|
| 684 |
+
"num_tokens": 16825308.0,
|
| 685 |
+
"step": 750
|
| 686 |
+
},
|
| 687 |
+
{
|
| 688 |
+
"epoch": 0.06798309367801955,
|
| 689 |
+
"grad_norm": 0.4112262427806854,
|
| 690 |
+
"learning_rate": 1.9728419357724305e-05,
|
| 691 |
+
"loss": 3.3498,
|
| 692 |
+
"mean_token_accuracy": 0.4032081626355648,
|
| 693 |
+
"num_tokens": 17048788.0,
|
| 694 |
+
"step": 760
|
| 695 |
+
},
|
| 696 |
+
{
|
| 697 |
+
"epoch": 0.06887760806851981,
|
| 698 |
+
"grad_norm": 0.49325069785118103,
|
| 699 |
+
"learning_rate": 1.9724841220144917e-05,
|
| 700 |
+
"loss": 3.3523,
|
| 701 |
+
"mean_token_accuracy": 0.4036438427865505,
|
| 702 |
+
"num_tokens": 17273104.0,
|
| 703 |
+
"step": 770
|
| 704 |
+
},
|
| 705 |
+
{
|
| 706 |
+
"epoch": 0.06977212245902006,
|
| 707 |
+
"grad_norm": 0.5840951204299927,
|
| 708 |
+
"learning_rate": 1.972126308256553e-05,
|
| 709 |
+
"loss": 3.3446,
|
| 710 |
+
"mean_token_accuracy": 0.4040109634399414,
|
| 711 |
+
"num_tokens": 17497961.0,
|
| 712 |
+
"step": 780
|
| 713 |
+
},
|
| 714 |
+
{
|
| 715 |
+
"epoch": 0.07066663684952032,
|
| 716 |
+
"grad_norm": 0.49413686990737915,
|
| 717 |
+
"learning_rate": 1.9717684944986137e-05,
|
| 718 |
+
"loss": 3.3309,
|
| 719 |
+
"mean_token_accuracy": 0.40450835302472116,
|
| 720 |
+
"num_tokens": 17722726.0,
|
| 721 |
+
"step": 790
|
| 722 |
+
},
|
| 723 |
+
{
|
| 724 |
+
"epoch": 0.07156115124002058,
|
| 725 |
+
"grad_norm": 0.6528025269508362,
|
| 726 |
+
"learning_rate": 1.9714106807406745e-05,
|
| 727 |
+
"loss": 3.3427,
|
| 728 |
+
"mean_token_accuracy": 0.4030210435390472,
|
| 729 |
+
"num_tokens": 17946378.0,
|
| 730 |
+
"step": 800
|
| 731 |
+
},
|
| 732 |
+
{
|
| 733 |
+
"epoch": 0.07245566563052083,
|
| 734 |
+
"grad_norm": 0.5769058465957642,
|
| 735 |
+
"learning_rate": 1.9710528669827357e-05,
|
| 736 |
+
"loss": 3.3423,
|
| 737 |
+
"mean_token_accuracy": 0.4021991953253746,
|
| 738 |
+
"num_tokens": 18170591.0,
|
| 739 |
+
"step": 810
|
| 740 |
+
},
|
| 741 |
+
{
|
| 742 |
+
"epoch": 0.07335018002102109,
|
| 743 |
+
"grad_norm": 0.6946350336074829,
|
| 744 |
+
"learning_rate": 1.9706950532247965e-05,
|
| 745 |
+
"loss": 3.322,
|
| 746 |
+
"mean_token_accuracy": 0.40520998015999793,
|
| 747 |
+
"num_tokens": 18395601.0,
|
| 748 |
+
"step": 820
|
| 749 |
+
},
|
| 750 |
+
{
|
| 751 |
+
"epoch": 0.07424469441152134,
|
| 752 |
+
"grad_norm": 0.5611916184425354,
|
| 753 |
+
"learning_rate": 1.9703372394668576e-05,
|
| 754 |
+
"loss": 3.3172,
|
| 755 |
+
"mean_token_accuracy": 0.4065264783799648,
|
| 756 |
+
"num_tokens": 18619829.0,
|
| 757 |
+
"step": 830
|
| 758 |
+
},
|
| 759 |
+
{
|
| 760 |
+
"epoch": 0.0751392088020216,
|
| 761 |
+
"grad_norm": 0.5267366170883179,
|
| 762 |
+
"learning_rate": 1.9699794257089188e-05,
|
| 763 |
+
"loss": 3.3504,
|
| 764 |
+
"mean_token_accuracy": 0.4045166805386543,
|
| 765 |
+
"num_tokens": 18844573.0,
|
| 766 |
+
"step": 840
|
| 767 |
+
},
|
| 768 |
+
{
|
| 769 |
+
"epoch": 0.07603372319252186,
|
| 770 |
+
"grad_norm": 0.5963064432144165,
|
| 771 |
+
"learning_rate": 1.9696216119509796e-05,
|
| 772 |
+
"loss": 3.3026,
|
| 773 |
+
"mean_token_accuracy": 0.4085647910833359,
|
| 774 |
+
"num_tokens": 19071174.0,
|
| 775 |
+
"step": 850
|
| 776 |
+
},
|
| 777 |
+
{
|
| 778 |
+
"epoch": 0.07692823758302211,
|
| 779 |
+
"grad_norm": 0.4585157632827759,
|
| 780 |
+
"learning_rate": 1.9692637981930408e-05,
|
| 781 |
+
"loss": 3.2948,
|
| 782 |
+
"mean_token_accuracy": 0.4082924917340279,
|
| 783 |
+
"num_tokens": 19296052.0,
|
| 784 |
+
"step": 860
|
| 785 |
+
},
|
| 786 |
+
{
|
| 787 |
+
"epoch": 0.07782275197352237,
|
| 788 |
+
"grad_norm": 0.5613287687301636,
|
| 789 |
+
"learning_rate": 1.9689059844351016e-05,
|
| 790 |
+
"loss": 3.3167,
|
| 791 |
+
"mean_token_accuracy": 0.40675563290715216,
|
| 792 |
+
"num_tokens": 19521709.0,
|
| 793 |
+
"step": 870
|
| 794 |
+
},
|
| 795 |
+
{
|
| 796 |
+
"epoch": 0.07871726636402263,
|
| 797 |
+
"grad_norm": 0.4587007761001587,
|
| 798 |
+
"learning_rate": 1.9685481706771628e-05,
|
| 799 |
+
"loss": 3.305,
|
| 800 |
+
"mean_token_accuracy": 0.4078836299479008,
|
| 801 |
+
"num_tokens": 19745334.0,
|
| 802 |
+
"step": 880
|
| 803 |
+
},
|
| 804 |
+
{
|
| 805 |
+
"epoch": 0.07961178075452288,
|
| 806 |
+
"grad_norm": 0.5072513818740845,
|
| 807 |
+
"learning_rate": 1.968190356919224e-05,
|
| 808 |
+
"loss": 3.3259,
|
| 809 |
+
"mean_token_accuracy": 0.40673111006617546,
|
| 810 |
+
"num_tokens": 19969825.0,
|
| 811 |
+
"step": 890
|
| 812 |
+
},
|
| 813 |
+
{
|
| 814 |
+
"epoch": 0.08050629514502315,
|
| 815 |
+
"grad_norm": 0.5777090787887573,
|
| 816 |
+
"learning_rate": 1.9678325431612848e-05,
|
| 817 |
+
"loss": 3.2855,
|
| 818 |
+
"mean_token_accuracy": 0.4096429578959942,
|
| 819 |
+
"num_tokens": 20193404.0,
|
| 820 |
+
"step": 900
|
| 821 |
+
},
|
| 822 |
+
{
|
| 823 |
+
"epoch": 0.08140080953552341,
|
| 824 |
+
"grad_norm": 0.5001935362815857,
|
| 825 |
+
"learning_rate": 1.967474729403346e-05,
|
| 826 |
+
"loss": 3.287,
|
| 827 |
+
"mean_token_accuracy": 0.41084871664643285,
|
| 828 |
+
"num_tokens": 20418412.0,
|
| 829 |
+
"step": 910
|
| 830 |
+
},
|
| 831 |
+
{
|
| 832 |
+
"epoch": 0.08229532392602366,
|
| 833 |
+
"grad_norm": 0.560683012008667,
|
| 834 |
+
"learning_rate": 1.9671169156454067e-05,
|
| 835 |
+
"loss": 3.3084,
|
| 836 |
+
"mean_token_accuracy": 0.4078595593571663,
|
| 837 |
+
"num_tokens": 20642807.0,
|
| 838 |
+
"step": 920
|
| 839 |
+
},
|
| 840 |
+
{
|
| 841 |
+
"epoch": 0.08318983831652392,
|
| 842 |
+
"grad_norm": 0.7433478832244873,
|
| 843 |
+
"learning_rate": 1.9667591018874676e-05,
|
| 844 |
+
"loss": 3.3168,
|
| 845 |
+
"mean_token_accuracy": 0.40866749435663224,
|
| 846 |
+
"num_tokens": 20866434.0,
|
| 847 |
+
"step": 930
|
| 848 |
+
},
|
| 849 |
+
{
|
| 850 |
+
"epoch": 0.08408435270702418,
|
| 851 |
+
"grad_norm": 0.47655490040779114,
|
| 852 |
+
"learning_rate": 1.9664012881295287e-05,
|
| 853 |
+
"loss": 3.2776,
|
| 854 |
+
"mean_token_accuracy": 0.40998933985829356,
|
| 855 |
+
"num_tokens": 21092326.0,
|
| 856 |
+
"step": 940
|
| 857 |
+
},
|
| 858 |
+
{
|
| 859 |
+
"epoch": 0.08497886709752443,
|
| 860 |
+
"grad_norm": 0.6603720784187317,
|
| 861 |
+
"learning_rate": 1.9660434743715895e-05,
|
| 862 |
+
"loss": 3.246,
|
| 863 |
+
"mean_token_accuracy": 0.41296741738915443,
|
| 864 |
+
"num_tokens": 21317008.0,
|
| 865 |
+
"step": 950
|
| 866 |
+
},
|
| 867 |
+
{
|
| 868 |
+
"epoch": 0.08587338148802469,
|
| 869 |
+
"grad_norm": 0.6937066912651062,
|
| 870 |
+
"learning_rate": 1.9656856606136507e-05,
|
| 871 |
+
"loss": 3.2978,
|
| 872 |
+
"mean_token_accuracy": 0.4103521354496479,
|
| 873 |
+
"num_tokens": 21541185.0,
|
| 874 |
+
"step": 960
|
| 875 |
+
},
|
| 876 |
+
{
|
| 877 |
+
"epoch": 0.08676789587852494,
|
| 878 |
+
"grad_norm": 0.5999243259429932,
|
| 879 |
+
"learning_rate": 1.965327846855712e-05,
|
| 880 |
+
"loss": 3.2726,
|
| 881 |
+
"mean_token_accuracy": 0.41270416751503947,
|
| 882 |
+
"num_tokens": 21765805.0,
|
| 883 |
+
"step": 970
|
| 884 |
+
},
|
| 885 |
+
{
|
| 886 |
+
"epoch": 0.0876624102690252,
|
| 887 |
+
"grad_norm": 0.4740482270717621,
|
| 888 |
+
"learning_rate": 1.9649700330977727e-05,
|
| 889 |
+
"loss": 3.2616,
|
| 890 |
+
"mean_token_accuracy": 0.41271830424666406,
|
| 891 |
+
"num_tokens": 21993399.0,
|
| 892 |
+
"step": 980
|
| 893 |
+
},
|
| 894 |
+
{
|
| 895 |
+
"epoch": 0.08855692465952546,
|
| 896 |
+
"grad_norm": 0.4714813530445099,
|
| 897 |
+
"learning_rate": 1.964612219339834e-05,
|
| 898 |
+
"loss": 3.2992,
|
| 899 |
+
"mean_token_accuracy": 0.4101860985159874,
|
| 900 |
+
"num_tokens": 22217604.0,
|
| 901 |
+
"step": 990
|
| 902 |
+
},
|
| 903 |
+
{
|
| 904 |
+
"epoch": 0.08945143905002571,
|
| 905 |
+
"grad_norm": 0.422974556684494,
|
| 906 |
+
"learning_rate": 1.9642544055818947e-05,
|
| 907 |
+
"loss": 3.2634,
|
| 908 |
+
"mean_token_accuracy": 0.41224386021494863,
|
| 909 |
+
"num_tokens": 22442384.0,
|
| 910 |
+
"step": 1000
|
| 911 |
+
}
|
| 912 |
+
],
|
| 913 |
+
"logging_steps": 10,
|
| 914 |
+
"max_steps": 55895,
|
| 915 |
+
"num_input_tokens_seen": 0,
|
| 916 |
+
"num_train_epochs": 5,
|
| 917 |
+
"save_steps": 500,
|
| 918 |
+
"stateful_callbacks": {
|
| 919 |
+
"TrainerControl": {
|
| 920 |
+
"args": {
|
| 921 |
+
"should_epoch_stop": false,
|
| 922 |
+
"should_evaluate": false,
|
| 923 |
+
"should_log": false,
|
| 924 |
+
"should_save": true,
|
| 925 |
+
"should_training_stop": false
|
| 926 |
+
},
|
| 927 |
+
"attributes": {}
|
| 928 |
+
}
|
| 929 |
+
},
|
| 930 |
+
"total_flos": 1.1315610691043328e+16,
|
| 931 |
+
"train_batch_size": 64,
|
| 932 |
+
"trial_name": null,
|
| 933 |
+
"trial_params": null
|
| 934 |
+
}
|
checkpoint-1000/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eec90019e03b3308744cf7428c0bc743b8dd914d925c9059ff38ec49b74a159f
|
| 3 |
+
size 5688
|
checkpoint-10000/config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LlamaForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 1,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"head_dim": 128,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 1024,
|
| 14 |
+
"max_position_embeddings": 512,
|
| 15 |
+
"mlp_bias": false,
|
| 16 |
+
"model_type": "llama",
|
| 17 |
+
"num_attention_heads": 6,
|
| 18 |
+
"num_hidden_layers": 6,
|
| 19 |
+
"num_key_value_heads": 6,
|
| 20 |
+
"pad_token_id": 0,
|
| 21 |
+
"pretraining_tp": 1,
|
| 22 |
+
"rms_norm_eps": 1e-05,
|
| 23 |
+
"rope_scaling": null,
|
| 24 |
+
"rope_theta": 10000.0,
|
| 25 |
+
"tie_word_embeddings": false,
|
| 26 |
+
"torch_dtype": "float32",
|
| 27 |
+
"transformers_version": "4.51.3",
|
| 28 |
+
"use_cache": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
checkpoint-10000/generation_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"pad_token_id": 0,
|
| 6 |
+
"transformers_version": "4.51.3"
|
| 7 |
+
}
|
checkpoint-10000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:458c8eba82e517f5ba85b362bd54b06f115679c56b6b1c4071cb969b5c55915f
|
| 3 |
+
size 309900448
|
checkpoint-10000/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ed7277ef8481f73983e89138757736d72118e72dd5e978643a274ade68505a4
|
| 3 |
+
size 619836730
|
checkpoint-10000/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
|
| 3 |
+
size 14244
|
checkpoint-10000/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d5bb0c2380f80076bc24221e137f860d6cf7f973dc1aa82881bce26bcfde343
|
| 3 |
+
size 1064
|
checkpoint-10000/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": true,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": true,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "</s>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": true,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
checkpoint-10000/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-10000/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
checkpoint-10000/tokenizer_config.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": true,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": true,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": true,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 37 |
+
"pad_token": "</s>",
|
| 38 |
+
"sp_model_kwargs": {},
|
| 39 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 40 |
+
"unk_token": "<unk>",
|
| 41 |
+
"use_default_system_prompt": false
|
| 42 |
+
}
|
checkpoint-10000/trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-10000/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eec90019e03b3308744cf7428c0bc743b8dd914d925c9059ff38ec49b74a159f
|
| 3 |
+
size 5688
|
checkpoint-10500/config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LlamaForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 1,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"head_dim": 128,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 1024,
|
| 14 |
+
"max_position_embeddings": 512,
|
| 15 |
+
"mlp_bias": false,
|
| 16 |
+
"model_type": "llama",
|
| 17 |
+
"num_attention_heads": 6,
|
| 18 |
+
"num_hidden_layers": 6,
|
| 19 |
+
"num_key_value_heads": 6,
|
| 20 |
+
"pad_token_id": 0,
|
| 21 |
+
"pretraining_tp": 1,
|
| 22 |
+
"rms_norm_eps": 1e-05,
|
| 23 |
+
"rope_scaling": null,
|
| 24 |
+
"rope_theta": 10000.0,
|
| 25 |
+
"tie_word_embeddings": false,
|
| 26 |
+
"torch_dtype": "float32",
|
| 27 |
+
"transformers_version": "4.51.3",
|
| 28 |
+
"use_cache": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
checkpoint-10500/generation_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"pad_token_id": 0,
|
| 6 |
+
"transformers_version": "4.51.3"
|
| 7 |
+
}
|
checkpoint-10500/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90611e57d216eaabc14ef4dbed35b6d3636be548a9d21e0c7a3d9be4f566bb4e
|
| 3 |
+
size 309900448
|
checkpoint-10500/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b59b975e194b7bb1e98b21294e78c9c86dc0b58cd3892b629c2479b022994f8
|
| 3 |
+
size 619836730
|
checkpoint-10500/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
|
| 3 |
+
size 14244
|
checkpoint-10500/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a2d11ff118d0553d52a97bddfd207f05928501796b8701d3a53e420beb2f5b7
|
| 3 |
+
size 1064
|
checkpoint-10500/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": true,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": true,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "</s>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": true,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
checkpoint-10500/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-10500/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
checkpoint-10500/tokenizer_config.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": true,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": true,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": true,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 37 |
+
"pad_token": "</s>",
|
| 38 |
+
"sp_model_kwargs": {},
|
| 39 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 40 |
+
"unk_token": "<unk>",
|
| 41 |
+
"use_default_system_prompt": false
|
| 42 |
+
}
|
checkpoint-10500/trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-10500/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eec90019e03b3308744cf7428c0bc743b8dd914d925c9059ff38ec49b74a159f
|
| 3 |
+
size 5688
|
checkpoint-11000/config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LlamaForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 1,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"head_dim": 128,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 1024,
|
| 14 |
+
"max_position_embeddings": 512,
|
| 15 |
+
"mlp_bias": false,
|
| 16 |
+
"model_type": "llama",
|
| 17 |
+
"num_attention_heads": 6,
|
| 18 |
+
"num_hidden_layers": 6,
|
| 19 |
+
"num_key_value_heads": 6,
|
| 20 |
+
"pad_token_id": 0,
|
| 21 |
+
"pretraining_tp": 1,
|
| 22 |
+
"rms_norm_eps": 1e-05,
|
| 23 |
+
"rope_scaling": null,
|
| 24 |
+
"rope_theta": 10000.0,
|
| 25 |
+
"tie_word_embeddings": false,
|
| 26 |
+
"torch_dtype": "float32",
|
| 27 |
+
"transformers_version": "4.51.3",
|
| 28 |
+
"use_cache": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
checkpoint-11000/generation_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"pad_token_id": 0,
|
| 6 |
+
"transformers_version": "4.51.3"
|
| 7 |
+
}
|
checkpoint-11000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:933c3626401ae46c5786aecd6efab533e1656993686117b3be8aaf60bb034acb
|
| 3 |
+
size 309900448
|
checkpoint-11000/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b75082e4ecdcf5ef7d34918fb783df1e626aaf7dd5e930474d55cb41da9ab754
|
| 3 |
+
size 619836730
|
checkpoint-11000/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
|
| 3 |
+
size 14244
|
checkpoint-11000/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebabd05435cd42dd6dc311e938b4060596334d385dedeffb84634c6f408fa78b
|
| 3 |
+
size 1064
|
checkpoint-11000/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": true,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": true,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "</s>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": true,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
checkpoint-11000/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-11000/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
checkpoint-11000/tokenizer_config.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": true,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": true,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": true,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 37 |
+
"pad_token": "</s>",
|
| 38 |
+
"sp_model_kwargs": {},
|
| 39 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 40 |
+
"unk_token": "<unk>",
|
| 41 |
+
"use_default_system_prompt": false
|
| 42 |
+
}
|
checkpoint-11000/trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-11000/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eec90019e03b3308744cf7428c0bc743b8dd914d925c9059ff38ec49b74a159f
|
| 3 |
+
size 5688
|
checkpoint-11500/config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LlamaForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 1,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"head_dim": 128,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 1024,
|
| 14 |
+
"max_position_embeddings": 512,
|
| 15 |
+
"mlp_bias": false,
|
| 16 |
+
"model_type": "llama",
|
| 17 |
+
"num_attention_heads": 6,
|
| 18 |
+
"num_hidden_layers": 6,
|
| 19 |
+
"num_key_value_heads": 6,
|
| 20 |
+
"pad_token_id": 0,
|
| 21 |
+
"pretraining_tp": 1,
|
| 22 |
+
"rms_norm_eps": 1e-05,
|
| 23 |
+
"rope_scaling": null,
|
| 24 |
+
"rope_theta": 10000.0,
|
| 25 |
+
"tie_word_embeddings": false,
|
| 26 |
+
"torch_dtype": "float32",
|
| 27 |
+
"transformers_version": "4.51.3",
|
| 28 |
+
"use_cache": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
checkpoint-11500/generation_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"pad_token_id": 0,
|
| 6 |
+
"transformers_version": "4.51.3"
|
| 7 |
+
}
|