Hizy commited on
Commit
e3c59f3
·
verified ·
1 Parent(s): 923face

Upload folder using huggingface_hub

Browse files
Files changed (32) hide show
  1. .gitattributes +1 -0
  2. qwen2-gob-plan-115/README.md +73 -0
  3. qwen2-gob-plan-115/added_tokens.json +24 -0
  4. qwen2-gob-plan-115/all_results.json +12 -0
  5. qwen2-gob-plan-115/chat_template.jinja +54 -0
  6. qwen2-gob-plan-115/config.json +58 -0
  7. qwen2-gob-plan-115/eval_results.json +7 -0
  8. qwen2-gob-plan-115/generation_config.json +13 -0
  9. qwen2-gob-plan-115/merges.txt +0 -0
  10. qwen2-gob-plan-115/model-00001-of-00004.safetensors +3 -0
  11. qwen2-gob-plan-115/model-00002-of-00004.safetensors +3 -0
  12. qwen2-gob-plan-115/model-00003-of-00004.safetensors +3 -0
  13. qwen2-gob-plan-115/model-00004-of-00004.safetensors +3 -0
  14. qwen2-gob-plan-115/model.safetensors.index.json +347 -0
  15. qwen2-gob-plan-115/runs/Jan16_22-04-04_hgx001/events.out.tfevents.1768572504.hgx001.402760.0 +3 -0
  16. qwen2-gob-plan-115/runs/Jan16_22-18-02_hgx001/events.out.tfevents.1768573293.hgx001.407010.0 +3 -0
  17. qwen2-gob-plan-115/runs/Jan16_22-33-49_hgx001/events.out.tfevents.1768574249.hgx001.412455.0 +3 -0
  18. qwen2-gob-plan-115/runs/Jan16_23-25-23_hgx001/events.out.tfevents.1768577364.hgx001.425689.0 +3 -0
  19. qwen2-gob-plan-115/runs/Jan16_23-34-59_hgx001/events.out.tfevents.1768577955.hgx001.430957.0 +3 -0
  20. qwen2-gob-plan-115/runs/Jan17_00-14-02_hgx001/events.out.tfevents.1768580289.hgx001.438329.0 +3 -0
  21. qwen2-gob-plan-115/runs/Jan17_00-52-50_hgx001/events.out.tfevents.1768582598.hgx001.445580.0 +3 -0
  22. qwen2-gob-plan-115/runs/Jan17_00-52-50_hgx001/events.out.tfevents.1768722851.hgx001.445580.1 +3 -0
  23. qwen2-gob-plan-115/special_tokens_map.json +31 -0
  24. qwen2-gob-plan-115/tokenizer.json +3 -0
  25. qwen2-gob-plan-115/tokenizer_config.json +208 -0
  26. qwen2-gob-plan-115/train_results.json +8 -0
  27. qwen2-gob-plan-115/trainer_log.jsonl +430 -0
  28. qwen2-gob-plan-115/trainer_state.json +3054 -0
  29. qwen2-gob-plan-115/training_args.bin +3 -0
  30. qwen2-gob-plan-115/training_eval_loss.png +0 -0
  31. qwen2-gob-plan-115/training_loss.png +0 -0
  32. qwen2-gob-plan-115/vocab.json +0 -0
.gitattributes CHANGED
@@ -45,3 +45,4 @@ agenttrek-llama/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
  webrl-llama/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
  atomic_expert/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
  test_admin/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
45
  webrl-llama/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
  atomic_expert/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
  test_admin/tokenizer.json filter=lfs diff=lfs merge=lfs -text
48
+ qwen2-gob-plan-115/tokenizer.json filter=lfs diff=lfs merge=lfs -text
qwen2-gob-plan-115/README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: Qwen/Qwen2.5-7B-Instruct
5
+ tags:
6
+ - llama-factory
7
+ - full
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: qwen2-gob115-1epoch
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # qwen2-gob115-1epoch
18
+
19
+ This model is a fine-tuned version of [/ceph/home/muhan01/huggingfacemodels/Qwen2.5-7B-Instruct](https://huggingface.co//ceph/home/muhan01/huggingfacemodels/Qwen2.5-7B-Instruct) on the gob-plan-115 dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.2078
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 0.0001
41
+ - train_batch_size: 1
42
+ - eval_batch_size: 1
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 4
46
+ - gradient_accumulation_steps: 2
47
+ - total_train_batch_size: 8
48
+ - total_eval_batch_size: 4
49
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
50
+ - lr_scheduler_type: cosine
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 1
53
+
54
+ ### Training results
55
+
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:------:|:----:|:---------------:|
58
+ | 0.8213 | 0.1187 | 500 | 0.7408 |
59
+ | 0.5742 | 0.2374 | 1000 | 0.6352 |
60
+ | 0.5326 | 0.3562 | 1500 | 0.5120 |
61
+ | 0.4165 | 0.4749 | 2000 | 0.4213 |
62
+ | 0.4129 | 0.5936 | 2500 | 0.3469 |
63
+ | 0.345 | 0.7123 | 3000 | 0.2756 |
64
+ | 0.1919 | 0.8311 | 3500 | 0.2285 |
65
+ | 0.2301 | 0.9498 | 4000 | 0.2088 |
66
+
67
+
68
+ ### Framework versions
69
+
70
+ - Transformers 4.57.1
71
+ - Pytorch 2.7.1+cu126
72
+ - Datasets 4.0.0
73
+ - Tokenizers 0.22.2-rc0
qwen2-gob-plan-115/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
qwen2-gob-plan-115/all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_loss": 0.20783543586730957,
4
+ "eval_runtime": 2007.8977,
5
+ "eval_samples_per_second": 1.865,
6
+ "eval_steps_per_second": 0.466,
7
+ "total_flos": 449297978753024.0,
8
+ "train_loss": 0.44325452399106674,
9
+ "train_runtime": 138200.0012,
10
+ "train_samples_per_second": 0.244,
11
+ "train_steps_per_second": 0.03
12
+ }
qwen2-gob-plan-115/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
qwen2-gob-plan-115/config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 3584,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 18944,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention"
41
+ ],
42
+ "max_position_embeddings": 32768,
43
+ "max_window_layers": 28,
44
+ "model_type": "qwen2",
45
+ "num_attention_heads": 28,
46
+ "num_hidden_layers": 28,
47
+ "num_key_value_heads": 4,
48
+ "pad_token_id": 151643,
49
+ "rms_norm_eps": 1e-06,
50
+ "rope_scaling": null,
51
+ "rope_theta": 1000000.0,
52
+ "sliding_window": null,
53
+ "tie_word_embeddings": false,
54
+ "transformers_version": "4.57.1",
55
+ "use_cache": false,
56
+ "use_sliding_window": false,
57
+ "vocab_size": 152064
58
+ }
qwen2-gob-plan-115/eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_loss": 0.20783543586730957,
4
+ "eval_runtime": 2007.8977,
5
+ "eval_samples_per_second": 1.865,
6
+ "eval_steps_per_second": 0.466
7
+ }
qwen2-gob-plan-115/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "repetition_penalty": 1.05,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.57.1"
13
+ }
qwen2-gob-plan-115/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
qwen2-gob-plan-115/model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eaed636e2a362995a2ae1e3d93417296f0817299ea421e2081480ec8a7f5a4d
3
+ size 4877660776
qwen2-gob-plan-115/model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:455e8fedb210c68589f7191fd0f57633c876c80cbd1314a2bf9f9f287ec2e5f6
3
+ size 4932751008
qwen2-gob-plan-115/model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fae00c76c424b973d8fa11db8fd032a3dcf52f1f810262c98c21f136fb2aee37
3
+ size 4330865200
qwen2-gob-plan-115/model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33164791e5562ae2390bbd4b8984dc603d6b5691b9b98483b1047cb1b97c1a27
3
+ size 1089994880
qwen2-gob-plan-115/model.safetensors.index.json ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 333312,
4
+ "total_size": 15231233024
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
20
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
32
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
33
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
44
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
56
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
68
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
80
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
92
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
104
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
116
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
128
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
135
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
140
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
141
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
152
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
164
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
165
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
176
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
188
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
200
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
212
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
224
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
236
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
248
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
260
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
261
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
272
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
284
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
296
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
298
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
299
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
300
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
301
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
303
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
304
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
306
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
308
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
309
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
315
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
316
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
318
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
320
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
321
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
327
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
328
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
330
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
332
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
333
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
344
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
345
+ "model.norm.weight": "model-00003-of-00004.safetensors"
346
+ }
347
+ }
qwen2-gob-plan-115/runs/Jan16_22-04-04_hgx001/events.out.tfevents.1768572504.hgx001.402760.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ece68b4cbb87edc64c9572bcf73c9163a912df43e4211608d030ccd63d96dbb1
3
+ size 6219
qwen2-gob-plan-115/runs/Jan16_22-18-02_hgx001/events.out.tfevents.1768573293.hgx001.407010.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50d602f17273ff234e0e9d044eefbe5a227265fe9a927b3296de5dac47449880
3
+ size 6426
qwen2-gob-plan-115/runs/Jan16_22-33-49_hgx001/events.out.tfevents.1768574249.hgx001.412455.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:127f87043cd9a855049a272574423d8548a73954ba79861e2bfd89b4bb405090
3
+ size 7047
qwen2-gob-plan-115/runs/Jan16_23-25-23_hgx001/events.out.tfevents.1768577364.hgx001.425689.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:981855c3e3b230945e5c6aef116945f9e638bdccae695576be6c6c014090b9d3
3
+ size 6219
qwen2-gob-plan-115/runs/Jan16_23-34-59_hgx001/events.out.tfevents.1768577955.hgx001.430957.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:716a5d1630147fc12ff400ebf5287732d0b16aa3be8f57f77f64e4e4deb08b77
3
+ size 7047
qwen2-gob-plan-115/runs/Jan17_00-14-02_hgx001/events.out.tfevents.1768580289.hgx001.438329.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9cf17fcae66da4f5b0367dcf8009641e206caaa7a1a3e5c1383785068918e7e
3
+ size 6840
qwen2-gob-plan-115/runs/Jan17_00-52-50_hgx001/events.out.tfevents.1768582598.hgx001.445580.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f15941577af486535ac0b0f5d76d4a534714f12e5d5ce18ef22835100cad3818
3
+ size 97524
qwen2-gob-plan-115/runs/Jan17_00-52-50_hgx001/events.out.tfevents.1768722851.hgx001.445580.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42799118eae238125172c4e42c26147a288f17881c3c1c80eeb9fc26237bdb4a
3
+ size 359
qwen2-gob-plan-115/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
qwen2-gob-plan-115/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
qwen2-gob-plan-115/tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "right",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
qwen2-gob-plan-115/train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 449297978753024.0,
4
+ "train_loss": 0.44325452399106674,
5
+ "train_runtime": 138200.0012,
6
+ "train_samples_per_second": 0.244,
7
+ "train_steps_per_second": 0.03
8
+ }
qwen2-gob-plan-115/trainer_log.jsonl ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 4212, "loss": 2.1021, "lr": 2.1327014218009483e-06, "epoch": 0.0023744509082274726, "percentage": 0.24, "elapsed_time": "0:05:48", "remaining_time": "1 day, 16:38:56"}
2
+ {"current_steps": 20, "total_steps": 4212, "loss": 1.0168, "lr": 4.502369668246446e-06, "epoch": 0.004748901816454945, "percentage": 0.47, "elapsed_time": "0:11:10", "remaining_time": "1 day, 15:02:26"}
3
+ {"current_steps": 30, "total_steps": 4212, "loss": 0.6508, "lr": 6.8720379146919435e-06, "epoch": 0.007123352724682417, "percentage": 0.71, "elapsed_time": "0:17:06", "remaining_time": "1 day, 15:45:49"}
4
+ {"current_steps": 40, "total_steps": 4212, "loss": 0.5579, "lr": 9.24170616113744e-06, "epoch": 0.00949780363290989, "percentage": 0.95, "elapsed_time": "0:21:30", "remaining_time": "1 day, 13:24:05"}
5
+ {"current_steps": 50, "total_steps": 4212, "loss": 0.5911, "lr": 1.161137440758294e-05, "epoch": 0.011872254541137363, "percentage": 1.19, "elapsed_time": "0:26:59", "remaining_time": "1 day, 13:27:11"}
6
+ {"current_steps": 60, "total_steps": 4212, "loss": 0.5363, "lr": 1.3981042654028437e-05, "epoch": 0.014246705449364834, "percentage": 1.42, "elapsed_time": "0:32:04", "remaining_time": "1 day, 12:59:45"}
7
+ {"current_steps": 70, "total_steps": 4212, "loss": 0.5813, "lr": 1.6350710900473933e-05, "epoch": 0.016621156357592308, "percentage": 1.66, "elapsed_time": "0:36:17", "remaining_time": "1 day, 11:47:42"}
8
+ {"current_steps": 80, "total_steps": 4212, "loss": 0.5928, "lr": 1.872037914691943e-05, "epoch": 0.01899560726581978, "percentage": 1.9, "elapsed_time": "0:41:03", "remaining_time": "1 day, 11:20:45"}
9
+ {"current_steps": 90, "total_steps": 4212, "loss": 0.5576, "lr": 2.109004739336493e-05, "epoch": 0.021370058174047253, "percentage": 2.14, "elapsed_time": "0:45:43", "remaining_time": "1 day, 10:54:16"}
10
+ {"current_steps": 100, "total_steps": 4212, "loss": 0.5891, "lr": 2.345971563981043e-05, "epoch": 0.023744509082274726, "percentage": 2.37, "elapsed_time": "0:49:40", "remaining_time": "1 day, 10:02:52"}
11
+ {"current_steps": 110, "total_steps": 4212, "loss": 0.6125, "lr": 2.5829383886255927e-05, "epoch": 0.026118959990502195, "percentage": 2.61, "elapsed_time": "0:55:00", "remaining_time": "1 day, 10:11:28"}
12
+ {"current_steps": 120, "total_steps": 4212, "loss": 0.5772, "lr": 2.8199052132701424e-05, "epoch": 0.028493410898729667, "percentage": 2.85, "elapsed_time": "0:59:18", "remaining_time": "1 day, 9:42:31"}
13
+ {"current_steps": 130, "total_steps": 4212, "loss": 0.6367, "lr": 3.056872037914692e-05, "epoch": 0.03086786180695714, "percentage": 3.09, "elapsed_time": "1:04:39", "remaining_time": "1 day, 9:50:02"}
14
+ {"current_steps": 140, "total_steps": 4212, "loss": 0.6695, "lr": 3.293838862559242e-05, "epoch": 0.033242312715184616, "percentage": 3.32, "elapsed_time": "1:09:42", "remaining_time": "1 day, 9:47:43"}
15
+ {"current_steps": 150, "total_steps": 4212, "loss": 0.6226, "lr": 3.530805687203792e-05, "epoch": 0.03561676362341209, "percentage": 3.56, "elapsed_time": "1:14:55", "remaining_time": "1 day, 9:48:49"}
16
+ {"current_steps": 160, "total_steps": 4212, "loss": 0.6079, "lr": 3.767772511848342e-05, "epoch": 0.03799121453163956, "percentage": 3.8, "elapsed_time": "1:19:19", "remaining_time": "1 day, 9:28:51"}
17
+ {"current_steps": 170, "total_steps": 4212, "loss": 0.6063, "lr": 4.004739336492891e-05, "epoch": 0.040365665439867034, "percentage": 4.04, "elapsed_time": "1:24:55", "remaining_time": "1 day, 9:39:07"}
18
+ {"current_steps": 180, "total_steps": 4212, "loss": 0.5927, "lr": 4.2417061611374406e-05, "epoch": 0.042740116348094506, "percentage": 4.27, "elapsed_time": "1:29:44", "remaining_time": "1 day, 9:30:18"}
19
+ {"current_steps": 190, "total_steps": 4212, "loss": 0.6349, "lr": 4.478672985781991e-05, "epoch": 0.04511456725632198, "percentage": 4.51, "elapsed_time": "1:34:21", "remaining_time": "1 day, 9:17:28"}
20
+ {"current_steps": 200, "total_steps": 4212, "loss": 0.6307, "lr": 4.71563981042654e-05, "epoch": 0.04748901816454945, "percentage": 4.75, "elapsed_time": "1:38:34", "remaining_time": "1 day, 8:57:22"}
21
+ {"current_steps": 210, "total_steps": 4212, "loss": 0.6583, "lr": 4.95260663507109e-05, "epoch": 0.04986346907277692, "percentage": 4.99, "elapsed_time": "1:43:41", "remaining_time": "1 day, 8:55:56"}
22
+ {"current_steps": 220, "total_steps": 4212, "loss": 0.7049, "lr": 5.1895734597156396e-05, "epoch": 0.05223791998100439, "percentage": 5.22, "elapsed_time": "1:49:31", "remaining_time": "1 day, 9:07:25"}
23
+ {"current_steps": 230, "total_steps": 4212, "loss": 0.6285, "lr": 5.42654028436019e-05, "epoch": 0.05461237088923186, "percentage": 5.46, "elapsed_time": "1:53:55", "remaining_time": "1 day, 8:52:18"}
24
+ {"current_steps": 240, "total_steps": 4212, "loss": 0.6836, "lr": 5.66350710900474e-05, "epoch": 0.056986821797459335, "percentage": 5.7, "elapsed_time": "1:59:01", "remaining_time": "1 day, 8:49:56"}
25
+ {"current_steps": 250, "total_steps": 4212, "loss": 0.641, "lr": 5.90047393364929e-05, "epoch": 0.05936127270568681, "percentage": 5.94, "elapsed_time": "2:03:48", "remaining_time": "1 day, 8:42:00"}
26
+ {"current_steps": 260, "total_steps": 4212, "loss": 0.6517, "lr": 6.137440758293839e-05, "epoch": 0.06173572361391428, "percentage": 6.17, "elapsed_time": "2:08:16", "remaining_time": "1 day, 8:29:45"}
27
+ {"current_steps": 270, "total_steps": 4212, "loss": 0.7061, "lr": 6.374407582938389e-05, "epoch": 0.06411017452214175, "percentage": 6.41, "elapsed_time": "2:13:09", "remaining_time": "1 day, 8:24:03"}
28
+ {"current_steps": 280, "total_steps": 4212, "loss": 0.9615, "lr": 6.611374407582939e-05, "epoch": 0.06648462543036923, "percentage": 6.65, "elapsed_time": "2:17:17", "remaining_time": "1 day, 8:08:02"}
29
+ {"current_steps": 290, "total_steps": 4212, "loss": 0.7798, "lr": 6.848341232227489e-05, "epoch": 0.0688590763385967, "percentage": 6.89, "elapsed_time": "2:22:17", "remaining_time": "1 day, 8:04:17"}
30
+ {"current_steps": 300, "total_steps": 4212, "loss": 0.6821, "lr": 7.085308056872039e-05, "epoch": 0.07123352724682418, "percentage": 7.12, "elapsed_time": "2:27:03", "remaining_time": "1 day, 7:57:43"}
31
+ {"current_steps": 310, "total_steps": 4212, "loss": 0.6711, "lr": 7.322274881516588e-05, "epoch": 0.07360797815505164, "percentage": 7.36, "elapsed_time": "2:31:14", "remaining_time": "1 day, 7:43:45"}
32
+ {"current_steps": 320, "total_steps": 4212, "loss": 0.6888, "lr": 7.559241706161138e-05, "epoch": 0.07598242906327912, "percentage": 7.6, "elapsed_time": "2:35:33", "remaining_time": "1 day, 7:31:53"}
33
+ {"current_steps": 330, "total_steps": 4212, "loss": 0.8411, "lr": 7.796208530805688e-05, "epoch": 0.07835687997150659, "percentage": 7.83, "elapsed_time": "2:39:48", "remaining_time": "1 day, 7:19:58"}
34
+ {"current_steps": 340, "total_steps": 4212, "loss": 0.6802, "lr": 8.033175355450238e-05, "epoch": 0.08073133087973407, "percentage": 8.07, "elapsed_time": "2:44:55", "remaining_time": "1 day, 7:18:11"}
35
+ {"current_steps": 350, "total_steps": 4212, "loss": 0.7479, "lr": 8.270142180094788e-05, "epoch": 0.08310578178796153, "percentage": 8.31, "elapsed_time": "2:49:42", "remaining_time": "1 day, 7:12:31"}
36
+ {"current_steps": 360, "total_steps": 4212, "loss": 0.7292, "lr": 8.507109004739337e-05, "epoch": 0.08548023269618901, "percentage": 8.55, "elapsed_time": "2:54:32", "remaining_time": "1 day, 7:07:39"}
37
+ {"current_steps": 370, "total_steps": 4212, "loss": 0.6956, "lr": 8.744075829383887e-05, "epoch": 0.08785468360441648, "percentage": 8.78, "elapsed_time": "2:59:11", "remaining_time": "1 day, 7:00:41"}
38
+ {"current_steps": 380, "total_steps": 4212, "loss": 0.7527, "lr": 8.981042654028437e-05, "epoch": 0.09022913451264396, "percentage": 9.02, "elapsed_time": "3:04:19", "remaining_time": "1 day, 6:58:47"}
39
+ {"current_steps": 390, "total_steps": 4212, "loss": 0.7719, "lr": 9.218009478672986e-05, "epoch": 0.09260358542087142, "percentage": 9.26, "elapsed_time": "3:09:45", "remaining_time": "1 day, 6:59:41"}
40
+ {"current_steps": 400, "total_steps": 4212, "loss": 0.7168, "lr": 9.454976303317536e-05, "epoch": 0.0949780363290989, "percentage": 9.5, "elapsed_time": "3:14:20", "remaining_time": "1 day, 6:52:04"}
41
+ {"current_steps": 410, "total_steps": 4212, "loss": 0.8366, "lr": 9.691943127962086e-05, "epoch": 0.09735248723732637, "percentage": 9.73, "elapsed_time": "3:19:23", "remaining_time": "1 day, 6:49:03"}
42
+ {"current_steps": 420, "total_steps": 4212, "loss": 0.7962, "lr": 9.928909952606635e-05, "epoch": 0.09972693814555383, "percentage": 9.97, "elapsed_time": "3:24:39", "remaining_time": "1 day, 6:47:50"}
43
+ {"current_steps": 430, "total_steps": 4212, "loss": 0.8253, "lr": 9.999915830219296e-05, "epoch": 0.10210138905378131, "percentage": 10.21, "elapsed_time": "3:28:50", "remaining_time": "1 day, 6:36:53"}
44
+ {"current_steps": 440, "total_steps": 4212, "loss": 0.7228, "lr": 9.999503576890838e-05, "epoch": 0.10447583996200878, "percentage": 10.45, "elapsed_time": "3:33:30", "remaining_time": "1 day, 6:30:24"}
45
+ {"current_steps": 450, "total_steps": 4212, "loss": 0.8091, "lr": 9.998747808549429e-05, "epoch": 0.10685029087023626, "percentage": 10.68, "elapsed_time": "3:38:11", "remaining_time": "1 day, 6:24:02"}
46
+ {"current_steps": 460, "total_steps": 4212, "loss": 0.8117, "lr": 9.997648577123782e-05, "epoch": 0.10922474177846372, "percentage": 10.92, "elapsed_time": "3:43:30", "remaining_time": "1 day, 6:23:01"}
47
+ {"current_steps": 470, "total_steps": 4212, "loss": 0.7577, "lr": 9.996205958141894e-05, "epoch": 0.1115991926866912, "percentage": 11.16, "elapsed_time": "3:48:41", "remaining_time": "1 day, 6:20:46"}
48
+ {"current_steps": 480, "total_steps": 4212, "loss": 0.8031, "lr": 9.994420050725863e-05, "epoch": 0.11397364359491867, "percentage": 11.4, "elapsed_time": "3:53:58", "remaining_time": "1 day, 6:19:05"}
49
+ {"current_steps": 490, "total_steps": 4212, "loss": 0.7887, "lr": 9.992290977585072e-05, "epoch": 0.11634809450314615, "percentage": 11.63, "elapsed_time": "3:59:17", "remaining_time": "1 day, 6:17:39"}
50
+ {"current_steps": 500, "total_steps": 4212, "loss": 0.8213, "lr": 9.989818885007766e-05, "epoch": 0.11872254541137361, "percentage": 11.87, "elapsed_time": "4:04:33", "remaining_time": "1 day, 6:15:36"}
51
+ {"current_steps": 500, "total_steps": 4212, "eval_loss": 0.7407782077789307, "epoch": 0.11872254541137361, "percentage": 11.87, "elapsed_time": "4:37:58", "remaining_time": "1 day, 10:23:42"}
52
+ {"current_steps": 510, "total_steps": 4212, "loss": 0.7828, "lr": 9.987003942850989e-05, "epoch": 0.1210969963196011, "percentage": 12.11, "elapsed_time": "4:42:31", "remaining_time": "1 day, 10:10:50"}
53
+ {"current_steps": 520, "total_steps": 4212, "loss": 0.7454, "lr": 9.983846344528923e-05, "epoch": 0.12347144722782856, "percentage": 12.35, "elapsed_time": "4:47:26", "remaining_time": "1 day, 10:00:53"}
54
+ {"current_steps": 530, "total_steps": 4212, "loss": 0.6686, "lr": 9.980346306999596e-05, "epoch": 0.12584589813605604, "percentage": 12.58, "elapsed_time": "4:52:47", "remaining_time": "1 day, 9:54:06"}
55
+ {"current_steps": 540, "total_steps": 4212, "loss": 0.9724, "lr": 9.976504070749969e-05, "epoch": 0.1282203490442835, "percentage": 12.82, "elapsed_time": "4:57:26", "remaining_time": "1 day, 9:42:36"}
56
+ {"current_steps": 550, "total_steps": 4212, "loss": 0.735, "lr": 9.972319899779422e-05, "epoch": 0.13059479995251097, "percentage": 13.06, "elapsed_time": "5:02:27", "remaining_time": "1 day, 9:33:51"}
57
+ {"current_steps": 560, "total_steps": 4212, "loss": 0.7613, "lr": 9.967794081581606e-05, "epoch": 0.13296925086073846, "percentage": 13.3, "elapsed_time": "5:07:00", "remaining_time": "1 day, 9:22:06"}
58
+ {"current_steps": 570, "total_steps": 4212, "loss": 0.7509, "lr": 9.962926927124697e-05, "epoch": 0.13534370176896593, "percentage": 13.53, "elapsed_time": "5:11:36", "remaining_time": "1 day, 9:10:57"}
59
+ {"current_steps": 580, "total_steps": 4212, "loss": 0.7055, "lr": 9.957718770830022e-05, "epoch": 0.1377181526771934, "percentage": 13.77, "elapsed_time": "5:16:38", "remaining_time": "1 day, 9:02:50"}
60
+ {"current_steps": 590, "total_steps": 4212, "loss": 0.7494, "lr": 9.952169970549088e-05, "epoch": 0.14009260358542086, "percentage": 14.01, "elapsed_time": "5:21:39", "remaining_time": "1 day, 8:54:41"}
61
+ {"current_steps": 600, "total_steps": 4212, "loss": 0.7347, "lr": 9.946280907538985e-05, "epoch": 0.14246705449364835, "percentage": 14.25, "elapsed_time": "5:26:10", "remaining_time": "1 day, 8:43:35"}
62
+ {"current_steps": 610, "total_steps": 4212, "loss": 0.6848, "lr": 9.940051986436198e-05, "epoch": 0.14484150540187582, "percentage": 14.48, "elapsed_time": "5:30:58", "remaining_time": "1 day, 8:34:22"}
63
+ {"current_steps": 620, "total_steps": 4212, "loss": 0.7168, "lr": 9.933483635228804e-05, "epoch": 0.14721595631010329, "percentage": 14.72, "elapsed_time": "5:35:34", "remaining_time": "1 day, 8:24:11"}
64
+ {"current_steps": 630, "total_steps": 4212, "loss": 0.7163, "lr": 9.926576305227063e-05, "epoch": 0.14959040721833075, "percentage": 14.96, "elapsed_time": "5:40:34", "remaining_time": "1 day, 8:16:24"}
65
+ {"current_steps": 640, "total_steps": 4212, "loss": 0.639, "lr": 9.919330471032401e-05, "epoch": 0.15196485812655824, "percentage": 15.19, "elapsed_time": "5:45:52", "remaining_time": "1 day, 8:10:23"}
66
+ {"current_steps": 650, "total_steps": 4212, "loss": 0.6972, "lr": 9.911746630504818e-05, "epoch": 0.1543393090347857, "percentage": 15.43, "elapsed_time": "5:50:40", "remaining_time": "1 day, 8:01:39"}
67
+ {"current_steps": 660, "total_steps": 4212, "loss": 0.7853, "lr": 9.903825304728664e-05, "epoch": 0.15671375994301318, "percentage": 15.67, "elapsed_time": "5:55:33", "remaining_time": "1 day, 7:53:31"}
68
+ {"current_steps": 670, "total_steps": 4212, "loss": 0.6616, "lr": 9.895567037976842e-05, "epoch": 0.15908821085124064, "percentage": 15.91, "elapsed_time": "6:00:26", "remaining_time": "1 day, 7:45:31"}
69
+ {"current_steps": 680, "total_steps": 4212, "loss": 0.6649, "lr": 9.88697239767341e-05, "epoch": 0.16146266175946813, "percentage": 16.14, "elapsed_time": "6:04:53", "remaining_time": "1 day, 7:35:19"}
70
+ {"current_steps": 690, "total_steps": 4212, "loss": 0.7677, "lr": 9.878041974354598e-05, "epoch": 0.1638371126676956, "percentage": 16.38, "elapsed_time": "6:09:53", "remaining_time": "1 day, 7:28:04"}
71
+ {"current_steps": 700, "total_steps": 4212, "loss": 0.7151, "lr": 9.868776381628218e-05, "epoch": 0.16621156357592307, "percentage": 16.62, "elapsed_time": "6:14:34", "remaining_time": "1 day, 7:19:19"}
72
+ {"current_steps": 710, "total_steps": 4212, "loss": 0.8004, "lr": 9.859176256131522e-05, "epoch": 0.16858601448415053, "percentage": 16.86, "elapsed_time": "6:19:06", "remaining_time": "1 day, 7:09:52"}
73
+ {"current_steps": 720, "total_steps": 4212, "loss": 0.686, "lr": 9.849242257487447e-05, "epoch": 0.17096046539237802, "percentage": 17.09, "elapsed_time": "6:23:58", "remaining_time": "1 day, 7:02:15"}
74
+ {"current_steps": 730, "total_steps": 4212, "loss": 0.7076, "lr": 9.838975068259297e-05, "epoch": 0.1733349163006055, "percentage": 17.33, "elapsed_time": "6:29:16", "remaining_time": "1 day, 6:56:46"}
75
+ {"current_steps": 740, "total_steps": 4212, "loss": 0.7879, "lr": 9.828375393903842e-05, "epoch": 0.17570936720883296, "percentage": 17.57, "elapsed_time": "6:34:13", "remaining_time": "1 day, 6:49:41"}
76
+ {"current_steps": 750, "total_steps": 4212, "loss": 0.72, "lr": 9.817443962722843e-05, "epoch": 0.17808381811706042, "percentage": 17.81, "elapsed_time": "6:39:23", "remaining_time": "1 day, 6:43:34"}
77
+ {"current_steps": 760, "total_steps": 4212, "loss": 0.7231, "lr": 9.806181525813019e-05, "epoch": 0.18045826902528792, "percentage": 18.04, "elapsed_time": "6:44:10", "remaining_time": "1 day, 6:35:47"}
78
+ {"current_steps": 770, "total_steps": 4212, "loss": 0.6952, "lr": 9.79458885701443e-05, "epoch": 0.18283271993351538, "percentage": 18.28, "elapsed_time": "6:48:36", "remaining_time": "1 day, 6:26:31"}
79
+ {"current_steps": 780, "total_steps": 4212, "loss": 0.6398, "lr": 9.782666752857317e-05, "epoch": 0.18520717084174285, "percentage": 18.52, "elapsed_time": "6:53:37", "remaining_time": "1 day, 6:19:55"}
80
+ {"current_steps": 790, "total_steps": 4212, "loss": 0.7202, "lr": 9.770416032507361e-05, "epoch": 0.1875816217499703, "percentage": 18.76, "elapsed_time": "6:58:09", "remaining_time": "1 day, 6:11:16"}
81
+ {"current_steps": 800, "total_steps": 4212, "loss": 0.6733, "lr": 9.757837537709407e-05, "epoch": 0.1899560726581978, "percentage": 18.99, "elapsed_time": "7:03:06", "remaining_time": "1 day, 6:04:33"}
82
+ {"current_steps": 810, "total_steps": 4212, "loss": 0.7764, "lr": 9.744932132729625e-05, "epoch": 0.19233052356642527, "percentage": 19.23, "elapsed_time": "7:07:30", "remaining_time": "1 day, 5:55:31"}
83
+ {"current_steps": 820, "total_steps": 4212, "loss": 0.6662, "lr": 9.731700704296126e-05, "epoch": 0.19470497447465274, "percentage": 19.47, "elapsed_time": "7:12:44", "remaining_time": "1 day, 5:50:05"}
84
+ {"current_steps": 830, "total_steps": 4212, "loss": 0.6228, "lr": 9.71814416153803e-05, "epoch": 0.1970794253828802, "percentage": 19.71, "elapsed_time": "7:18:39", "remaining_time": "1 day, 5:47:23"}
85
+ {"current_steps": 840, "total_steps": 4212, "loss": 0.6479, "lr": 9.704263435923014e-05, "epoch": 0.19945387629110767, "percentage": 19.94, "elapsed_time": "7:24:38", "remaining_time": "1 day, 5:44:53"}
86
+ {"current_steps": 850, "total_steps": 4212, "loss": 0.8001, "lr": 9.690059481193295e-05, "epoch": 0.20182832719933516, "percentage": 20.18, "elapsed_time": "7:29:59", "remaining_time": "1 day, 5:39:49"}
87
+ {"current_steps": 860, "total_steps": 4212, "loss": 0.5831, "lr": 9.675533273300111e-05, "epoch": 0.20420277810756263, "percentage": 20.42, "elapsed_time": "7:35:30", "remaining_time": "1 day, 5:35:23"}
88
+ {"current_steps": 870, "total_steps": 4212, "loss": 0.6441, "lr": 9.660685810336654e-05, "epoch": 0.2065772290157901, "percentage": 20.66, "elapsed_time": "7:40:51", "remaining_time": "1 day, 5:30:21"}
89
+ {"current_steps": 880, "total_steps": 4212, "loss": 0.6613, "lr": 9.645518112469498e-05, "epoch": 0.20895167992401756, "percentage": 20.89, "elapsed_time": "7:45:39", "remaining_time": "1 day, 5:23:09"}
90
+ {"current_steps": 890, "total_steps": 4212, "loss": 0.5891, "lr": 9.630031221868501e-05, "epoch": 0.21132613083224505, "percentage": 21.13, "elapsed_time": "7:50:09", "remaining_time": "1 day, 5:14:54"}
91
+ {"current_steps": 900, "total_steps": 4212, "loss": 0.6525, "lr": 9.614226202635195e-05, "epoch": 0.21370058174047252, "percentage": 21.37, "elapsed_time": "7:55:29", "remaining_time": "1 day, 5:09:46"}
92
+ {"current_steps": 910, "total_steps": 4212, "loss": 0.6435, "lr": 9.59810414072968e-05, "epoch": 0.21607503264869998, "percentage": 21.6, "elapsed_time": "8:00:09", "remaining_time": "1 day, 5:02:15"}
93
+ {"current_steps": 920, "total_steps": 4212, "loss": 0.689, "lr": 9.581666143895994e-05, "epoch": 0.21844948355692745, "percentage": 21.84, "elapsed_time": "8:05:19", "remaining_time": "1 day, 4:56:35"}
94
+ {"current_steps": 930, "total_steps": 4212, "loss": 0.6445, "lr": 9.564913341586017e-05, "epoch": 0.22082393446515494, "percentage": 22.08, "elapsed_time": "8:11:38", "remaining_time": "1 day, 4:55:01"}
95
+ {"current_steps": 940, "total_steps": 4212, "loss": 0.6848, "lr": 9.547846884881853e-05, "epoch": 0.2231983853733824, "percentage": 22.32, "elapsed_time": "8:16:24", "remaining_time": "1 day, 4:47:56"}
96
+ {"current_steps": 950, "total_steps": 4212, "loss": 0.6939, "lr": 9.530467946416745e-05, "epoch": 0.22557283628160987, "percentage": 22.55, "elapsed_time": "8:20:53", "remaining_time": "1 day, 4:39:54"}
97
+ {"current_steps": 960, "total_steps": 4212, "loss": 0.6167, "lr": 9.512777720294504e-05, "epoch": 0.22794728718983734, "percentage": 22.79, "elapsed_time": "8:25:23", "remaining_time": "1 day, 4:32:01"}
98
+ {"current_steps": 970, "total_steps": 4212, "loss": 0.6839, "lr": 9.494777422007462e-05, "epoch": 0.23032173809806483, "percentage": 23.03, "elapsed_time": "8:30:13", "remaining_time": "1 day, 4:25:16"}
99
+ {"current_steps": 980, "total_steps": 4212, "loss": 0.5922, "lr": 9.476468288352951e-05, "epoch": 0.2326961890062923, "percentage": 23.27, "elapsed_time": "8:35:07", "remaining_time": "1 day, 4:18:53"}
100
+ {"current_steps": 990, "total_steps": 4212, "loss": 0.5699, "lr": 9.457851577348332e-05, "epoch": 0.23507063991451976, "percentage": 23.5, "elapsed_time": "8:39:30", "remaining_time": "1 day, 4:10:45"}
101
+ {"current_steps": 1000, "total_steps": 4212, "loss": 0.5742, "lr": 9.438928568144547e-05, "epoch": 0.23744509082274723, "percentage": 23.74, "elapsed_time": "8:43:58", "remaining_time": "1 day, 4:02:59"}
102
+ {"current_steps": 1000, "total_steps": 4212, "eval_loss": 0.6352267861366272, "epoch": 0.23744509082274723, "percentage": 23.74, "elapsed_time": "9:17:18", "remaining_time": "1 day, 5:50:05"}
103
+ {"current_steps": 1010, "total_steps": 4212, "loss": 0.6413, "lr": 9.41970056093824e-05, "epoch": 0.23981954173097472, "percentage": 23.98, "elapsed_time": "9:22:38", "remaining_time": "1 day, 5:43:43"}
104
+ {"current_steps": 1020, "total_steps": 4212, "loss": 0.6281, "lr": 9.400168876882408e-05, "epoch": 0.2421939926392022, "percentage": 24.22, "elapsed_time": "9:27:40", "remaining_time": "1 day, 5:36:29"}
105
+ {"current_steps": 1030, "total_steps": 4212, "loss": 0.6787, "lr": 9.380334857995629e-05, "epoch": 0.24456844354742965, "percentage": 24.45, "elapsed_time": "9:32:27", "remaining_time": "1 day, 5:28:29"}
106
+ {"current_steps": 1040, "total_steps": 4212, "loss": 0.6043, "lr": 9.360199867069866e-05, "epoch": 0.24694289445565712, "percentage": 24.69, "elapsed_time": "9:36:56", "remaining_time": "1 day, 5:19:41"}
107
+ {"current_steps": 1050, "total_steps": 4212, "loss": 0.5633, "lr": 9.339765287576803e-05, "epoch": 0.2493173453638846, "percentage": 24.93, "elapsed_time": "9:41:25", "remaining_time": "1 day, 5:10:55"}
108
+ {"current_steps": 1060, "total_steps": 4212, "loss": 0.7124, "lr": 9.319032523572815e-05, "epoch": 0.2516917962721121, "percentage": 25.17, "elapsed_time": "9:46:09", "remaining_time": "1 day, 5:03:00"}
109
+ {"current_steps": 1070, "total_steps": 4212, "loss": 0.5931, "lr": 9.298002999602471e-05, "epoch": 0.25406624718033954, "percentage": 25.4, "elapsed_time": "9:50:45", "remaining_time": "1 day, 4:54:42"}
110
+ {"current_steps": 1080, "total_steps": 4212, "loss": 0.724, "lr": 9.276678160600674e-05, "epoch": 0.256440698088567, "percentage": 25.64, "elapsed_time": "9:56:23", "remaining_time": "1 day, 4:49:31"}
111
+ {"current_steps": 1090, "total_steps": 4212, "loss": 0.6224, "lr": 9.255059471793369e-05, "epoch": 0.2588151489967945, "percentage": 25.88, "elapsed_time": "10:01:28", "remaining_time": "1 day, 4:42:45"}
112
+ {"current_steps": 1100, "total_steps": 4212, "loss": 0.5864, "lr": 9.233148418596862e-05, "epoch": 0.26118959990502194, "percentage": 26.12, "elapsed_time": "10:06:53", "remaining_time": "1 day, 4:36:57"}
113
+ {"current_steps": 1110, "total_steps": 4212, "loss": 0.5933, "lr": 9.210946506515777e-05, "epoch": 0.26356405081324946, "percentage": 26.35, "elapsed_time": "10:12:10", "remaining_time": "1 day, 4:30:47"}
114
+ {"current_steps": 1120, "total_steps": 4212, "loss": 0.5608, "lr": 9.188455261039592e-05, "epoch": 0.2659385017214769, "percentage": 26.59, "elapsed_time": "10:16:19", "remaining_time": "1 day, 4:21:28"}
115
+ {"current_steps": 1130, "total_steps": 4212, "loss": 0.5591, "lr": 9.165676227537836e-05, "epoch": 0.2683129526297044, "percentage": 26.83, "elapsed_time": "10:20:54", "remaining_time": "1 day, 4:13:30"}
116
+ {"current_steps": 1140, "total_steps": 4212, "loss": 0.6269, "lr": 9.1426109711539e-05, "epoch": 0.27068740353793186, "percentage": 27.07, "elapsed_time": "10:26:29", "remaining_time": "1 day, 4:08:13"}
117
+ {"current_steps": 1150, "total_steps": 4212, "loss": 0.5785, "lr": 9.1192610766975e-05, "epoch": 0.2730618544461593, "percentage": 27.3, "elapsed_time": "10:31:23", "remaining_time": "1 day, 4:01:09"}
118
+ {"current_steps": 1160, "total_steps": 4212, "loss": 0.7172, "lr": 9.095628148535788e-05, "epoch": 0.2754363053543868, "percentage": 27.54, "elapsed_time": "10:36:19", "remaining_time": "1 day, 3:54:12"}
119
+ {"current_steps": 1170, "total_steps": 4212, "loss": 0.5275, "lr": 9.071713810483103e-05, "epoch": 0.27781075626261426, "percentage": 27.78, "elapsed_time": "10:40:41", "remaining_time": "1 day, 3:45:47"}
120
+ {"current_steps": 1180, "total_steps": 4212, "loss": 0.6399, "lr": 9.047519705689418e-05, "epoch": 0.2801852071708417, "percentage": 28.02, "elapsed_time": "10:46:18", "remaining_time": "1 day, 3:40:41"}
121
+ {"current_steps": 1190, "total_steps": 4212, "loss": 0.6105, "lr": 9.023047496527423e-05, "epoch": 0.28255965807906924, "percentage": 28.25, "elapsed_time": "10:51:00", "remaining_time": "1 day, 3:33:14"}
122
+ {"current_steps": 1200, "total_steps": 4212, "loss": 0.609, "lr": 8.998298864478314e-05, "epoch": 0.2849341089872967, "percentage": 28.49, "elapsed_time": "10:56:21", "remaining_time": "1 day, 3:27:27"}
123
+ {"current_steps": 1210, "total_steps": 4212, "loss": 0.5436, "lr": 8.973275510016252e-05, "epoch": 0.2873085598955242, "percentage": 28.73, "elapsed_time": "11:00:32", "remaining_time": "1 day, 3:18:48"}
124
+ {"current_steps": 1220, "total_steps": 4212, "loss": 0.5398, "lr": 8.947979152491533e-05, "epoch": 0.28968301080375164, "percentage": 28.96, "elapsed_time": "11:05:34", "remaining_time": "1 day, 3:12:17"}
125
+ {"current_steps": 1230, "total_steps": 4212, "loss": 0.5829, "lr": 8.922411530012433e-05, "epoch": 0.2920574617119791, "percentage": 29.2, "elapsed_time": "11:10:28", "remaining_time": "1 day, 3:05:29"}
126
+ {"current_steps": 1240, "total_steps": 4212, "loss": 0.5319, "lr": 8.89657439932581e-05, "epoch": 0.29443191262020657, "percentage": 29.44, "elapsed_time": "11:14:29", "remaining_time": "1 day, 2:56:36"}
127
+ {"current_steps": 1250, "total_steps": 4212, "loss": 0.5556, "lr": 8.870469535696375e-05, "epoch": 0.29680636352843404, "percentage": 29.68, "elapsed_time": "11:19:10", "remaining_time": "1 day, 2:49:21"}
128
+ {"current_steps": 1260, "total_steps": 4212, "loss": 0.5374, "lr": 8.844098732784723e-05, "epoch": 0.2991808144366615, "percentage": 29.91, "elapsed_time": "11:24:04", "remaining_time": "1 day, 2:42:40"}
129
+ {"current_steps": 1270, "total_steps": 4212, "loss": 0.5525, "lr": 8.817463802524096e-05, "epoch": 0.30155526534488897, "percentage": 30.15, "elapsed_time": "11:29:21", "remaining_time": "1 day, 2:36:55"}
130
+ {"current_steps": 1280, "total_steps": 4212, "loss": 0.6688, "lr": 8.79056657499587e-05, "epoch": 0.3039297162531165, "percentage": 30.39, "elapsed_time": "11:36:04", "remaining_time": "1 day, 2:34:26"}
131
+ {"current_steps": 1290, "total_steps": 4212, "loss": 0.5966, "lr": 8.763408898303829e-05, "epoch": 0.30630416716134395, "percentage": 30.63, "elapsed_time": "11:40:28", "remaining_time": "1 day, 2:26:39"}
132
+ {"current_steps": 1300, "total_steps": 4212, "loss": 0.606, "lr": 8.73599263844717e-05, "epoch": 0.3086786180695714, "percentage": 30.86, "elapsed_time": "11:46:15", "remaining_time": "1 day, 2:22:01"}
133
+ {"current_steps": 1310, "total_steps": 4212, "loss": 0.5279, "lr": 8.708319679192293e-05, "epoch": 0.3110530689777989, "percentage": 31.1, "elapsed_time": "11:50:53", "remaining_time": "1 day, 2:14:47"}
134
+ {"current_steps": 1320, "total_steps": 4212, "loss": 0.568, "lr": 8.680391921943371e-05, "epoch": 0.31342751988602635, "percentage": 31.34, "elapsed_time": "11:55:08", "remaining_time": "1 day, 2:06:47"}
135
+ {"current_steps": 1330, "total_steps": 4212, "loss": 0.583, "lr": 8.652211285611701e-05, "epoch": 0.3158019707942538, "percentage": 31.58, "elapsed_time": "11:59:49", "remaining_time": "1 day, 1:59:48"}
136
+ {"current_steps": 1340, "total_steps": 4212, "loss": 0.6056, "lr": 8.623779706483855e-05, "epoch": 0.3181764217024813, "percentage": 31.81, "elapsed_time": "12:04:03", "remaining_time": "1 day, 1:51:50"}
137
+ {"current_steps": 1350, "total_steps": 4212, "loss": 0.5846, "lr": 8.595099138088644e-05, "epoch": 0.32055087261070875, "percentage": 32.05, "elapsed_time": "12:09:06", "remaining_time": "1 day, 1:45:42"}
138
+ {"current_steps": 1360, "total_steps": 4212, "loss": 0.5304, "lr": 8.566171551062889e-05, "epoch": 0.32292532351893627, "percentage": 32.29, "elapsed_time": "12:14:47", "remaining_time": "1 day, 1:40:55"}
139
+ {"current_steps": 1370, "total_steps": 4212, "loss": 0.584, "lr": 8.536998933016014e-05, "epoch": 0.32529977442716373, "percentage": 32.53, "elapsed_time": "12:19:09", "remaining_time": "1 day, 1:33:20"}
140
+ {"current_steps": 1380, "total_steps": 4212, "loss": 0.6155, "lr": 8.507583288393479e-05, "epoch": 0.3276742253353912, "percentage": 32.76, "elapsed_time": "12:23:50", "remaining_time": "1 day, 1:26:29"}
141
+ {"current_steps": 1390, "total_steps": 4212, "loss": 0.5219, "lr": 8.477926638339067e-05, "epoch": 0.33004867624361867, "percentage": 33.0, "elapsed_time": "12:29:25", "remaining_time": "1 day, 1:21:30"}
142
+ {"current_steps": 1400, "total_steps": 4212, "loss": 0.4953, "lr": 8.448031020555993e-05, "epoch": 0.33242312715184613, "percentage": 33.24, "elapsed_time": "12:33:50", "remaining_time": "1 day, 1:14:09"}
143
+ {"current_steps": 1410, "total_steps": 4212, "loss": 0.5899, "lr": 8.417898489166905e-05, "epoch": 0.3347975780600736, "percentage": 33.48, "elapsed_time": "12:39:11", "remaining_time": "1 day, 1:08:40"}
144
+ {"current_steps": 1420, "total_steps": 4212, "loss": 0.5992, "lr": 8.387531114572746e-05, "epoch": 0.33717202896830106, "percentage": 33.71, "elapsed_time": "12:44:32", "remaining_time": "1 day, 1:03:15"}
145
+ {"current_steps": 1430, "total_steps": 4212, "loss": 0.5268, "lr": 8.356930983310493e-05, "epoch": 0.33954647987652853, "percentage": 33.95, "elapsed_time": "12:50:00", "remaining_time": "1 day, 0:58:01"}
146
+ {"current_steps": 1440, "total_steps": 4212, "loss": 0.5396, "lr": 8.32610019790979e-05, "epoch": 0.34192093078475605, "percentage": 34.19, "elapsed_time": "12:54:24", "remaining_time": "1 day, 0:50:44"}
147
+ {"current_steps": 1450, "total_steps": 4212, "loss": 0.5868, "lr": 8.295040876748489e-05, "epoch": 0.3442953816929835, "percentage": 34.43, "elapsed_time": "12:58:59", "remaining_time": "1 day, 0:43:50"}
148
+ {"current_steps": 1460, "total_steps": 4212, "loss": 0.5431, "lr": 8.263755153907095e-05, "epoch": 0.346669832601211, "percentage": 34.66, "elapsed_time": "13:03:22", "remaining_time": "1 day, 0:36:35"}
149
+ {"current_steps": 1470, "total_steps": 4212, "loss": 0.5288, "lr": 8.23224517902213e-05, "epoch": 0.34904428350943845, "percentage": 34.9, "elapsed_time": "13:08:33", "remaining_time": "1 day, 0:30:53"}
150
+ {"current_steps": 1480, "total_steps": 4212, "loss": 0.5153, "lr": 8.200513117138435e-05, "epoch": 0.3514187344176659, "percentage": 35.14, "elapsed_time": "13:14:01", "remaining_time": "1 day, 0:25:44"}
151
+ {"current_steps": 1490, "total_steps": 4212, "loss": 0.5482, "lr": 8.168561148560414e-05, "epoch": 0.3537931853258934, "percentage": 35.38, "elapsed_time": "13:18:22", "remaining_time": "1 day, 0:18:30"}
152
+ {"current_steps": 1500, "total_steps": 4212, "loss": 0.5326, "lr": 8.136391468702214e-05, "epoch": 0.35616763623412084, "percentage": 35.61, "elapsed_time": "13:23:26", "remaining_time": "1 day, 0:12:36"}
153
+ {"current_steps": 1500, "total_steps": 4212, "eval_loss": 0.5119722485542297, "epoch": 0.35616763623412084, "percentage": 35.61, "elapsed_time": "13:56:53", "remaining_time": "1 day, 1:13:05"}
154
+ {"current_steps": 1510, "total_steps": 4212, "loss": 0.6232, "lr": 8.104006287936892e-05, "epoch": 0.3585420871423483, "percentage": 35.85, "elapsed_time": "14:01:01", "remaining_time": "1 day, 1:04:56"}
155
+ {"current_steps": 1520, "total_steps": 4212, "loss": 0.4939, "lr": 8.07140783144453e-05, "epoch": 0.36091653805057583, "percentage": 36.09, "elapsed_time": "14:05:49", "remaining_time": "1 day, 0:58:00"}
156
+ {"current_steps": 1530, "total_steps": 4212, "loss": 0.4444, "lr": 8.038598339059351e-05, "epoch": 0.3632909889588033, "percentage": 36.32, "elapsed_time": "14:11:08", "remaining_time": "1 day, 0:51:59"}
157
+ {"current_steps": 1540, "total_steps": 4212, "loss": 0.4768, "lr": 8.005580065115816e-05, "epoch": 0.36566543986703076, "percentage": 36.56, "elapsed_time": "14:15:04", "remaining_time": "1 day, 0:43:37"}
158
+ {"current_steps": 1550, "total_steps": 4212, "loss": 0.479, "lr": 7.972355278293733e-05, "epoch": 0.3680398907752582, "percentage": 36.8, "elapsed_time": "14:19:44", "remaining_time": "1 day, 0:36:32"}
159
+ {"current_steps": 1560, "total_steps": 4212, "loss": 0.4734, "lr": 7.938926261462366e-05, "epoch": 0.3704143416834857, "percentage": 37.04, "elapsed_time": "14:24:23", "remaining_time": "1 day, 0:29:27"}
160
+ {"current_steps": 1570, "total_steps": 4212, "loss": 0.5425, "lr": 7.905295311523595e-05, "epoch": 0.37278879259171316, "percentage": 37.27, "elapsed_time": "14:29:05", "remaining_time": "1 day, 0:22:31"}
161
+ {"current_steps": 1580, "total_steps": 4212, "loss": 0.4818, "lr": 7.871464739254084e-05, "epoch": 0.3751632434999406, "percentage": 37.51, "elapsed_time": "14:34:21", "remaining_time": "1 day, 0:16:31"}
162
+ {"current_steps": 1590, "total_steps": 4212, "loss": 0.496, "lr": 7.837436869146517e-05, "epoch": 0.3775376944081681, "percentage": 37.75, "elapsed_time": "14:38:55", "remaining_time": "1 day, 0:09:24"}
163
+ {"current_steps": 1600, "total_steps": 4212, "loss": 0.5389, "lr": 7.80321403924987e-05, "epoch": 0.3799121453163956, "percentage": 37.99, "elapsed_time": "14:43:35", "remaining_time": "1 day, 0:02:27"}
164
+ {"current_steps": 1610, "total_steps": 4212, "loss": 0.489, "lr": 7.768798601008776e-05, "epoch": 0.3822865962246231, "percentage": 38.22, "elapsed_time": "14:47:56", "remaining_time": "23:55:02"}
165
+ {"current_steps": 1620, "total_steps": 4212, "loss": 0.4599, "lr": 7.734192919101958e-05, "epoch": 0.38466104713285054, "percentage": 38.46, "elapsed_time": "14:52:52", "remaining_time": "23:48:35"}
166
+ {"current_steps": 1630, "total_steps": 4212, "loss": 0.6238, "lr": 7.69939937127974e-05, "epoch": 0.387035498041078, "percentage": 38.7, "elapsed_time": "14:58:04", "remaining_time": "23:42:36"}
167
+ {"current_steps": 1640, "total_steps": 4212, "loss": 0.4984, "lr": 7.664420348200689e-05, "epoch": 0.3894099489493055, "percentage": 38.94, "elapsed_time": "15:03:27", "remaining_time": "23:36:53"}
168
+ {"current_steps": 1650, "total_steps": 4212, "loss": 0.4746, "lr": 7.629258253267332e-05, "epoch": 0.39178439985753294, "percentage": 39.17, "elapsed_time": "15:08:17", "remaining_time": "23:30:19"}
169
+ {"current_steps": 1660, "total_steps": 4212, "loss": 0.5243, "lr": 7.593915502461042e-05, "epoch": 0.3941588507657604, "percentage": 39.41, "elapsed_time": "15:13:59", "remaining_time": "23:25:07"}
170
+ {"current_steps": 1670, "total_steps": 4212, "loss": 0.5353, "lr": 7.558394524176023e-05, "epoch": 0.39653330167398787, "percentage": 39.65, "elapsed_time": "15:18:54", "remaining_time": "23:18:43"}
171
+ {"current_steps": 1680, "total_steps": 4212, "loss": 0.5151, "lr": 7.522697759052451e-05, "epoch": 0.39890775258221534, "percentage": 39.89, "elapsed_time": "15:24:09", "remaining_time": "23:12:50"}
172
+ {"current_steps": 1690, "total_steps": 4212, "loss": 0.4583, "lr": 7.486827659808796e-05, "epoch": 0.40128220349044286, "percentage": 40.12, "elapsed_time": "15:29:55", "remaining_time": "23:07:44"}
173
+ {"current_steps": 1700, "total_steps": 4212, "loss": 0.5045, "lr": 7.450786691073274e-05, "epoch": 0.4036566543986703, "percentage": 40.36, "elapsed_time": "15:35:41", "remaining_time": "23:02:37"}
174
+ {"current_steps": 1710, "total_steps": 4212, "loss": 0.521, "lr": 7.414577329214522e-05, "epoch": 0.4060311053068978, "percentage": 40.6, "elapsed_time": "15:39:45", "remaining_time": "22:55:00"}
175
+ {"current_steps": 1720, "total_steps": 4212, "loss": 0.4216, "lr": 7.378202062171432e-05, "epoch": 0.40840555621512525, "percentage": 40.84, "elapsed_time": "15:44:11", "remaining_time": "22:47:58"}
176
+ {"current_steps": 1730, "total_steps": 4212, "loss": 0.439, "lr": 7.341663389282219e-05, "epoch": 0.4107800071233527, "percentage": 41.07, "elapsed_time": "15:48:24", "remaining_time": "22:40:39"}
177
+ {"current_steps": 1740, "total_steps": 4212, "loss": 0.468, "lr": 7.304963821112681e-05, "epoch": 0.4131544580315802, "percentage": 41.31, "elapsed_time": "15:53:08", "remaining_time": "22:34:07"}
178
+ {"current_steps": 1750, "total_steps": 4212, "loss": 0.5209, "lr": 7.268105879283703e-05, "epoch": 0.41552890893980765, "percentage": 41.55, "elapsed_time": "15:58:25", "remaining_time": "22:28:21"}
179
+ {"current_steps": 1760, "total_steps": 4212, "loss": 0.4444, "lr": 7.231092096297995e-05, "epoch": 0.4179033598480351, "percentage": 41.79, "elapsed_time": "16:03:18", "remaining_time": "22:22:04"}
180
+ {"current_steps": 1770, "total_steps": 4212, "loss": 0.4706, "lr": 7.19392501536609e-05, "epoch": 0.42027781075626264, "percentage": 42.02, "elapsed_time": "16:07:52", "remaining_time": "22:15:20"}
181
+ {"current_steps": 1780, "total_steps": 4212, "loss": 0.545, "lr": 7.156607190231591e-05, "epoch": 0.4226522616644901, "percentage": 42.26, "elapsed_time": "16:12:46", "remaining_time": "22:09:05"}
182
+ {"current_steps": 1790, "total_steps": 4212, "loss": 0.4404, "lr": 7.11914118499571e-05, "epoch": 0.42502671257271757, "percentage": 42.5, "elapsed_time": "16:18:09", "remaining_time": "22:03:30"}
183
+ {"current_steps": 1800, "total_steps": 4212, "loss": 0.4848, "lr": 7.081529573941091e-05, "epoch": 0.42740116348094503, "percentage": 42.74, "elapsed_time": "16:23:29", "remaining_time": "21:57:52"}
184
+ {"current_steps": 1810, "total_steps": 4212, "loss": 0.5122, "lr": 7.043774941354925e-05, "epoch": 0.4297756143891725, "percentage": 42.97, "elapsed_time": "16:28:28", "remaining_time": "21:51:46"}
185
+ {"current_steps": 1820, "total_steps": 4212, "loss": 0.4651, "lr": 7.005879881351384e-05, "epoch": 0.43215006529739997, "percentage": 43.21, "elapsed_time": "16:32:56", "remaining_time": "21:45:00"}
186
+ {"current_steps": 1830, "total_steps": 4212, "loss": 0.4405, "lr": 6.967846997693392e-05, "epoch": 0.43452451620562743, "percentage": 43.45, "elapsed_time": "16:37:10", "remaining_time": "21:37:57"}
187
+ {"current_steps": 1840, "total_steps": 4212, "loss": 0.4304, "lr": 6.929678903613705e-05, "epoch": 0.4368989671138549, "percentage": 43.68, "elapsed_time": "16:41:26", "remaining_time": "21:30:59"}
188
+ {"current_steps": 1850, "total_steps": 4212, "loss": 0.4278, "lr": 6.891378221635367e-05, "epoch": 0.4392734180220824, "percentage": 43.92, "elapsed_time": "16:46:50", "remaining_time": "21:25:29"}
189
+ {"current_steps": 1860, "total_steps": 4212, "loss": 0.4781, "lr": 6.852947583391511e-05, "epoch": 0.4416478689303099, "percentage": 44.16, "elapsed_time": "16:52:57", "remaining_time": "21:20:53"}
190
+ {"current_steps": 1870, "total_steps": 4212, "loss": 0.4582, "lr": 6.814389629444543e-05, "epoch": 0.44402231983853735, "percentage": 44.4, "elapsed_time": "16:57:40", "remaining_time": "21:14:32"}
191
+ {"current_steps": 1880, "total_steps": 4212, "loss": 0.4225, "lr": 6.775707009104708e-05, "epoch": 0.4463967707467648, "percentage": 44.63, "elapsed_time": "17:01:40", "remaining_time": "21:07:18"}
192
+ {"current_steps": 1890, "total_steps": 4212, "loss": 0.4215, "lr": 6.73690238024806e-05, "epoch": 0.4487712216549923, "percentage": 44.87, "elapsed_time": "17:06:25", "remaining_time": "21:01:02"}
193
+ {"current_steps": 1900, "total_steps": 4212, "loss": 0.518, "lr": 6.697978409133831e-05, "epoch": 0.45114567256321975, "percentage": 45.11, "elapsed_time": "17:10:57", "remaining_time": "20:54:30"}
194
+ {"current_steps": 1910, "total_steps": 4212, "loss": 0.4217, "lr": 6.658937770221242e-05, "epoch": 0.4535201234714472, "percentage": 45.35, "elapsed_time": "17:15:36", "remaining_time": "20:48:09"}
195
+ {"current_steps": 1920, "total_steps": 4212, "loss": 0.4848, "lr": 6.619783145985743e-05, "epoch": 0.4558945743796747, "percentage": 45.58, "elapsed_time": "17:21:24", "remaining_time": "20:43:11"}
196
+ {"current_steps": 1930, "total_steps": 4212, "loss": 0.4036, "lr": 6.580517226734686e-05, "epoch": 0.4582690252879022, "percentage": 45.82, "elapsed_time": "17:26:26", "remaining_time": "20:37:17"}
197
+ {"current_steps": 1940, "total_steps": 4212, "loss": 0.459, "lr": 6.541142710422489e-05, "epoch": 0.46064347619612966, "percentage": 46.06, "elapsed_time": "17:31:19", "remaining_time": "20:31:15"}
198
+ {"current_steps": 1950, "total_steps": 4212, "loss": 0.4338, "lr": 6.501662302465254e-05, "epoch": 0.46301792710435713, "percentage": 46.3, "elapsed_time": "17:35:50", "remaining_time": "20:24:46"}
199
+ {"current_steps": 1960, "total_steps": 4212, "loss": 0.4175, "lr": 6.46207871555488e-05, "epoch": 0.4653923780125846, "percentage": 46.53, "elapsed_time": "17:40:10", "remaining_time": "20:18:07"}
200
+ {"current_steps": 1970, "total_steps": 4212, "loss": 0.4774, "lr": 6.422394669472676e-05, "epoch": 0.46776682892081206, "percentage": 46.77, "elapsed_time": "17:45:46", "remaining_time": "20:12:55"}
201
+ {"current_steps": 1980, "total_steps": 4212, "loss": 0.4808, "lr": 6.382612890902478e-05, "epoch": 0.4701412798290395, "percentage": 47.01, "elapsed_time": "17:50:42", "remaining_time": "20:06:59"}
202
+ {"current_steps": 1990, "total_steps": 4212, "loss": 0.4014, "lr": 6.342736113243305e-05, "epoch": 0.472515730737267, "percentage": 47.25, "elapsed_time": "17:55:21", "remaining_time": "20:00:43"}
203
+ {"current_steps": 2000, "total_steps": 4212, "loss": 0.4165, "lr": 6.302767076421552e-05, "epoch": 0.47489018164549446, "percentage": 47.48, "elapsed_time": "18:00:17", "remaining_time": "19:54:48"}
204
+ {"current_steps": 2000, "total_steps": 4212, "eval_loss": 0.4213089346885681, "epoch": 0.47489018164549446, "percentage": 47.48, "elapsed_time": "18:33:40", "remaining_time": "20:31:43"}
205
+ {"current_steps": 2010, "total_steps": 4212, "loss": 0.4429, "lr": 6.26270852670272e-05, "epoch": 0.477264632553722, "percentage": 47.72, "elapsed_time": "18:38:45", "remaining_time": "20:25:37"}
206
+ {"current_steps": 2020, "total_steps": 4212, "loss": 0.4951, "lr": 6.222563216502724e-05, "epoch": 0.47963908346194944, "percentage": 47.96, "elapsed_time": "18:44:12", "remaining_time": "20:19:56"}
207
+ {"current_steps": 2030, "total_steps": 4212, "loss": 0.4165, "lr": 6.182333904198782e-05, "epoch": 0.4820135343701769, "percentage": 48.2, "elapsed_time": "18:48:50", "remaining_time": "20:13:21"}
208
+ {"current_steps": 2040, "total_steps": 4212, "loss": 0.4133, "lr": 6.14202335393988e-05, "epoch": 0.4843879852784044, "percentage": 48.43, "elapsed_time": "18:54:16", "remaining_time": "20:07:40"}
209
+ {"current_steps": 2050, "total_steps": 4212, "loss": 0.4254, "lr": 6.1016343354568464e-05, "epoch": 0.48676243618663184, "percentage": 48.67, "elapsed_time": "18:58:29", "remaining_time": "20:00:41"}
210
+ {"current_steps": 2060, "total_steps": 4212, "loss": 0.4138, "lr": 6.0611696238720485e-05, "epoch": 0.4891368870948593, "percentage": 48.91, "elapsed_time": "19:02:41", "remaining_time": "19:53:43"}
211
+ {"current_steps": 2070, "total_steps": 4212, "loss": 0.4021, "lr": 6.020631999508717e-05, "epoch": 0.4915113380030868, "percentage": 49.15, "elapsed_time": "19:07:28", "remaining_time": "19:47:23"}
212
+ {"current_steps": 2080, "total_steps": 4212, "loss": 0.3708, "lr": 5.980024247699903e-05, "epoch": 0.49388578891131424, "percentage": 49.38, "elapsed_time": "19:12:26", "remaining_time": "19:41:14"}
213
+ {"current_steps": 2090, "total_steps": 4212, "loss": 0.4198, "lr": 5.939349158597102e-05, "epoch": 0.4962602398195417, "percentage": 49.62, "elapsed_time": "19:17:02", "remaining_time": "19:34:45"}
214
+ {"current_steps": 2100, "total_steps": 4212, "loss": 0.3956, "lr": 5.898609526978547e-05, "epoch": 0.4986346907277692, "percentage": 49.86, "elapsed_time": "19:21:19", "remaining_time": "19:27:57"}
215
+ {"current_steps": 2110, "total_steps": 4212, "loss": 0.4176, "lr": 5.857808152057173e-05, "epoch": 0.5010091416359966, "percentage": 50.09, "elapsed_time": "19:26:08", "remaining_time": "19:21:43"}
216
+ {"current_steps": 2120, "total_steps": 4212, "loss": 0.4061, "lr": 5.816947837288285e-05, "epoch": 0.5033835925442242, "percentage": 50.33, "elapsed_time": "19:30:36", "remaining_time": "19:15:09"}
217
+ {"current_steps": 2130, "total_steps": 4212, "loss": 0.3868, "lr": 5.776031390176938e-05, "epoch": 0.5057580434524516, "percentage": 50.57, "elapsed_time": "19:35:46", "remaining_time": "19:09:16"}
218
+ {"current_steps": 2140, "total_steps": 4212, "loss": 0.5408, "lr": 5.7350616220850285e-05, "epoch": 0.5081324943606791, "percentage": 50.81, "elapsed_time": "19:40:12", "remaining_time": "19:02:42"}
219
+ {"current_steps": 2150, "total_steps": 4212, "loss": 0.4183, "lr": 5.694041348038128e-05, "epoch": 0.5105069452689066, "percentage": 51.04, "elapsed_time": "19:45:55", "remaining_time": "18:57:22"}
220
+ {"current_steps": 2160, "total_steps": 4212, "loss": 0.3751, "lr": 5.652973386532066e-05, "epoch": 0.512881396177134, "percentage": 51.28, "elapsed_time": "19:50:52", "remaining_time": "18:51:19"}
221
+ {"current_steps": 2170, "total_steps": 4212, "loss": 0.3528, "lr": 5.611860559339265e-05, "epoch": 0.5152558470853615, "percentage": 51.52, "elapsed_time": "19:55:08", "remaining_time": "18:44:38"}
222
+ {"current_steps": 2180, "total_steps": 4212, "loss": 0.4023, "lr": 5.5707056913148626e-05, "epoch": 0.517630297993589, "percentage": 51.76, "elapsed_time": "19:59:49", "remaining_time": "18:38:22"}
223
+ {"current_steps": 2190, "total_steps": 4212, "loss": 0.3637, "lr": 5.529511610202616e-05, "epoch": 0.5200047489018165, "percentage": 51.99, "elapsed_time": "20:03:41", "remaining_time": "18:31:21"}
224
+ {"current_steps": 2200, "total_steps": 4212, "loss": 0.3705, "lr": 5.4882811464406026e-05, "epoch": 0.5223791998100439, "percentage": 52.23, "elapsed_time": "20:07:54", "remaining_time": "18:24:40"}
225
+ {"current_steps": 2210, "total_steps": 4212, "loss": 0.4017, "lr": 5.4470171329667506e-05, "epoch": 0.5247536507182714, "percentage": 52.47, "elapsed_time": "20:13:12", "remaining_time": "18:19:01"}
226
+ {"current_steps": 2220, "total_steps": 4212, "loss": 0.4057, "lr": 5.405722405024183e-05, "epoch": 0.5271281016264989, "percentage": 52.71, "elapsed_time": "20:17:53", "remaining_time": "18:12:48"}
227
+ {"current_steps": 2230, "total_steps": 4212, "loss": 0.3652, "lr": 5.364399799966402e-05, "epoch": 0.5295025525347263, "percentage": 52.94, "elapsed_time": "20:22:31", "remaining_time": "18:06:33"}
228
+ {"current_steps": 2240, "total_steps": 4212, "loss": 0.3943, "lr": 5.323052157062346e-05, "epoch": 0.5318770034429539, "percentage": 53.18, "elapsed_time": "20:27:29", "remaining_time": "18:00:38"}
229
+ {"current_steps": 2250, "total_steps": 4212, "loss": 0.3657, "lr": 5.281682317301302e-05, "epoch": 0.5342514543511813, "percentage": 53.42, "elapsed_time": "20:32:14", "remaining_time": "17:54:31"}
230
+ {"current_steps": 2260, "total_steps": 4212, "loss": 0.3839, "lr": 5.240293123197694e-05, "epoch": 0.5366259052594088, "percentage": 53.66, "elapsed_time": "20:36:44", "remaining_time": "17:48:11"}
231
+ {"current_steps": 2270, "total_steps": 4212, "loss": 0.3579, "lr": 5.198887418595779e-05, "epoch": 0.5390003561676362, "percentage": 53.89, "elapsed_time": "20:41:01", "remaining_time": "17:41:42"}
232
+ {"current_steps": 2280, "total_steps": 4212, "loss": 0.3627, "lr": 5.157468048474257e-05, "epoch": 0.5413748070758637, "percentage": 54.13, "elapsed_time": "20:45:51", "remaining_time": "17:35:42"}
233
+ {"current_steps": 2290, "total_steps": 4212, "loss": 0.3827, "lr": 5.1160378587507716e-05, "epoch": 0.5437492579840911, "percentage": 54.37, "elapsed_time": "20:50:20", "remaining_time": "17:29:24"}
234
+ {"current_steps": 2300, "total_steps": 4212, "loss": 0.3421, "lr": 5.074599696086384e-05, "epoch": 0.5461237088923186, "percentage": 54.61, "elapsed_time": "20:55:45", "remaining_time": "17:23:55"}
235
+ {"current_steps": 2310, "total_steps": 4212, "loss": 0.384, "lr": 5.033156407689978e-05, "epoch": 0.5484981598005462, "percentage": 54.84, "elapsed_time": "21:01:00", "remaining_time": "17:18:16"}
236
+ {"current_steps": 2320, "total_steps": 4212, "loss": 0.3977, "lr": 4.991710841122623e-05, "epoch": 0.5508726107087736, "percentage": 55.08, "elapsed_time": "21:05:50", "remaining_time": "17:12:18"}
237
+ {"current_steps": 2330, "total_steps": 4212, "loss": 0.3861, "lr": 4.950265844101915e-05, "epoch": 0.5532470616170011, "percentage": 55.32, "elapsed_time": "21:10:24", "remaining_time": "17:06:08"}
238
+ {"current_steps": 2340, "total_steps": 4212, "loss": 0.366, "lr": 4.9088242643063304e-05, "epoch": 0.5556215125252285, "percentage": 55.56, "elapsed_time": "21:14:34", "remaining_time": "16:59:39"}
239
+ {"current_steps": 2350, "total_steps": 4212, "loss": 0.4019, "lr": 4.8673889491795344e-05, "epoch": 0.557995963433456, "percentage": 55.79, "elapsed_time": "21:19:17", "remaining_time": "16:53:38"}
240
+ {"current_steps": 2360, "total_steps": 4212, "loss": 0.39, "lr": 4.8259627457347554e-05, "epoch": 0.5603704143416834, "percentage": 56.03, "elapsed_time": "21:24:21", "remaining_time": "16:47:53"}
241
+ {"current_steps": 2370, "total_steps": 4212, "loss": 0.4089, "lr": 4.784548500359162e-05, "epoch": 0.562744865249911, "percentage": 56.27, "elapsed_time": "21:28:47", "remaining_time": "16:41:40"}
242
+ {"current_steps": 2380, "total_steps": 4212, "loss": 0.3207, "lr": 4.743149058618278e-05, "epoch": 0.5651193161581385, "percentage": 56.51, "elapsed_time": "21:34:10", "remaining_time": "16:36:11"}
243
+ {"current_steps": 2390, "total_steps": 4212, "loss": 0.3822, "lr": 4.7017672650604766e-05, "epoch": 0.5674937670663659, "percentage": 56.74, "elapsed_time": "21:38:41", "remaining_time": "16:30:02"}
244
+ {"current_steps": 2400, "total_steps": 4212, "loss": 0.3417, "lr": 4.6604059630215326e-05, "epoch": 0.5698682179745934, "percentage": 56.98, "elapsed_time": "21:43:22", "remaining_time": "16:24:02"}
245
+ {"current_steps": 2410, "total_steps": 4212, "loss": 0.404, "lr": 4.6190679944292395e-05, "epoch": 0.5722426688828208, "percentage": 57.22, "elapsed_time": "21:48:13", "remaining_time": "16:18:11"}
246
+ {"current_steps": 2420, "total_steps": 4212, "loss": 0.3609, "lr": 4.5777561996081656e-05, "epoch": 0.5746171197910483, "percentage": 57.45, "elapsed_time": "21:52:57", "remaining_time": "16:12:14"}
247
+ {"current_steps": 2430, "total_steps": 4212, "loss": 0.3217, "lr": 4.5364734170844807e-05, "epoch": 0.5769915706992758, "percentage": 57.69, "elapsed_time": "21:57:10", "remaining_time": "16:05:55"}
248
+ {"current_steps": 2440, "total_steps": 4212, "loss": 0.3611, "lr": 4.4952224833909194e-05, "epoch": 0.5793660216075033, "percentage": 57.93, "elapsed_time": "22:01:59", "remaining_time": "16:00:04"}
249
+ {"current_steps": 2450, "total_steps": 4212, "loss": 0.3434, "lr": 4.4540062328718945e-05, "epoch": 0.5817404725157307, "percentage": 58.17, "elapsed_time": "22:06:48", "remaining_time": "15:54:13"}
250
+ {"current_steps": 2460, "total_steps": 4212, "loss": 0.3619, "lr": 4.412827497488744e-05, "epoch": 0.5841149234239582, "percentage": 58.4, "elapsed_time": "22:11:16", "remaining_time": "15:48:07"}
251
+ {"current_steps": 2470, "total_steps": 4212, "loss": 0.3387, "lr": 4.371689106625143e-05, "epoch": 0.5864893743321857, "percentage": 58.64, "elapsed_time": "22:16:26", "remaining_time": "15:42:32"}
252
+ {"current_steps": 2480, "total_steps": 4212, "loss": 0.3145, "lr": 4.330593886892707e-05, "epoch": 0.5888638252404131, "percentage": 58.88, "elapsed_time": "22:21:11", "remaining_time": "15:36:40"}
253
+ {"current_steps": 2490, "total_steps": 4212, "loss": 0.4051, "lr": 4.2895446619367684e-05, "epoch": 0.5912382761486407, "percentage": 59.12, "elapsed_time": "22:25:53", "remaining_time": "15:30:46"}
254
+ {"current_steps": 2500, "total_steps": 4212, "loss": 0.4129, "lr": 4.2485442522423636e-05, "epoch": 0.5936127270568681, "percentage": 59.35, "elapsed_time": "22:30:58", "remaining_time": "15:25:08"}
255
+ {"current_steps": 2500, "total_steps": 4212, "eval_loss": 0.3469230532646179, "epoch": 0.5936127270568681, "percentage": 59.35, "elapsed_time": "23:04:21", "remaining_time": "15:48:00"}
256
+ {"current_steps": 2510, "total_steps": 4212, "loss": 0.3652, "lr": 4.207595474940446e-05, "epoch": 0.5959871779650956, "percentage": 59.59, "elapsed_time": "23:09:16", "remaining_time": "15:42:03"}
257
+ {"current_steps": 2520, "total_steps": 4212, "loss": 0.2876, "lr": 4.166701143614315e-05, "epoch": 0.598361628873323, "percentage": 59.83, "elapsed_time": "23:13:41", "remaining_time": "15:35:45"}
258
+ {"current_steps": 2530, "total_steps": 4212, "loss": 0.3285, "lr": 4.1258640681062934e-05, "epoch": 0.6007360797815505, "percentage": 60.07, "elapsed_time": "23:18:16", "remaining_time": "15:29:36"}
259
+ {"current_steps": 2540, "total_steps": 4212, "loss": 0.3517, "lr": 4.08508705432467e-05, "epoch": 0.6031105306897779, "percentage": 60.3, "elapsed_time": "23:22:37", "remaining_time": "15:23:18"}
260
+ {"current_steps": 2550, "total_steps": 4212, "loss": 0.3232, "lr": 4.0443729040509045e-05, "epoch": 0.6054849815980055, "percentage": 60.54, "elapsed_time": "23:27:07", "remaining_time": "15:17:07"}
261
+ {"current_steps": 2560, "total_steps": 4212, "loss": 0.3429, "lr": 4.00372441474711e-05, "epoch": 0.607859432506233, "percentage": 60.78, "elapsed_time": "23:31:46", "remaining_time": "15:11:02"}
262
+ {"current_steps": 2570, "total_steps": 4212, "loss": 0.4089, "lr": 3.96314437936385e-05, "epoch": 0.6102338834144604, "percentage": 61.02, "elapsed_time": "23:36:09", "remaining_time": "15:04:47"}
263
+ {"current_steps": 2580, "total_steps": 4212, "loss": 0.3063, "lr": 3.922635586148234e-05, "epoch": 0.6126083343226879, "percentage": 61.25, "elapsed_time": "23:41:38", "remaining_time": "14:59:16"}
264
+ {"current_steps": 2590, "total_steps": 4212, "loss": 0.3233, "lr": 3.8822008184523265e-05, "epoch": 0.6149827852309153, "percentage": 61.49, "elapsed_time": "23:45:40", "remaining_time": "14:52:49"}
265
+ {"current_steps": 2600, "total_steps": 4212, "loss": 0.3372, "lr": 3.841842854541919e-05, "epoch": 0.6173572361391428, "percentage": 61.73, "elapsed_time": "23:50:09", "remaining_time": "14:46:41"}
266
+ {"current_steps": 2610, "total_steps": 4212, "loss": 0.3275, "lr": 3.8015644674056266e-05, "epoch": 0.6197316870473702, "percentage": 61.97, "elapsed_time": "23:55:44", "remaining_time": "14:41:14"}
267
+ {"current_steps": 2620, "total_steps": 4212, "loss": 0.3352, "lr": 3.7613684245643544e-05, "epoch": 0.6221061379555978, "percentage": 62.2, "elapsed_time": "1 day, 0:00:25", "remaining_time": "14:35:15"}
268
+ {"current_steps": 2630, "total_steps": 4212, "loss": 0.3131, "lr": 3.7212574878811495e-05, "epoch": 0.6244805888638253, "percentage": 62.44, "elapsed_time": "1 day, 0:05:15", "remaining_time": "14:29:21"}
269
+ {"current_steps": 2640, "total_steps": 4212, "loss": 0.3497, "lr": 3.68123441337143e-05, "epoch": 0.6268550397720527, "percentage": 62.68, "elapsed_time": "1 day, 0:09:10", "remaining_time": "14:22:55"}
270
+ {"current_steps": 2650, "total_steps": 4212, "loss": 0.3628, "lr": 3.641301951013617e-05, "epoch": 0.6292294906802802, "percentage": 62.92, "elapsed_time": "1 day, 0:12:54", "remaining_time": "14:16:23"}
271
+ {"current_steps": 2660, "total_steps": 4212, "loss": 0.2979, "lr": 3.601462844560187e-05, "epoch": 0.6316039415885076, "percentage": 63.15, "elapsed_time": "1 day, 0:17:10", "remaining_time": "14:10:12"}
272
+ {"current_steps": 2670, "total_steps": 4212, "loss": 0.3169, "lr": 3.561719831349153e-05, "epoch": 0.6339783924967352, "percentage": 63.39, "elapsed_time": "1 day, 0:22:35", "remaining_time": "14:04:41"}
273
+ {"current_steps": 2680, "total_steps": 4212, "loss": 0.3234, "lr": 3.5220756421159696e-05, "epoch": 0.6363528434049626, "percentage": 63.63, "elapsed_time": "1 day, 0:27:43", "remaining_time": "13:59:00"}
274
+ {"current_steps": 2690, "total_steps": 4212, "loss": 0.3372, "lr": 3.482533000805921e-05, "epoch": 0.6387272943131901, "percentage": 63.87, "elapsed_time": "1 day, 0:32:27", "remaining_time": "13:53:07"}
275
+ {"current_steps": 2700, "total_steps": 4212, "loss": 0.2902, "lr": 3.443094624386949e-05, "epoch": 0.6411017452214175, "percentage": 64.1, "elapsed_time": "1 day, 0:36:31", "remaining_time": "13:46:51"}
276
+ {"current_steps": 2710, "total_steps": 4212, "loss": 0.293, "lr": 3.4037632226629704e-05, "epoch": 0.643476196129645, "percentage": 64.34, "elapsed_time": "1 day, 0:40:59", "remaining_time": "13:40:50"}
277
+ {"current_steps": 2720, "total_steps": 4212, "loss": 0.3087, "lr": 3.3645414980876946e-05, "epoch": 0.6458506470378725, "percentage": 64.58, "elapsed_time": "1 day, 0:46:23", "remaining_time": "13:35:19"}
278
+ {"current_steps": 2730, "total_steps": 4212, "loss": 0.3082, "lr": 3.32543214557893e-05, "epoch": 0.6482250979461, "percentage": 64.81, "elapsed_time": "1 day, 0:50:48", "remaining_time": "13:29:17"}
279
+ {"current_steps": 2740, "total_steps": 4212, "loss": 0.2858, "lr": 3.286437852333418e-05, "epoch": 0.6505995488543275, "percentage": 65.05, "elapsed_time": "1 day, 0:55:48", "remaining_time": "13:23:35"}
280
+ {"current_steps": 2750, "total_steps": 4212, "loss": 0.339, "lr": 3.247561297642203e-05, "epoch": 0.6529739997625549, "percentage": 65.29, "elapsed_time": "1 day, 1:00:08", "remaining_time": "13:17:32"}
281
+ {"current_steps": 2760, "total_steps": 4212, "loss": 0.2766, "lr": 3.208805152706533e-05, "epoch": 0.6553484506707824, "percentage": 65.53, "elapsed_time": "1 day, 1:04:58", "remaining_time": "13:11:44"}
282
+ {"current_steps": 2770, "total_steps": 4212, "loss": 0.3518, "lr": 3.170172080454319e-05, "epoch": 0.6577229015790098, "percentage": 65.76, "elapsed_time": "1 day, 1:09:15", "remaining_time": "13:05:40"}
283
+ {"current_steps": 2780, "total_steps": 4212, "loss": 0.2908, "lr": 3.131664735357174e-05, "epoch": 0.6600973524872373, "percentage": 66.0, "elapsed_time": "1 day, 1:13:44", "remaining_time": "12:59:44"}
284
+ {"current_steps": 2790, "total_steps": 4212, "loss": 0.3426, "lr": 3.0932857632480185e-05, "epoch": 0.6624718033954649, "percentage": 66.24, "elapsed_time": "1 day, 1:18:45", "remaining_time": "12:54:04"}
285
+ {"current_steps": 2800, "total_steps": 4212, "loss": 0.2876, "lr": 3.055037801139286e-05, "epoch": 0.6648462543036923, "percentage": 66.48, "elapsed_time": "1 day, 1:23:12", "remaining_time": "12:48:08"}
286
+ {"current_steps": 2810, "total_steps": 4212, "loss": 0.2819, "lr": 3.0169234770417376e-05, "epoch": 0.6672207052119198, "percentage": 66.71, "elapsed_time": "1 day, 1:28:08", "remaining_time": "12:42:26"}
287
+ {"current_steps": 2820, "total_steps": 4212, "loss": 0.3145, "lr": 2.978945409783892e-05, "epoch": 0.6695951561201472, "percentage": 66.95, "elapsed_time": "1 day, 1:32:50", "remaining_time": "12:36:38"}
288
+ {"current_steps": 2830, "total_steps": 4212, "loss": 0.2872, "lr": 2.94110620883208e-05, "epoch": 0.6719696070283747, "percentage": 67.19, "elapsed_time": "1 day, 1:37:19", "remaining_time": "12:30:44"}
289
+ {"current_steps": 2840, "total_steps": 4212, "loss": 0.2798, "lr": 2.9034084741111555e-05, "epoch": 0.6743440579366021, "percentage": 67.43, "elapsed_time": "1 day, 1:41:23", "remaining_time": "12:24:38"}
290
+ {"current_steps": 2850, "total_steps": 4212, "loss": 0.2699, "lr": 2.8658547958258543e-05, "epoch": 0.6767185088448296, "percentage": 67.66, "elapsed_time": "1 day, 1:46:42", "remaining_time": "12:19:09"}
291
+ {"current_steps": 2860, "total_steps": 4212, "loss": 0.3362, "lr": 2.8284477542828153e-05, "epoch": 0.6790929597530571, "percentage": 67.9, "elapsed_time": "1 day, 1:51:51", "remaining_time": "12:13:36"}
292
+ {"current_steps": 2870, "total_steps": 4212, "loss": 0.2896, "lr": 2.791189919713294e-05, "epoch": 0.6814674106612846, "percentage": 68.14, "elapsed_time": "1 day, 1:56:56", "remaining_time": "12:08:00"}
293
+ {"current_steps": 2880, "total_steps": 4212, "loss": 0.3138, "lr": 2.7540838520965672e-05, "epoch": 0.6838418615695121, "percentage": 68.38, "elapsed_time": "1 day, 2:01:45", "remaining_time": "12:02:18"}
294
+ {"current_steps": 2890, "total_steps": 4212, "loss": 0.268, "lr": 2.7171321009840178e-05, "epoch": 0.6862163124777395, "percentage": 68.61, "elapsed_time": "1 day, 2:06:39", "remaining_time": "11:56:38"}
295
+ {"current_steps": 2900, "total_steps": 4212, "loss": 0.2844, "lr": 2.6803372053239834e-05, "epoch": 0.688590763385967, "percentage": 68.85, "elapsed_time": "1 day, 2:12:16", "remaining_time": "11:51:19"}
296
+ {"current_steps": 2910, "total_steps": 4212, "loss": 0.2931, "lr": 2.6437016932872816e-05, "epoch": 0.6909652142941944, "percentage": 69.09, "elapsed_time": "1 day, 2:16:58", "remaining_time": "11:45:34"}
297
+ {"current_steps": 2920, "total_steps": 4212, "loss": 0.2818, "lr": 2.6072280820935103e-05, "epoch": 0.693339665202422, "percentage": 69.33, "elapsed_time": "1 day, 2:21:08", "remaining_time": "11:39:36"}
298
+ {"current_steps": 2930, "total_steps": 4212, "loss": 0.2741, "lr": 2.5709188778380942e-05, "epoch": 0.6957141161106494, "percentage": 69.56, "elapsed_time": "1 day, 2:26:14", "remaining_time": "11:34:02"}
299
+ {"current_steps": 2940, "total_steps": 4212, "loss": 0.3142, "lr": 2.5347765753200808e-05, "epoch": 0.6980885670188769, "percentage": 69.8, "elapsed_time": "1 day, 2:31:14", "remaining_time": "11:28:27"}
300
+ {"current_steps": 2950, "total_steps": 4212, "loss": 0.2499, "lr": 2.4988036578707303e-05, "epoch": 0.7004630179271043, "percentage": 70.04, "elapsed_time": "1 day, 2:35:42", "remaining_time": "11:22:38"}
301
+ {"current_steps": 2960, "total_steps": 4212, "loss": 0.2565, "lr": 2.463002597182882e-05, "epoch": 0.7028374688353318, "percentage": 70.28, "elapsed_time": "1 day, 2:39:49", "remaining_time": "11:16:41"}
302
+ {"current_steps": 2970, "total_steps": 4212, "loss": 0.2848, "lr": 2.427375853141134e-05, "epoch": 0.7052119197435593, "percentage": 70.51, "elapsed_time": "1 day, 2:44:02", "remaining_time": "11:10:47"}
303
+ {"current_steps": 2980, "total_steps": 4212, "loss": 0.2536, "lr": 2.3919258736528123e-05, "epoch": 0.7075863706517868, "percentage": 70.75, "elapsed_time": "1 day, 2:48:24", "remaining_time": "11:04:57"}
304
+ {"current_steps": 2990, "total_steps": 4212, "loss": 0.2693, "lr": 2.3566550944797804e-05, "epoch": 0.7099608215600143, "percentage": 70.99, "elapsed_time": "1 day, 2:53:28", "remaining_time": "10:59:25"}
305
+ {"current_steps": 3000, "total_steps": 4212, "loss": 0.345, "lr": 2.321565939071089e-05, "epoch": 0.7123352724682417, "percentage": 71.23, "elapsed_time": "1 day, 2:58:37", "remaining_time": "10:53:55"}
306
+ {"current_steps": 3000, "total_steps": 4212, "eval_loss": 0.2755714952945709, "epoch": 0.7123352724682417, "percentage": 71.23, "elapsed_time": "1 day, 3:31:59", "remaining_time": "11:07:24"}
307
+ {"current_steps": 3010, "total_steps": 4212, "loss": 0.3807, "lr": 2.2866608183964376e-05, "epoch": 0.7147097233764692, "percentage": 71.46, "elapsed_time": "1 day, 3:36:49", "remaining_time": "11:01:37"}
308
+ {"current_steps": 3020, "total_steps": 4212, "loss": 0.2892, "lr": 2.2519421307805445e-05, "epoch": 0.7170841742846966, "percentage": 71.7, "elapsed_time": "1 day, 3:41:29", "remaining_time": "10:55:47"}
309
+ {"current_steps": 3030, "total_steps": 4212, "loss": 0.2531, "lr": 2.217412261738338e-05, "epoch": 0.7194586251929241, "percentage": 71.94, "elapsed_time": "1 day, 3:46:48", "remaining_time": "10:50:13"}
310
+ {"current_steps": 3040, "total_steps": 4212, "loss": 0.2458, "lr": 2.183073583811055e-05, "epoch": 0.7218330761011517, "percentage": 72.17, "elapsed_time": "1 day, 3:51:05", "remaining_time": "10:44:15"}
311
+ {"current_steps": 3050, "total_steps": 4212, "loss": 0.2413, "lr": 2.1489284564032308e-05, "epoch": 0.7242075270093791, "percentage": 72.41, "elapsed_time": "1 day, 3:55:50", "remaining_time": "10:38:28"}
312
+ {"current_steps": 3060, "total_steps": 4212, "loss": 0.2575, "lr": 2.1149792256205725e-05, "epoch": 0.7265819779176066, "percentage": 72.65, "elapsed_time": "1 day, 4:00:21", "remaining_time": "10:32:36"}
313
+ {"current_steps": 3070, "total_steps": 4212, "loss": 0.2503, "lr": 2.0812282241087662e-05, "epoch": 0.728956428825834, "percentage": 72.89, "elapsed_time": "1 day, 4:05:20", "remaining_time": "10:26:55"}
314
+ {"current_steps": 3080, "total_steps": 4212, "loss": 0.277, "lr": 2.0476777708931978e-05, "epoch": 0.7313308797340615, "percentage": 73.12, "elapsed_time": "1 day, 4:10:40", "remaining_time": "10:21:22"}
315
+ {"current_steps": 3090, "total_steps": 4212, "loss": 0.2969, "lr": 2.01433017121962e-05, "epoch": 0.7337053306422889, "percentage": 73.36, "elapsed_time": "1 day, 4:15:15", "remaining_time": "10:15:33"}
316
+ {"current_steps": 3100, "total_steps": 4212, "loss": 0.2344, "lr": 1.981187716395751e-05, "epoch": 0.7360797815505165, "percentage": 73.6, "elapsed_time": "1 day, 4:19:44", "remaining_time": "10:09:42"}
317
+ {"current_steps": 3110, "total_steps": 4212, "loss": 0.2428, "lr": 1.9482526836338387e-05, "epoch": 0.7384542324587439, "percentage": 73.84, "elapsed_time": "1 day, 4:23:42", "remaining_time": "10:03:41"}
318
+ {"current_steps": 3120, "total_steps": 4212, "loss": 0.2683, "lr": 1.915527335894209e-05, "epoch": 0.7408286833669714, "percentage": 74.07, "elapsed_time": "1 day, 4:28:10", "remaining_time": "9:57:51"}
319
+ {"current_steps": 3130, "total_steps": 4212, "loss": 0.2508, "lr": 1.8830139217297498e-05, "epoch": 0.7432031342751989, "percentage": 74.31, "elapsed_time": "1 day, 4:32:51", "remaining_time": "9:52:06"}
320
+ {"current_steps": 3140, "total_steps": 4212, "loss": 0.2353, "lr": 1.8507146751314464e-05, "epoch": 0.7455775851834263, "percentage": 74.55, "elapsed_time": "1 day, 4:37:20", "remaining_time": "9:46:18"}
321
+ {"current_steps": 3150, "total_steps": 4212, "loss": 0.2623, "lr": 1.8186318153748587e-05, "epoch": 0.7479520360916538, "percentage": 74.79, "elapsed_time": "1 day, 4:42:22", "remaining_time": "9:40:41"}
322
+ {"current_steps": 3160, "total_steps": 4212, "loss": 0.2305, "lr": 1.786767546867647e-05, "epoch": 0.7503264869998812, "percentage": 75.02, "elapsed_time": "1 day, 4:47:06", "remaining_time": "9:34:58"}
323
+ {"current_steps": 3170, "total_steps": 4212, "loss": 0.2508, "lr": 1.755124058998108e-05, "epoch": 0.7527009379081088, "percentage": 75.26, "elapsed_time": "1 day, 4:51:23", "remaining_time": "9:29:07"}
324
+ {"current_steps": 3180, "total_steps": 4212, "loss": 0.242, "lr": 1.723703525984735e-05, "epoch": 0.7550753888163362, "percentage": 75.5, "elapsed_time": "1 day, 4:55:51", "remaining_time": "9:23:20"}
325
+ {"current_steps": 3190, "total_steps": 4212, "loss": 0.2466, "lr": 1.692508106726836e-05, "epoch": 0.7574498397245637, "percentage": 75.74, "elapsed_time": "1 day, 5:00:46", "remaining_time": "9:17:42"}
326
+ {"current_steps": 3200, "total_steps": 4212, "loss": 0.2407, "lr": 1.6615399446561886e-05, "epoch": 0.7598242906327912, "percentage": 75.97, "elapsed_time": "1 day, 5:05:07", "remaining_time": "9:11:53"}
327
+ {"current_steps": 3210, "total_steps": 4212, "loss": 0.2504, "lr": 1.630801167589774e-05, "epoch": 0.7621987415410186, "percentage": 76.21, "elapsed_time": "1 day, 5:09:47", "remaining_time": "9:06:11"}
328
+ {"current_steps": 3220, "total_steps": 4212, "loss": 0.2792, "lr": 1.6002938875835665e-05, "epoch": 0.7645731924492462, "percentage": 76.45, "elapsed_time": "1 day, 5:15:09", "remaining_time": "9:00:43"}
329
+ {"current_steps": 3230, "total_steps": 4212, "loss": 0.2673, "lr": 1.5700202007874165e-05, "epoch": 0.7669476433574736, "percentage": 76.69, "elapsed_time": "1 day, 5:20:32", "remaining_time": "8:55:14"}
330
+ {"current_steps": 3240, "total_steps": 4212, "loss": 0.2371, "lr": 1.5399821873010335e-05, "epoch": 0.7693220942657011, "percentage": 76.92, "elapsed_time": "1 day, 5:25:11", "remaining_time": "8:49:33"}
331
+ {"current_steps": 3250, "total_steps": 4212, "loss": 0.2323, "lr": 1.5101819110310433e-05, "epoch": 0.7716965451739285, "percentage": 77.16, "elapsed_time": "1 day, 5:29:33", "remaining_time": "8:43:47"}
332
+ {"current_steps": 3260, "total_steps": 4212, "loss": 0.2566, "lr": 1.4806214195492008e-05, "epoch": 0.774070996082156, "percentage": 77.4, "elapsed_time": "1 day, 5:33:54", "remaining_time": "8:38:01"}
333
+ {"current_steps": 3270, "total_steps": 4212, "loss": 0.2677, "lr": 1.4513027439516847e-05, "epoch": 0.7764454469903834, "percentage": 77.64, "elapsed_time": "1 day, 5:39:12", "remaining_time": "8:32:32"}
334
+ {"current_steps": 3280, "total_steps": 4212, "loss": 0.253, "lr": 1.4222278987195447e-05, "epoch": 0.778819897898611, "percentage": 77.87, "elapsed_time": "1 day, 5:44:04", "remaining_time": "8:26:56"}
335
+ {"current_steps": 3290, "total_steps": 4212, "loss": 0.2366, "lr": 1.3933988815802962e-05, "epoch": 0.7811943488068385, "percentage": 78.11, "elapsed_time": "1 day, 5:49:00", "remaining_time": "8:21:21"}
336
+ {"current_steps": 3300, "total_steps": 4212, "loss": 0.2658, "lr": 1.3648176733706419e-05, "epoch": 0.7835687997150659, "percentage": 78.35, "elapsed_time": "1 day, 5:55:36", "remaining_time": "8:16:14"}
337
+ {"current_steps": 3310, "total_steps": 4212, "loss": 0.2921, "lr": 1.3364862379003812e-05, "epoch": 0.7859432506232934, "percentage": 78.58, "elapsed_time": "1 day, 6:00:36", "remaining_time": "8:10:40"}
338
+ {"current_steps": 3320, "total_steps": 4212, "loss": 0.2456, "lr": 1.3084065218174679e-05, "epoch": 0.7883177015315208, "percentage": 78.82, "elapsed_time": "1 day, 6:05:00", "remaining_time": "8:04:57"}
339
+ {"current_steps": 3330, "total_steps": 4212, "loss": 0.2153, "lr": 1.2805804544742672e-05, "epoch": 0.7906921524397483, "percentage": 79.06, "elapsed_time": "1 day, 6:10:19", "remaining_time": "7:59:29"}
340
+ {"current_steps": 3340, "total_steps": 4212, "loss": 0.254, "lr": 1.2530099477949792e-05, "epoch": 0.7930666033479757, "percentage": 79.3, "elapsed_time": "1 day, 6:14:57", "remaining_time": "7:53:50"}
341
+ {"current_steps": 3350, "total_steps": 4212, "loss": 0.2346, "lr": 1.2256968961442755e-05, "epoch": 0.7954410542562033, "percentage": 79.53, "elapsed_time": "1 day, 6:19:22", "remaining_time": "7:48:08"}
342
+ {"current_steps": 3360, "total_steps": 4212, "loss": 0.2527, "lr": 1.198643176197144e-05, "epoch": 0.7978155051644307, "percentage": 79.77, "elapsed_time": "1 day, 6:24:19", "remaining_time": "7:42:35"}
343
+ {"current_steps": 3370, "total_steps": 4212, "loss": 0.2582, "lr": 1.1718506468099254e-05, "epoch": 0.8001899560726582, "percentage": 80.01, "elapsed_time": "1 day, 6:28:35", "remaining_time": "7:36:52"}
344
+ {"current_steps": 3380, "total_steps": 4212, "loss": 0.242, "lr": 1.1453211488926153e-05, "epoch": 0.8025644069808857, "percentage": 80.25, "elapsed_time": "1 day, 6:33:21", "remaining_time": "7:31:17"}
345
+ {"current_steps": 3390, "total_steps": 4212, "loss": 0.2509, "lr": 1.1190565052823548e-05, "epoch": 0.8049388578891131, "percentage": 80.48, "elapsed_time": "1 day, 6:37:26", "remaining_time": "7:25:32"}
346
+ {"current_steps": 3400, "total_steps": 4212, "loss": 0.2187, "lr": 1.0930585206181942e-05, "epoch": 0.8073133087973406, "percentage": 80.72, "elapsed_time": "1 day, 6:41:59", "remaining_time": "7:19:54"}
347
+ {"current_steps": 3410, "total_steps": 4212, "loss": 0.2268, "lr": 1.0673289812170972e-05, "epoch": 0.809687759705568, "percentage": 80.96, "elapsed_time": "1 day, 6:47:25", "remaining_time": "7:14:29"}
348
+ {"current_steps": 3420, "total_steps": 4212, "loss": 0.2317, "lr": 1.041869654951198e-05, "epoch": 0.8120622106137956, "percentage": 81.2, "elapsed_time": "1 day, 6:51:41", "remaining_time": "7:08:48"}
349
+ {"current_steps": 3430, "total_steps": 4212, "loss": 0.2534, "lr": 1.016682291126333e-05, "epoch": 0.814436661522023, "percentage": 81.43, "elapsed_time": "1 day, 6:56:03", "remaining_time": "7:03:09"}
350
+ {"current_steps": 3440, "total_steps": 4212, "loss": 0.2129, "lr": 9.917686203618475e-06, "epoch": 0.8168111124302505, "percentage": 81.67, "elapsed_time": "1 day, 7:00:30", "remaining_time": "6:57:31"}
351
+ {"current_steps": 3450, "total_steps": 4212, "loss": 0.2118, "lr": 9.671303544716875e-06, "epoch": 0.819185563338478, "percentage": 81.91, "elapsed_time": "1 day, 7:05:11", "remaining_time": "6:51:57"}
352
+ {"current_steps": 3460, "total_steps": 4212, "loss": 0.2315, "lr": 9.427691863467758e-06, "epoch": 0.8215600142467054, "percentage": 82.15, "elapsed_time": "1 day, 7:10:18", "remaining_time": "6:46:29"}
353
+ {"current_steps": 3470, "total_steps": 4212, "loss": 0.2312, "lr": 9.186867898386952e-06, "epoch": 0.823934465154933, "percentage": 82.38, "elapsed_time": "1 day, 7:14:49", "remaining_time": "6:40:53"}
354
+ {"current_steps": 3480, "total_steps": 4212, "loss": 0.2311, "lr": 8.948848196446852e-06, "epoch": 0.8263089160631604, "percentage": 82.62, "elapsed_time": "1 day, 7:20:04", "remaining_time": "6:35:27"}
355
+ {"current_steps": 3490, "total_steps": 4212, "loss": 0.2519, "lr": 8.713649111939332e-06, "epoch": 0.8286833669713879, "percentage": 82.86, "elapsed_time": "1 day, 7:24:35", "remaining_time": "6:29:52"}
356
+ {"current_steps": 3500, "total_steps": 4212, "loss": 0.1919, "lr": 8.481286805352234e-06, "epoch": 0.8310578178796153, "percentage": 83.1, "elapsed_time": "1 day, 7:29:11", "remaining_time": "6:24:18"}
357
+ {"current_steps": 3500, "total_steps": 4212, "eval_loss": 0.2284688502550125, "epoch": 0.8310578178796153, "percentage": 83.1, "elapsed_time": "1 day, 8:02:37", "remaining_time": "6:31:06"}
358
+ {"current_steps": 3510, "total_steps": 4212, "loss": 0.2235, "lr": 8.251777242258834e-06, "epoch": 0.8334322687878428, "percentage": 83.33, "elapsed_time": "1 day, 8:07:37", "remaining_time": "6:25:31"}
359
+ {"current_steps": 3520, "total_steps": 4212, "loss": 0.1998, "lr": 8.025136192220894e-06, "epoch": 0.8358067196960702, "percentage": 83.57, "elapsed_time": "1 day, 8:12:32", "remaining_time": "6:19:55"}
360
+ {"current_steps": 3530, "total_steps": 4212, "loss": 0.2251, "lr": 7.801379227705203e-06, "epoch": 0.8381811706042978, "percentage": 83.81, "elapsed_time": "1 day, 8:17:12", "remaining_time": "6:14:16"}
361
+ {"current_steps": 3540, "total_steps": 4212, "loss": 0.1954, "lr": 7.58052172301349e-06, "epoch": 0.8405556215125253, "percentage": 84.05, "elapsed_time": "1 day, 8:22:15", "remaining_time": "6:08:42"}
362
+ {"current_steps": 3550, "total_steps": 4212, "loss": 0.2014, "lr": 7.362578853226121e-06, "epoch": 0.8429300724207527, "percentage": 84.28, "elapsed_time": "1 day, 8:27:39", "remaining_time": "6:03:11"}
363
+ {"current_steps": 3560, "total_steps": 4212, "loss": 0.2346, "lr": 7.1475655931594e-06, "epoch": 0.8453045233289802, "percentage": 84.52, "elapsed_time": "1 day, 8:32:39", "remaining_time": "5:57:37"}
364
+ {"current_steps": 3570, "total_steps": 4212, "loss": 0.1944, "lr": 6.9354967163367035e-06, "epoch": 0.8476789742372076, "percentage": 84.76, "elapsed_time": "1 day, 8:37:38", "remaining_time": "5:52:02"}
365
+ {"current_steps": 3580, "total_steps": 4212, "loss": 0.2036, "lr": 6.726386793973305e-06, "epoch": 0.8500534251454351, "percentage": 85.0, "elapsed_time": "1 day, 8:42:35", "remaining_time": "5:46:28"}
366
+ {"current_steps": 3590, "total_steps": 4212, "loss": 0.2438, "lr": 6.520250193975242e-06, "epoch": 0.8524278760536625, "percentage": 85.23, "elapsed_time": "1 day, 8:47:27", "remaining_time": "5:40:52"}
367
+ {"current_steps": 3600, "total_steps": 4212, "loss": 0.2258, "lr": 6.317101079952148e-06, "epoch": 0.8548023269618901, "percentage": 85.47, "elapsed_time": "1 day, 8:53:14", "remaining_time": "5:35:27"}
368
+ {"current_steps": 3610, "total_steps": 4212, "loss": 0.1954, "lr": 6.116953410243925e-06, "epoch": 0.8571767778701176, "percentage": 85.71, "elapsed_time": "1 day, 8:57:55", "remaining_time": "5:29:50"}
369
+ {"current_steps": 3620, "total_steps": 4212, "loss": 0.2192, "lr": 5.919820936961856e-06, "epoch": 0.859551228778345, "percentage": 85.94, "elapsed_time": "1 day, 9:02:50", "remaining_time": "5:24:15"}
370
+ {"current_steps": 3630, "total_steps": 4212, "loss": 0.1896, "lr": 5.725717205043552e-06, "epoch": 0.8619256796865725, "percentage": 86.18, "elapsed_time": "1 day, 9:07:14", "remaining_time": "5:18:36"}
371
+ {"current_steps": 3640, "total_steps": 4212, "loss": 0.2318, "lr": 5.5346555513223485e-06, "epoch": 0.8643001305947999, "percentage": 86.42, "elapsed_time": "1 day, 9:12:27", "remaining_time": "5:13:05"}
372
+ {"current_steps": 3650, "total_steps": 4212, "loss": 0.2081, "lr": 5.34664910361094e-06, "epoch": 0.8666745815030275, "percentage": 86.66, "elapsed_time": "1 day, 9:17:40", "remaining_time": "5:07:35"}
373
+ {"current_steps": 3660, "total_steps": 4212, "loss": 0.2156, "lr": 5.161710779799328e-06, "epoch": 0.8690490324112549, "percentage": 86.89, "elapsed_time": "1 day, 9:22:59", "remaining_time": "5:02:05"}
374
+ {"current_steps": 3670, "total_steps": 4212, "loss": 0.1889, "lr": 4.979853286967273e-06, "epoch": 0.8714234833194824, "percentage": 87.13, "elapsed_time": "1 day, 9:27:03", "remaining_time": "4:56:24"}
375
+ {"current_steps": 3680, "total_steps": 4212, "loss": 0.2193, "lr": 4.801089120511165e-06, "epoch": 0.8737979342277098, "percentage": 87.37, "elapsed_time": "1 day, 9:31:58", "remaining_time": "4:50:51"}
376
+ {"current_steps": 3690, "total_steps": 4212, "loss": 0.2315, "lr": 4.625430563285515e-06, "epoch": 0.8761723851359373, "percentage": 87.61, "elapsed_time": "1 day, 9:36:53", "remaining_time": "4:45:18"}
377
+ {"current_steps": 3700, "total_steps": 4212, "loss": 0.2573, "lr": 4.452889684758938e-06, "epoch": 0.8785468360441648, "percentage": 87.84, "elapsed_time": "1 day, 9:41:29", "remaining_time": "4:39:43"}
378
+ {"current_steps": 3710, "total_steps": 4212, "loss": 0.2112, "lr": 4.283478340184893e-06, "epoch": 0.8809212869523922, "percentage": 88.08, "elapsed_time": "1 day, 9:46:43", "remaining_time": "4:34:14"}
379
+ {"current_steps": 3720, "total_steps": 4212, "loss": 0.2091, "lr": 4.11720816978714e-06, "epoch": 0.8832957378606198, "percentage": 88.32, "elapsed_time": "1 day, 9:51:59", "remaining_time": "4:28:44"}
380
+ {"current_steps": 3730, "total_steps": 4212, "loss": 0.2038, "lr": 3.95409059795987e-06, "epoch": 0.8856701887688472, "percentage": 88.56, "elapsed_time": "1 day, 9:56:00", "remaining_time": "4:23:05"}
381
+ {"current_steps": 3740, "total_steps": 4212, "loss": 0.2478, "lr": 3.7941368324828253e-06, "epoch": 0.8880446396770747, "percentage": 88.79, "elapsed_time": "1 day, 10:00:17", "remaining_time": "4:17:29"}
382
+ {"current_steps": 3750, "total_steps": 4212, "loss": 0.2081, "lr": 3.6373578637511283e-06, "epoch": 0.8904190905853021, "percentage": 89.03, "elapsed_time": "1 day, 10:05:04", "remaining_time": "4:11:57"}
383
+ {"current_steps": 3760, "total_steps": 4212, "loss": 0.224, "lr": 3.4837644640202003e-06, "epoch": 0.8927935414935296, "percentage": 89.27, "elapsed_time": "1 day, 10:09:12", "remaining_time": "4:06:20"}
384
+ {"current_steps": 3770, "total_steps": 4212, "loss": 0.2222, "lr": 3.333367186665576e-06, "epoch": 0.895167992401757, "percentage": 89.51, "elapsed_time": "1 day, 10:14:12", "remaining_time": "4:00:50"}
385
+ {"current_steps": 3780, "total_steps": 4212, "loss": 0.2212, "lr": 3.186176365457766e-06, "epoch": 0.8975424433099846, "percentage": 89.74, "elapsed_time": "1 day, 10:19:27", "remaining_time": "3:55:22"}
386
+ {"current_steps": 3790, "total_steps": 4212, "loss": 0.1949, "lr": 3.042202113852255e-06, "epoch": 0.8999168942182121, "percentage": 89.98, "elapsed_time": "1 day, 10:23:50", "remaining_time": "3:49:48"}
387
+ {"current_steps": 3800, "total_steps": 4212, "loss": 0.2108, "lr": 2.9014543242945837e-06, "epoch": 0.9022913451264395, "percentage": 90.22, "elapsed_time": "1 day, 10:29:25", "remaining_time": "3:44:22"}
388
+ {"current_steps": 3810, "total_steps": 4212, "loss": 0.2566, "lr": 2.7639426675406753e-06, "epoch": 0.904665796034667, "percentage": 90.46, "elapsed_time": "1 day, 10:34:22", "remaining_time": "3:38:52"}
389
+ {"current_steps": 3820, "total_steps": 4212, "loss": 0.2098, "lr": 2.629676591992314e-06, "epoch": 0.9070402469428944, "percentage": 90.69, "elapsed_time": "1 day, 10:38:36", "remaining_time": "3:33:18"}
390
+ {"current_steps": 3830, "total_steps": 4212, "loss": 0.2456, "lr": 2.498665323047966e-06, "epoch": 0.909414697851122, "percentage": 90.93, "elapsed_time": "1 day, 10:43:47", "remaining_time": "3:27:50"}
391
+ {"current_steps": 3840, "total_steps": 4212, "loss": 0.203, "lr": 2.370917862468941e-06, "epoch": 0.9117891487593494, "percentage": 91.17, "elapsed_time": "1 day, 10:49:09", "remaining_time": "3:22:23"}
392
+ {"current_steps": 3850, "total_steps": 4212, "loss": 0.2151, "lr": 2.2464429877607995e-06, "epoch": 0.9141635996675769, "percentage": 91.41, "elapsed_time": "1 day, 10:53:31", "remaining_time": "3:16:50"}
393
+ {"current_steps": 3860, "total_steps": 4212, "loss": 0.2254, "lr": 2.1252492515703382e-06, "epoch": 0.9165380505758044, "percentage": 91.64, "elapsed_time": "1 day, 10:59:12", "remaining_time": "3:11:25"}
394
+ {"current_steps": 3870, "total_steps": 4212, "loss": 0.1823, "lr": 2.0073449810978974e-06, "epoch": 0.9189125014840318, "percentage": 91.88, "elapsed_time": "1 day, 11:04:44", "remaining_time": "3:06:00"}
395
+ {"current_steps": 3880, "total_steps": 4212, "loss": 0.2369, "lr": 1.8927382775251856e-06, "epoch": 0.9212869523922593, "percentage": 92.12, "elapsed_time": "1 day, 11:10:33", "remaining_time": "3:00:35"}
396
+ {"current_steps": 3890, "total_steps": 4212, "loss": 0.1982, "lr": 1.781437015458698e-06, "epoch": 0.9236614033004867, "percentage": 92.36, "elapsed_time": "1 day, 11:15:35", "remaining_time": "2:55:07"}
397
+ {"current_steps": 3900, "total_steps": 4212, "loss": 0.205, "lr": 1.673448842388603e-06, "epoch": 0.9260358542087143, "percentage": 92.59, "elapsed_time": "1 day, 11:20:04", "remaining_time": "2:49:36"}
398
+ {"current_steps": 3910, "total_steps": 4212, "loss": 0.2055, "lr": 1.5687811781633033e-06, "epoch": 0.9284103051169417, "percentage": 92.83, "elapsed_time": "1 day, 11:24:22", "remaining_time": "2:44:04"}
399
+ {"current_steps": 3920, "total_steps": 4212, "loss": 0.1943, "lr": 1.4674412144796368e-06, "epoch": 0.9307847560251692, "percentage": 93.07, "elapsed_time": "1 day, 11:29:07", "remaining_time": "2:38:35"}
400
+ {"current_steps": 3930, "total_steps": 4212, "loss": 0.1821, "lr": 1.3694359143887225e-06, "epoch": 0.9331592069333966, "percentage": 93.3, "elapsed_time": "1 day, 11:34:11", "remaining_time": "2:33:08"}
401
+ {"current_steps": 3940, "total_steps": 4212, "loss": 0.1794, "lr": 1.2747720118175099e-06, "epoch": 0.9355336578416241, "percentage": 93.54, "elapsed_time": "1 day, 11:38:57", "remaining_time": "2:27:39"}
402
+ {"current_steps": 3950, "total_steps": 4212, "loss": 0.1955, "lr": 1.1834560111061211e-06, "epoch": 0.9379081087498516, "percentage": 93.78, "elapsed_time": "1 day, 11:43:48", "remaining_time": "2:22:11"}
403
+ {"current_steps": 3960, "total_steps": 4212, "loss": 0.2071, "lr": 1.095494186560947e-06, "epoch": 0.940282559658079, "percentage": 94.02, "elapsed_time": "1 day, 11:48:12", "remaining_time": "2:16:42"}
404
+ {"current_steps": 3970, "total_steps": 4212, "loss": 0.2151, "lr": 1.0108925820234926e-06, "epoch": 0.9426570105663066, "percentage": 94.25, "elapsed_time": "1 day, 11:52:08", "remaining_time": "2:11:11"}
405
+ {"current_steps": 3980, "total_steps": 4212, "loss": 0.1961, "lr": 9.29657010455165e-07, "epoch": 0.945031461474534, "percentage": 94.49, "elapsed_time": "1 day, 11:56:23", "remaining_time": "2:05:41"}
406
+ {"current_steps": 3990, "total_steps": 4212, "loss": 0.2276, "lr": 8.517930535378083e-07, "epoch": 0.9474059123827615, "percentage": 94.73, "elapsed_time": "1 day, 12:01:32", "remaining_time": "2:00:15"}
407
+ {"current_steps": 4000, "total_steps": 4212, "loss": 0.2301, "lr": 7.773060612902395e-07, "epoch": 0.9497803632909889, "percentage": 94.97, "elapsed_time": "1 day, 12:05:34", "remaining_time": "1:54:46"}
408
+ {"current_steps": 4000, "total_steps": 4212, "eval_loss": 0.20879201591014862, "epoch": 0.9497803632909889, "percentage": 94.97, "elapsed_time": "1 day, 12:38:52", "remaining_time": "1:56:32"}
409
+ {"current_steps": 4010, "total_steps": 4212, "loss": 0.1919, "lr": 7.062011517006139e-07, "epoch": 0.9521548141992164, "percentage": 95.2, "elapsed_time": "1 day, 12:44:23", "remaining_time": "1:51:02"}
410
+ {"current_steps": 4020, "total_steps": 4212, "loss": 0.2081, "lr": 6.384832103747907e-07, "epoch": 0.954529265107444, "percentage": 95.44, "elapsed_time": "1 day, 12:48:58", "remaining_time": "1:45:30"}
411
+ {"current_steps": 4030, "total_steps": 4212, "loss": 0.1962, "lr": 5.741568902006277e-07, "epoch": 0.9569037160156714, "percentage": 95.68, "elapsed_time": "1 day, 12:53:27", "remaining_time": "1:39:57"}
412
+ {"current_steps": 4040, "total_steps": 4212, "loss": 0.1665, "lr": 5.132266110282835e-07, "epoch": 0.9592781669238989, "percentage": 95.92, "elapsed_time": "1 day, 12:58:38", "remaining_time": "1:34:27"}
413
+ {"current_steps": 4050, "total_steps": 4212, "loss": 0.2127, "lr": 4.5569655936654186e-07, "epoch": 0.9616526178321263, "percentage": 96.15, "elapsed_time": "1 day, 13:04:03", "remaining_time": "1:28:57"}
414
+ {"current_steps": 4060, "total_steps": 4212, "loss": 0.2103, "lr": 4.0157068809515417e-07, "epoch": 0.9640270687403538, "percentage": 96.39, "elapsed_time": "1 day, 13:09:18", "remaining_time": "1:23:27"}
415
+ {"current_steps": 4070, "total_steps": 4212, "loss": 0.2309, "lr": 3.50852716193234e-07, "epoch": 0.9664015196485812, "percentage": 96.63, "elapsed_time": "1 day, 13:14:36", "remaining_time": "1:17:57"}
416
+ {"current_steps": 4080, "total_steps": 4212, "loss": 0.2033, "lr": 3.0354612848372265e-07, "epoch": 0.9687759705568088, "percentage": 96.87, "elapsed_time": "1 day, 13:19:54", "remaining_time": "1:12:28"}
417
+ {"current_steps": 4090, "total_steps": 4212, "loss": 0.1892, "lr": 2.59654175393953e-07, "epoch": 0.9711504214650362, "percentage": 97.1, "elapsed_time": "1 day, 13:25:31", "remaining_time": "1:06:58"}
418
+ {"current_steps": 4100, "total_steps": 4212, "loss": 0.2659, "lr": 2.1917987273232245e-07, "epoch": 0.9735248723732637, "percentage": 97.34, "elapsed_time": "1 day, 13:30:52", "remaining_time": "1:01:29"}
419
+ {"current_steps": 4110, "total_steps": 4212, "loss": 0.1669, "lr": 1.8212600148105884e-07, "epoch": 0.9758993232814912, "percentage": 97.58, "elapsed_time": "1 day, 13:35:27", "remaining_time": "0:55:58"}
420
+ {"current_steps": 4120, "total_steps": 4212, "loss": 0.2005, "lr": 1.4849510760513995e-07, "epoch": 0.9782737741897186, "percentage": 97.82, "elapsed_time": "1 day, 13:40:04", "remaining_time": "0:50:28"}
421
+ {"current_steps": 4130, "total_steps": 4212, "loss": 0.1955, "lr": 1.182895018773944e-07, "epoch": 0.9806482250979461, "percentage": 98.05, "elapsed_time": "1 day, 13:44:24", "remaining_time": "0:44:57"}
422
+ {"current_steps": 4140, "total_steps": 4212, "loss": 0.1946, "lr": 9.151125971967878e-08, "epoch": 0.9830226760061735, "percentage": 98.29, "elapsed_time": "1 day, 13:49:12", "remaining_time": "0:39:27"}
423
+ {"current_steps": 4150, "total_steps": 4212, "loss": 0.2244, "lr": 6.816222106030834e-08, "epoch": 0.9853971269144011, "percentage": 98.53, "elapsed_time": "1 day, 13:53:46", "remaining_time": "0:33:58"}
424
+ {"current_steps": 4160, "total_steps": 4212, "loss": 0.2032, "lr": 4.824399020763593e-08, "epoch": 0.9877715778226285, "percentage": 98.77, "elapsed_time": "1 day, 13:59:03", "remaining_time": "0:28:29"}
425
+ {"current_steps": 4170, "total_steps": 4212, "loss": 0.2069, "lr": 3.175793573980124e-08, "epoch": 0.990146028730856, "percentage": 99.0, "elapsed_time": "1 day, 14:03:19", "remaining_time": "0:22:59"}
426
+ {"current_steps": 4180, "total_steps": 4212, "loss": 0.2685, "lr": 1.8705190410717166e-08, "epoch": 0.9925204796390834, "percentage": 99.24, "elapsed_time": "1 day, 14:07:56", "remaining_time": "0:17:30"}
427
+ {"current_steps": 4190, "total_steps": 4212, "loss": 0.1907, "lr": 9.086651072215402e-09, "epoch": 0.9948949305473109, "percentage": 99.48, "elapsed_time": "1 day, 14:12:38", "remaining_time": "0:12:02"}
428
+ {"current_steps": 4200, "total_steps": 4212, "loss": 0.211, "lr": 2.902978612456808e-09, "epoch": 0.9972693814555385, "percentage": 99.72, "elapsed_time": "1 day, 14:16:54", "remaining_time": "0:06:33"}
429
+ {"current_steps": 4210, "total_steps": 4212, "loss": 0.2539, "lr": 1.5459791047889305e-10, "epoch": 0.9996438323637659, "percentage": 99.95, "elapsed_time": "1 day, 14:22:08", "remaining_time": "0:01:05"}
430
+ {"current_steps": 4212, "total_steps": 4212, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "1 day, 14:23:20", "remaining_time": "0:00:00"}
qwen2-gob-plan-115/trainer_state.json ADDED
@@ -0,0 +1,3054 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 4212,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0023744509082274726,
14
+ "grad_norm": 17.46586799621582,
15
+ "learning_rate": 2.1327014218009483e-06,
16
+ "loss": 2.1021,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.004748901816454945,
21
+ "grad_norm": 8.901350021362305,
22
+ "learning_rate": 4.502369668246446e-06,
23
+ "loss": 1.0168,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.007123352724682417,
28
+ "grad_norm": 6.82363224029541,
29
+ "learning_rate": 6.8720379146919435e-06,
30
+ "loss": 0.6508,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.00949780363290989,
35
+ "grad_norm": 5.696808338165283,
36
+ "learning_rate": 9.24170616113744e-06,
37
+ "loss": 0.5579,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.011872254541137363,
42
+ "grad_norm": 7.2419819831848145,
43
+ "learning_rate": 1.161137440758294e-05,
44
+ "loss": 0.5911,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.014246705449364834,
49
+ "grad_norm": 5.160392761230469,
50
+ "learning_rate": 1.3981042654028437e-05,
51
+ "loss": 0.5363,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.016621156357592308,
56
+ "grad_norm": 5.5930962562561035,
57
+ "learning_rate": 1.6350710900473933e-05,
58
+ "loss": 0.5813,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.01899560726581978,
63
+ "grad_norm": 5.244218349456787,
64
+ "learning_rate": 1.872037914691943e-05,
65
+ "loss": 0.5928,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.021370058174047253,
70
+ "grad_norm": 5.168963432312012,
71
+ "learning_rate": 2.109004739336493e-05,
72
+ "loss": 0.5576,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.023744509082274726,
77
+ "grad_norm": 5.0075836181640625,
78
+ "learning_rate": 2.345971563981043e-05,
79
+ "loss": 0.5891,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.026118959990502195,
84
+ "grad_norm": 4.455913066864014,
85
+ "learning_rate": 2.5829383886255927e-05,
86
+ "loss": 0.6125,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.028493410898729667,
91
+ "grad_norm": 3.4008946418762207,
92
+ "learning_rate": 2.8199052132701424e-05,
93
+ "loss": 0.5772,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.03086786180695714,
98
+ "grad_norm": 4.0339884757995605,
99
+ "learning_rate": 3.056872037914692e-05,
100
+ "loss": 0.6367,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.033242312715184616,
105
+ "grad_norm": 3.819330930709839,
106
+ "learning_rate": 3.293838862559242e-05,
107
+ "loss": 0.6695,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.03561676362341209,
112
+ "grad_norm": 3.6872074604034424,
113
+ "learning_rate": 3.530805687203792e-05,
114
+ "loss": 0.6226,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.03799121453163956,
119
+ "grad_norm": 3.7728962898254395,
120
+ "learning_rate": 3.767772511848342e-05,
121
+ "loss": 0.6079,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.040365665439867034,
126
+ "grad_norm": 3.776675224304199,
127
+ "learning_rate": 4.004739336492891e-05,
128
+ "loss": 0.6063,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.042740116348094506,
133
+ "grad_norm": 3.165210485458374,
134
+ "learning_rate": 4.2417061611374406e-05,
135
+ "loss": 0.5927,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.04511456725632198,
140
+ "grad_norm": 2.95751690864563,
141
+ "learning_rate": 4.478672985781991e-05,
142
+ "loss": 0.6349,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.04748901816454945,
147
+ "grad_norm": 3.4776411056518555,
148
+ "learning_rate": 4.71563981042654e-05,
149
+ "loss": 0.6307,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.04986346907277692,
154
+ "grad_norm": 4.28296422958374,
155
+ "learning_rate": 4.95260663507109e-05,
156
+ "loss": 0.6583,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.05223791998100439,
161
+ "grad_norm": 2.8013813495635986,
162
+ "learning_rate": 5.1895734597156396e-05,
163
+ "loss": 0.7049,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.05461237088923186,
168
+ "grad_norm": 2.8767004013061523,
169
+ "learning_rate": 5.42654028436019e-05,
170
+ "loss": 0.6285,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.056986821797459335,
175
+ "grad_norm": 3.0225541591644287,
176
+ "learning_rate": 5.66350710900474e-05,
177
+ "loss": 0.6836,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.05936127270568681,
182
+ "grad_norm": 2.8183484077453613,
183
+ "learning_rate": 5.90047393364929e-05,
184
+ "loss": 0.641,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.06173572361391428,
189
+ "grad_norm": 3.104975700378418,
190
+ "learning_rate": 6.137440758293839e-05,
191
+ "loss": 0.6517,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.06411017452214175,
196
+ "grad_norm": 3.1265125274658203,
197
+ "learning_rate": 6.374407582938389e-05,
198
+ "loss": 0.7061,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.06648462543036923,
203
+ "grad_norm": 149.14334106445312,
204
+ "learning_rate": 6.611374407582939e-05,
205
+ "loss": 0.9615,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.0688590763385967,
210
+ "grad_norm": 3.6724066734313965,
211
+ "learning_rate": 6.848341232227489e-05,
212
+ "loss": 0.7798,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.07123352724682418,
217
+ "grad_norm": 4.057666778564453,
218
+ "learning_rate": 7.085308056872039e-05,
219
+ "loss": 0.6821,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.07360797815505164,
224
+ "grad_norm": 2.4661478996276855,
225
+ "learning_rate": 7.322274881516588e-05,
226
+ "loss": 0.6711,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.07598242906327912,
231
+ "grad_norm": 3.435774326324463,
232
+ "learning_rate": 7.559241706161138e-05,
233
+ "loss": 0.6888,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.07835687997150659,
238
+ "grad_norm": 2.5769989490509033,
239
+ "learning_rate": 7.796208530805688e-05,
240
+ "loss": 0.8411,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.08073133087973407,
245
+ "grad_norm": 2.518162250518799,
246
+ "learning_rate": 8.033175355450238e-05,
247
+ "loss": 0.6802,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.08310578178796153,
252
+ "grad_norm": 6.829289436340332,
253
+ "learning_rate": 8.270142180094788e-05,
254
+ "loss": 0.7479,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.08548023269618901,
259
+ "grad_norm": 2.6685585975646973,
260
+ "learning_rate": 8.507109004739337e-05,
261
+ "loss": 0.7292,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.08785468360441648,
266
+ "grad_norm": 2.4462921619415283,
267
+ "learning_rate": 8.744075829383887e-05,
268
+ "loss": 0.6956,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.09022913451264396,
273
+ "grad_norm": 2.393085479736328,
274
+ "learning_rate": 8.981042654028437e-05,
275
+ "loss": 0.7527,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.09260358542087142,
280
+ "grad_norm": 2.7507741451263428,
281
+ "learning_rate": 9.218009478672986e-05,
282
+ "loss": 0.7719,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.0949780363290989,
287
+ "grad_norm": 2.751133441925049,
288
+ "learning_rate": 9.454976303317536e-05,
289
+ "loss": 0.7168,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.09735248723732637,
294
+ "grad_norm": 2.7662034034729004,
295
+ "learning_rate": 9.691943127962086e-05,
296
+ "loss": 0.8366,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.09972693814555383,
301
+ "grad_norm": 3.0649664402008057,
302
+ "learning_rate": 9.928909952606635e-05,
303
+ "loss": 0.7962,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.10210138905378131,
308
+ "grad_norm": 2.3150815963745117,
309
+ "learning_rate": 9.999915830219296e-05,
310
+ "loss": 0.8253,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.10447583996200878,
315
+ "grad_norm": 2.705899238586426,
316
+ "learning_rate": 9.999503576890838e-05,
317
+ "loss": 0.7228,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.10685029087023626,
322
+ "grad_norm": 4.077371597290039,
323
+ "learning_rate": 9.998747808549429e-05,
324
+ "loss": 0.8091,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.10922474177846372,
329
+ "grad_norm": 2.475609302520752,
330
+ "learning_rate": 9.997648577123782e-05,
331
+ "loss": 0.8117,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.1115991926866912,
336
+ "grad_norm": 4.027471542358398,
337
+ "learning_rate": 9.996205958141894e-05,
338
+ "loss": 0.7577,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.11397364359491867,
343
+ "grad_norm": 2.330831289291382,
344
+ "learning_rate": 9.994420050725863e-05,
345
+ "loss": 0.8031,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.11634809450314615,
350
+ "grad_norm": 4.374856948852539,
351
+ "learning_rate": 9.992290977585072e-05,
352
+ "loss": 0.7887,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.11872254541137361,
357
+ "grad_norm": 2.5677125453948975,
358
+ "learning_rate": 9.989818885007766e-05,
359
+ "loss": 0.8213,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.11872254541137361,
364
+ "eval_loss": 0.7407782077789307,
365
+ "eval_runtime": 2005.0626,
366
+ "eval_samples_per_second": 1.867,
367
+ "eval_steps_per_second": 0.467,
368
+ "step": 500
369
+ },
370
+ {
371
+ "epoch": 0.1210969963196011,
372
+ "grad_norm": 2.1170384883880615,
373
+ "learning_rate": 9.987003942850989e-05,
374
+ "loss": 0.7828,
375
+ "step": 510
376
+ },
377
+ {
378
+ "epoch": 0.12347144722782856,
379
+ "grad_norm": 3.637371301651001,
380
+ "learning_rate": 9.983846344528923e-05,
381
+ "loss": 0.7454,
382
+ "step": 520
383
+ },
384
+ {
385
+ "epoch": 0.12584589813605604,
386
+ "grad_norm": 3.4971210956573486,
387
+ "learning_rate": 9.980346306999596e-05,
388
+ "loss": 0.6686,
389
+ "step": 530
390
+ },
391
+ {
392
+ "epoch": 0.1282203490442835,
393
+ "grad_norm": 2.935371160507202,
394
+ "learning_rate": 9.976504070749969e-05,
395
+ "loss": 0.9724,
396
+ "step": 540
397
+ },
398
+ {
399
+ "epoch": 0.13059479995251097,
400
+ "grad_norm": 1.999279260635376,
401
+ "learning_rate": 9.972319899779422e-05,
402
+ "loss": 0.735,
403
+ "step": 550
404
+ },
405
+ {
406
+ "epoch": 0.13296925086073846,
407
+ "grad_norm": 6.134634017944336,
408
+ "learning_rate": 9.967794081581606e-05,
409
+ "loss": 0.7613,
410
+ "step": 560
411
+ },
412
+ {
413
+ "epoch": 0.13534370176896593,
414
+ "grad_norm": 2.5179545879364014,
415
+ "learning_rate": 9.962926927124697e-05,
416
+ "loss": 0.7509,
417
+ "step": 570
418
+ },
419
+ {
420
+ "epoch": 0.1377181526771934,
421
+ "grad_norm": 2.3336329460144043,
422
+ "learning_rate": 9.957718770830022e-05,
423
+ "loss": 0.7055,
424
+ "step": 580
425
+ },
426
+ {
427
+ "epoch": 0.14009260358542086,
428
+ "grad_norm": 2.2505533695220947,
429
+ "learning_rate": 9.952169970549088e-05,
430
+ "loss": 0.7494,
431
+ "step": 590
432
+ },
433
+ {
434
+ "epoch": 0.14246705449364835,
435
+ "grad_norm": 1.9913499355316162,
436
+ "learning_rate": 9.946280907538985e-05,
437
+ "loss": 0.7347,
438
+ "step": 600
439
+ },
440
+ {
441
+ "epoch": 0.14484150540187582,
442
+ "grad_norm": 3.043692111968994,
443
+ "learning_rate": 9.940051986436198e-05,
444
+ "loss": 0.6848,
445
+ "step": 610
446
+ },
447
+ {
448
+ "epoch": 0.14721595631010329,
449
+ "grad_norm": 2.2284398078918457,
450
+ "learning_rate": 9.933483635228804e-05,
451
+ "loss": 0.7168,
452
+ "step": 620
453
+ },
454
+ {
455
+ "epoch": 0.14959040721833075,
456
+ "grad_norm": 2.5508503913879395,
457
+ "learning_rate": 9.926576305227063e-05,
458
+ "loss": 0.7163,
459
+ "step": 630
460
+ },
461
+ {
462
+ "epoch": 0.15196485812655824,
463
+ "grad_norm": 2.114255428314209,
464
+ "learning_rate": 9.919330471032401e-05,
465
+ "loss": 0.639,
466
+ "step": 640
467
+ },
468
+ {
469
+ "epoch": 0.1543393090347857,
470
+ "grad_norm": 2.1416802406311035,
471
+ "learning_rate": 9.911746630504818e-05,
472
+ "loss": 0.6972,
473
+ "step": 650
474
+ },
475
+ {
476
+ "epoch": 0.15671375994301318,
477
+ "grad_norm": 2.205293893814087,
478
+ "learning_rate": 9.903825304728664e-05,
479
+ "loss": 0.7853,
480
+ "step": 660
481
+ },
482
+ {
483
+ "epoch": 0.15908821085124064,
484
+ "grad_norm": 2.643953800201416,
485
+ "learning_rate": 9.895567037976842e-05,
486
+ "loss": 0.6616,
487
+ "step": 670
488
+ },
489
+ {
490
+ "epoch": 0.16146266175946813,
491
+ "grad_norm": 1.8454670906066895,
492
+ "learning_rate": 9.88697239767341e-05,
493
+ "loss": 0.6649,
494
+ "step": 680
495
+ },
496
+ {
497
+ "epoch": 0.1638371126676956,
498
+ "grad_norm": 2.498075008392334,
499
+ "learning_rate": 9.878041974354598e-05,
500
+ "loss": 0.7677,
501
+ "step": 690
502
+ },
503
+ {
504
+ "epoch": 0.16621156357592307,
505
+ "grad_norm": 2.1788713932037354,
506
+ "learning_rate": 9.868776381628218e-05,
507
+ "loss": 0.7151,
508
+ "step": 700
509
+ },
510
+ {
511
+ "epoch": 0.16858601448415053,
512
+ "grad_norm": 1.8394912481307983,
513
+ "learning_rate": 9.859176256131522e-05,
514
+ "loss": 0.8004,
515
+ "step": 710
516
+ },
517
+ {
518
+ "epoch": 0.17096046539237802,
519
+ "grad_norm": 1.8285107612609863,
520
+ "learning_rate": 9.849242257487447e-05,
521
+ "loss": 0.686,
522
+ "step": 720
523
+ },
524
+ {
525
+ "epoch": 0.1733349163006055,
526
+ "grad_norm": 2.7288475036621094,
527
+ "learning_rate": 9.838975068259297e-05,
528
+ "loss": 0.7076,
529
+ "step": 730
530
+ },
531
+ {
532
+ "epoch": 0.17570936720883296,
533
+ "grad_norm": 3.241076707839966,
534
+ "learning_rate": 9.828375393903842e-05,
535
+ "loss": 0.7879,
536
+ "step": 740
537
+ },
538
+ {
539
+ "epoch": 0.17808381811706042,
540
+ "grad_norm": 1.8407208919525146,
541
+ "learning_rate": 9.817443962722843e-05,
542
+ "loss": 0.72,
543
+ "step": 750
544
+ },
545
+ {
546
+ "epoch": 0.18045826902528792,
547
+ "grad_norm": 2.23694109916687,
548
+ "learning_rate": 9.806181525813019e-05,
549
+ "loss": 0.7231,
550
+ "step": 760
551
+ },
552
+ {
553
+ "epoch": 0.18283271993351538,
554
+ "grad_norm": 2.168240785598755,
555
+ "learning_rate": 9.79458885701443e-05,
556
+ "loss": 0.6952,
557
+ "step": 770
558
+ },
559
+ {
560
+ "epoch": 0.18520717084174285,
561
+ "grad_norm": 1.7602572441101074,
562
+ "learning_rate": 9.782666752857317e-05,
563
+ "loss": 0.6398,
564
+ "step": 780
565
+ },
566
+ {
567
+ "epoch": 0.1875816217499703,
568
+ "grad_norm": 2.375122547149658,
569
+ "learning_rate": 9.770416032507361e-05,
570
+ "loss": 0.7202,
571
+ "step": 790
572
+ },
573
+ {
574
+ "epoch": 0.1899560726581978,
575
+ "grad_norm": 1.909433126449585,
576
+ "learning_rate": 9.757837537709407e-05,
577
+ "loss": 0.6733,
578
+ "step": 800
579
+ },
580
+ {
581
+ "epoch": 0.19233052356642527,
582
+ "grad_norm": 2.0807507038116455,
583
+ "learning_rate": 9.744932132729625e-05,
584
+ "loss": 0.7764,
585
+ "step": 810
586
+ },
587
+ {
588
+ "epoch": 0.19470497447465274,
589
+ "grad_norm": 2.5525600910186768,
590
+ "learning_rate": 9.731700704296126e-05,
591
+ "loss": 0.6662,
592
+ "step": 820
593
+ },
594
+ {
595
+ "epoch": 0.1970794253828802,
596
+ "grad_norm": 1.9601514339447021,
597
+ "learning_rate": 9.71814416153803e-05,
598
+ "loss": 0.6228,
599
+ "step": 830
600
+ },
601
+ {
602
+ "epoch": 0.19945387629110767,
603
+ "grad_norm": 1.6027888059616089,
604
+ "learning_rate": 9.704263435923014e-05,
605
+ "loss": 0.6479,
606
+ "step": 840
607
+ },
608
+ {
609
+ "epoch": 0.20182832719933516,
610
+ "grad_norm": 2.980756998062134,
611
+ "learning_rate": 9.690059481193295e-05,
612
+ "loss": 0.8001,
613
+ "step": 850
614
+ },
615
+ {
616
+ "epoch": 0.20420277810756263,
617
+ "grad_norm": 1.9024136066436768,
618
+ "learning_rate": 9.675533273300111e-05,
619
+ "loss": 0.5831,
620
+ "step": 860
621
+ },
622
+ {
623
+ "epoch": 0.2065772290157901,
624
+ "grad_norm": 1.647428274154663,
625
+ "learning_rate": 9.660685810336654e-05,
626
+ "loss": 0.6441,
627
+ "step": 870
628
+ },
629
+ {
630
+ "epoch": 0.20895167992401756,
631
+ "grad_norm": 1.460141897201538,
632
+ "learning_rate": 9.645518112469498e-05,
633
+ "loss": 0.6613,
634
+ "step": 880
635
+ },
636
+ {
637
+ "epoch": 0.21132613083224505,
638
+ "grad_norm": 1.7688757181167603,
639
+ "learning_rate": 9.630031221868501e-05,
640
+ "loss": 0.5891,
641
+ "step": 890
642
+ },
643
+ {
644
+ "epoch": 0.21370058174047252,
645
+ "grad_norm": 2.1308674812316895,
646
+ "learning_rate": 9.614226202635195e-05,
647
+ "loss": 0.6525,
648
+ "step": 900
649
+ },
650
+ {
651
+ "epoch": 0.21607503264869998,
652
+ "grad_norm": 1.9658663272857666,
653
+ "learning_rate": 9.59810414072968e-05,
654
+ "loss": 0.6435,
655
+ "step": 910
656
+ },
657
+ {
658
+ "epoch": 0.21844948355692745,
659
+ "grad_norm": 1.6313807964324951,
660
+ "learning_rate": 9.581666143895994e-05,
661
+ "loss": 0.689,
662
+ "step": 920
663
+ },
664
+ {
665
+ "epoch": 0.22082393446515494,
666
+ "grad_norm": 1.8163197040557861,
667
+ "learning_rate": 9.564913341586017e-05,
668
+ "loss": 0.6445,
669
+ "step": 930
670
+ },
671
+ {
672
+ "epoch": 0.2231983853733824,
673
+ "grad_norm": 2.0528953075408936,
674
+ "learning_rate": 9.547846884881853e-05,
675
+ "loss": 0.6848,
676
+ "step": 940
677
+ },
678
+ {
679
+ "epoch": 0.22557283628160987,
680
+ "grad_norm": 1.8023221492767334,
681
+ "learning_rate": 9.530467946416745e-05,
682
+ "loss": 0.6939,
683
+ "step": 950
684
+ },
685
+ {
686
+ "epoch": 0.22794728718983734,
687
+ "grad_norm": 1.956764578819275,
688
+ "learning_rate": 9.512777720294504e-05,
689
+ "loss": 0.6167,
690
+ "step": 960
691
+ },
692
+ {
693
+ "epoch": 0.23032173809806483,
694
+ "grad_norm": 2.1776814460754395,
695
+ "learning_rate": 9.494777422007462e-05,
696
+ "loss": 0.6839,
697
+ "step": 970
698
+ },
699
+ {
700
+ "epoch": 0.2326961890062923,
701
+ "grad_norm": 1.680591106414795,
702
+ "learning_rate": 9.476468288352951e-05,
703
+ "loss": 0.5922,
704
+ "step": 980
705
+ },
706
+ {
707
+ "epoch": 0.23507063991451976,
708
+ "grad_norm": 1.850630760192871,
709
+ "learning_rate": 9.457851577348332e-05,
710
+ "loss": 0.5699,
711
+ "step": 990
712
+ },
713
+ {
714
+ "epoch": 0.23744509082274723,
715
+ "grad_norm": 1.4332187175750732,
716
+ "learning_rate": 9.438928568144547e-05,
717
+ "loss": 0.5742,
718
+ "step": 1000
719
+ },
720
+ {
721
+ "epoch": 0.23744509082274723,
722
+ "eval_loss": 0.6352267861366272,
723
+ "eval_runtime": 2000.6672,
724
+ "eval_samples_per_second": 1.871,
725
+ "eval_steps_per_second": 0.468,
726
+ "step": 1000
727
+ },
728
+ {
729
+ "epoch": 0.23981954173097472,
730
+ "grad_norm": 1.6159008741378784,
731
+ "learning_rate": 9.41970056093824e-05,
732
+ "loss": 0.6413,
733
+ "step": 1010
734
+ },
735
+ {
736
+ "epoch": 0.2421939926392022,
737
+ "grad_norm": 1.7297577857971191,
738
+ "learning_rate": 9.400168876882408e-05,
739
+ "loss": 0.6281,
740
+ "step": 1020
741
+ },
742
+ {
743
+ "epoch": 0.24456844354742965,
744
+ "grad_norm": 1.577554702758789,
745
+ "learning_rate": 9.380334857995629e-05,
746
+ "loss": 0.6787,
747
+ "step": 1030
748
+ },
749
+ {
750
+ "epoch": 0.24694289445565712,
751
+ "grad_norm": 1.6764177083969116,
752
+ "learning_rate": 9.360199867069866e-05,
753
+ "loss": 0.6043,
754
+ "step": 1040
755
+ },
756
+ {
757
+ "epoch": 0.2493173453638846,
758
+ "grad_norm": 1.7127679586410522,
759
+ "learning_rate": 9.339765287576803e-05,
760
+ "loss": 0.5633,
761
+ "step": 1050
762
+ },
763
+ {
764
+ "epoch": 0.2516917962721121,
765
+ "grad_norm": 1.5897778272628784,
766
+ "learning_rate": 9.319032523572815e-05,
767
+ "loss": 0.7124,
768
+ "step": 1060
769
+ },
770
+ {
771
+ "epoch": 0.25406624718033954,
772
+ "grad_norm": 1.6537786722183228,
773
+ "learning_rate": 9.298002999602471e-05,
774
+ "loss": 0.5931,
775
+ "step": 1070
776
+ },
777
+ {
778
+ "epoch": 0.256440698088567,
779
+ "grad_norm": 1.4399652481079102,
780
+ "learning_rate": 9.276678160600674e-05,
781
+ "loss": 0.724,
782
+ "step": 1080
783
+ },
784
+ {
785
+ "epoch": 0.2588151489967945,
786
+ "grad_norm": 1.9762135744094849,
787
+ "learning_rate": 9.255059471793369e-05,
788
+ "loss": 0.6224,
789
+ "step": 1090
790
+ },
791
+ {
792
+ "epoch": 0.26118959990502194,
793
+ "grad_norm": 1.9962340593338013,
794
+ "learning_rate": 9.233148418596862e-05,
795
+ "loss": 0.5864,
796
+ "step": 1100
797
+ },
798
+ {
799
+ "epoch": 0.26356405081324946,
800
+ "grad_norm": 1.3939169645309448,
801
+ "learning_rate": 9.210946506515777e-05,
802
+ "loss": 0.5933,
803
+ "step": 1110
804
+ },
805
+ {
806
+ "epoch": 0.2659385017214769,
807
+ "grad_norm": 1.665360689163208,
808
+ "learning_rate": 9.188455261039592e-05,
809
+ "loss": 0.5608,
810
+ "step": 1120
811
+ },
812
+ {
813
+ "epoch": 0.2683129526297044,
814
+ "grad_norm": 1.626900315284729,
815
+ "learning_rate": 9.165676227537836e-05,
816
+ "loss": 0.5591,
817
+ "step": 1130
818
+ },
819
+ {
820
+ "epoch": 0.27068740353793186,
821
+ "grad_norm": 1.8547568321228027,
822
+ "learning_rate": 9.1426109711539e-05,
823
+ "loss": 0.6269,
824
+ "step": 1140
825
+ },
826
+ {
827
+ "epoch": 0.2730618544461593,
828
+ "grad_norm": 1.4865916967391968,
829
+ "learning_rate": 9.1192610766975e-05,
830
+ "loss": 0.5785,
831
+ "step": 1150
832
+ },
833
+ {
834
+ "epoch": 0.2754363053543868,
835
+ "grad_norm": 1.7755070924758911,
836
+ "learning_rate": 9.095628148535788e-05,
837
+ "loss": 0.7172,
838
+ "step": 1160
839
+ },
840
+ {
841
+ "epoch": 0.27781075626261426,
842
+ "grad_norm": 1.3893502950668335,
843
+ "learning_rate": 9.071713810483103e-05,
844
+ "loss": 0.5275,
845
+ "step": 1170
846
+ },
847
+ {
848
+ "epoch": 0.2801852071708417,
849
+ "grad_norm": 1.7527644634246826,
850
+ "learning_rate": 9.047519705689418e-05,
851
+ "loss": 0.6399,
852
+ "step": 1180
853
+ },
854
+ {
855
+ "epoch": 0.28255965807906924,
856
+ "grad_norm": 1.5130287408828735,
857
+ "learning_rate": 9.023047496527423e-05,
858
+ "loss": 0.6105,
859
+ "step": 1190
860
+ },
861
+ {
862
+ "epoch": 0.2849341089872967,
863
+ "grad_norm": 1.838929533958435,
864
+ "learning_rate": 8.998298864478314e-05,
865
+ "loss": 0.609,
866
+ "step": 1200
867
+ },
868
+ {
869
+ "epoch": 0.2873085598955242,
870
+ "grad_norm": 1.4919387102127075,
871
+ "learning_rate": 8.973275510016252e-05,
872
+ "loss": 0.5436,
873
+ "step": 1210
874
+ },
875
+ {
876
+ "epoch": 0.28968301080375164,
877
+ "grad_norm": 1.4375627040863037,
878
+ "learning_rate": 8.947979152491533e-05,
879
+ "loss": 0.5398,
880
+ "step": 1220
881
+ },
882
+ {
883
+ "epoch": 0.2920574617119791,
884
+ "grad_norm": 1.4347418546676636,
885
+ "learning_rate": 8.922411530012433e-05,
886
+ "loss": 0.5829,
887
+ "step": 1230
888
+ },
889
+ {
890
+ "epoch": 0.29443191262020657,
891
+ "grad_norm": 1.4648419618606567,
892
+ "learning_rate": 8.89657439932581e-05,
893
+ "loss": 0.5319,
894
+ "step": 1240
895
+ },
896
+ {
897
+ "epoch": 0.29680636352843404,
898
+ "grad_norm": 1.460462212562561,
899
+ "learning_rate": 8.870469535696375e-05,
900
+ "loss": 0.5556,
901
+ "step": 1250
902
+ },
903
+ {
904
+ "epoch": 0.2991808144366615,
905
+ "grad_norm": 1.4240905046463013,
906
+ "learning_rate": 8.844098732784723e-05,
907
+ "loss": 0.5374,
908
+ "step": 1260
909
+ },
910
+ {
911
+ "epoch": 0.30155526534488897,
912
+ "grad_norm": 2.142026662826538,
913
+ "learning_rate": 8.817463802524096e-05,
914
+ "loss": 0.5525,
915
+ "step": 1270
916
+ },
917
+ {
918
+ "epoch": 0.3039297162531165,
919
+ "grad_norm": 3.275667190551758,
920
+ "learning_rate": 8.79056657499587e-05,
921
+ "loss": 0.6688,
922
+ "step": 1280
923
+ },
924
+ {
925
+ "epoch": 0.30630416716134395,
926
+ "grad_norm": 1.68355393409729,
927
+ "learning_rate": 8.763408898303829e-05,
928
+ "loss": 0.5966,
929
+ "step": 1290
930
+ },
931
+ {
932
+ "epoch": 0.3086786180695714,
933
+ "grad_norm": 18.15928840637207,
934
+ "learning_rate": 8.73599263844717e-05,
935
+ "loss": 0.606,
936
+ "step": 1300
937
+ },
938
+ {
939
+ "epoch": 0.3110530689777989,
940
+ "grad_norm": 1.5004063844680786,
941
+ "learning_rate": 8.708319679192293e-05,
942
+ "loss": 0.5279,
943
+ "step": 1310
944
+ },
945
+ {
946
+ "epoch": 0.31342751988602635,
947
+ "grad_norm": 1.3138238191604614,
948
+ "learning_rate": 8.680391921943371e-05,
949
+ "loss": 0.568,
950
+ "step": 1320
951
+ },
952
+ {
953
+ "epoch": 0.3158019707942538,
954
+ "grad_norm": 1.4695392847061157,
955
+ "learning_rate": 8.652211285611701e-05,
956
+ "loss": 0.583,
957
+ "step": 1330
958
+ },
959
+ {
960
+ "epoch": 0.3181764217024813,
961
+ "grad_norm": 1.6598325967788696,
962
+ "learning_rate": 8.623779706483855e-05,
963
+ "loss": 0.6056,
964
+ "step": 1340
965
+ },
966
+ {
967
+ "epoch": 0.32055087261070875,
968
+ "grad_norm": 1.5287786722183228,
969
+ "learning_rate": 8.595099138088644e-05,
970
+ "loss": 0.5846,
971
+ "step": 1350
972
+ },
973
+ {
974
+ "epoch": 0.32292532351893627,
975
+ "grad_norm": 1.881782054901123,
976
+ "learning_rate": 8.566171551062889e-05,
977
+ "loss": 0.5304,
978
+ "step": 1360
979
+ },
980
+ {
981
+ "epoch": 0.32529977442716373,
982
+ "grad_norm": 1.6649096012115479,
983
+ "learning_rate": 8.536998933016014e-05,
984
+ "loss": 0.584,
985
+ "step": 1370
986
+ },
987
+ {
988
+ "epoch": 0.3276742253353912,
989
+ "grad_norm": 1.4391852617263794,
990
+ "learning_rate": 8.507583288393479e-05,
991
+ "loss": 0.6155,
992
+ "step": 1380
993
+ },
994
+ {
995
+ "epoch": 0.33004867624361867,
996
+ "grad_norm": 1.3186627626419067,
997
+ "learning_rate": 8.477926638339067e-05,
998
+ "loss": 0.5219,
999
+ "step": 1390
1000
+ },
1001
+ {
1002
+ "epoch": 0.33242312715184613,
1003
+ "grad_norm": 1.169506311416626,
1004
+ "learning_rate": 8.448031020555993e-05,
1005
+ "loss": 0.4953,
1006
+ "step": 1400
1007
+ },
1008
+ {
1009
+ "epoch": 0.3347975780600736,
1010
+ "grad_norm": 1.8767844438552856,
1011
+ "learning_rate": 8.417898489166905e-05,
1012
+ "loss": 0.5899,
1013
+ "step": 1410
1014
+ },
1015
+ {
1016
+ "epoch": 0.33717202896830106,
1017
+ "grad_norm": 5.4630255699157715,
1018
+ "learning_rate": 8.387531114572746e-05,
1019
+ "loss": 0.5992,
1020
+ "step": 1420
1021
+ },
1022
+ {
1023
+ "epoch": 0.33954647987652853,
1024
+ "grad_norm": 1.2826589345932007,
1025
+ "learning_rate": 8.356930983310493e-05,
1026
+ "loss": 0.5268,
1027
+ "step": 1430
1028
+ },
1029
+ {
1030
+ "epoch": 0.34192093078475605,
1031
+ "grad_norm": 1.3490769863128662,
1032
+ "learning_rate": 8.32610019790979e-05,
1033
+ "loss": 0.5396,
1034
+ "step": 1440
1035
+ },
1036
+ {
1037
+ "epoch": 0.3442953816929835,
1038
+ "grad_norm": 1.3992174863815308,
1039
+ "learning_rate": 8.295040876748489e-05,
1040
+ "loss": 0.5868,
1041
+ "step": 1450
1042
+ },
1043
+ {
1044
+ "epoch": 0.346669832601211,
1045
+ "grad_norm": 1.3962913751602173,
1046
+ "learning_rate": 8.263755153907095e-05,
1047
+ "loss": 0.5431,
1048
+ "step": 1460
1049
+ },
1050
+ {
1051
+ "epoch": 0.34904428350943845,
1052
+ "grad_norm": 1.4923028945922852,
1053
+ "learning_rate": 8.23224517902213e-05,
1054
+ "loss": 0.5288,
1055
+ "step": 1470
1056
+ },
1057
+ {
1058
+ "epoch": 0.3514187344176659,
1059
+ "grad_norm": 1.333264946937561,
1060
+ "learning_rate": 8.200513117138435e-05,
1061
+ "loss": 0.5153,
1062
+ "step": 1480
1063
+ },
1064
+ {
1065
+ "epoch": 0.3537931853258934,
1066
+ "grad_norm": 1.5983505249023438,
1067
+ "learning_rate": 8.168561148560414e-05,
1068
+ "loss": 0.5482,
1069
+ "step": 1490
1070
+ },
1071
+ {
1072
+ "epoch": 0.35616763623412084,
1073
+ "grad_norm": 1.445067048072815,
1074
+ "learning_rate": 8.136391468702214e-05,
1075
+ "loss": 0.5326,
1076
+ "step": 1500
1077
+ },
1078
+ {
1079
+ "epoch": 0.35616763623412084,
1080
+ "eval_loss": 0.5119722485542297,
1081
+ "eval_runtime": 2007.1248,
1082
+ "eval_samples_per_second": 1.865,
1083
+ "eval_steps_per_second": 0.466,
1084
+ "step": 1500
1085
+ },
1086
+ {
1087
+ "epoch": 0.3585420871423483,
1088
+ "grad_norm": 3.041621685028076,
1089
+ "learning_rate": 8.104006287936892e-05,
1090
+ "loss": 0.6232,
1091
+ "step": 1510
1092
+ },
1093
+ {
1094
+ "epoch": 0.36091653805057583,
1095
+ "grad_norm": 1.2769168615341187,
1096
+ "learning_rate": 8.07140783144453e-05,
1097
+ "loss": 0.4939,
1098
+ "step": 1520
1099
+ },
1100
+ {
1101
+ "epoch": 0.3632909889588033,
1102
+ "grad_norm": 1.4550813436508179,
1103
+ "learning_rate": 8.038598339059351e-05,
1104
+ "loss": 0.4444,
1105
+ "step": 1530
1106
+ },
1107
+ {
1108
+ "epoch": 0.36566543986703076,
1109
+ "grad_norm": 1.3510379791259766,
1110
+ "learning_rate": 8.005580065115816e-05,
1111
+ "loss": 0.4768,
1112
+ "step": 1540
1113
+ },
1114
+ {
1115
+ "epoch": 0.3680398907752582,
1116
+ "grad_norm": 1.391461968421936,
1117
+ "learning_rate": 7.972355278293733e-05,
1118
+ "loss": 0.479,
1119
+ "step": 1550
1120
+ },
1121
+ {
1122
+ "epoch": 0.3704143416834857,
1123
+ "grad_norm": 1.3295865058898926,
1124
+ "learning_rate": 7.938926261462366e-05,
1125
+ "loss": 0.4734,
1126
+ "step": 1560
1127
+ },
1128
+ {
1129
+ "epoch": 0.37278879259171316,
1130
+ "grad_norm": 1.7632473707199097,
1131
+ "learning_rate": 7.905295311523595e-05,
1132
+ "loss": 0.5425,
1133
+ "step": 1570
1134
+ },
1135
+ {
1136
+ "epoch": 0.3751632434999406,
1137
+ "grad_norm": 1.3326750993728638,
1138
+ "learning_rate": 7.871464739254084e-05,
1139
+ "loss": 0.4818,
1140
+ "step": 1580
1141
+ },
1142
+ {
1143
+ "epoch": 0.3775376944081681,
1144
+ "grad_norm": 1.4247772693634033,
1145
+ "learning_rate": 7.837436869146517e-05,
1146
+ "loss": 0.496,
1147
+ "step": 1590
1148
+ },
1149
+ {
1150
+ "epoch": 0.3799121453163956,
1151
+ "grad_norm": 1.2185418605804443,
1152
+ "learning_rate": 7.80321403924987e-05,
1153
+ "loss": 0.5389,
1154
+ "step": 1600
1155
+ },
1156
+ {
1157
+ "epoch": 0.3822865962246231,
1158
+ "grad_norm": 1.6560026407241821,
1159
+ "learning_rate": 7.768798601008776e-05,
1160
+ "loss": 0.489,
1161
+ "step": 1610
1162
+ },
1163
+ {
1164
+ "epoch": 0.38466104713285054,
1165
+ "grad_norm": 1.5144054889678955,
1166
+ "learning_rate": 7.734192919101958e-05,
1167
+ "loss": 0.4599,
1168
+ "step": 1620
1169
+ },
1170
+ {
1171
+ "epoch": 0.387035498041078,
1172
+ "grad_norm": 1.2287232875823975,
1173
+ "learning_rate": 7.69939937127974e-05,
1174
+ "loss": 0.6238,
1175
+ "step": 1630
1176
+ },
1177
+ {
1178
+ "epoch": 0.3894099489493055,
1179
+ "grad_norm": 1.2839241027832031,
1180
+ "learning_rate": 7.664420348200689e-05,
1181
+ "loss": 0.4984,
1182
+ "step": 1640
1183
+ },
1184
+ {
1185
+ "epoch": 0.39178439985753294,
1186
+ "grad_norm": 1.3231412172317505,
1187
+ "learning_rate": 7.629258253267332e-05,
1188
+ "loss": 0.4746,
1189
+ "step": 1650
1190
+ },
1191
+ {
1192
+ "epoch": 0.3941588507657604,
1193
+ "grad_norm": 1.5302526950836182,
1194
+ "learning_rate": 7.593915502461042e-05,
1195
+ "loss": 0.5243,
1196
+ "step": 1660
1197
+ },
1198
+ {
1199
+ "epoch": 0.39653330167398787,
1200
+ "grad_norm": 1.2323942184448242,
1201
+ "learning_rate": 7.558394524176023e-05,
1202
+ "loss": 0.5353,
1203
+ "step": 1670
1204
+ },
1205
+ {
1206
+ "epoch": 0.39890775258221534,
1207
+ "grad_norm": 1.5699905157089233,
1208
+ "learning_rate": 7.522697759052451e-05,
1209
+ "loss": 0.5151,
1210
+ "step": 1680
1211
+ },
1212
+ {
1213
+ "epoch": 0.40128220349044286,
1214
+ "grad_norm": 1.3586243391036987,
1215
+ "learning_rate": 7.486827659808796e-05,
1216
+ "loss": 0.4583,
1217
+ "step": 1690
1218
+ },
1219
+ {
1220
+ "epoch": 0.4036566543986703,
1221
+ "grad_norm": 1.3707525730133057,
1222
+ "learning_rate": 7.450786691073274e-05,
1223
+ "loss": 0.5045,
1224
+ "step": 1700
1225
+ },
1226
+ {
1227
+ "epoch": 0.4060311053068978,
1228
+ "grad_norm": 1.6775321960449219,
1229
+ "learning_rate": 7.414577329214522e-05,
1230
+ "loss": 0.521,
1231
+ "step": 1710
1232
+ },
1233
+ {
1234
+ "epoch": 0.40840555621512525,
1235
+ "grad_norm": 1.200310468673706,
1236
+ "learning_rate": 7.378202062171432e-05,
1237
+ "loss": 0.4216,
1238
+ "step": 1720
1239
+ },
1240
+ {
1241
+ "epoch": 0.4107800071233527,
1242
+ "grad_norm": 1.3154892921447754,
1243
+ "learning_rate": 7.341663389282219e-05,
1244
+ "loss": 0.439,
1245
+ "step": 1730
1246
+ },
1247
+ {
1248
+ "epoch": 0.4131544580315802,
1249
+ "grad_norm": 1.3154064416885376,
1250
+ "learning_rate": 7.304963821112681e-05,
1251
+ "loss": 0.468,
1252
+ "step": 1740
1253
+ },
1254
+ {
1255
+ "epoch": 0.41552890893980765,
1256
+ "grad_norm": 1.8923077583312988,
1257
+ "learning_rate": 7.268105879283703e-05,
1258
+ "loss": 0.5209,
1259
+ "step": 1750
1260
+ },
1261
+ {
1262
+ "epoch": 0.4179033598480351,
1263
+ "grad_norm": 1.6501123905181885,
1264
+ "learning_rate": 7.231092096297995e-05,
1265
+ "loss": 0.4444,
1266
+ "step": 1760
1267
+ },
1268
+ {
1269
+ "epoch": 0.42027781075626264,
1270
+ "grad_norm": 1.5123274326324463,
1271
+ "learning_rate": 7.19392501536609e-05,
1272
+ "loss": 0.4706,
1273
+ "step": 1770
1274
+ },
1275
+ {
1276
+ "epoch": 0.4226522616644901,
1277
+ "grad_norm": 1.410674810409546,
1278
+ "learning_rate": 7.156607190231591e-05,
1279
+ "loss": 0.545,
1280
+ "step": 1780
1281
+ },
1282
+ {
1283
+ "epoch": 0.42502671257271757,
1284
+ "grad_norm": 1.343226671218872,
1285
+ "learning_rate": 7.11914118499571e-05,
1286
+ "loss": 0.4404,
1287
+ "step": 1790
1288
+ },
1289
+ {
1290
+ "epoch": 0.42740116348094503,
1291
+ "grad_norm": 1.652727723121643,
1292
+ "learning_rate": 7.081529573941091e-05,
1293
+ "loss": 0.4848,
1294
+ "step": 1800
1295
+ },
1296
+ {
1297
+ "epoch": 0.4297756143891725,
1298
+ "grad_norm": 1.6431710720062256,
1299
+ "learning_rate": 7.043774941354925e-05,
1300
+ "loss": 0.5122,
1301
+ "step": 1810
1302
+ },
1303
+ {
1304
+ "epoch": 0.43215006529739997,
1305
+ "grad_norm": 1.0853626728057861,
1306
+ "learning_rate": 7.005879881351384e-05,
1307
+ "loss": 0.4651,
1308
+ "step": 1820
1309
+ },
1310
+ {
1311
+ "epoch": 0.43452451620562743,
1312
+ "grad_norm": 1.4375888109207153,
1313
+ "learning_rate": 6.967846997693392e-05,
1314
+ "loss": 0.4405,
1315
+ "step": 1830
1316
+ },
1317
+ {
1318
+ "epoch": 0.4368989671138549,
1319
+ "grad_norm": 1.214448094367981,
1320
+ "learning_rate": 6.929678903613705e-05,
1321
+ "loss": 0.4304,
1322
+ "step": 1840
1323
+ },
1324
+ {
1325
+ "epoch": 0.4392734180220824,
1326
+ "grad_norm": 1.5465502738952637,
1327
+ "learning_rate": 6.891378221635367e-05,
1328
+ "loss": 0.4278,
1329
+ "step": 1850
1330
+ },
1331
+ {
1332
+ "epoch": 0.4416478689303099,
1333
+ "grad_norm": 1.4311429262161255,
1334
+ "learning_rate": 6.852947583391511e-05,
1335
+ "loss": 0.4781,
1336
+ "step": 1860
1337
+ },
1338
+ {
1339
+ "epoch": 0.44402231983853735,
1340
+ "grad_norm": 1.4049911499023438,
1341
+ "learning_rate": 6.814389629444543e-05,
1342
+ "loss": 0.4582,
1343
+ "step": 1870
1344
+ },
1345
+ {
1346
+ "epoch": 0.4463967707467648,
1347
+ "grad_norm": 1.1797651052474976,
1348
+ "learning_rate": 6.775707009104708e-05,
1349
+ "loss": 0.4225,
1350
+ "step": 1880
1351
+ },
1352
+ {
1353
+ "epoch": 0.4487712216549923,
1354
+ "grad_norm": 1.1640324592590332,
1355
+ "learning_rate": 6.73690238024806e-05,
1356
+ "loss": 0.4215,
1357
+ "step": 1890
1358
+ },
1359
+ {
1360
+ "epoch": 0.45114567256321975,
1361
+ "grad_norm": 1.2648041248321533,
1362
+ "learning_rate": 6.697978409133831e-05,
1363
+ "loss": 0.518,
1364
+ "step": 1900
1365
+ },
1366
+ {
1367
+ "epoch": 0.4535201234714472,
1368
+ "grad_norm": 1.2650415897369385,
1369
+ "learning_rate": 6.658937770221242e-05,
1370
+ "loss": 0.4217,
1371
+ "step": 1910
1372
+ },
1373
+ {
1374
+ "epoch": 0.4558945743796747,
1375
+ "grad_norm": 1.7061676979064941,
1376
+ "learning_rate": 6.619783145985743e-05,
1377
+ "loss": 0.4848,
1378
+ "step": 1920
1379
+ },
1380
+ {
1381
+ "epoch": 0.4582690252879022,
1382
+ "grad_norm": 1.4012316465377808,
1383
+ "learning_rate": 6.580517226734686e-05,
1384
+ "loss": 0.4036,
1385
+ "step": 1930
1386
+ },
1387
+ {
1388
+ "epoch": 0.46064347619612966,
1389
+ "grad_norm": 1.1461294889450073,
1390
+ "learning_rate": 6.541142710422489e-05,
1391
+ "loss": 0.459,
1392
+ "step": 1940
1393
+ },
1394
+ {
1395
+ "epoch": 0.46301792710435713,
1396
+ "grad_norm": 1.4154229164123535,
1397
+ "learning_rate": 6.501662302465254e-05,
1398
+ "loss": 0.4338,
1399
+ "step": 1950
1400
+ },
1401
+ {
1402
+ "epoch": 0.4653923780125846,
1403
+ "grad_norm": 1.0013957023620605,
1404
+ "learning_rate": 6.46207871555488e-05,
1405
+ "loss": 0.4175,
1406
+ "step": 1960
1407
+ },
1408
+ {
1409
+ "epoch": 0.46776682892081206,
1410
+ "grad_norm": 1.1550451517105103,
1411
+ "learning_rate": 6.422394669472676e-05,
1412
+ "loss": 0.4774,
1413
+ "step": 1970
1414
+ },
1415
+ {
1416
+ "epoch": 0.4701412798290395,
1417
+ "grad_norm": 2.9167330265045166,
1418
+ "learning_rate": 6.382612890902478e-05,
1419
+ "loss": 0.4808,
1420
+ "step": 1980
1421
+ },
1422
+ {
1423
+ "epoch": 0.472515730737267,
1424
+ "grad_norm": 1.2706438302993774,
1425
+ "learning_rate": 6.342736113243305e-05,
1426
+ "loss": 0.4014,
1427
+ "step": 1990
1428
+ },
1429
+ {
1430
+ "epoch": 0.47489018164549446,
1431
+ "grad_norm": 1.3059141635894775,
1432
+ "learning_rate": 6.302767076421552e-05,
1433
+ "loss": 0.4165,
1434
+ "step": 2000
1435
+ },
1436
+ {
1437
+ "epoch": 0.47489018164549446,
1438
+ "eval_loss": 0.4213089346885681,
1439
+ "eval_runtime": 2002.2295,
1440
+ "eval_samples_per_second": 1.87,
1441
+ "eval_steps_per_second": 0.467,
1442
+ "step": 2000
1443
+ },
1444
+ {
1445
+ "epoch": 0.477264632553722,
1446
+ "grad_norm": 1.2650591135025024,
1447
+ "learning_rate": 6.26270852670272e-05,
1448
+ "loss": 0.4429,
1449
+ "step": 2010
1450
+ },
1451
+ {
1452
+ "epoch": 0.47963908346194944,
1453
+ "grad_norm": 1.1537431478500366,
1454
+ "learning_rate": 6.222563216502724e-05,
1455
+ "loss": 0.4951,
1456
+ "step": 2020
1457
+ },
1458
+ {
1459
+ "epoch": 0.4820135343701769,
1460
+ "grad_norm": 1.0984758138656616,
1461
+ "learning_rate": 6.182333904198782e-05,
1462
+ "loss": 0.4165,
1463
+ "step": 2030
1464
+ },
1465
+ {
1466
+ "epoch": 0.4843879852784044,
1467
+ "grad_norm": 1.335157036781311,
1468
+ "learning_rate": 6.14202335393988e-05,
1469
+ "loss": 0.4133,
1470
+ "step": 2040
1471
+ },
1472
+ {
1473
+ "epoch": 0.48676243618663184,
1474
+ "grad_norm": 1.3875082731246948,
1475
+ "learning_rate": 6.1016343354568464e-05,
1476
+ "loss": 0.4254,
1477
+ "step": 2050
1478
+ },
1479
+ {
1480
+ "epoch": 0.4891368870948593,
1481
+ "grad_norm": 1.3098845481872559,
1482
+ "learning_rate": 6.0611696238720485e-05,
1483
+ "loss": 0.4138,
1484
+ "step": 2060
1485
+ },
1486
+ {
1487
+ "epoch": 0.4915113380030868,
1488
+ "grad_norm": 1.2245336771011353,
1489
+ "learning_rate": 6.020631999508717e-05,
1490
+ "loss": 0.4021,
1491
+ "step": 2070
1492
+ },
1493
+ {
1494
+ "epoch": 0.49388578891131424,
1495
+ "grad_norm": 1.128445029258728,
1496
+ "learning_rate": 5.980024247699903e-05,
1497
+ "loss": 0.3708,
1498
+ "step": 2080
1499
+ },
1500
+ {
1501
+ "epoch": 0.4962602398195417,
1502
+ "grad_norm": 1.207760214805603,
1503
+ "learning_rate": 5.939349158597102e-05,
1504
+ "loss": 0.4198,
1505
+ "step": 2090
1506
+ },
1507
+ {
1508
+ "epoch": 0.4986346907277692,
1509
+ "grad_norm": 1.347845435142517,
1510
+ "learning_rate": 5.898609526978547e-05,
1511
+ "loss": 0.3956,
1512
+ "step": 2100
1513
+ },
1514
+ {
1515
+ "epoch": 0.5010091416359966,
1516
+ "grad_norm": 1.4427859783172607,
1517
+ "learning_rate": 5.857808152057173e-05,
1518
+ "loss": 0.4176,
1519
+ "step": 2110
1520
+ },
1521
+ {
1522
+ "epoch": 0.5033835925442242,
1523
+ "grad_norm": 1.3605539798736572,
1524
+ "learning_rate": 5.816947837288285e-05,
1525
+ "loss": 0.4061,
1526
+ "step": 2120
1527
+ },
1528
+ {
1529
+ "epoch": 0.5057580434524516,
1530
+ "grad_norm": 1.6183706521987915,
1531
+ "learning_rate": 5.776031390176938e-05,
1532
+ "loss": 0.3868,
1533
+ "step": 2130
1534
+ },
1535
+ {
1536
+ "epoch": 0.5081324943606791,
1537
+ "grad_norm": 1.385736346244812,
1538
+ "learning_rate": 5.7350616220850285e-05,
1539
+ "loss": 0.5408,
1540
+ "step": 2140
1541
+ },
1542
+ {
1543
+ "epoch": 0.5105069452689066,
1544
+ "grad_norm": 1.7778092622756958,
1545
+ "learning_rate": 5.694041348038128e-05,
1546
+ "loss": 0.4183,
1547
+ "step": 2150
1548
+ },
1549
+ {
1550
+ "epoch": 0.512881396177134,
1551
+ "grad_norm": 1.1978100538253784,
1552
+ "learning_rate": 5.652973386532066e-05,
1553
+ "loss": 0.3751,
1554
+ "step": 2160
1555
+ },
1556
+ {
1557
+ "epoch": 0.5152558470853615,
1558
+ "grad_norm": 1.3059000968933105,
1559
+ "learning_rate": 5.611860559339265e-05,
1560
+ "loss": 0.3528,
1561
+ "step": 2170
1562
+ },
1563
+ {
1564
+ "epoch": 0.517630297993589,
1565
+ "grad_norm": 1.505707025527954,
1566
+ "learning_rate": 5.5707056913148626e-05,
1567
+ "loss": 0.4023,
1568
+ "step": 2180
1569
+ },
1570
+ {
1571
+ "epoch": 0.5200047489018165,
1572
+ "grad_norm": 1.1287879943847656,
1573
+ "learning_rate": 5.529511610202616e-05,
1574
+ "loss": 0.3637,
1575
+ "step": 2190
1576
+ },
1577
+ {
1578
+ "epoch": 0.5223791998100439,
1579
+ "grad_norm": 1.0342644453048706,
1580
+ "learning_rate": 5.4882811464406026e-05,
1581
+ "loss": 0.3705,
1582
+ "step": 2200
1583
+ },
1584
+ {
1585
+ "epoch": 0.5247536507182714,
1586
+ "grad_norm": 1.2189958095550537,
1587
+ "learning_rate": 5.4470171329667506e-05,
1588
+ "loss": 0.4017,
1589
+ "step": 2210
1590
+ },
1591
+ {
1592
+ "epoch": 0.5271281016264989,
1593
+ "grad_norm": 2.2192893028259277,
1594
+ "learning_rate": 5.405722405024183e-05,
1595
+ "loss": 0.4057,
1596
+ "step": 2220
1597
+ },
1598
+ {
1599
+ "epoch": 0.5295025525347263,
1600
+ "grad_norm": 1.441884994506836,
1601
+ "learning_rate": 5.364399799966402e-05,
1602
+ "loss": 0.3652,
1603
+ "step": 2230
1604
+ },
1605
+ {
1606
+ "epoch": 0.5318770034429539,
1607
+ "grad_norm": 1.0196775197982788,
1608
+ "learning_rate": 5.323052157062346e-05,
1609
+ "loss": 0.3943,
1610
+ "step": 2240
1611
+ },
1612
+ {
1613
+ "epoch": 0.5342514543511813,
1614
+ "grad_norm": 1.2071340084075928,
1615
+ "learning_rate": 5.281682317301302e-05,
1616
+ "loss": 0.3657,
1617
+ "step": 2250
1618
+ },
1619
+ {
1620
+ "epoch": 0.5366259052594088,
1621
+ "grad_norm": 1.2292804718017578,
1622
+ "learning_rate": 5.240293123197694e-05,
1623
+ "loss": 0.3839,
1624
+ "step": 2260
1625
+ },
1626
+ {
1627
+ "epoch": 0.5390003561676362,
1628
+ "grad_norm": 1.7216770648956299,
1629
+ "learning_rate": 5.198887418595779e-05,
1630
+ "loss": 0.3579,
1631
+ "step": 2270
1632
+ },
1633
+ {
1634
+ "epoch": 0.5413748070758637,
1635
+ "grad_norm": 1.202574372291565,
1636
+ "learning_rate": 5.157468048474257e-05,
1637
+ "loss": 0.3627,
1638
+ "step": 2280
1639
+ },
1640
+ {
1641
+ "epoch": 0.5437492579840911,
1642
+ "grad_norm": 1.6190992593765259,
1643
+ "learning_rate": 5.1160378587507716e-05,
1644
+ "loss": 0.3827,
1645
+ "step": 2290
1646
+ },
1647
+ {
1648
+ "epoch": 0.5461237088923186,
1649
+ "grad_norm": 1.0596972703933716,
1650
+ "learning_rate": 5.074599696086384e-05,
1651
+ "loss": 0.3421,
1652
+ "step": 2300
1653
+ },
1654
+ {
1655
+ "epoch": 0.5484981598005462,
1656
+ "grad_norm": 1.2202777862548828,
1657
+ "learning_rate": 5.033156407689978e-05,
1658
+ "loss": 0.384,
1659
+ "step": 2310
1660
+ },
1661
+ {
1662
+ "epoch": 0.5508726107087736,
1663
+ "grad_norm": 1.0164202451705933,
1664
+ "learning_rate": 4.991710841122623e-05,
1665
+ "loss": 0.3977,
1666
+ "step": 2320
1667
+ },
1668
+ {
1669
+ "epoch": 0.5532470616170011,
1670
+ "grad_norm": 1.3084182739257812,
1671
+ "learning_rate": 4.950265844101915e-05,
1672
+ "loss": 0.3861,
1673
+ "step": 2330
1674
+ },
1675
+ {
1676
+ "epoch": 0.5556215125252285,
1677
+ "grad_norm": 1.02189302444458,
1678
+ "learning_rate": 4.9088242643063304e-05,
1679
+ "loss": 0.366,
1680
+ "step": 2340
1681
+ },
1682
+ {
1683
+ "epoch": 0.557995963433456,
1684
+ "grad_norm": 1.5467653274536133,
1685
+ "learning_rate": 4.8673889491795344e-05,
1686
+ "loss": 0.4019,
1687
+ "step": 2350
1688
+ },
1689
+ {
1690
+ "epoch": 0.5603704143416834,
1691
+ "grad_norm": 1.1603262424468994,
1692
+ "learning_rate": 4.8259627457347554e-05,
1693
+ "loss": 0.39,
1694
+ "step": 2360
1695
+ },
1696
+ {
1697
+ "epoch": 0.562744865249911,
1698
+ "grad_norm": 1.525814414024353,
1699
+ "learning_rate": 4.784548500359162e-05,
1700
+ "loss": 0.4089,
1701
+ "step": 2370
1702
+ },
1703
+ {
1704
+ "epoch": 0.5651193161581385,
1705
+ "grad_norm": 1.124791145324707,
1706
+ "learning_rate": 4.743149058618278e-05,
1707
+ "loss": 0.3207,
1708
+ "step": 2380
1709
+ },
1710
+ {
1711
+ "epoch": 0.5674937670663659,
1712
+ "grad_norm": 1.044631004333496,
1713
+ "learning_rate": 4.7017672650604766e-05,
1714
+ "loss": 0.3822,
1715
+ "step": 2390
1716
+ },
1717
+ {
1718
+ "epoch": 0.5698682179745934,
1719
+ "grad_norm": 1.312994360923767,
1720
+ "learning_rate": 4.6604059630215326e-05,
1721
+ "loss": 0.3417,
1722
+ "step": 2400
1723
+ },
1724
+ {
1725
+ "epoch": 0.5722426688828208,
1726
+ "grad_norm": 0.9241451025009155,
1727
+ "learning_rate": 4.6190679944292395e-05,
1728
+ "loss": 0.404,
1729
+ "step": 2410
1730
+ },
1731
+ {
1732
+ "epoch": 0.5746171197910483,
1733
+ "grad_norm": 1.1292483806610107,
1734
+ "learning_rate": 4.5777561996081656e-05,
1735
+ "loss": 0.3609,
1736
+ "step": 2420
1737
+ },
1738
+ {
1739
+ "epoch": 0.5769915706992758,
1740
+ "grad_norm": 1.0960733890533447,
1741
+ "learning_rate": 4.5364734170844807e-05,
1742
+ "loss": 0.3217,
1743
+ "step": 2430
1744
+ },
1745
+ {
1746
+ "epoch": 0.5793660216075033,
1747
+ "grad_norm": 1.073885440826416,
1748
+ "learning_rate": 4.4952224833909194e-05,
1749
+ "loss": 0.3611,
1750
+ "step": 2440
1751
+ },
1752
+ {
1753
+ "epoch": 0.5817404725157307,
1754
+ "grad_norm": 1.343590259552002,
1755
+ "learning_rate": 4.4540062328718945e-05,
1756
+ "loss": 0.3434,
1757
+ "step": 2450
1758
+ },
1759
+ {
1760
+ "epoch": 0.5841149234239582,
1761
+ "grad_norm": 1.1478984355926514,
1762
+ "learning_rate": 4.412827497488744e-05,
1763
+ "loss": 0.3619,
1764
+ "step": 2460
1765
+ },
1766
+ {
1767
+ "epoch": 0.5864893743321857,
1768
+ "grad_norm": 1.0082939863204956,
1769
+ "learning_rate": 4.371689106625143e-05,
1770
+ "loss": 0.3387,
1771
+ "step": 2470
1772
+ },
1773
+ {
1774
+ "epoch": 0.5888638252404131,
1775
+ "grad_norm": 0.9021357893943787,
1776
+ "learning_rate": 4.330593886892707e-05,
1777
+ "loss": 0.3145,
1778
+ "step": 2480
1779
+ },
1780
+ {
1781
+ "epoch": 0.5912382761486407,
1782
+ "grad_norm": 3.701632261276245,
1783
+ "learning_rate": 4.2895446619367684e-05,
1784
+ "loss": 0.4051,
1785
+ "step": 2490
1786
+ },
1787
+ {
1788
+ "epoch": 0.5936127270568681,
1789
+ "grad_norm": 1.09543776512146,
1790
+ "learning_rate": 4.2485442522423636e-05,
1791
+ "loss": 0.4129,
1792
+ "step": 2500
1793
+ },
1794
+ {
1795
+ "epoch": 0.5936127270568681,
1796
+ "eval_loss": 0.3469230532646179,
1797
+ "eval_runtime": 2002.8775,
1798
+ "eval_samples_per_second": 1.869,
1799
+ "eval_steps_per_second": 0.467,
1800
+ "step": 2500
1801
+ },
1802
+ {
1803
+ "epoch": 0.5959871779650956,
1804
+ "grad_norm": 1.1468104124069214,
1805
+ "learning_rate": 4.207595474940446e-05,
1806
+ "loss": 0.3652,
1807
+ "step": 2510
1808
+ },
1809
+ {
1810
+ "epoch": 0.598361628873323,
1811
+ "grad_norm": 1.0615928173065186,
1812
+ "learning_rate": 4.166701143614315e-05,
1813
+ "loss": 0.2876,
1814
+ "step": 2520
1815
+ },
1816
+ {
1817
+ "epoch": 0.6007360797815505,
1818
+ "grad_norm": 1.3340593576431274,
1819
+ "learning_rate": 4.1258640681062934e-05,
1820
+ "loss": 0.3285,
1821
+ "step": 2530
1822
+ },
1823
+ {
1824
+ "epoch": 0.6031105306897779,
1825
+ "grad_norm": 1.2769051790237427,
1826
+ "learning_rate": 4.08508705432467e-05,
1827
+ "loss": 0.3517,
1828
+ "step": 2540
1829
+ },
1830
+ {
1831
+ "epoch": 0.6054849815980055,
1832
+ "grad_norm": 1.1702165603637695,
1833
+ "learning_rate": 4.0443729040509045e-05,
1834
+ "loss": 0.3232,
1835
+ "step": 2550
1836
+ },
1837
+ {
1838
+ "epoch": 0.607859432506233,
1839
+ "grad_norm": 1.4789706468582153,
1840
+ "learning_rate": 4.00372441474711e-05,
1841
+ "loss": 0.3429,
1842
+ "step": 2560
1843
+ },
1844
+ {
1845
+ "epoch": 0.6102338834144604,
1846
+ "grad_norm": 0.880572497844696,
1847
+ "learning_rate": 3.96314437936385e-05,
1848
+ "loss": 0.4089,
1849
+ "step": 2570
1850
+ },
1851
+ {
1852
+ "epoch": 0.6126083343226879,
1853
+ "grad_norm": 1.0814920663833618,
1854
+ "learning_rate": 3.922635586148234e-05,
1855
+ "loss": 0.3063,
1856
+ "step": 2580
1857
+ },
1858
+ {
1859
+ "epoch": 0.6149827852309153,
1860
+ "grad_norm": 1.1467609405517578,
1861
+ "learning_rate": 3.8822008184523265e-05,
1862
+ "loss": 0.3233,
1863
+ "step": 2590
1864
+ },
1865
+ {
1866
+ "epoch": 0.6173572361391428,
1867
+ "grad_norm": 0.8752092123031616,
1868
+ "learning_rate": 3.841842854541919e-05,
1869
+ "loss": 0.3372,
1870
+ "step": 2600
1871
+ },
1872
+ {
1873
+ "epoch": 0.6197316870473702,
1874
+ "grad_norm": 1.2454406023025513,
1875
+ "learning_rate": 3.8015644674056266e-05,
1876
+ "loss": 0.3275,
1877
+ "step": 2610
1878
+ },
1879
+ {
1880
+ "epoch": 0.6221061379555978,
1881
+ "grad_norm": 0.9372221827507019,
1882
+ "learning_rate": 3.7613684245643544e-05,
1883
+ "loss": 0.3352,
1884
+ "step": 2620
1885
+ },
1886
+ {
1887
+ "epoch": 0.6244805888638253,
1888
+ "grad_norm": 1.0082669258117676,
1889
+ "learning_rate": 3.7212574878811495e-05,
1890
+ "loss": 0.3131,
1891
+ "step": 2630
1892
+ },
1893
+ {
1894
+ "epoch": 0.6268550397720527,
1895
+ "grad_norm": 1.0585817098617554,
1896
+ "learning_rate": 3.68123441337143e-05,
1897
+ "loss": 0.3497,
1898
+ "step": 2640
1899
+ },
1900
+ {
1901
+ "epoch": 0.6292294906802802,
1902
+ "grad_norm": 1.1361665725708008,
1903
+ "learning_rate": 3.641301951013617e-05,
1904
+ "loss": 0.3628,
1905
+ "step": 2650
1906
+ },
1907
+ {
1908
+ "epoch": 0.6316039415885076,
1909
+ "grad_norm": 0.9877005815505981,
1910
+ "learning_rate": 3.601462844560187e-05,
1911
+ "loss": 0.2979,
1912
+ "step": 2660
1913
+ },
1914
+ {
1915
+ "epoch": 0.6339783924967352,
1916
+ "grad_norm": 1.5143479108810425,
1917
+ "learning_rate": 3.561719831349153e-05,
1918
+ "loss": 0.3169,
1919
+ "step": 2670
1920
+ },
1921
+ {
1922
+ "epoch": 0.6363528434049626,
1923
+ "grad_norm": 1.29010009765625,
1924
+ "learning_rate": 3.5220756421159696e-05,
1925
+ "loss": 0.3234,
1926
+ "step": 2680
1927
+ },
1928
+ {
1929
+ "epoch": 0.6387272943131901,
1930
+ "grad_norm": 1.5375481843948364,
1931
+ "learning_rate": 3.482533000805921e-05,
1932
+ "loss": 0.3372,
1933
+ "step": 2690
1934
+ },
1935
+ {
1936
+ "epoch": 0.6411017452214175,
1937
+ "grad_norm": 1.360487461090088,
1938
+ "learning_rate": 3.443094624386949e-05,
1939
+ "loss": 0.2902,
1940
+ "step": 2700
1941
+ },
1942
+ {
1943
+ "epoch": 0.643476196129645,
1944
+ "grad_norm": 1.3578060865402222,
1945
+ "learning_rate": 3.4037632226629704e-05,
1946
+ "loss": 0.293,
1947
+ "step": 2710
1948
+ },
1949
+ {
1950
+ "epoch": 0.6458506470378725,
1951
+ "grad_norm": 1.2596126794815063,
1952
+ "learning_rate": 3.3645414980876946e-05,
1953
+ "loss": 0.3087,
1954
+ "step": 2720
1955
+ },
1956
+ {
1957
+ "epoch": 0.6482250979461,
1958
+ "grad_norm": 0.9766151309013367,
1959
+ "learning_rate": 3.32543214557893e-05,
1960
+ "loss": 0.3082,
1961
+ "step": 2730
1962
+ },
1963
+ {
1964
+ "epoch": 0.6505995488543275,
1965
+ "grad_norm": 1.2312662601470947,
1966
+ "learning_rate": 3.286437852333418e-05,
1967
+ "loss": 0.2858,
1968
+ "step": 2740
1969
+ },
1970
+ {
1971
+ "epoch": 0.6529739997625549,
1972
+ "grad_norm": 1.3974473476409912,
1973
+ "learning_rate": 3.247561297642203e-05,
1974
+ "loss": 0.339,
1975
+ "step": 2750
1976
+ },
1977
+ {
1978
+ "epoch": 0.6553484506707824,
1979
+ "grad_norm": 1.2069544792175293,
1980
+ "learning_rate": 3.208805152706533e-05,
1981
+ "loss": 0.2766,
1982
+ "step": 2760
1983
+ },
1984
+ {
1985
+ "epoch": 0.6577229015790098,
1986
+ "grad_norm": 2.0875706672668457,
1987
+ "learning_rate": 3.170172080454319e-05,
1988
+ "loss": 0.3518,
1989
+ "step": 2770
1990
+ },
1991
+ {
1992
+ "epoch": 0.6600973524872373,
1993
+ "grad_norm": 1.344187617301941,
1994
+ "learning_rate": 3.131664735357174e-05,
1995
+ "loss": 0.2908,
1996
+ "step": 2780
1997
+ },
1998
+ {
1999
+ "epoch": 0.6624718033954649,
2000
+ "grad_norm": 1.1575170755386353,
2001
+ "learning_rate": 3.0932857632480185e-05,
2002
+ "loss": 0.3426,
2003
+ "step": 2790
2004
+ },
2005
+ {
2006
+ "epoch": 0.6648462543036923,
2007
+ "grad_norm": 1.8178951740264893,
2008
+ "learning_rate": 3.055037801139286e-05,
2009
+ "loss": 0.2876,
2010
+ "step": 2800
2011
+ },
2012
+ {
2013
+ "epoch": 0.6672207052119198,
2014
+ "grad_norm": 1.0035322904586792,
2015
+ "learning_rate": 3.0169234770417376e-05,
2016
+ "loss": 0.2819,
2017
+ "step": 2810
2018
+ },
2019
+ {
2020
+ "epoch": 0.6695951561201472,
2021
+ "grad_norm": 1.2262848615646362,
2022
+ "learning_rate": 2.978945409783892e-05,
2023
+ "loss": 0.3145,
2024
+ "step": 2820
2025
+ },
2026
+ {
2027
+ "epoch": 0.6719696070283747,
2028
+ "grad_norm": 1.0249428749084473,
2029
+ "learning_rate": 2.94110620883208e-05,
2030
+ "loss": 0.2872,
2031
+ "step": 2830
2032
+ },
2033
+ {
2034
+ "epoch": 0.6743440579366021,
2035
+ "grad_norm": 1.0349829196929932,
2036
+ "learning_rate": 2.9034084741111555e-05,
2037
+ "loss": 0.2798,
2038
+ "step": 2840
2039
+ },
2040
+ {
2041
+ "epoch": 0.6767185088448296,
2042
+ "grad_norm": 1.034436583518982,
2043
+ "learning_rate": 2.8658547958258543e-05,
2044
+ "loss": 0.2699,
2045
+ "step": 2850
2046
+ },
2047
+ {
2048
+ "epoch": 0.6790929597530571,
2049
+ "grad_norm": 1.9253498315811157,
2050
+ "learning_rate": 2.8284477542828153e-05,
2051
+ "loss": 0.3362,
2052
+ "step": 2860
2053
+ },
2054
+ {
2055
+ "epoch": 0.6814674106612846,
2056
+ "grad_norm": 1.1226086616516113,
2057
+ "learning_rate": 2.791189919713294e-05,
2058
+ "loss": 0.2896,
2059
+ "step": 2870
2060
+ },
2061
+ {
2062
+ "epoch": 0.6838418615695121,
2063
+ "grad_norm": 0.920665442943573,
2064
+ "learning_rate": 2.7540838520965672e-05,
2065
+ "loss": 0.3138,
2066
+ "step": 2880
2067
+ },
2068
+ {
2069
+ "epoch": 0.6862163124777395,
2070
+ "grad_norm": 1.1101292371749878,
2071
+ "learning_rate": 2.7171321009840178e-05,
2072
+ "loss": 0.268,
2073
+ "step": 2890
2074
+ },
2075
+ {
2076
+ "epoch": 0.688590763385967,
2077
+ "grad_norm": 1.0533770322799683,
2078
+ "learning_rate": 2.6803372053239834e-05,
2079
+ "loss": 0.2844,
2080
+ "step": 2900
2081
+ },
2082
+ {
2083
+ "epoch": 0.6909652142941944,
2084
+ "grad_norm": 1.2257792949676514,
2085
+ "learning_rate": 2.6437016932872816e-05,
2086
+ "loss": 0.2931,
2087
+ "step": 2910
2088
+ },
2089
+ {
2090
+ "epoch": 0.693339665202422,
2091
+ "grad_norm": 1.1098326444625854,
2092
+ "learning_rate": 2.6072280820935103e-05,
2093
+ "loss": 0.2818,
2094
+ "step": 2920
2095
+ },
2096
+ {
2097
+ "epoch": 0.6957141161106494,
2098
+ "grad_norm": 1.048757553100586,
2099
+ "learning_rate": 2.5709188778380942e-05,
2100
+ "loss": 0.2741,
2101
+ "step": 2930
2102
+ },
2103
+ {
2104
+ "epoch": 0.6980885670188769,
2105
+ "grad_norm": 1.3457996845245361,
2106
+ "learning_rate": 2.5347765753200808e-05,
2107
+ "loss": 0.3142,
2108
+ "step": 2940
2109
+ },
2110
+ {
2111
+ "epoch": 0.7004630179271043,
2112
+ "grad_norm": 1.0679852962493896,
2113
+ "learning_rate": 2.4988036578707303e-05,
2114
+ "loss": 0.2499,
2115
+ "step": 2950
2116
+ },
2117
+ {
2118
+ "epoch": 0.7028374688353318,
2119
+ "grad_norm": 1.3276768922805786,
2120
+ "learning_rate": 2.463002597182882e-05,
2121
+ "loss": 0.2565,
2122
+ "step": 2960
2123
+ },
2124
+ {
2125
+ "epoch": 0.7052119197435593,
2126
+ "grad_norm": 1.2980198860168457,
2127
+ "learning_rate": 2.427375853141134e-05,
2128
+ "loss": 0.2848,
2129
+ "step": 2970
2130
+ },
2131
+ {
2132
+ "epoch": 0.7075863706517868,
2133
+ "grad_norm": 1.0313228368759155,
2134
+ "learning_rate": 2.3919258736528123e-05,
2135
+ "loss": 0.2536,
2136
+ "step": 2980
2137
+ },
2138
+ {
2139
+ "epoch": 0.7099608215600143,
2140
+ "grad_norm": 0.9275791049003601,
2141
+ "learning_rate": 2.3566550944797804e-05,
2142
+ "loss": 0.2693,
2143
+ "step": 2990
2144
+ },
2145
+ {
2146
+ "epoch": 0.7123352724682417,
2147
+ "grad_norm": 1.083311915397644,
2148
+ "learning_rate": 2.321565939071089e-05,
2149
+ "loss": 0.345,
2150
+ "step": 3000
2151
+ },
2152
+ {
2153
+ "epoch": 0.7123352724682417,
2154
+ "eval_loss": 0.2755714952945709,
2155
+ "eval_runtime": 2001.6463,
2156
+ "eval_samples_per_second": 1.87,
2157
+ "eval_steps_per_second": 0.468,
2158
+ "step": 3000
2159
+ },
2160
+ {
2161
+ "epoch": 0.7147097233764692,
2162
+ "grad_norm": 1.2642430067062378,
2163
+ "learning_rate": 2.2866608183964376e-05,
2164
+ "loss": 0.3807,
2165
+ "step": 3010
2166
+ },
2167
+ {
2168
+ "epoch": 0.7170841742846966,
2169
+ "grad_norm": 1.761056900024414,
2170
+ "learning_rate": 2.2519421307805445e-05,
2171
+ "loss": 0.2892,
2172
+ "step": 3020
2173
+ },
2174
+ {
2175
+ "epoch": 0.7194586251929241,
2176
+ "grad_norm": 1.1523739099502563,
2177
+ "learning_rate": 2.217412261738338e-05,
2178
+ "loss": 0.2531,
2179
+ "step": 3030
2180
+ },
2181
+ {
2182
+ "epoch": 0.7218330761011517,
2183
+ "grad_norm": 0.9026095271110535,
2184
+ "learning_rate": 2.183073583811055e-05,
2185
+ "loss": 0.2458,
2186
+ "step": 3040
2187
+ },
2188
+ {
2189
+ "epoch": 0.7242075270093791,
2190
+ "grad_norm": 1.0815373659133911,
2191
+ "learning_rate": 2.1489284564032308e-05,
2192
+ "loss": 0.2413,
2193
+ "step": 3050
2194
+ },
2195
+ {
2196
+ "epoch": 0.7265819779176066,
2197
+ "grad_norm": 0.9200854897499084,
2198
+ "learning_rate": 2.1149792256205725e-05,
2199
+ "loss": 0.2575,
2200
+ "step": 3060
2201
+ },
2202
+ {
2203
+ "epoch": 0.728956428825834,
2204
+ "grad_norm": 0.9628139734268188,
2205
+ "learning_rate": 2.0812282241087662e-05,
2206
+ "loss": 0.2503,
2207
+ "step": 3070
2208
+ },
2209
+ {
2210
+ "epoch": 0.7313308797340615,
2211
+ "grad_norm": 1.0112043619155884,
2212
+ "learning_rate": 2.0476777708931978e-05,
2213
+ "loss": 0.277,
2214
+ "step": 3080
2215
+ },
2216
+ {
2217
+ "epoch": 0.7337053306422889,
2218
+ "grad_norm": 1.2384965419769287,
2219
+ "learning_rate": 2.01433017121962e-05,
2220
+ "loss": 0.2969,
2221
+ "step": 3090
2222
+ },
2223
+ {
2224
+ "epoch": 0.7360797815505165,
2225
+ "grad_norm": 1.1001275777816772,
2226
+ "learning_rate": 1.981187716395751e-05,
2227
+ "loss": 0.2344,
2228
+ "step": 3100
2229
+ },
2230
+ {
2231
+ "epoch": 0.7384542324587439,
2232
+ "grad_norm": 0.9843474626541138,
2233
+ "learning_rate": 1.9482526836338387e-05,
2234
+ "loss": 0.2428,
2235
+ "step": 3110
2236
+ },
2237
+ {
2238
+ "epoch": 0.7408286833669714,
2239
+ "grad_norm": 1.2588458061218262,
2240
+ "learning_rate": 1.915527335894209e-05,
2241
+ "loss": 0.2683,
2242
+ "step": 3120
2243
+ },
2244
+ {
2245
+ "epoch": 0.7432031342751989,
2246
+ "grad_norm": 0.9996175765991211,
2247
+ "learning_rate": 1.8830139217297498e-05,
2248
+ "loss": 0.2508,
2249
+ "step": 3130
2250
+ },
2251
+ {
2252
+ "epoch": 0.7455775851834263,
2253
+ "grad_norm": 0.9209540486335754,
2254
+ "learning_rate": 1.8507146751314464e-05,
2255
+ "loss": 0.2353,
2256
+ "step": 3140
2257
+ },
2258
+ {
2259
+ "epoch": 0.7479520360916538,
2260
+ "grad_norm": 0.901054859161377,
2261
+ "learning_rate": 1.8186318153748587e-05,
2262
+ "loss": 0.2623,
2263
+ "step": 3150
2264
+ },
2265
+ {
2266
+ "epoch": 0.7503264869998812,
2267
+ "grad_norm": 0.9320164918899536,
2268
+ "learning_rate": 1.786767546867647e-05,
2269
+ "loss": 0.2305,
2270
+ "step": 3160
2271
+ },
2272
+ {
2273
+ "epoch": 0.7527009379081088,
2274
+ "grad_norm": 0.8801546692848206,
2275
+ "learning_rate": 1.755124058998108e-05,
2276
+ "loss": 0.2508,
2277
+ "step": 3170
2278
+ },
2279
+ {
2280
+ "epoch": 0.7550753888163362,
2281
+ "grad_norm": 1.0162204504013062,
2282
+ "learning_rate": 1.723703525984735e-05,
2283
+ "loss": 0.242,
2284
+ "step": 3180
2285
+ },
2286
+ {
2287
+ "epoch": 0.7574498397245637,
2288
+ "grad_norm": 1.0432860851287842,
2289
+ "learning_rate": 1.692508106726836e-05,
2290
+ "loss": 0.2466,
2291
+ "step": 3190
2292
+ },
2293
+ {
2294
+ "epoch": 0.7598242906327912,
2295
+ "grad_norm": 0.8475198745727539,
2296
+ "learning_rate": 1.6615399446561886e-05,
2297
+ "loss": 0.2407,
2298
+ "step": 3200
2299
+ },
2300
+ {
2301
+ "epoch": 0.7621987415410186,
2302
+ "grad_norm": 1.0046483278274536,
2303
+ "learning_rate": 1.630801167589774e-05,
2304
+ "loss": 0.2504,
2305
+ "step": 3210
2306
+ },
2307
+ {
2308
+ "epoch": 0.7645731924492462,
2309
+ "grad_norm": 1.0153136253356934,
2310
+ "learning_rate": 1.6002938875835665e-05,
2311
+ "loss": 0.2792,
2312
+ "step": 3220
2313
+ },
2314
+ {
2315
+ "epoch": 0.7669476433574736,
2316
+ "grad_norm": 1.3016788959503174,
2317
+ "learning_rate": 1.5700202007874165e-05,
2318
+ "loss": 0.2673,
2319
+ "step": 3230
2320
+ },
2321
+ {
2322
+ "epoch": 0.7693220942657011,
2323
+ "grad_norm": 0.8398091197013855,
2324
+ "learning_rate": 1.5399821873010335e-05,
2325
+ "loss": 0.2371,
2326
+ "step": 3240
2327
+ },
2328
+ {
2329
+ "epoch": 0.7716965451739285,
2330
+ "grad_norm": 0.9159530997276306,
2331
+ "learning_rate": 1.5101819110310433e-05,
2332
+ "loss": 0.2323,
2333
+ "step": 3250
2334
+ },
2335
+ {
2336
+ "epoch": 0.774070996082156,
2337
+ "grad_norm": 1.0264992713928223,
2338
+ "learning_rate": 1.4806214195492008e-05,
2339
+ "loss": 0.2566,
2340
+ "step": 3260
2341
+ },
2342
+ {
2343
+ "epoch": 0.7764454469903834,
2344
+ "grad_norm": 1.2823314666748047,
2345
+ "learning_rate": 1.4513027439516847e-05,
2346
+ "loss": 0.2677,
2347
+ "step": 3270
2348
+ },
2349
+ {
2350
+ "epoch": 0.778819897898611,
2351
+ "grad_norm": 1.0543042421340942,
2352
+ "learning_rate": 1.4222278987195447e-05,
2353
+ "loss": 0.253,
2354
+ "step": 3280
2355
+ },
2356
+ {
2357
+ "epoch": 0.7811943488068385,
2358
+ "grad_norm": 1.279941201210022,
2359
+ "learning_rate": 1.3933988815802962e-05,
2360
+ "loss": 0.2366,
2361
+ "step": 3290
2362
+ },
2363
+ {
2364
+ "epoch": 0.7835687997150659,
2365
+ "grad_norm": 1.0144097805023193,
2366
+ "learning_rate": 1.3648176733706419e-05,
2367
+ "loss": 0.2658,
2368
+ "step": 3300
2369
+ },
2370
+ {
2371
+ "epoch": 0.7859432506232934,
2372
+ "grad_norm": 1.0627365112304688,
2373
+ "learning_rate": 1.3364862379003812e-05,
2374
+ "loss": 0.2921,
2375
+ "step": 3310
2376
+ },
2377
+ {
2378
+ "epoch": 0.7883177015315208,
2379
+ "grad_norm": 1.0910556316375732,
2380
+ "learning_rate": 1.3084065218174679e-05,
2381
+ "loss": 0.2456,
2382
+ "step": 3320
2383
+ },
2384
+ {
2385
+ "epoch": 0.7906921524397483,
2386
+ "grad_norm": 1.017910122871399,
2387
+ "learning_rate": 1.2805804544742672e-05,
2388
+ "loss": 0.2153,
2389
+ "step": 3330
2390
+ },
2391
+ {
2392
+ "epoch": 0.7930666033479757,
2393
+ "grad_norm": 1.1485787630081177,
2394
+ "learning_rate": 1.2530099477949792e-05,
2395
+ "loss": 0.254,
2396
+ "step": 3340
2397
+ },
2398
+ {
2399
+ "epoch": 0.7954410542562033,
2400
+ "grad_norm": 1.189522385597229,
2401
+ "learning_rate": 1.2256968961442755e-05,
2402
+ "loss": 0.2346,
2403
+ "step": 3350
2404
+ },
2405
+ {
2406
+ "epoch": 0.7978155051644307,
2407
+ "grad_norm": 1.3077176809310913,
2408
+ "learning_rate": 1.198643176197144e-05,
2409
+ "loss": 0.2527,
2410
+ "step": 3360
2411
+ },
2412
+ {
2413
+ "epoch": 0.8001899560726582,
2414
+ "grad_norm": 1.1327699422836304,
2415
+ "learning_rate": 1.1718506468099254e-05,
2416
+ "loss": 0.2582,
2417
+ "step": 3370
2418
+ },
2419
+ {
2420
+ "epoch": 0.8025644069808857,
2421
+ "grad_norm": 1.7842756509780884,
2422
+ "learning_rate": 1.1453211488926153e-05,
2423
+ "loss": 0.242,
2424
+ "step": 3380
2425
+ },
2426
+ {
2427
+ "epoch": 0.8049388578891131,
2428
+ "grad_norm": 1.4039660692214966,
2429
+ "learning_rate": 1.1190565052823548e-05,
2430
+ "loss": 0.2509,
2431
+ "step": 3390
2432
+ },
2433
+ {
2434
+ "epoch": 0.8073133087973406,
2435
+ "grad_norm": 0.8914623260498047,
2436
+ "learning_rate": 1.0930585206181942e-05,
2437
+ "loss": 0.2187,
2438
+ "step": 3400
2439
+ },
2440
+ {
2441
+ "epoch": 0.809687759705568,
2442
+ "grad_norm": 0.8296724557876587,
2443
+ "learning_rate": 1.0673289812170972e-05,
2444
+ "loss": 0.2268,
2445
+ "step": 3410
2446
+ },
2447
+ {
2448
+ "epoch": 0.8120622106137956,
2449
+ "grad_norm": 1.0560425519943237,
2450
+ "learning_rate": 1.041869654951198e-05,
2451
+ "loss": 0.2317,
2452
+ "step": 3420
2453
+ },
2454
+ {
2455
+ "epoch": 0.814436661522023,
2456
+ "grad_norm": 0.8370431661605835,
2457
+ "learning_rate": 1.016682291126333e-05,
2458
+ "loss": 0.2534,
2459
+ "step": 3430
2460
+ },
2461
+ {
2462
+ "epoch": 0.8168111124302505,
2463
+ "grad_norm": 0.798896074295044,
2464
+ "learning_rate": 9.917686203618475e-06,
2465
+ "loss": 0.2129,
2466
+ "step": 3440
2467
+ },
2468
+ {
2469
+ "epoch": 0.819185563338478,
2470
+ "grad_norm": 1.008288860321045,
2471
+ "learning_rate": 9.671303544716875e-06,
2472
+ "loss": 0.2118,
2473
+ "step": 3450
2474
+ },
2475
+ {
2476
+ "epoch": 0.8215600142467054,
2477
+ "grad_norm": 1.2545337677001953,
2478
+ "learning_rate": 9.427691863467758e-06,
2479
+ "loss": 0.2315,
2480
+ "step": 3460
2481
+ },
2482
+ {
2483
+ "epoch": 0.823934465154933,
2484
+ "grad_norm": 0.8400135040283203,
2485
+ "learning_rate": 9.186867898386952e-06,
2486
+ "loss": 0.2312,
2487
+ "step": 3470
2488
+ },
2489
+ {
2490
+ "epoch": 0.8263089160631604,
2491
+ "grad_norm": 0.6971375942230225,
2492
+ "learning_rate": 8.948848196446852e-06,
2493
+ "loss": 0.2311,
2494
+ "step": 3480
2495
+ },
2496
+ {
2497
+ "epoch": 0.8286833669713879,
2498
+ "grad_norm": 0.9910257458686829,
2499
+ "learning_rate": 8.713649111939332e-06,
2500
+ "loss": 0.2519,
2501
+ "step": 3490
2502
+ },
2503
+ {
2504
+ "epoch": 0.8310578178796153,
2505
+ "grad_norm": 0.8067383766174316,
2506
+ "learning_rate": 8.481286805352234e-06,
2507
+ "loss": 0.1919,
2508
+ "step": 3500
2509
+ },
2510
+ {
2511
+ "epoch": 0.8310578178796153,
2512
+ "eval_loss": 0.2284688502550125,
2513
+ "eval_runtime": 2005.8031,
2514
+ "eval_samples_per_second": 1.867,
2515
+ "eval_steps_per_second": 0.467,
2516
+ "step": 3500
2517
+ },
2518
+ {
2519
+ "epoch": 0.8334322687878428,
2520
+ "grad_norm": 1.251442313194275,
2521
+ "learning_rate": 8.251777242258834e-06,
2522
+ "loss": 0.2235,
2523
+ "step": 3510
2524
+ },
2525
+ {
2526
+ "epoch": 0.8358067196960702,
2527
+ "grad_norm": 0.8034676313400269,
2528
+ "learning_rate": 8.025136192220894e-06,
2529
+ "loss": 0.1998,
2530
+ "step": 3520
2531
+ },
2532
+ {
2533
+ "epoch": 0.8381811706042978,
2534
+ "grad_norm": 0.8723871111869812,
2535
+ "learning_rate": 7.801379227705203e-06,
2536
+ "loss": 0.2251,
2537
+ "step": 3530
2538
+ },
2539
+ {
2540
+ "epoch": 0.8405556215125253,
2541
+ "grad_norm": 0.9363040328025818,
2542
+ "learning_rate": 7.58052172301349e-06,
2543
+ "loss": 0.1954,
2544
+ "step": 3540
2545
+ },
2546
+ {
2547
+ "epoch": 0.8429300724207527,
2548
+ "grad_norm": 0.8821825385093689,
2549
+ "learning_rate": 7.362578853226121e-06,
2550
+ "loss": 0.2014,
2551
+ "step": 3550
2552
+ },
2553
+ {
2554
+ "epoch": 0.8453045233289802,
2555
+ "grad_norm": 0.8672744035720825,
2556
+ "learning_rate": 7.1475655931594e-06,
2557
+ "loss": 0.2346,
2558
+ "step": 3560
2559
+ },
2560
+ {
2561
+ "epoch": 0.8476789742372076,
2562
+ "grad_norm": 0.8038856387138367,
2563
+ "learning_rate": 6.9354967163367035e-06,
2564
+ "loss": 0.1944,
2565
+ "step": 3570
2566
+ },
2567
+ {
2568
+ "epoch": 0.8500534251454351,
2569
+ "grad_norm": 0.9127787351608276,
2570
+ "learning_rate": 6.726386793973305e-06,
2571
+ "loss": 0.2036,
2572
+ "step": 3580
2573
+ },
2574
+ {
2575
+ "epoch": 0.8524278760536625,
2576
+ "grad_norm": 1.6385974884033203,
2577
+ "learning_rate": 6.520250193975242e-06,
2578
+ "loss": 0.2438,
2579
+ "step": 3590
2580
+ },
2581
+ {
2582
+ "epoch": 0.8548023269618901,
2583
+ "grad_norm": 0.8287534713745117,
2584
+ "learning_rate": 6.317101079952148e-06,
2585
+ "loss": 0.2258,
2586
+ "step": 3600
2587
+ },
2588
+ {
2589
+ "epoch": 0.8571767778701176,
2590
+ "grad_norm": 0.7688109874725342,
2591
+ "learning_rate": 6.116953410243925e-06,
2592
+ "loss": 0.1954,
2593
+ "step": 3610
2594
+ },
2595
+ {
2596
+ "epoch": 0.859551228778345,
2597
+ "grad_norm": 1.0905680656433105,
2598
+ "learning_rate": 5.919820936961856e-06,
2599
+ "loss": 0.2192,
2600
+ "step": 3620
2601
+ },
2602
+ {
2603
+ "epoch": 0.8619256796865725,
2604
+ "grad_norm": 0.8777235746383667,
2605
+ "learning_rate": 5.725717205043552e-06,
2606
+ "loss": 0.1896,
2607
+ "step": 3630
2608
+ },
2609
+ {
2610
+ "epoch": 0.8643001305947999,
2611
+ "grad_norm": 1.0974262952804565,
2612
+ "learning_rate": 5.5346555513223485e-06,
2613
+ "loss": 0.2318,
2614
+ "step": 3640
2615
+ },
2616
+ {
2617
+ "epoch": 0.8666745815030275,
2618
+ "grad_norm": 0.9711960554122925,
2619
+ "learning_rate": 5.34664910361094e-06,
2620
+ "loss": 0.2081,
2621
+ "step": 3650
2622
+ },
2623
+ {
2624
+ "epoch": 0.8690490324112549,
2625
+ "grad_norm": 1.0660927295684814,
2626
+ "learning_rate": 5.161710779799328e-06,
2627
+ "loss": 0.2156,
2628
+ "step": 3660
2629
+ },
2630
+ {
2631
+ "epoch": 0.8714234833194824,
2632
+ "grad_norm": 1.1410367488861084,
2633
+ "learning_rate": 4.979853286967273e-06,
2634
+ "loss": 0.1889,
2635
+ "step": 3670
2636
+ },
2637
+ {
2638
+ "epoch": 0.8737979342277098,
2639
+ "grad_norm": 1.3238369226455688,
2640
+ "learning_rate": 4.801089120511165e-06,
2641
+ "loss": 0.2193,
2642
+ "step": 3680
2643
+ },
2644
+ {
2645
+ "epoch": 0.8761723851359373,
2646
+ "grad_norm": 0.7377246618270874,
2647
+ "learning_rate": 4.625430563285515e-06,
2648
+ "loss": 0.2315,
2649
+ "step": 3690
2650
+ },
2651
+ {
2652
+ "epoch": 0.8785468360441648,
2653
+ "grad_norm": 1.1461482048034668,
2654
+ "learning_rate": 4.452889684758938e-06,
2655
+ "loss": 0.2573,
2656
+ "step": 3700
2657
+ },
2658
+ {
2659
+ "epoch": 0.8809212869523922,
2660
+ "grad_norm": 0.8280972242355347,
2661
+ "learning_rate": 4.283478340184893e-06,
2662
+ "loss": 0.2112,
2663
+ "step": 3710
2664
+ },
2665
+ {
2666
+ "epoch": 0.8832957378606198,
2667
+ "grad_norm": 1.114720344543457,
2668
+ "learning_rate": 4.11720816978714e-06,
2669
+ "loss": 0.2091,
2670
+ "step": 3720
2671
+ },
2672
+ {
2673
+ "epoch": 0.8856701887688472,
2674
+ "grad_norm": 0.9124228358268738,
2675
+ "learning_rate": 3.95409059795987e-06,
2676
+ "loss": 0.2038,
2677
+ "step": 3730
2678
+ },
2679
+ {
2680
+ "epoch": 0.8880446396770747,
2681
+ "grad_norm": 1.368277668952942,
2682
+ "learning_rate": 3.7941368324828253e-06,
2683
+ "loss": 0.2478,
2684
+ "step": 3740
2685
+ },
2686
+ {
2687
+ "epoch": 0.8904190905853021,
2688
+ "grad_norm": 0.9257198572158813,
2689
+ "learning_rate": 3.6373578637511283e-06,
2690
+ "loss": 0.2081,
2691
+ "step": 3750
2692
+ },
2693
+ {
2694
+ "epoch": 0.8927935414935296,
2695
+ "grad_norm": 0.911368191242218,
2696
+ "learning_rate": 3.4837644640202003e-06,
2697
+ "loss": 0.224,
2698
+ "step": 3760
2699
+ },
2700
+ {
2701
+ "epoch": 0.895167992401757,
2702
+ "grad_norm": 0.8841039538383484,
2703
+ "learning_rate": 3.333367186665576e-06,
2704
+ "loss": 0.2222,
2705
+ "step": 3770
2706
+ },
2707
+ {
2708
+ "epoch": 0.8975424433099846,
2709
+ "grad_norm": 0.9717174172401428,
2710
+ "learning_rate": 3.186176365457766e-06,
2711
+ "loss": 0.2212,
2712
+ "step": 3780
2713
+ },
2714
+ {
2715
+ "epoch": 0.8999168942182121,
2716
+ "grad_norm": 1.0966217517852783,
2717
+ "learning_rate": 3.042202113852255e-06,
2718
+ "loss": 0.1949,
2719
+ "step": 3790
2720
+ },
2721
+ {
2722
+ "epoch": 0.9022913451264395,
2723
+ "grad_norm": 0.7802509665489197,
2724
+ "learning_rate": 2.9014543242945837e-06,
2725
+ "loss": 0.2108,
2726
+ "step": 3800
2727
+ },
2728
+ {
2729
+ "epoch": 0.904665796034667,
2730
+ "grad_norm": 1.031802773475647,
2731
+ "learning_rate": 2.7639426675406753e-06,
2732
+ "loss": 0.2566,
2733
+ "step": 3810
2734
+ },
2735
+ {
2736
+ "epoch": 0.9070402469428944,
2737
+ "grad_norm": 0.9489237070083618,
2738
+ "learning_rate": 2.629676591992314e-06,
2739
+ "loss": 0.2098,
2740
+ "step": 3820
2741
+ },
2742
+ {
2743
+ "epoch": 0.909414697851122,
2744
+ "grad_norm": 0.6206024885177612,
2745
+ "learning_rate": 2.498665323047966e-06,
2746
+ "loss": 0.2456,
2747
+ "step": 3830
2748
+ },
2749
+ {
2750
+ "epoch": 0.9117891487593494,
2751
+ "grad_norm": 1.0588915348052979,
2752
+ "learning_rate": 2.370917862468941e-06,
2753
+ "loss": 0.203,
2754
+ "step": 3840
2755
+ },
2756
+ {
2757
+ "epoch": 0.9141635996675769,
2758
+ "grad_norm": 0.879034161567688,
2759
+ "learning_rate": 2.2464429877607995e-06,
2760
+ "loss": 0.2151,
2761
+ "step": 3850
2762
+ },
2763
+ {
2764
+ "epoch": 0.9165380505758044,
2765
+ "grad_norm": 1.5556215047836304,
2766
+ "learning_rate": 2.1252492515703382e-06,
2767
+ "loss": 0.2254,
2768
+ "step": 3860
2769
+ },
2770
+ {
2771
+ "epoch": 0.9189125014840318,
2772
+ "grad_norm": 0.8274180889129639,
2773
+ "learning_rate": 2.0073449810978974e-06,
2774
+ "loss": 0.1823,
2775
+ "step": 3870
2776
+ },
2777
+ {
2778
+ "epoch": 0.9212869523922593,
2779
+ "grad_norm": 1.0189625024795532,
2780
+ "learning_rate": 1.8927382775251856e-06,
2781
+ "loss": 0.2369,
2782
+ "step": 3880
2783
+ },
2784
+ {
2785
+ "epoch": 0.9236614033004867,
2786
+ "grad_norm": 0.7381725907325745,
2787
+ "learning_rate": 1.781437015458698e-06,
2788
+ "loss": 0.1982,
2789
+ "step": 3890
2790
+ },
2791
+ {
2792
+ "epoch": 0.9260358542087143,
2793
+ "grad_norm": 0.8868210315704346,
2794
+ "learning_rate": 1.673448842388603e-06,
2795
+ "loss": 0.205,
2796
+ "step": 3900
2797
+ },
2798
+ {
2799
+ "epoch": 0.9284103051169417,
2800
+ "grad_norm": 0.7141517400741577,
2801
+ "learning_rate": 1.5687811781633033e-06,
2802
+ "loss": 0.2055,
2803
+ "step": 3910
2804
+ },
2805
+ {
2806
+ "epoch": 0.9307847560251692,
2807
+ "grad_norm": 0.9352526068687439,
2808
+ "learning_rate": 1.4674412144796368e-06,
2809
+ "loss": 0.1943,
2810
+ "step": 3920
2811
+ },
2812
+ {
2813
+ "epoch": 0.9331592069333966,
2814
+ "grad_norm": 0.9011178612709045,
2815
+ "learning_rate": 1.3694359143887225e-06,
2816
+ "loss": 0.1821,
2817
+ "step": 3930
2818
+ },
2819
+ {
2820
+ "epoch": 0.9355336578416241,
2821
+ "grad_norm": 0.927413284778595,
2822
+ "learning_rate": 1.2747720118175099e-06,
2823
+ "loss": 0.1794,
2824
+ "step": 3940
2825
+ },
2826
+ {
2827
+ "epoch": 0.9379081087498516,
2828
+ "grad_norm": 1.1069186925888062,
2829
+ "learning_rate": 1.1834560111061211e-06,
2830
+ "loss": 0.1955,
2831
+ "step": 3950
2832
+ },
2833
+ {
2834
+ "epoch": 0.940282559658079,
2835
+ "grad_norm": 0.8640979528427124,
2836
+ "learning_rate": 1.095494186560947e-06,
2837
+ "loss": 0.2071,
2838
+ "step": 3960
2839
+ },
2840
+ {
2841
+ "epoch": 0.9426570105663066,
2842
+ "grad_norm": 0.9992754459381104,
2843
+ "learning_rate": 1.0108925820234926e-06,
2844
+ "loss": 0.2151,
2845
+ "step": 3970
2846
+ },
2847
+ {
2848
+ "epoch": 0.945031461474534,
2849
+ "grad_norm": 1.3405293226242065,
2850
+ "learning_rate": 9.29657010455165e-07,
2851
+ "loss": 0.1961,
2852
+ "step": 3980
2853
+ },
2854
+ {
2855
+ "epoch": 0.9474059123827615,
2856
+ "grad_norm": 0.887992262840271,
2857
+ "learning_rate": 8.517930535378083e-07,
2858
+ "loss": 0.2276,
2859
+ "step": 3990
2860
+ },
2861
+ {
2862
+ "epoch": 0.9497803632909889,
2863
+ "grad_norm": 1.3939125537872314,
2864
+ "learning_rate": 7.773060612902395e-07,
2865
+ "loss": 0.2301,
2866
+ "step": 4000
2867
+ },
2868
+ {
2869
+ "epoch": 0.9497803632909889,
2870
+ "eval_loss": 0.20879201591014862,
2871
+ "eval_runtime": 1997.7765,
2872
+ "eval_samples_per_second": 1.874,
2873
+ "eval_steps_per_second": 0.469,
2874
+ "step": 4000
2875
+ },
2876
+ {
2877
+ "epoch": 0.9521548141992164,
2878
+ "grad_norm": 0.8135402202606201,
2879
+ "learning_rate": 7.062011517006139e-07,
2880
+ "loss": 0.1919,
2881
+ "step": 4010
2882
+ },
2883
+ {
2884
+ "epoch": 0.954529265107444,
2885
+ "grad_norm": 1.2012323141098022,
2886
+ "learning_rate": 6.384832103747907e-07,
2887
+ "loss": 0.2081,
2888
+ "step": 4020
2889
+ },
2890
+ {
2891
+ "epoch": 0.9569037160156714,
2892
+ "grad_norm": 0.8670451641082764,
2893
+ "learning_rate": 5.741568902006277e-07,
2894
+ "loss": 0.1962,
2895
+ "step": 4030
2896
+ },
2897
+ {
2898
+ "epoch": 0.9592781669238989,
2899
+ "grad_norm": 0.910858154296875,
2900
+ "learning_rate": 5.132266110282835e-07,
2901
+ "loss": 0.1665,
2902
+ "step": 4040
2903
+ },
2904
+ {
2905
+ "epoch": 0.9616526178321263,
2906
+ "grad_norm": 1.337033987045288,
2907
+ "learning_rate": 4.5569655936654186e-07,
2908
+ "loss": 0.2127,
2909
+ "step": 4050
2910
+ },
2911
+ {
2912
+ "epoch": 0.9640270687403538,
2913
+ "grad_norm": 0.8553494811058044,
2914
+ "learning_rate": 4.0157068809515417e-07,
2915
+ "loss": 0.2103,
2916
+ "step": 4060
2917
+ },
2918
+ {
2919
+ "epoch": 0.9664015196485812,
2920
+ "grad_norm": 0.9513109922409058,
2921
+ "learning_rate": 3.50852716193234e-07,
2922
+ "loss": 0.2309,
2923
+ "step": 4070
2924
+ },
2925
+ {
2926
+ "epoch": 0.9687759705568088,
2927
+ "grad_norm": 0.9447882771492004,
2928
+ "learning_rate": 3.0354612848372265e-07,
2929
+ "loss": 0.2033,
2930
+ "step": 4080
2931
+ },
2932
+ {
2933
+ "epoch": 0.9711504214650362,
2934
+ "grad_norm": 0.9029374122619629,
2935
+ "learning_rate": 2.59654175393953e-07,
2936
+ "loss": 0.1892,
2937
+ "step": 4090
2938
+ },
2939
+ {
2940
+ "epoch": 0.9735248723732637,
2941
+ "grad_norm": 1.0304471254348755,
2942
+ "learning_rate": 2.1917987273232245e-07,
2943
+ "loss": 0.2659,
2944
+ "step": 4100
2945
+ },
2946
+ {
2947
+ "epoch": 0.9758993232814912,
2948
+ "grad_norm": 0.9072940349578857,
2949
+ "learning_rate": 1.8212600148105884e-07,
2950
+ "loss": 0.1669,
2951
+ "step": 4110
2952
+ },
2953
+ {
2954
+ "epoch": 0.9782737741897186,
2955
+ "grad_norm": 0.9835550785064697,
2956
+ "learning_rate": 1.4849510760513995e-07,
2957
+ "loss": 0.2005,
2958
+ "step": 4120
2959
+ },
2960
+ {
2961
+ "epoch": 0.9806482250979461,
2962
+ "grad_norm": 0.9774510264396667,
2963
+ "learning_rate": 1.182895018773944e-07,
2964
+ "loss": 0.1955,
2965
+ "step": 4130
2966
+ },
2967
+ {
2968
+ "epoch": 0.9830226760061735,
2969
+ "grad_norm": 0.9649205207824707,
2970
+ "learning_rate": 9.151125971967878e-08,
2971
+ "loss": 0.1946,
2972
+ "step": 4140
2973
+ },
2974
+ {
2975
+ "epoch": 0.9853971269144011,
2976
+ "grad_norm": 0.8067905306816101,
2977
+ "learning_rate": 6.816222106030834e-08,
2978
+ "loss": 0.2244,
2979
+ "step": 4150
2980
+ },
2981
+ {
2982
+ "epoch": 0.9877715778226285,
2983
+ "grad_norm": 0.8424655795097351,
2984
+ "learning_rate": 4.824399020763593e-08,
2985
+ "loss": 0.2032,
2986
+ "step": 4160
2987
+ },
2988
+ {
2989
+ "epoch": 0.990146028730856,
2990
+ "grad_norm": 1.2165002822875977,
2991
+ "learning_rate": 3.175793573980124e-08,
2992
+ "loss": 0.2069,
2993
+ "step": 4170
2994
+ },
2995
+ {
2996
+ "epoch": 0.9925204796390834,
2997
+ "grad_norm": 0.9676713347434998,
2998
+ "learning_rate": 1.8705190410717166e-08,
2999
+ "loss": 0.2685,
3000
+ "step": 4180
3001
+ },
3002
+ {
3003
+ "epoch": 0.9948949305473109,
3004
+ "grad_norm": 0.9836387634277344,
3005
+ "learning_rate": 9.086651072215402e-09,
3006
+ "loss": 0.1907,
3007
+ "step": 4190
3008
+ },
3009
+ {
3010
+ "epoch": 0.9972693814555385,
3011
+ "grad_norm": 1.0036119222640991,
3012
+ "learning_rate": 2.902978612456808e-09,
3013
+ "loss": 0.211,
3014
+ "step": 4200
3015
+ },
3016
+ {
3017
+ "epoch": 0.9996438323637659,
3018
+ "grad_norm": 1.1510422229766846,
3019
+ "learning_rate": 1.5459791047889305e-10,
3020
+ "loss": 0.2539,
3021
+ "step": 4210
3022
+ },
3023
+ {
3024
+ "epoch": 1.0,
3025
+ "step": 4212,
3026
+ "total_flos": 449297978753024.0,
3027
+ "train_loss": 0.44325452399106674,
3028
+ "train_runtime": 138200.0012,
3029
+ "train_samples_per_second": 0.244,
3030
+ "train_steps_per_second": 0.03
3031
+ }
3032
+ ],
3033
+ "logging_steps": 10,
3034
+ "max_steps": 4212,
3035
+ "num_input_tokens_seen": 0,
3036
+ "num_train_epochs": 1,
3037
+ "save_steps": 1000,
3038
+ "stateful_callbacks": {
3039
+ "TrainerControl": {
3040
+ "args": {
3041
+ "should_epoch_stop": false,
3042
+ "should_evaluate": false,
3043
+ "should_log": false,
3044
+ "should_save": true,
3045
+ "should_training_stop": true
3046
+ },
3047
+ "attributes": {}
3048
+ }
3049
+ },
3050
+ "total_flos": 449297978753024.0,
3051
+ "train_batch_size": 1,
3052
+ "trial_name": null,
3053
+ "trial_params": null
3054
+ }
qwen2-gob-plan-115/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:366195804c523e80245a1400ecce5ecd8a158fab29557b0bd8da51a1676fd5c9
3
+ size 8337
qwen2-gob-plan-115/training_eval_loss.png ADDED
qwen2-gob-plan-115/training_loss.png ADDED
qwen2-gob-plan-115/vocab.json ADDED
The diff for this file is too large to render. See raw diff