hf-reset commited on
Commit
0867df2
·
0 Parent(s):

Reset repository without checkpoints directories

Browse files
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: Qwen/Qwen3-8B
5
+ tags:
6
+ - llama-factory
7
+ - full
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: nl2bash-3k-traces-restore-hp
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # nl2bash-3k-traces-restore-hp
18
+
19
+ This model is a fine-tuned version of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) on the DCAgent/nl2bash-3k-traces dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 4e-05
39
+ - train_batch_size: 1
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 16
44
+ - total_train_batch_size: 16
45
+ - total_eval_batch_size: 128
46
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
47
+ - lr_scheduler_type: cosine
48
+ - lr_scheduler_warmup_ratio: 0.1
49
+ - num_epochs: 6.0
50
+
51
+ ### Training results
52
+
53
+
54
+
55
+ ### Framework versions
56
+
57
+ - Transformers 4.56.0
58
+ - Pytorch 2.7.0+cu128
59
+ - Datasets 3.6.0
60
+ - Tokenizers 0.22.1
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "achieved_tflops_per_gpu": 0.001951399015158965,
3
+ "achieved_tflops_per_gpu_theoretical": 1016.3713688469504,
4
+ "epoch": 6.0,
5
+ "loss_nan_ranks": 0,
6
+ "loss_rank_avg": 0.133074551820755,
7
+ "mfu_percent": 0.00013790805760840742,
8
+ "mfu_percent_theoretical": 71.82836528953713,
9
+ "total_flos": 123574112944128.0,
10
+ "train_loss": 0.2030692549064906,
11
+ "train_runtime": 3957.8692,
12
+ "train_samples_per_second": 4.79,
13
+ "train_steps_per_second": 0.3,
14
+ "valid_targets_mean": 1319.8,
15
+ "valid_targets_min": 875
16
+ }
chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "dtype": "bfloat16",
8
+ "eos_token_id": 151645,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 4096,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 12288,
14
+ "layer_types": [
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention"
51
+ ],
52
+ "max_position_embeddings": 40960,
53
+ "max_window_layers": 36,
54
+ "model_type": "qwen3",
55
+ "num_attention_heads": 32,
56
+ "num_hidden_layers": 36,
57
+ "num_key_value_heads": 8,
58
+ "pad_token_id": 151643,
59
+ "rms_norm_eps": 1e-06,
60
+ "rope_scaling": null,
61
+ "rope_theta": 1000000,
62
+ "sliding_window": null,
63
+ "tie_word_embeddings": false,
64
+ "transformers_version": "4.56.0",
65
+ "use_cache": false,
66
+ "use_sliding_window": false,
67
+ "vocab_size": 151936
68
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "temperature": 0.6,
9
+ "top_k": 20,
10
+ "top_p": 0.95,
11
+ "transformers_version": "4.56.0"
12
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2ff25930b01ffcdd98c5bb83d6b103fe5f9184cbb01bc173e4d9c111636e9b1
3
+ size 4902257696
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:719862a4f1ad6cfca37428d712a8657ea7a38721cd06b00911d367935eec5ce9
3
+ size 4915960368
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b57d1078ace2df52e246fa9b24e18e68cde285ef0edead701901b187a500cdf1
3
+ size 4983068496
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c28b5eef7400bfb294d2d3965ed853b73dfb2de1a4a5e44e1de3a01091da3e1
3
+ size 1580230264
model.safetensors.index.json ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 308224,
4
+ "total_size": 16381470720
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
32
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
122
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
130
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
141
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
142
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
143
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
144
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
145
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
146
+ "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
147
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
148
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
149
+ "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
150
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
151
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
152
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
153
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
154
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
155
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
156
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
157
+ "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
158
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
159
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
160
+ "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
161
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
162
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
163
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
164
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
165
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
166
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
167
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
168
+ "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
169
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
170
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
171
+ "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
172
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
173
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
174
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
180
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
181
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
182
+ "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
183
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
184
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
185
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
223
+ "model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
247
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
257
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
261
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
262
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
274
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
275
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
276
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
277
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
278
+ "model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
279
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
280
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
281
+ "model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
282
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
283
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
284
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
285
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
286
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
287
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
288
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
289
+ "model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
290
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
291
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
292
+ "model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
293
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
294
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
295
+ "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
296
+ "model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
297
+ "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
298
+ "model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
299
+ "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
300
+ "model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
301
+ "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
302
+ "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
303
+ "model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
304
+ "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
305
+ "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
306
+ "model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors",
307
+ "model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
308
+ "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
309
+ "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
310
+ "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
311
+ "model.layers.33.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
312
+ "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
313
+ "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
314
+ "model.layers.33.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
315
+ "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
316
+ "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
317
+ "model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors",
318
+ "model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
319
+ "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
320
+ "model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
321
+ "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
322
+ "model.layers.34.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
323
+ "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
324
+ "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
325
+ "model.layers.34.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
326
+ "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
327
+ "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
328
+ "model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
329
+ "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
330
+ "model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
331
+ "model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
332
+ "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
333
+ "model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
334
+ "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
335
+ "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
336
+ "model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
337
+ "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
338
+ "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
339
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
340
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
341
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
342
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
343
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
344
+ "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
345
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
346
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
347
+ "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
348
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
349
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
350
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
351
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
352
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
353
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
354
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
355
+ "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
356
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
357
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
358
+ "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
359
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
360
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
361
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
362
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
363
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
364
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
365
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
366
+ "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
367
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
368
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
369
+ "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
370
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
371
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
372
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
373
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
374
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
375
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
376
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
377
+ "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
378
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
379
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
380
+ "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
381
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
382
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
383
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
384
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
385
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
386
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
387
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
388
+ "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
389
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
390
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
391
+ "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
392
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
393
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
394
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
395
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
396
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
397
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
398
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
399
+ "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
400
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
401
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
402
+ "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
403
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
404
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
405
+ "model.norm.weight": "model-00004-of-00004.safetensors"
406
+ }
407
+ }
run_summary.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "agent_name": null,
3
+ "training_start": null,
4
+ "training_end": null,
5
+ "created_by": "DCAgent",
6
+ "base_model_name": "Qwen/Qwen3-8B",
7
+ "dataset_name": "DCAgent/nl2bash-3k-traces",
8
+ "training_type": "SFT",
9
+ "training_parameters": "https://huggingface.co/penfever/nl2bash-3k-traces-restore-hp/blob/main/config.json",
10
+ "wandb_link": "https://wandb.ai/dogml/dc-agent/runs/nl2bash-3k-traces_hub-model-id_nl2bash-3k-traces-restore-hp_Qwen3-8B",
11
+ "traces_location_s3": null
12
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 32768,
235
+ "pad_token": "<|endoftext|>",
236
+ "padding_side": "right",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
train_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "achieved_tflops_per_gpu": 0.001951399015158965,
3
+ "achieved_tflops_per_gpu_theoretical": 1016.3713688469504,
4
+ "epoch": 6.0,
5
+ "loss_nan_ranks": 0,
6
+ "loss_rank_avg": 0.133074551820755,
7
+ "mfu_percent": 0.00013790805760840742,
8
+ "mfu_percent_theoretical": 71.82836528953713,
9
+ "total_flos": 123574112944128.0,
10
+ "train_loss": 0.2030692549064906,
11
+ "train_runtime": 3957.8692,
12
+ "train_samples_per_second": 4.79,
13
+ "train_steps_per_second": 0.3,
14
+ "valid_targets_mean": 1319.8,
15
+ "valid_targets_min": 875
16
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 5, "total_steps": 1188, "loss": 1.0237, "lr": 1.3445378151260504e-06, "epoch": 0.025252525252525252, "percentage": 0.42, "elapsed_time": "0:00:26", "remaining_time": "1:43:47"}
2
+ {"current_steps": 10, "total_steps": 1188, "loss": 0.9775, "lr": 3.0252100840336137e-06, "epoch": 0.050505050505050504, "percentage": 0.84, "elapsed_time": "0:00:39", "remaining_time": "1:18:09"}
3
+ {"current_steps": 15, "total_steps": 1188, "loss": 0.8599, "lr": 4.705882352941177e-06, "epoch": 0.07575757575757576, "percentage": 1.26, "elapsed_time": "0:00:53", "remaining_time": "1:10:17"}
4
+ {"current_steps": 20, "total_steps": 1188, "loss": 0.7379, "lr": 6.386554621848739e-06, "epoch": 0.10101010101010101, "percentage": 1.68, "elapsed_time": "0:01:08", "remaining_time": "1:06:40"}
5
+ {"current_steps": 25, "total_steps": 1188, "loss": 0.6167, "lr": 8.067226890756303e-06, "epoch": 0.12626262626262627, "percentage": 2.1, "elapsed_time": "0:01:22", "remaining_time": "1:04:19"}
6
+ {"current_steps": 30, "total_steps": 1188, "loss": 0.5299, "lr": 9.747899159663867e-06, "epoch": 0.15151515151515152, "percentage": 2.53, "elapsed_time": "0:01:37", "remaining_time": "1:02:27"}
7
+ {"current_steps": 35, "total_steps": 1188, "loss": 0.4981, "lr": 1.1428571428571429e-05, "epoch": 0.17676767676767677, "percentage": 2.95, "elapsed_time": "0:01:51", "remaining_time": "1:01:12"}
8
+ {"current_steps": 40, "total_steps": 1188, "loss": 0.4407, "lr": 1.3109243697478993e-05, "epoch": 0.20202020202020202, "percentage": 3.37, "elapsed_time": "0:02:05", "remaining_time": "1:00:08"}
9
+ {"current_steps": 45, "total_steps": 1188, "loss": 0.414, "lr": 1.4789915966386557e-05, "epoch": 0.22727272727272727, "percentage": 3.79, "elapsed_time": "0:02:20", "remaining_time": "0:59:33"}
10
+ {"current_steps": 50, "total_steps": 1188, "loss": 0.3676, "lr": 1.647058823529412e-05, "epoch": 0.25252525252525254, "percentage": 4.21, "elapsed_time": "0:02:35", "remaining_time": "0:58:58"}
11
+ {"current_steps": 55, "total_steps": 1188, "loss": 0.3464, "lr": 1.8151260504201682e-05, "epoch": 0.2777777777777778, "percentage": 4.63, "elapsed_time": "0:02:49", "remaining_time": "0:58:04"}
12
+ {"current_steps": 60, "total_steps": 1188, "loss": 0.338, "lr": 1.9831932773109244e-05, "epoch": 0.30303030303030304, "percentage": 5.05, "elapsed_time": "0:03:03", "remaining_time": "0:57:37"}
13
+ {"current_steps": 65, "total_steps": 1188, "loss": 0.3275, "lr": 2.1512605042016807e-05, "epoch": 0.3282828282828283, "percentage": 5.47, "elapsed_time": "0:03:18", "remaining_time": "0:57:06"}
14
+ {"current_steps": 70, "total_steps": 1188, "loss": 0.2979, "lr": 2.3193277310924373e-05, "epoch": 0.35353535353535354, "percentage": 5.89, "elapsed_time": "0:03:32", "remaining_time": "0:56:28"}
15
+ {"current_steps": 75, "total_steps": 1188, "loss": 0.2957, "lr": 2.4873949579831935e-05, "epoch": 0.3787878787878788, "percentage": 6.31, "elapsed_time": "0:03:46", "remaining_time": "0:55:55"}
16
+ {"current_steps": 80, "total_steps": 1188, "loss": 0.3063, "lr": 2.6554621848739497e-05, "epoch": 0.40404040404040403, "percentage": 6.73, "elapsed_time": "0:04:00", "remaining_time": "0:55:27"}
17
+ {"current_steps": 85, "total_steps": 1188, "loss": 0.2973, "lr": 2.8235294117647063e-05, "epoch": 0.4292929292929293, "percentage": 7.15, "elapsed_time": "0:04:14", "remaining_time": "0:55:01"}
18
+ {"current_steps": 90, "total_steps": 1188, "loss": 0.2854, "lr": 2.9915966386554626e-05, "epoch": 0.45454545454545453, "percentage": 7.58, "elapsed_time": "0:04:28", "remaining_time": "0:54:41"}
19
+ {"current_steps": 95, "total_steps": 1188, "loss": 0.2698, "lr": 3.159663865546219e-05, "epoch": 0.4797979797979798, "percentage": 8.0, "elapsed_time": "0:04:43", "remaining_time": "0:54:22"}
20
+ {"current_steps": 100, "total_steps": 1188, "loss": 0.2831, "lr": 3.3277310924369754e-05, "epoch": 0.5050505050505051, "percentage": 8.42, "elapsed_time": "0:04:58", "remaining_time": "0:54:02"}
21
+ {"current_steps": 105, "total_steps": 1188, "loss": 0.2821, "lr": 3.495798319327731e-05, "epoch": 0.5303030303030303, "percentage": 8.84, "elapsed_time": "0:06:15", "remaining_time": "1:04:37"}
22
+ {"current_steps": 110, "total_steps": 1188, "loss": 0.2799, "lr": 3.663865546218488e-05, "epoch": 0.5555555555555556, "percentage": 9.26, "elapsed_time": "0:06:30", "remaining_time": "1:03:44"}
23
+ {"current_steps": 115, "total_steps": 1188, "loss": 0.279, "lr": 3.8319327731092444e-05, "epoch": 0.5808080808080808, "percentage": 9.68, "elapsed_time": "0:06:44", "remaining_time": "1:02:54"}
24
+ {"current_steps": 120, "total_steps": 1188, "loss": 0.2694, "lr": 4e-05, "epoch": 0.6060606060606061, "percentage": 10.1, "elapsed_time": "0:06:58", "remaining_time": "1:02:03"}
25
+ {"current_steps": 125, "total_steps": 1188, "loss": 0.2793, "lr": 3.999784088124753e-05, "epoch": 0.6313131313131313, "percentage": 10.52, "elapsed_time": "0:07:12", "remaining_time": "1:01:20"}
26
+ {"current_steps": 130, "total_steps": 1188, "loss": 0.2597, "lr": 3.99913639911695e-05, "epoch": 0.6565656565656566, "percentage": 10.94, "elapsed_time": "0:07:26", "remaining_time": "1:00:34"}
27
+ {"current_steps": 135, "total_steps": 1188, "loss": 0.2705, "lr": 3.998057072820338e-05, "epoch": 0.6818181818181818, "percentage": 11.36, "elapsed_time": "0:07:40", "remaining_time": "0:59:49"}
28
+ {"current_steps": 140, "total_steps": 1188, "loss": 0.2753, "lr": 3.996546342274282e-05, "epoch": 0.7070707070707071, "percentage": 11.78, "elapsed_time": "0:07:54", "remaining_time": "0:59:13"}
29
+ {"current_steps": 145, "total_steps": 1188, "loss": 0.2598, "lr": 3.9946045336634485e-05, "epoch": 0.7323232323232324, "percentage": 12.21, "elapsed_time": "0:08:09", "remaining_time": "0:58:37"}
30
+ {"current_steps": 150, "total_steps": 1188, "loss": 0.2704, "lr": 3.9922320662473755e-05, "epoch": 0.7575757575757576, "percentage": 12.63, "elapsed_time": "0:08:23", "remaining_time": "0:58:02"}
31
+ {"current_steps": 155, "total_steps": 1188, "loss": 0.2545, "lr": 3.989429452269951e-05, "epoch": 0.7828282828282829, "percentage": 13.05, "elapsed_time": "0:08:37", "remaining_time": "0:57:30"}
32
+ {"current_steps": 160, "total_steps": 1188, "loss": 0.2619, "lr": 3.986197296848816e-05, "epoch": 0.8080808080808081, "percentage": 13.47, "elapsed_time": "0:08:51", "remaining_time": "0:56:54"}
33
+ {"current_steps": 165, "total_steps": 1188, "loss": 0.2628, "lr": 3.982536297844706e-05, "epoch": 0.8333333333333334, "percentage": 13.89, "elapsed_time": "0:09:04", "remaining_time": "0:56:18"}
34
+ {"current_steps": 170, "total_steps": 1188, "loss": 0.2588, "lr": 3.978447245710784e-05, "epoch": 0.8585858585858586, "percentage": 14.31, "elapsed_time": "0:09:18", "remaining_time": "0:55:47"}
35
+ {"current_steps": 175, "total_steps": 1188, "loss": 0.2538, "lr": 3.973931023321962e-05, "epoch": 0.8838383838383839, "percentage": 14.73, "elapsed_time": "0:09:32", "remaining_time": "0:55:16"}
36
+ {"current_steps": 180, "total_steps": 1188, "loss": 0.2409, "lr": 3.9689886057842866e-05, "epoch": 0.9090909090909091, "percentage": 15.15, "elapsed_time": "0:09:47", "remaining_time": "0:54:52"}
37
+ {"current_steps": 185, "total_steps": 1188, "loss": 0.2545, "lr": 3.963621060224396e-05, "epoch": 0.9343434343434344, "percentage": 15.57, "elapsed_time": "0:10:01", "remaining_time": "0:54:21"}
38
+ {"current_steps": 190, "total_steps": 1188, "loss": 0.2553, "lr": 3.957829545559118e-05, "epoch": 0.9595959595959596, "percentage": 15.99, "elapsed_time": "0:10:14", "remaining_time": "0:53:48"}
39
+ {"current_steps": 195, "total_steps": 1188, "loss": 0.2546, "lr": 3.951615312245243e-05, "epoch": 0.9848484848484849, "percentage": 16.41, "elapsed_time": "0:10:29", "remaining_time": "0:53:23"}
40
+ {"current_steps": 200, "total_steps": 1188, "loss": 0.2535, "lr": 3.9449797020095404e-05, "epoch": 1.0101010101010102, "percentage": 16.84, "elapsed_time": "0:10:43", "remaining_time": "0:52:58"}
41
+ {"current_steps": 205, "total_steps": 1188, "loss": 0.2277, "lr": 3.937924147559059e-05, "epoch": 1.0353535353535352, "percentage": 17.26, "elapsed_time": "0:12:00", "remaining_time": "0:57:36"}
42
+ {"current_steps": 210, "total_steps": 1188, "loss": 0.2358, "lr": 3.9304501722717925e-05, "epoch": 1.0606060606060606, "percentage": 17.68, "elapsed_time": "0:12:14", "remaining_time": "0:57:00"}
43
+ {"current_steps": 215, "total_steps": 1188, "loss": 0.2333, "lr": 3.922559389867758e-05, "epoch": 1.0858585858585859, "percentage": 18.1, "elapsed_time": "0:12:28", "remaining_time": "0:56:25"}
44
+ {"current_steps": 220, "total_steps": 1188, "loss": 0.2289, "lr": 3.914253504060585e-05, "epoch": 1.1111111111111112, "percentage": 18.52, "elapsed_time": "0:12:42", "remaining_time": "0:55:53"}
45
+ {"current_steps": 225, "total_steps": 1188, "loss": 0.225, "lr": 3.90553430818965e-05, "epoch": 1.1363636363636362, "percentage": 18.94, "elapsed_time": "0:12:55", "remaining_time": "0:55:19"}
46
+ {"current_steps": 230, "total_steps": 1188, "loss": 0.23, "lr": 3.896403684832887e-05, "epoch": 1.1616161616161615, "percentage": 19.36, "elapsed_time": "0:13:08", "remaining_time": "0:54:44"}
47
+ {"current_steps": 235, "total_steps": 1188, "loss": 0.2314, "lr": 3.886863605400306e-05, "epoch": 1.1868686868686869, "percentage": 19.78, "elapsed_time": "0:13:22", "remaining_time": "0:54:13"}
48
+ {"current_steps": 240, "total_steps": 1188, "loss": 0.2231, "lr": 3.876916129708347e-05, "epoch": 1.2121212121212122, "percentage": 20.2, "elapsed_time": "0:13:36", "remaining_time": "0:53:45"}
49
+ {"current_steps": 245, "total_steps": 1188, "loss": 0.2363, "lr": 3.866563405535142e-05, "epoch": 1.2373737373737375, "percentage": 20.62, "elapsed_time": "0:13:50", "remaining_time": "0:53:17"}
50
+ {"current_steps": 250, "total_steps": 1188, "loss": 0.2187, "lr": 3.855807668156779e-05, "epoch": 1.2626262626262625, "percentage": 21.04, "elapsed_time": "0:14:04", "remaining_time": "0:52:48"}
51
+ {"current_steps": 255, "total_steps": 1188, "loss": 0.2273, "lr": 3.844651239864687e-05, "epoch": 1.2878787878787878, "percentage": 21.46, "elapsed_time": "0:14:18", "remaining_time": "0:52:21"}
52
+ {"current_steps": 260, "total_steps": 1188, "loss": 0.227, "lr": 3.8330965294642186e-05, "epoch": 1.3131313131313131, "percentage": 21.89, "elapsed_time": "0:14:32", "remaining_time": "0:51:52"}
53
+ {"current_steps": 265, "total_steps": 1188, "loss": 0.2331, "lr": 3.821146031754565e-05, "epoch": 1.3383838383838385, "percentage": 22.31, "elapsed_time": "0:14:46", "remaining_time": "0:51:26"}
54
+ {"current_steps": 270, "total_steps": 1188, "loss": 0.2198, "lr": 3.808802326990096e-05, "epoch": 1.3636363636363638, "percentage": 22.73, "elapsed_time": "0:15:00", "remaining_time": "0:51:01"}
55
+ {"current_steps": 275, "total_steps": 1188, "loss": 0.2212, "lr": 3.7960680803232544e-05, "epoch": 1.3888888888888888, "percentage": 23.15, "elapsed_time": "0:15:14", "remaining_time": "0:50:35"}
56
+ {"current_steps": 280, "total_steps": 1188, "loss": 0.2239, "lr": 3.782946041229119e-05, "epoch": 1.4141414141414141, "percentage": 23.57, "elapsed_time": "0:15:27", "remaining_time": "0:50:07"}
57
+ {"current_steps": 285, "total_steps": 1188, "loss": 0.234, "lr": 3.769439042911758e-05, "epoch": 1.4393939393939394, "percentage": 23.99, "elapsed_time": "0:15:40", "remaining_time": "0:49:39"}
58
+ {"current_steps": 290, "total_steps": 1188, "loss": 0.2173, "lr": 3.755550001692506e-05, "epoch": 1.4646464646464645, "percentage": 24.41, "elapsed_time": "0:15:54", "remaining_time": "0:49:15"}
59
+ {"current_steps": 295, "total_steps": 1188, "loss": 0.2105, "lr": 3.7412819163802986e-05, "epoch": 1.4898989898989898, "percentage": 24.83, "elapsed_time": "0:16:07", "remaining_time": "0:48:49"}
60
+ {"current_steps": 300, "total_steps": 1188, "loss": 0.2359, "lr": 3.726637867624191e-05, "epoch": 1.5151515151515151, "percentage": 25.25, "elapsed_time": "0:16:21", "remaining_time": "0:48:25"}
61
+ {"current_steps": 305, "total_steps": 1188, "loss": 0.2152, "lr": 3.711621017248213e-05, "epoch": 1.5404040404040404, "percentage": 25.67, "elapsed_time": "0:17:38", "remaining_time": "0:51:05"}
62
+ {"current_steps": 310, "total_steps": 1188, "loss": 0.2259, "lr": 3.6962346075686876e-05, "epoch": 1.5656565656565657, "percentage": 26.09, "elapsed_time": "0:17:54", "remaining_time": "0:50:41"}
63
+ {"current_steps": 315, "total_steps": 1188, "loss": 0.2276, "lr": 3.680481960694183e-05, "epoch": 1.5909090909090908, "percentage": 26.52, "elapsed_time": "0:18:07", "remaining_time": "0:50:14"}
64
+ {"current_steps": 320, "total_steps": 1188, "loss": 0.2275, "lr": 3.6643664778082254e-05, "epoch": 1.6161616161616161, "percentage": 26.94, "elapsed_time": "0:18:21", "remaining_time": "0:49:47"}
65
+ {"current_steps": 325, "total_steps": 1188, "loss": 0.2193, "lr": 3.6478916384349465e-05, "epoch": 1.6414141414141414, "percentage": 27.36, "elapsed_time": "0:18:34", "remaining_time": "0:49:20"}
66
+ {"current_steps": 330, "total_steps": 1188, "loss": 0.2243, "lr": 3.631060999687809e-05, "epoch": 1.6666666666666665, "percentage": 27.78, "elapsed_time": "0:18:48", "remaining_time": "0:48:54"}
67
+ {"current_steps": 335, "total_steps": 1188, "loss": 0.2132, "lr": 3.613878195501586e-05, "epoch": 1.691919191919192, "percentage": 28.2, "elapsed_time": "0:19:02", "remaining_time": "0:48:28"}
68
+ {"current_steps": 340, "total_steps": 1188, "loss": 0.2148, "lr": 3.596346935847752e-05, "epoch": 1.7171717171717171, "percentage": 28.62, "elapsed_time": "0:19:16", "remaining_time": "0:48:03"}
69
+ {"current_steps": 345, "total_steps": 1188, "loss": 0.225, "lr": 3.578471005933454e-05, "epoch": 1.7424242424242424, "percentage": 29.04, "elapsed_time": "0:19:30", "remaining_time": "0:47:40"}
70
+ {"current_steps": 350, "total_steps": 1188, "loss": 0.2148, "lr": 3.5602542653842416e-05, "epoch": 1.7676767676767677, "percentage": 29.46, "elapsed_time": "0:19:43", "remaining_time": "0:47:14"}
71
+ {"current_steps": 355, "total_steps": 1188, "loss": 0.2203, "lr": 3.541700647410728e-05, "epoch": 1.7929292929292928, "percentage": 29.88, "elapsed_time": "0:19:57", "remaining_time": "0:46:49"}
72
+ {"current_steps": 360, "total_steps": 1188, "loss": 0.2307, "lr": 3.522814157959362e-05, "epoch": 1.8181818181818183, "percentage": 30.3, "elapsed_time": "0:20:11", "remaining_time": "0:46:26"}
73
+ {"current_steps": 365, "total_steps": 1188, "loss": 0.23, "lr": 3.5035988748474974e-05, "epoch": 1.8434343434343434, "percentage": 30.72, "elapsed_time": "0:20:25", "remaining_time": "0:46:02"}
74
+ {"current_steps": 370, "total_steps": 1188, "loss": 0.2128, "lr": 3.484058946882946e-05, "epoch": 1.8686868686868687, "percentage": 31.14, "elapsed_time": "0:20:38", "remaining_time": "0:45:38"}
75
+ {"current_steps": 375, "total_steps": 1188, "loss": 0.221, "lr": 3.4641985929681954e-05, "epoch": 1.893939393939394, "percentage": 31.57, "elapsed_time": "0:20:52", "remaining_time": "0:45:15"}
76
+ {"current_steps": 380, "total_steps": 1188, "loss": 0.2149, "lr": 3.4440221011895026e-05, "epoch": 1.9191919191919191, "percentage": 31.99, "elapsed_time": "0:21:06", "remaining_time": "0:44:52"}
77
+ {"current_steps": 385, "total_steps": 1188, "loss": 0.2226, "lr": 3.423533827891044e-05, "epoch": 1.9444444444444444, "percentage": 32.41, "elapsed_time": "0:21:18", "remaining_time": "0:44:27"}
78
+ {"current_steps": 390, "total_steps": 1188, "loss": 0.2269, "lr": 3.402738196734327e-05, "epoch": 1.9696969696969697, "percentage": 32.83, "elapsed_time": "0:21:32", "remaining_time": "0:44:04"}
79
+ {"current_steps": 395, "total_steps": 1188, "loss": 0.2196, "lr": 3.381639697743073e-05, "epoch": 1.9949494949494948, "percentage": 33.25, "elapsed_time": "0:21:46", "remaining_time": "0:43:43"}
80
+ {"current_steps": 400, "total_steps": 1188, "loss": 0.1951, "lr": 3.3602428863337625e-05, "epoch": 2.0202020202020203, "percentage": 33.67, "elapsed_time": "0:22:00", "remaining_time": "0:43:21"}
81
+ {"current_steps": 405, "total_steps": 1188, "loss": 0.1904, "lr": 3.338552382332073e-05, "epoch": 2.0454545454545454, "percentage": 34.09, "elapsed_time": "0:23:22", "remaining_time": "0:45:12"}
82
+ {"current_steps": 410, "total_steps": 1188, "loss": 0.1952, "lr": 3.3165728689753976e-05, "epoch": 2.0707070707070705, "percentage": 34.51, "elapsed_time": "0:23:36", "remaining_time": "0:44:48"}
83
+ {"current_steps": 415, "total_steps": 1188, "loss": 0.1938, "lr": 3.2943090919016815e-05, "epoch": 2.095959595959596, "percentage": 34.93, "elapsed_time": "0:23:50", "remaining_time": "0:44:24"}
84
+ {"current_steps": 420, "total_steps": 1188, "loss": 0.1834, "lr": 3.2717658581247844e-05, "epoch": 2.121212121212121, "percentage": 35.35, "elapsed_time": "0:24:05", "remaining_time": "0:44:03"}
85
+ {"current_steps": 425, "total_steps": 1188, "loss": 0.1995, "lr": 3.248948034996583e-05, "epoch": 2.1464646464646466, "percentage": 35.77, "elapsed_time": "0:24:18", "remaining_time": "0:43:39"}
86
+ {"current_steps": 430, "total_steps": 1188, "loss": 0.184, "lr": 3.2258605491560606e-05, "epoch": 2.1717171717171717, "percentage": 36.2, "elapsed_time": "0:24:32", "remaining_time": "0:43:15"}
87
+ {"current_steps": 435, "total_steps": 1188, "loss": 0.1889, "lr": 3.2025083854655776e-05, "epoch": 2.196969696969697, "percentage": 36.62, "elapsed_time": "0:24:45", "remaining_time": "0:42:51"}
88
+ {"current_steps": 440, "total_steps": 1188, "loss": 0.1922, "lr": 3.178896585934588e-05, "epoch": 2.2222222222222223, "percentage": 37.04, "elapsed_time": "0:24:58", "remaining_time": "0:42:28"}
89
+ {"current_steps": 445, "total_steps": 1188, "loss": 0.181, "lr": 3.1550302486310076e-05, "epoch": 2.2474747474747474, "percentage": 37.46, "elapsed_time": "0:25:11", "remaining_time": "0:42:04"}
90
+ {"current_steps": 450, "total_steps": 1188, "loss": 0.1932, "lr": 3.130914526580478e-05, "epoch": 2.2727272727272725, "percentage": 37.88, "elapsed_time": "0:25:24", "remaining_time": "0:41:40"}
91
+ {"current_steps": 455, "total_steps": 1188, "loss": 0.1943, "lr": 3.10655462665377e-05, "epoch": 2.297979797979798, "percentage": 38.3, "elapsed_time": "0:25:38", "remaining_time": "0:41:18"}
92
+ {"current_steps": 460, "total_steps": 1188, "loss": 0.1858, "lr": 3.0819558084425574e-05, "epoch": 2.323232323232323, "percentage": 38.72, "elapsed_time": "0:25:52", "remaining_time": "0:40:56"}
93
+ {"current_steps": 465, "total_steps": 1188, "loss": 0.1912, "lr": 3.0571233831238093e-05, "epoch": 2.3484848484848486, "percentage": 39.14, "elapsed_time": "0:26:06", "remaining_time": "0:40:35"}
94
+ {"current_steps": 470, "total_steps": 1188, "loss": 0.1907, "lr": 3.032062712313044e-05, "epoch": 2.3737373737373737, "percentage": 39.56, "elapsed_time": "0:26:20", "remaining_time": "0:40:13"}
95
+ {"current_steps": 475, "total_steps": 1188, "loss": 0.1963, "lr": 3.0067792069066902e-05, "epoch": 2.398989898989899, "percentage": 39.98, "elapsed_time": "0:26:34", "remaining_time": "0:39:53"}
96
+ {"current_steps": 480, "total_steps": 1188, "loss": 0.2008, "lr": 2.9812783259138133e-05, "epoch": 2.4242424242424243, "percentage": 40.4, "elapsed_time": "0:26:48", "remaining_time": "0:39:32"}
97
+ {"current_steps": 485, "total_steps": 1188, "loss": 0.1868, "lr": 2.955565575277449e-05, "epoch": 2.4494949494949494, "percentage": 40.82, "elapsed_time": "0:27:01", "remaining_time": "0:39:11"}
98
+ {"current_steps": 490, "total_steps": 1188, "loss": 0.1896, "lr": 2.929646506685805e-05, "epoch": 2.474747474747475, "percentage": 41.25, "elapsed_time": "0:27:14", "remaining_time": "0:38:48"}
99
+ {"current_steps": 495, "total_steps": 1188, "loss": 0.1853, "lr": 2.9035267163735856e-05, "epoch": 2.5, "percentage": 41.67, "elapsed_time": "0:27:27", "remaining_time": "0:38:26"}
100
+ {"current_steps": 500, "total_steps": 1188, "loss": 0.1917, "lr": 2.8772118439136972e-05, "epoch": 2.525252525252525, "percentage": 42.09, "elapsed_time": "0:27:41", "remaining_time": "0:38:06"}
101
+ {"current_steps": 505, "total_steps": 1188, "loss": 0.2, "lr": 2.8507075709996015e-05, "epoch": 2.5505050505050506, "percentage": 42.51, "elapsed_time": "0:29:05", "remaining_time": "0:39:20"}
102
+ {"current_steps": 510, "total_steps": 1188, "loss": 0.1841, "lr": 2.8240196202185636e-05, "epoch": 2.5757575757575757, "percentage": 42.93, "elapsed_time": "0:29:18", "remaining_time": "0:38:57"}
103
+ {"current_steps": 515, "total_steps": 1188, "loss": 0.2015, "lr": 2.797153753816084e-05, "epoch": 2.601010101010101, "percentage": 43.35, "elapsed_time": "0:29:31", "remaining_time": "0:38:35"}
104
+ {"current_steps": 520, "total_steps": 1188, "loss": 0.1847, "lr": 2.770115772451758e-05, "epoch": 2.6262626262626263, "percentage": 43.77, "elapsed_time": "0:29:45", "remaining_time": "0:38:13"}
105
+ {"current_steps": 525, "total_steps": 1188, "loss": 0.1868, "lr": 2.7429115139468443e-05, "epoch": 2.6515151515151514, "percentage": 44.19, "elapsed_time": "0:29:58", "remaining_time": "0:37:51"}
106
+ {"current_steps": 530, "total_steps": 1188, "loss": 0.1955, "lr": 2.7155468520238116e-05, "epoch": 2.676767676767677, "percentage": 44.61, "elapsed_time": "0:30:13", "remaining_time": "0:37:30"}
107
+ {"current_steps": 535, "total_steps": 1188, "loss": 0.1881, "lr": 2.6880276950381316e-05, "epoch": 2.702020202020202, "percentage": 45.03, "elapsed_time": "0:30:27", "remaining_time": "0:37:10"}
108
+ {"current_steps": 540, "total_steps": 1188, "loss": 0.1805, "lr": 2.6603599847025935e-05, "epoch": 2.7272727272727275, "percentage": 45.45, "elapsed_time": "0:30:39", "remaining_time": "0:36:47"}
109
+ {"current_steps": 545, "total_steps": 1188, "loss": 0.191, "lr": 2.63254969480442e-05, "epoch": 2.7525252525252526, "percentage": 45.88, "elapsed_time": "0:30:53", "remaining_time": "0:36:26"}
110
+ {"current_steps": 550, "total_steps": 1188, "loss": 0.1875, "lr": 2.6046028299154545e-05, "epoch": 2.7777777777777777, "percentage": 46.3, "elapsed_time": "0:31:08", "remaining_time": "0:36:07"}
111
+ {"current_steps": 555, "total_steps": 1188, "loss": 0.1886, "lr": 2.5765254240957024e-05, "epoch": 2.8030303030303028, "percentage": 46.72, "elapsed_time": "0:31:21", "remaining_time": "0:35:46"}
112
+ {"current_steps": 560, "total_steps": 1188, "loss": 0.1867, "lr": 2.5483235395905056e-05, "epoch": 2.8282828282828283, "percentage": 47.14, "elapsed_time": "0:31:34", "remaining_time": "0:35:24"}
113
+ {"current_steps": 565, "total_steps": 1188, "loss": 0.1894, "lr": 2.5200032655216343e-05, "epoch": 2.8535353535353534, "percentage": 47.56, "elapsed_time": "0:31:47", "remaining_time": "0:35:03"}
114
+ {"current_steps": 570, "total_steps": 1188, "loss": 0.1858, "lr": 2.4915707165725694e-05, "epoch": 2.878787878787879, "percentage": 47.98, "elapsed_time": "0:32:00", "remaining_time": "0:34:42"}
115
+ {"current_steps": 575, "total_steps": 1188, "loss": 0.1847, "lr": 2.4630320316682724e-05, "epoch": 2.904040404040404, "percentage": 48.4, "elapsed_time": "0:32:13", "remaining_time": "0:34:21"}
116
+ {"current_steps": 580, "total_steps": 1188, "loss": 0.1856, "lr": 2.4343933726497183e-05, "epoch": 2.929292929292929, "percentage": 48.82, "elapsed_time": "0:32:26", "remaining_time": "0:34:00"}
117
+ {"current_steps": 585, "total_steps": 1188, "loss": 0.1887, "lr": 2.4056609229434812e-05, "epoch": 2.9545454545454546, "percentage": 49.24, "elapsed_time": "0:32:39", "remaining_time": "0:33:39"}
118
+ {"current_steps": 590, "total_steps": 1188, "loss": 0.1867, "lr": 2.376840886226656e-05, "epoch": 2.9797979797979797, "percentage": 49.66, "elapsed_time": "0:32:51", "remaining_time": "0:33:18"}
119
+ {"current_steps": 595, "total_steps": 1188, "loss": 0.1831, "lr": 2.347939485087416e-05, "epoch": 3.005050505050505, "percentage": 50.08, "elapsed_time": "0:33:04", "remaining_time": "0:32:58"}
120
+ {"current_steps": 600, "total_steps": 1188, "loss": 0.157, "lr": 2.3189629596814788e-05, "epoch": 3.0303030303030303, "percentage": 50.51, "elapsed_time": "0:33:18", "remaining_time": "0:32:39"}
121
+ {"current_steps": 605, "total_steps": 1188, "loss": 0.1605, "lr": 2.2899175663847823e-05, "epoch": 3.0555555555555554, "percentage": 50.93, "elapsed_time": "0:34:39", "remaining_time": "0:33:23"}
122
+ {"current_steps": 610, "total_steps": 1188, "loss": 0.1609, "lr": 2.2608095764426602e-05, "epoch": 3.080808080808081, "percentage": 51.35, "elapsed_time": "0:34:52", "remaining_time": "0:33:02"}
123
+ {"current_steps": 615, "total_steps": 1188, "loss": 0.1649, "lr": 2.2316452746158063e-05, "epoch": 3.106060606060606, "percentage": 51.77, "elapsed_time": "0:35:06", "remaining_time": "0:32:42"}
124
+ {"current_steps": 620, "total_steps": 1188, "loss": 0.161, "lr": 2.2024309578233174e-05, "epoch": 3.1313131313131315, "percentage": 52.19, "elapsed_time": "0:35:20", "remaining_time": "0:32:22"}
125
+ {"current_steps": 625, "total_steps": 1188, "loss": 0.1574, "lr": 2.1731729337831173e-05, "epoch": 3.1565656565656566, "percentage": 52.61, "elapsed_time": "0:35:33", "remaining_time": "0:32:02"}
126
+ {"current_steps": 630, "total_steps": 1188, "loss": 0.165, "lr": 2.143877519650042e-05, "epoch": 3.1818181818181817, "percentage": 53.03, "elapsed_time": "0:35:45", "remaining_time": "0:31:40"}
127
+ {"current_steps": 635, "total_steps": 1188, "loss": 0.1654, "lr": 2.1145510406518928e-05, "epoch": 3.207070707070707, "percentage": 53.45, "elapsed_time": "0:36:00", "remaining_time": "0:31:21"}
128
+ {"current_steps": 640, "total_steps": 1188, "loss": 0.1603, "lr": 2.0851998287237452e-05, "epoch": 3.2323232323232323, "percentage": 53.87, "elapsed_time": "0:36:12", "remaining_time": "0:31:00"}
129
+ {"current_steps": 645, "total_steps": 1188, "loss": 0.1568, "lr": 2.0558302211408075e-05, "epoch": 3.257575757575758, "percentage": 54.29, "elapsed_time": "0:36:27", "remaining_time": "0:30:41"}
130
+ {"current_steps": 650, "total_steps": 1188, "loss": 0.1657, "lr": 2.0264485591501272e-05, "epoch": 3.282828282828283, "percentage": 54.71, "elapsed_time": "0:36:41", "remaining_time": "0:30:21"}
131
+ {"current_steps": 655, "total_steps": 1188, "loss": 0.1626, "lr": 1.9970611866014432e-05, "epoch": 3.308080808080808, "percentage": 55.13, "elapsed_time": "0:36:53", "remaining_time": "0:30:01"}
132
+ {"current_steps": 660, "total_steps": 1188, "loss": 0.1581, "lr": 1.967674448577471e-05, "epoch": 3.3333333333333335, "percentage": 55.56, "elapsed_time": "0:37:06", "remaining_time": "0:29:41"}
133
+ {"current_steps": 665, "total_steps": 1188, "loss": 0.1586, "lr": 1.9382946900239247e-05, "epoch": 3.3585858585858586, "percentage": 55.98, "elapsed_time": "0:37:18", "remaining_time": "0:29:20"}
134
+ {"current_steps": 670, "total_steps": 1188, "loss": 0.171, "lr": 1.9089282543795692e-05, "epoch": 3.3838383838383836, "percentage": 56.4, "elapsed_time": "0:37:32", "remaining_time": "0:29:01"}
135
+ {"current_steps": 675, "total_steps": 1188, "loss": 0.1549, "lr": 1.879581482206592e-05, "epoch": 3.409090909090909, "percentage": 56.82, "elapsed_time": "0:37:45", "remaining_time": "0:28:41"}
136
+ {"current_steps": 680, "total_steps": 1188, "loss": 0.1645, "lr": 1.8502607098216056e-05, "epoch": 3.4343434343434343, "percentage": 57.24, "elapsed_time": "0:37:59", "remaining_time": "0:28:22"}
137
+ {"current_steps": 685, "total_steps": 1188, "loss": 0.1636, "lr": 1.8209722679275602e-05, "epoch": 3.45959595959596, "percentage": 57.66, "elapsed_time": "0:38:13", "remaining_time": "0:28:03"}
138
+ {"current_steps": 690, "total_steps": 1188, "loss": 0.1615, "lr": 1.791722480246868e-05, "epoch": 3.484848484848485, "percentage": 58.08, "elapsed_time": "0:38:25", "remaining_time": "0:27:44"}
139
+ {"current_steps": 695, "total_steps": 1188, "loss": 0.1592, "lr": 1.762517662156037e-05, "epoch": 3.51010101010101, "percentage": 58.5, "elapsed_time": "0:38:38", "remaining_time": "0:27:24"}
140
+ {"current_steps": 700, "total_steps": 1188, "loss": 0.1648, "lr": 1.733364119322109e-05, "epoch": 3.5353535353535355, "percentage": 58.92, "elapsed_time": "0:38:51", "remaining_time": "0:27:05"}
141
+ {"current_steps": 705, "total_steps": 1188, "loss": 0.1683, "lr": 1.704268146341185e-05, "epoch": 3.5606060606060606, "percentage": 59.34, "elapsed_time": "0:40:07", "remaining_time": "0:27:29"}
142
+ {"current_steps": 710, "total_steps": 1188, "loss": 0.1626, "lr": 1.675236025379355e-05, "epoch": 3.5858585858585856, "percentage": 59.76, "elapsed_time": "0:40:21", "remaining_time": "0:27:10"}
143
+ {"current_steps": 715, "total_steps": 1188, "loss": 0.1628, "lr": 1.6462740248162988e-05, "epoch": 3.611111111111111, "percentage": 60.19, "elapsed_time": "0:40:34", "remaining_time": "0:26:50"}
144
+ {"current_steps": 720, "total_steps": 1188, "loss": 0.1579, "lr": 1.6173883978918682e-05, "epoch": 3.6363636363636362, "percentage": 60.61, "elapsed_time": "0:40:46", "remaining_time": "0:26:30"}
145
+ {"current_steps": 725, "total_steps": 1188, "loss": 0.1606, "lr": 1.5885853813559392e-05, "epoch": 3.6616161616161618, "percentage": 61.03, "elapsed_time": "0:41:00", "remaining_time": "0:26:11"}
146
+ {"current_steps": 730, "total_steps": 1188, "loss": 0.1639, "lr": 1.5598711941218265e-05, "epoch": 3.686868686868687, "percentage": 61.45, "elapsed_time": "0:41:14", "remaining_time": "0:25:52"}
147
+ {"current_steps": 735, "total_steps": 1188, "loss": 0.1686, "lr": 1.531252035923541e-05, "epoch": 3.712121212121212, "percentage": 61.87, "elapsed_time": "0:41:28", "remaining_time": "0:25:34"}
148
+ {"current_steps": 740, "total_steps": 1188, "loss": 0.1652, "lr": 1.5027340859771972e-05, "epoch": 3.7373737373737375, "percentage": 62.29, "elapsed_time": "0:41:42", "remaining_time": "0:25:15"}
149
+ {"current_steps": 745, "total_steps": 1188, "loss": 0.1628, "lr": 1.4743235016468474e-05, "epoch": 3.7626262626262625, "percentage": 62.71, "elapsed_time": "0:41:56", "remaining_time": "0:24:56"}
150
+ {"current_steps": 750, "total_steps": 1188, "loss": 0.1614, "lr": 1.4460264171150296e-05, "epoch": 3.787878787878788, "percentage": 63.13, "elapsed_time": "0:42:09", "remaining_time": "0:24:37"}
151
+ {"current_steps": 755, "total_steps": 1188, "loss": 0.1659, "lr": 1.4178489420583297e-05, "epoch": 3.813131313131313, "percentage": 63.55, "elapsed_time": "0:42:23", "remaining_time": "0:24:18"}
152
+ {"current_steps": 760, "total_steps": 1188, "loss": 0.1595, "lr": 1.3897971603282278e-05, "epoch": 3.8383838383838382, "percentage": 63.97, "elapsed_time": "0:42:35", "remaining_time": "0:23:59"}
153
+ {"current_steps": 765, "total_steps": 1188, "loss": 0.166, "lr": 1.36187712863752e-05, "epoch": 3.8636363636363638, "percentage": 64.39, "elapsed_time": "0:42:49", "remaining_time": "0:23:40"}
154
+ {"current_steps": 770, "total_steps": 1188, "loss": 0.1599, "lr": 1.3340948752526069e-05, "epoch": 3.888888888888889, "percentage": 64.81, "elapsed_time": "0:43:02", "remaining_time": "0:23:22"}
155
+ {"current_steps": 775, "total_steps": 1188, "loss": 0.1596, "lr": 1.3064563986919142e-05, "epoch": 3.9141414141414144, "percentage": 65.24, "elapsed_time": "0:43:15", "remaining_time": "0:23:03"}
156
+ {"current_steps": 780, "total_steps": 1188, "loss": 0.1628, "lr": 1.278967666430745e-05, "epoch": 3.9393939393939394, "percentage": 65.66, "elapsed_time": "0:43:29", "remaining_time": "0:22:45"}
157
+ {"current_steps": 785, "total_steps": 1188, "loss": 0.1641, "lr": 1.2516346136128318e-05, "epoch": 3.9646464646464645, "percentage": 66.08, "elapsed_time": "0:43:43", "remaining_time": "0:22:27"}
158
+ {"current_steps": 790, "total_steps": 1188, "loss": 0.1634, "lr": 1.2244631417688632e-05, "epoch": 3.98989898989899, "percentage": 66.5, "elapsed_time": "0:43:58", "remaining_time": "0:22:09"}
159
+ {"current_steps": 795, "total_steps": 1188, "loss": 0.1509, "lr": 1.197459117542278e-05, "epoch": 4.015151515151516, "percentage": 66.92, "elapsed_time": "0:44:12", "remaining_time": "0:21:51"}
160
+ {"current_steps": 800, "total_steps": 1188, "loss": 0.1365, "lr": 1.170628371422587e-05, "epoch": 4.040404040404041, "percentage": 67.34, "elapsed_time": "0:44:25", "remaining_time": "0:21:32"}
161
+ {"current_steps": 805, "total_steps": 1188, "loss": 0.1454, "lr": 1.1439766964864995e-05, "epoch": 4.065656565656566, "percentage": 67.76, "elapsed_time": "0:45:55", "remaining_time": "0:21:51"}
162
+ {"current_steps": 810, "total_steps": 1188, "loss": 0.1405, "lr": 1.117509847147128e-05, "epoch": 4.090909090909091, "percentage": 68.18, "elapsed_time": "0:46:08", "remaining_time": "0:21:32"}
163
+ {"current_steps": 815, "total_steps": 1188, "loss": 0.138, "lr": 1.0912335379115469e-05, "epoch": 4.116161616161616, "percentage": 68.6, "elapsed_time": "0:46:22", "remaining_time": "0:21:13"}
164
+ {"current_steps": 820, "total_steps": 1188, "loss": 0.1441, "lr": 1.0651534421469569e-05, "epoch": 4.141414141414141, "percentage": 69.02, "elapsed_time": "0:46:33", "remaining_time": "0:20:53"}
165
+ {"current_steps": 825, "total_steps": 1188, "loss": 0.1423, "lr": 1.0392751908557406e-05, "epoch": 4.166666666666667, "percentage": 69.44, "elapsed_time": "0:46:47", "remaining_time": "0:20:35"}
166
+ {"current_steps": 830, "total_steps": 1188, "loss": 0.1385, "lr": 1.013604371459663e-05, "epoch": 4.191919191919192, "percentage": 69.87, "elapsed_time": "0:47:01", "remaining_time": "0:20:16"}
167
+ {"current_steps": 835, "total_steps": 1188, "loss": 0.1434, "lr": 9.881465265934802e-06, "epoch": 4.217171717171717, "percentage": 70.29, "elapsed_time": "0:47:14", "remaining_time": "0:19:58"}
168
+ {"current_steps": 840, "total_steps": 1188, "loss": 0.1395, "lr": 9.62907152908215e-06, "epoch": 4.242424242424242, "percentage": 70.71, "elapsed_time": "0:47:27", "remaining_time": "0:19:39"}
169
+ {"current_steps": 845, "total_steps": 1188, "loss": 0.145, "lr": 9.378916998843716e-06, "epoch": 4.267676767676767, "percentage": 71.13, "elapsed_time": "0:47:39", "remaining_time": "0:19:20"}
170
+ {"current_steps": 850, "total_steps": 1188, "loss": 0.1407, "lr": 9.13105568655322e-06, "epoch": 4.292929292929293, "percentage": 71.55, "elapsed_time": "0:47:53", "remaining_time": "0:19:02"}
171
+ {"current_steps": 855, "total_steps": 1188, "loss": 0.1433, "lr": 8.885541108411386e-06, "epoch": 4.318181818181818, "percentage": 71.97, "elapsed_time": "0:48:05", "remaining_time": "0:18:43"}
172
+ {"current_steps": 860, "total_steps": 1188, "loss": 0.146, "lr": 8.642426273931202e-06, "epoch": 4.343434343434343, "percentage": 72.39, "elapsed_time": "0:48:19", "remaining_time": "0:18:25"}
173
+ {"current_steps": 865, "total_steps": 1188, "loss": 0.1386, "lr": 8.40176367449247e-06, "epoch": 4.3686868686868685, "percentage": 72.81, "elapsed_time": "0:48:32", "remaining_time": "0:18:07"}
174
+ {"current_steps": 870, "total_steps": 1188, "loss": 0.1461, "lr": 8.16360527200833e-06, "epoch": 4.393939393939394, "percentage": 73.23, "elapsed_time": "0:48:45", "remaining_time": "0:17:49"}
175
+ {"current_steps": 875, "total_steps": 1188, "loss": 0.1456, "lr": 7.928002487706077e-06, "epoch": 4.41919191919192, "percentage": 73.65, "elapsed_time": "0:48:57", "remaining_time": "0:17:30"}
176
+ {"current_steps": 880, "total_steps": 1188, "loss": 0.1423, "lr": 7.69500619102469e-06, "epoch": 4.444444444444445, "percentage": 74.07, "elapsed_time": "0:49:10", "remaining_time": "0:17:12"}
177
+ {"current_steps": 885, "total_steps": 1188, "loss": 0.1453, "lr": 7.464666688631497e-06, "epoch": 4.46969696969697, "percentage": 74.49, "elapsed_time": "0:49:23", "remaining_time": "0:16:54"}
178
+ {"current_steps": 890, "total_steps": 1188, "loss": 0.1366, "lr": 7.237033713560415e-06, "epoch": 4.494949494949495, "percentage": 74.92, "elapsed_time": "0:49:36", "remaining_time": "0:16:36"}
179
+ {"current_steps": 895, "total_steps": 1188, "loss": 0.1416, "lr": 7.01215641447395e-06, "epoch": 4.52020202020202, "percentage": 75.34, "elapsed_time": "0:49:48", "remaining_time": "0:16:18"}
180
+ {"current_steps": 900, "total_steps": 1188, "loss": 0.144, "lr": 6.790083345051457e-06, "epoch": 4.545454545454545, "percentage": 75.76, "elapsed_time": "0:50:00", "remaining_time": "0:16:00"}
181
+ {"current_steps": 905, "total_steps": 1188, "loss": 0.142, "lr": 6.570862453505793e-06, "epoch": 4.570707070707071, "percentage": 76.18, "elapsed_time": "0:50:53", "remaining_time": "0:15:54"}
182
+ {"current_steps": 910, "total_steps": 1188, "loss": 0.1398, "lr": 6.35454107223074e-06, "epoch": 4.595959595959596, "percentage": 76.6, "elapsed_time": "0:51:07", "remaining_time": "0:15:37"}
183
+ {"current_steps": 915, "total_steps": 1188, "loss": 0.1419, "lr": 6.141165907581395e-06, "epoch": 4.621212121212121, "percentage": 77.02, "elapsed_time": "0:51:21", "remaining_time": "0:15:19"}
184
+ {"current_steps": 920, "total_steps": 1188, "loss": 0.138, "lr": 5.9307830297896755e-06, "epoch": 4.646464646464646, "percentage": 77.44, "elapsed_time": "0:51:34", "remaining_time": "0:15:01"}
185
+ {"current_steps": 925, "total_steps": 1188, "loss": 0.1378, "lr": 5.723437863017256e-06, "epoch": 4.671717171717171, "percentage": 77.86, "elapsed_time": "0:51:48", "remaining_time": "0:14:43"}
186
+ {"current_steps": 930, "total_steps": 1188, "loss": 0.138, "lr": 5.519175175547919e-06, "epoch": 4.696969696969697, "percentage": 78.28, "elapsed_time": "0:52:00", "remaining_time": "0:14:25"}
187
+ {"current_steps": 935, "total_steps": 1188, "loss": 0.1466, "lr": 5.318039070121557e-06, "epoch": 4.722222222222222, "percentage": 78.7, "elapsed_time": "0:52:14", "remaining_time": "0:14:08"}
188
+ {"current_steps": 940, "total_steps": 1188, "loss": 0.1371, "lr": 5.120072974411863e-06, "epoch": 4.747474747474747, "percentage": 79.12, "elapsed_time": "0:52:27", "remaining_time": "0:13:50"}
189
+ {"current_steps": 945, "total_steps": 1188, "loss": 0.1408, "lr": 4.92531963164981e-06, "epoch": 4.7727272727272725, "percentage": 79.55, "elapsed_time": "0:52:41", "remaining_time": "0:13:33"}
190
+ {"current_steps": 950, "total_steps": 1188, "loss": 0.1377, "lr": 4.733821091394841e-06, "epoch": 4.797979797979798, "percentage": 79.97, "elapsed_time": "0:52:54", "remaining_time": "0:13:15"}
191
+ {"current_steps": 955, "total_steps": 1188, "loss": 0.1341, "lr": 4.5456187004558806e-06, "epoch": 4.8232323232323235, "percentage": 80.39, "elapsed_time": "0:53:08", "remaining_time": "0:12:57"}
192
+ {"current_steps": 960, "total_steps": 1188, "loss": 0.1415, "lr": 4.360753093964094e-06, "epoch": 4.848484848484849, "percentage": 80.81, "elapsed_time": "0:53:22", "remaining_time": "0:12:40"}
193
+ {"current_steps": 965, "total_steps": 1188, "loss": 0.1474, "lr": 4.179264186599239e-06, "epoch": 4.873737373737374, "percentage": 81.23, "elapsed_time": "0:53:36", "remaining_time": "0:12:23"}
194
+ {"current_steps": 970, "total_steps": 1188, "loss": 0.1362, "lr": 4.001191163971645e-06, "epoch": 4.898989898989899, "percentage": 81.65, "elapsed_time": "0:53:49", "remaining_time": "0:12:05"}
195
+ {"current_steps": 975, "total_steps": 1188, "loss": 0.1436, "lr": 3.826572474161565e-06, "epoch": 4.924242424242424, "percentage": 82.07, "elapsed_time": "0:54:02", "remaining_time": "0:11:48"}
196
+ {"current_steps": 980, "total_steps": 1188, "loss": 0.1378, "lr": 3.65544581941776e-06, "epoch": 4.94949494949495, "percentage": 82.49, "elapsed_time": "0:54:14", "remaining_time": "0:11:30"}
197
+ {"current_steps": 985, "total_steps": 1188, "loss": 0.1428, "lr": 3.487848148017161e-06, "epoch": 4.974747474747475, "percentage": 82.91, "elapsed_time": "0:54:28", "remaining_time": "0:11:13"}
198
+ {"current_steps": 990, "total_steps": 1188, "loss": 0.1371, "lr": 3.3238156462872937e-06, "epoch": 5.0, "percentage": 83.33, "elapsed_time": "0:54:41", "remaining_time": "0:10:56"}
199
+ {"current_steps": 995, "total_steps": 1188, "loss": 0.1309, "lr": 3.1633837307932037e-06, "epoch": 5.025252525252525, "percentage": 83.75, "elapsed_time": "0:54:54", "remaining_time": "0:10:39"}
200
+ {"current_steps": 1000, "total_steps": 1188, "loss": 0.1292, "lr": 3.0065870406906094e-06, "epoch": 5.05050505050505, "percentage": 84.18, "elapsed_time": "0:55:08", "remaining_time": "0:10:22"}
201
+ {"current_steps": 1005, "total_steps": 1188, "loss": 0.1253, "lr": 2.8534594302469142e-06, "epoch": 5.075757575757576, "percentage": 84.6, "elapsed_time": "0:56:20", "remaining_time": "0:10:15"}
202
+ {"current_steps": 1010, "total_steps": 1188, "loss": 0.1276, "lr": 2.7040339615316315e-06, "epoch": 5.101010101010101, "percentage": 85.02, "elapsed_time": "0:56:33", "remaining_time": "0:09:58"}
203
+ {"current_steps": 1015, "total_steps": 1188, "loss": 0.1323, "lr": 2.5583428972779236e-06, "epoch": 5.126262626262626, "percentage": 85.44, "elapsed_time": "0:56:47", "remaining_time": "0:09:40"}
204
+ {"current_steps": 1020, "total_steps": 1188, "loss": 0.1323, "lr": 2.4164176939166883e-06, "epoch": 5.151515151515151, "percentage": 85.86, "elapsed_time": "0:57:00", "remaining_time": "0:09:23"}
205
+ {"current_steps": 1025, "total_steps": 1188, "loss": 0.1353, "lr": 2.278288994784723e-06, "epoch": 5.1767676767676765, "percentage": 86.28, "elapsed_time": "0:57:14", "remaining_time": "0:09:06"}
206
+ {"current_steps": 1030, "total_steps": 1188, "loss": 0.1335, "lr": 2.143986623508478e-06, "epoch": 5.202020202020202, "percentage": 86.7, "elapsed_time": "0:57:26", "remaining_time": "0:08:48"}
207
+ {"current_steps": 1035, "total_steps": 1188, "loss": 0.1302, "lr": 2.0135395775647916e-06, "epoch": 5.2272727272727275, "percentage": 87.12, "elapsed_time": "0:57:41", "remaining_time": "0:08:31"}
208
+ {"current_steps": 1040, "total_steps": 1188, "loss": 0.1292, "lr": 1.8869760220199707e-06, "epoch": 5.252525252525253, "percentage": 87.54, "elapsed_time": "0:57:53", "remaining_time": "0:08:14"}
209
+ {"current_steps": 1045, "total_steps": 1188, "loss": 0.1331, "lr": 1.7643232834486347e-06, "epoch": 5.277777777777778, "percentage": 87.96, "elapsed_time": "0:58:06", "remaining_time": "0:07:57"}
210
+ {"current_steps": 1050, "total_steps": 1188, "loss": 0.1284, "lr": 1.6456078440335699e-06, "epoch": 5.303030303030303, "percentage": 88.38, "elapsed_time": "0:58:19", "remaining_time": "0:07:39"}
211
+ {"current_steps": 1055, "total_steps": 1188, "loss": 0.1252, "lr": 1.530855335847916e-06, "epoch": 5.328282828282829, "percentage": 88.8, "elapsed_time": "0:58:31", "remaining_time": "0:07:22"}
212
+ {"current_steps": 1060, "total_steps": 1188, "loss": 0.1281, "lr": 1.4200905353209127e-06, "epoch": 5.353535353535354, "percentage": 89.23, "elapsed_time": "0:58:44", "remaining_time": "0:07:05"}
213
+ {"current_steps": 1065, "total_steps": 1188, "loss": 0.1294, "lr": 1.3133373578883557e-06, "epoch": 5.378787878787879, "percentage": 89.65, "elapsed_time": "0:58:56", "remaining_time": "0:06:48"}
214
+ {"current_steps": 1070, "total_steps": 1188, "loss": 0.1299, "lr": 1.210618852828962e-06, "epoch": 5.404040404040404, "percentage": 90.07, "elapsed_time": "0:59:10", "remaining_time": "0:06:31"}
215
+ {"current_steps": 1075, "total_steps": 1188, "loss": 0.1277, "lr": 1.111957198287792e-06, "epoch": 5.429292929292929, "percentage": 90.49, "elapsed_time": "0:59:23", "remaining_time": "0:06:14"}
216
+ {"current_steps": 1080, "total_steps": 1188, "loss": 0.1263, "lr": 1.0173736964876867e-06, "epoch": 5.454545454545454, "percentage": 90.91, "elapsed_time": "0:59:36", "remaining_time": "0:05:57"}
217
+ {"current_steps": 1085, "total_steps": 1188, "loss": 0.1284, "lr": 9.268887691298878e-07, "epoch": 5.47979797979798, "percentage": 91.33, "elapsed_time": "0:59:50", "remaining_time": "0:05:40"}
218
+ {"current_steps": 1090, "total_steps": 1188, "loss": 0.1277, "lr": 8.405219529847453e-07, "epoch": 5.505050505050505, "percentage": 91.75, "elapsed_time": "1:00:03", "remaining_time": "0:05:23"}
219
+ {"current_steps": 1095, "total_steps": 1188, "loss": 0.1345, "lr": 7.582918956734909e-07, "epoch": 5.53030303030303, "percentage": 92.17, "elapsed_time": "1:00:16", "remaining_time": "0:05:07"}
220
+ {"current_steps": 1100, "total_steps": 1188, "loss": 0.1265, "lr": 6.802163516419979e-07, "epoch": 5.555555555555555, "percentage": 92.59, "elapsed_time": "1:00:30", "remaining_time": "0:04:50"}
221
+ {"current_steps": 1105, "total_steps": 1188, "loss": 0.1307, "lr": 6.063121783273907e-07, "epoch": 5.58080808080808, "percentage": 93.01, "elapsed_time": "1:01:44", "remaining_time": "0:04:38"}
222
+ {"current_steps": 1110, "total_steps": 1188, "loss": 0.1277, "lr": 5.365953325183216e-07, "epoch": 5.606060606060606, "percentage": 93.43, "elapsed_time": "1:01:57", "remaining_time": "0:04:21"}
223
+ {"current_steps": 1115, "total_steps": 1188, "loss": 0.1323, "lr": 4.7108086690970113e-07, "epoch": 5.6313131313131315, "percentage": 93.86, "elapsed_time": "1:02:11", "remaining_time": "0:04:04"}
224
+ {"current_steps": 1120, "total_steps": 1188, "loss": 0.1235, "lr": 4.0978292685265765e-07, "epoch": 5.656565656565657, "percentage": 94.28, "elapsed_time": "1:02:24", "remaining_time": "0:03:47"}
225
+ {"current_steps": 1125, "total_steps": 1188, "loss": 0.1302, "lr": 3.5271474730037557e-07, "epoch": 5.681818181818182, "percentage": 94.7, "elapsed_time": "1:02:37", "remaining_time": "0:03:30"}
226
+ {"current_steps": 1130, "total_steps": 1188, "loss": 0.127, "lr": 2.998886499505171e-07, "epoch": 5.707070707070707, "percentage": 95.12, "elapsed_time": "1:02:51", "remaining_time": "0:03:13"}
227
+ {"current_steps": 1135, "total_steps": 1188, "loss": 0.1345, "lr": 2.513160405848303e-07, "epoch": 5.732323232323233, "percentage": 95.54, "elapsed_time": "1:03:04", "remaining_time": "0:02:56"}
228
+ {"current_steps": 1140, "total_steps": 1188, "loss": 0.1328, "lr": 2.0700740660648312e-07, "epoch": 5.757575757575758, "percentage": 95.96, "elapsed_time": "1:03:16", "remaining_time": "0:02:39"}
229
+ {"current_steps": 1145, "total_steps": 1188, "loss": 0.1284, "lr": 1.669723147757263e-07, "epoch": 5.782828282828283, "percentage": 96.38, "elapsed_time": "1:03:29", "remaining_time": "0:02:23"}
230
+ {"current_steps": 1150, "total_steps": 1188, "loss": 0.1331, "lr": 1.312194091443142e-07, "epoch": 5.808080808080808, "percentage": 96.8, "elapsed_time": "1:03:42", "remaining_time": "0:02:06"}
231
+ {"current_steps": 1155, "total_steps": 1188, "loss": 0.1306, "lr": 9.975640918915119e-08, "epoch": 5.833333333333333, "percentage": 97.22, "elapsed_time": "1:03:55", "remaining_time": "0:01:49"}
232
+ {"current_steps": 1160, "total_steps": 1188, "loss": 0.1304, "lr": 7.259010814555378e-08, "epoch": 5.858585858585858, "percentage": 97.64, "elapsed_time": "1:04:08", "remaining_time": "0:01:32"}
233
+ {"current_steps": 1165, "total_steps": 1188, "loss": 0.1292, "lr": 4.972637154052606e-08, "epoch": 5.883838383838384, "percentage": 98.06, "elapsed_time": "1:04:21", "remaining_time": "0:01:16"}
234
+ {"current_steps": 1170, "total_steps": 1188, "loss": 0.1341, "lr": 3.117013592631501e-08, "epoch": 5.909090909090909, "percentage": 98.48, "elapsed_time": "1:04:34", "remaining_time": "0:00:59"}
235
+ {"current_steps": 1175, "total_steps": 1188, "loss": 0.1267, "lr": 1.6925407814545325e-08, "epoch": 5.934343434343434, "percentage": 98.91, "elapsed_time": "1:04:47", "remaining_time": "0:00:43"}
236
+ {"current_steps": 1180, "total_steps": 1188, "loss": 0.1297, "lr": 6.995262811178016e-09, "epoch": 5.959595959595959, "percentage": 99.33, "elapsed_time": "1:05:01", "remaining_time": "0:00:26"}
237
+ {"current_steps": 1185, "total_steps": 1188, "loss": 0.1304, "lr": 1.3818449524416467e-09, "epoch": 5.984848484848484, "percentage": 99.75, "elapsed_time": "1:05:14", "remaining_time": "0:00:09"}
238
+ {"current_steps": 1188, "total_steps": 1188, "epoch": 6.0, "percentage": 100.0, "elapsed_time": "1:05:54", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,2654 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 6.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1188,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.025252525252525252,
14
+ "grad_norm": 20.169370087659352,
15
+ "learning_rate": 1.3445378151260504e-06,
16
+ "loss": 1.0237,
17
+ "loss_nan_ranks": 0,
18
+ "loss_rank_avg": 1.0157525539398193,
19
+ "step": 5,
20
+ "valid_targets_mean": 1187.8,
21
+ "valid_targets_min": 751
22
+ },
23
+ {
24
+ "epoch": 0.050505050505050504,
25
+ "grad_norm": 16.675857749316165,
26
+ "learning_rate": 3.0252100840336137e-06,
27
+ "loss": 0.9775,
28
+ "loss_nan_ranks": 0,
29
+ "loss_rank_avg": 0.9797632694244385,
30
+ "step": 10,
31
+ "valid_targets_mean": 1131.0,
32
+ "valid_targets_min": 704
33
+ },
34
+ {
35
+ "epoch": 0.07575757575757576,
36
+ "grad_norm": 8.704541263565613,
37
+ "learning_rate": 4.705882352941177e-06,
38
+ "loss": 0.8599,
39
+ "loss_nan_ranks": 0,
40
+ "loss_rank_avg": 0.7952851057052612,
41
+ "step": 15,
42
+ "valid_targets_mean": 1290.6,
43
+ "valid_targets_min": 687
44
+ },
45
+ {
46
+ "epoch": 0.10101010101010101,
47
+ "grad_norm": 5.301964048196616,
48
+ "learning_rate": 6.386554621848739e-06,
49
+ "loss": 0.7379,
50
+ "loss_nan_ranks": 0,
51
+ "loss_rank_avg": 0.6894418001174927,
52
+ "step": 20,
53
+ "valid_targets_mean": 1500.8,
54
+ "valid_targets_min": 659
55
+ },
56
+ {
57
+ "epoch": 0.12626262626262627,
58
+ "grad_norm": 2.8651616903258805,
59
+ "learning_rate": 8.067226890756303e-06,
60
+ "loss": 0.6167,
61
+ "loss_nan_ranks": 0,
62
+ "loss_rank_avg": 0.5881441831588745,
63
+ "step": 25,
64
+ "valid_targets_mean": 1226.6,
65
+ "valid_targets_min": 693
66
+ },
67
+ {
68
+ "epoch": 0.15151515151515152,
69
+ "grad_norm": 1.9308802697891598,
70
+ "learning_rate": 9.747899159663867e-06,
71
+ "loss": 0.5299,
72
+ "loss_nan_ranks": 0,
73
+ "loss_rank_avg": 0.5140570998191833,
74
+ "step": 30,
75
+ "valid_targets_mean": 1314.4,
76
+ "valid_targets_min": 532
77
+ },
78
+ {
79
+ "epoch": 0.17676767676767677,
80
+ "grad_norm": 1.6911984939158233,
81
+ "learning_rate": 1.1428571428571429e-05,
82
+ "loss": 0.4981,
83
+ "loss_nan_ranks": 0,
84
+ "loss_rank_avg": 0.49357980489730835,
85
+ "step": 35,
86
+ "valid_targets_mean": 1118.2,
87
+ "valid_targets_min": 680
88
+ },
89
+ {
90
+ "epoch": 0.20202020202020202,
91
+ "grad_norm": 1.492452782606785,
92
+ "learning_rate": 1.3109243697478993e-05,
93
+ "loss": 0.4407,
94
+ "loss_nan_ranks": 0,
95
+ "loss_rank_avg": 0.4345753788948059,
96
+ "step": 40,
97
+ "valid_targets_mean": 1273.3,
98
+ "valid_targets_min": 629
99
+ },
100
+ {
101
+ "epoch": 0.22727272727272727,
102
+ "grad_norm": 1.500112384532108,
103
+ "learning_rate": 1.4789915966386557e-05,
104
+ "loss": 0.414,
105
+ "loss_nan_ranks": 0,
106
+ "loss_rank_avg": 0.3712472915649414,
107
+ "step": 45,
108
+ "valid_targets_mean": 1263.1,
109
+ "valid_targets_min": 620
110
+ },
111
+ {
112
+ "epoch": 0.25252525252525254,
113
+ "grad_norm": 1.0754446138300695,
114
+ "learning_rate": 1.647058823529412e-05,
115
+ "loss": 0.3676,
116
+ "loss_nan_ranks": 0,
117
+ "loss_rank_avg": 0.35347896814346313,
118
+ "step": 50,
119
+ "valid_targets_mean": 1464.4,
120
+ "valid_targets_min": 659
121
+ },
122
+ {
123
+ "epoch": 0.2777777777777778,
124
+ "grad_norm": 1.1618002004795733,
125
+ "learning_rate": 1.8151260504201682e-05,
126
+ "loss": 0.3464,
127
+ "loss_nan_ranks": 0,
128
+ "loss_rank_avg": 0.3541208803653717,
129
+ "step": 55,
130
+ "valid_targets_mean": 1315.9,
131
+ "valid_targets_min": 589
132
+ },
133
+ {
134
+ "epoch": 0.30303030303030304,
135
+ "grad_norm": 1.0883564166727384,
136
+ "learning_rate": 1.9831932773109244e-05,
137
+ "loss": 0.338,
138
+ "loss_nan_ranks": 0,
139
+ "loss_rank_avg": 0.3574886620044708,
140
+ "step": 60,
141
+ "valid_targets_mean": 1405.6,
142
+ "valid_targets_min": 758
143
+ },
144
+ {
145
+ "epoch": 0.3282828282828283,
146
+ "grad_norm": 1.0137280896162002,
147
+ "learning_rate": 2.1512605042016807e-05,
148
+ "loss": 0.3275,
149
+ "loss_nan_ranks": 0,
150
+ "loss_rank_avg": 0.3133365511894226,
151
+ "step": 65,
152
+ "valid_targets_mean": 1416.2,
153
+ "valid_targets_min": 753
154
+ },
155
+ {
156
+ "epoch": 0.35353535353535354,
157
+ "grad_norm": 1.0379235855718125,
158
+ "learning_rate": 2.3193277310924373e-05,
159
+ "loss": 0.2979,
160
+ "loss_nan_ranks": 0,
161
+ "loss_rank_avg": 0.3172857463359833,
162
+ "step": 70,
163
+ "valid_targets_mean": 1387.2,
164
+ "valid_targets_min": 615
165
+ },
166
+ {
167
+ "epoch": 0.3787878787878788,
168
+ "grad_norm": 1.0676260962261874,
169
+ "learning_rate": 2.4873949579831935e-05,
170
+ "loss": 0.2957,
171
+ "loss_nan_ranks": 0,
172
+ "loss_rank_avg": 0.2801944613456726,
173
+ "step": 75,
174
+ "valid_targets_mean": 1275.1,
175
+ "valid_targets_min": 831
176
+ },
177
+ {
178
+ "epoch": 0.40404040404040403,
179
+ "grad_norm": 1.0045547818037883,
180
+ "learning_rate": 2.6554621848739497e-05,
181
+ "loss": 0.3063,
182
+ "loss_nan_ranks": 0,
183
+ "loss_rank_avg": 0.3241528272628784,
184
+ "step": 80,
185
+ "valid_targets_mean": 1368.9,
186
+ "valid_targets_min": 846
187
+ },
188
+ {
189
+ "epoch": 0.4292929292929293,
190
+ "grad_norm": 0.9923280106185243,
191
+ "learning_rate": 2.8235294117647063e-05,
192
+ "loss": 0.2973,
193
+ "loss_nan_ranks": 0,
194
+ "loss_rank_avg": 0.2852829396724701,
195
+ "step": 85,
196
+ "valid_targets_mean": 1345.0,
197
+ "valid_targets_min": 735
198
+ },
199
+ {
200
+ "epoch": 0.45454545454545453,
201
+ "grad_norm": 0.9646459041819732,
202
+ "learning_rate": 2.9915966386554626e-05,
203
+ "loss": 0.2854,
204
+ "loss_nan_ranks": 0,
205
+ "loss_rank_avg": 0.2792156934738159,
206
+ "step": 90,
207
+ "valid_targets_mean": 1343.4,
208
+ "valid_targets_min": 738
209
+ },
210
+ {
211
+ "epoch": 0.4797979797979798,
212
+ "grad_norm": 1.052289746435847,
213
+ "learning_rate": 3.159663865546219e-05,
214
+ "loss": 0.2698,
215
+ "loss_nan_ranks": 0,
216
+ "loss_rank_avg": 0.26299500465393066,
217
+ "step": 95,
218
+ "valid_targets_mean": 1293.9,
219
+ "valid_targets_min": 773
220
+ },
221
+ {
222
+ "epoch": 0.5050505050505051,
223
+ "grad_norm": 1.1622301238597719,
224
+ "learning_rate": 3.3277310924369754e-05,
225
+ "loss": 0.2831,
226
+ "loss_nan_ranks": 0,
227
+ "loss_rank_avg": 0.311882883310318,
228
+ "step": 100,
229
+ "valid_targets_mean": 1408.7,
230
+ "valid_targets_min": 738
231
+ },
232
+ {
233
+ "epoch": 0.5303030303030303,
234
+ "grad_norm": 1.1114150871374988,
235
+ "learning_rate": 3.495798319327731e-05,
236
+ "loss": 0.2821,
237
+ "loss_nan_ranks": 0,
238
+ "loss_rank_avg": 0.275522381067276,
239
+ "step": 105,
240
+ "valid_targets_mean": 1212.6,
241
+ "valid_targets_min": 618
242
+ },
243
+ {
244
+ "epoch": 0.5555555555555556,
245
+ "grad_norm": 0.9706083743298367,
246
+ "learning_rate": 3.663865546218488e-05,
247
+ "loss": 0.2799,
248
+ "loss_nan_ranks": 0,
249
+ "loss_rank_avg": 0.2872806489467621,
250
+ "step": 110,
251
+ "valid_targets_mean": 1481.1,
252
+ "valid_targets_min": 918
253
+ },
254
+ {
255
+ "epoch": 0.5808080808080808,
256
+ "grad_norm": 1.1271397120654052,
257
+ "learning_rate": 3.8319327731092444e-05,
258
+ "loss": 0.279,
259
+ "loss_nan_ranks": 0,
260
+ "loss_rank_avg": 0.2863199710845947,
261
+ "step": 115,
262
+ "valid_targets_mean": 1283.5,
263
+ "valid_targets_min": 574
264
+ },
265
+ {
266
+ "epoch": 0.6060606060606061,
267
+ "grad_norm": 1.0471857507926354,
268
+ "learning_rate": 4e-05,
269
+ "loss": 0.2694,
270
+ "loss_nan_ranks": 0,
271
+ "loss_rank_avg": 0.29653966426849365,
272
+ "step": 120,
273
+ "valid_targets_mean": 1340.8,
274
+ "valid_targets_min": 756
275
+ },
276
+ {
277
+ "epoch": 0.6313131313131313,
278
+ "grad_norm": 1.1072262420802004,
279
+ "learning_rate": 3.999784088124753e-05,
280
+ "loss": 0.2793,
281
+ "loss_nan_ranks": 0,
282
+ "loss_rank_avg": 0.28706759214401245,
283
+ "step": 125,
284
+ "valid_targets_mean": 1297.0,
285
+ "valid_targets_min": 784
286
+ },
287
+ {
288
+ "epoch": 0.6565656565656566,
289
+ "grad_norm": 0.9900053971903253,
290
+ "learning_rate": 3.99913639911695e-05,
291
+ "loss": 0.2597,
292
+ "loss_nan_ranks": 0,
293
+ "loss_rank_avg": 0.2584039866924286,
294
+ "step": 130,
295
+ "valid_targets_mean": 1323.2,
296
+ "valid_targets_min": 618
297
+ },
298
+ {
299
+ "epoch": 0.6818181818181818,
300
+ "grad_norm": 0.9452591379939352,
301
+ "learning_rate": 3.998057072820338e-05,
302
+ "loss": 0.2705,
303
+ "loss_nan_ranks": 0,
304
+ "loss_rank_avg": 0.2791949212551117,
305
+ "step": 135,
306
+ "valid_targets_mean": 1479.4,
307
+ "valid_targets_min": 582
308
+ },
309
+ {
310
+ "epoch": 0.7070707070707071,
311
+ "grad_norm": 1.1345903226168657,
312
+ "learning_rate": 3.996546342274282e-05,
313
+ "loss": 0.2753,
314
+ "loss_nan_ranks": 0,
315
+ "loss_rank_avg": 0.270517498254776,
316
+ "step": 140,
317
+ "valid_targets_mean": 1323.2,
318
+ "valid_targets_min": 797
319
+ },
320
+ {
321
+ "epoch": 0.7323232323232324,
322
+ "grad_norm": 1.0964303784318785,
323
+ "learning_rate": 3.9946045336634485e-05,
324
+ "loss": 0.2598,
325
+ "loss_nan_ranks": 0,
326
+ "loss_rank_avg": 0.25797197222709656,
327
+ "step": 145,
328
+ "valid_targets_mean": 1143.2,
329
+ "valid_targets_min": 632
330
+ },
331
+ {
332
+ "epoch": 0.7575757575757576,
333
+ "grad_norm": 1.0480327343570168,
334
+ "learning_rate": 3.9922320662473755e-05,
335
+ "loss": 0.2704,
336
+ "loss_nan_ranks": 0,
337
+ "loss_rank_avg": 0.2590729296207428,
338
+ "step": 150,
339
+ "valid_targets_mean": 1109.0,
340
+ "valid_targets_min": 555
341
+ },
342
+ {
343
+ "epoch": 0.7828282828282829,
344
+ "grad_norm": 1.2063540633588965,
345
+ "learning_rate": 3.989429452269951e-05,
346
+ "loss": 0.2545,
347
+ "loss_nan_ranks": 0,
348
+ "loss_rank_avg": 0.24084429442882538,
349
+ "step": 155,
350
+ "valid_targets_mean": 1144.1,
351
+ "valid_targets_min": 685
352
+ },
353
+ {
354
+ "epoch": 0.8080808080808081,
355
+ "grad_norm": 1.1381600500975266,
356
+ "learning_rate": 3.986197296848816e-05,
357
+ "loss": 0.2619,
358
+ "loss_nan_ranks": 0,
359
+ "loss_rank_avg": 0.26328036189079285,
360
+ "step": 160,
361
+ "valid_targets_mean": 1215.6,
362
+ "valid_targets_min": 717
363
+ },
364
+ {
365
+ "epoch": 0.8333333333333334,
366
+ "grad_norm": 0.9369305409921032,
367
+ "learning_rate": 3.982536297844706e-05,
368
+ "loss": 0.2628,
369
+ "loss_nan_ranks": 0,
370
+ "loss_rank_avg": 0.25279849767684937,
371
+ "step": 165,
372
+ "valid_targets_mean": 1300.6,
373
+ "valid_targets_min": 679
374
+ },
375
+ {
376
+ "epoch": 0.8585858585858586,
377
+ "grad_norm": 0.8800799927020014,
378
+ "learning_rate": 3.978447245710784e-05,
379
+ "loss": 0.2588,
380
+ "loss_nan_ranks": 0,
381
+ "loss_rank_avg": 0.25856029987335205,
382
+ "step": 170,
383
+ "valid_targets_mean": 1579.3,
384
+ "valid_targets_min": 821
385
+ },
386
+ {
387
+ "epoch": 0.8838383838383839,
388
+ "grad_norm": 0.9268097877772618,
389
+ "learning_rate": 3.973931023321962e-05,
390
+ "loss": 0.2538,
391
+ "loss_nan_ranks": 0,
392
+ "loss_rank_avg": 0.24270369112491608,
393
+ "step": 175,
394
+ "valid_targets_mean": 1203.4,
395
+ "valid_targets_min": 684
396
+ },
397
+ {
398
+ "epoch": 0.9090909090909091,
399
+ "grad_norm": 0.9474619522436488,
400
+ "learning_rate": 3.9689886057842866e-05,
401
+ "loss": 0.2409,
402
+ "loss_nan_ranks": 0,
403
+ "loss_rank_avg": 0.25069937109947205,
404
+ "step": 180,
405
+ "valid_targets_mean": 1398.1,
406
+ "valid_targets_min": 711
407
+ },
408
+ {
409
+ "epoch": 0.9343434343434344,
410
+ "grad_norm": 1.034637445880305,
411
+ "learning_rate": 3.963621060224396e-05,
412
+ "loss": 0.2545,
413
+ "loss_nan_ranks": 0,
414
+ "loss_rank_avg": 0.24697241187095642,
415
+ "step": 185,
416
+ "valid_targets_mean": 1112.4,
417
+ "valid_targets_min": 582
418
+ },
419
+ {
420
+ "epoch": 0.9595959595959596,
421
+ "grad_norm": 0.9298977899777117,
422
+ "learning_rate": 3.957829545559118e-05,
423
+ "loss": 0.2553,
424
+ "loss_nan_ranks": 0,
425
+ "loss_rank_avg": 0.2632555365562439,
426
+ "step": 190,
427
+ "valid_targets_mean": 1315.6,
428
+ "valid_targets_min": 742
429
+ },
430
+ {
431
+ "epoch": 0.9848484848484849,
432
+ "grad_norm": 0.8977159403395447,
433
+ "learning_rate": 3.951615312245243e-05,
434
+ "loss": 0.2546,
435
+ "loss_nan_ranks": 0,
436
+ "loss_rank_avg": 0.2525337338447571,
437
+ "step": 195,
438
+ "valid_targets_mean": 1346.2,
439
+ "valid_targets_min": 774
440
+ },
441
+ {
442
+ "epoch": 1.0101010101010102,
443
+ "grad_norm": 1.021475627213122,
444
+ "learning_rate": 3.9449797020095404e-05,
445
+ "loss": 0.2535,
446
+ "loss_nan_ranks": 0,
447
+ "loss_rank_avg": 0.2486366480588913,
448
+ "step": 200,
449
+ "valid_targets_mean": 1331.2,
450
+ "valid_targets_min": 680
451
+ },
452
+ {
453
+ "epoch": 1.0353535353535352,
454
+ "grad_norm": 1.0071426224742983,
455
+ "learning_rate": 3.937924147559059e-05,
456
+ "loss": 0.2277,
457
+ "loss_nan_ranks": 0,
458
+ "loss_rank_avg": 0.2366667240858078,
459
+ "step": 205,
460
+ "valid_targets_mean": 1303.9,
461
+ "valid_targets_min": 648
462
+ },
463
+ {
464
+ "epoch": 1.0606060606060606,
465
+ "grad_norm": 0.863916533755164,
466
+ "learning_rate": 3.9304501722717925e-05,
467
+ "loss": 0.2358,
468
+ "loss_nan_ranks": 0,
469
+ "loss_rank_avg": 0.22564607858657837,
470
+ "step": 210,
471
+ "valid_targets_mean": 1418.2,
472
+ "valid_targets_min": 1067
473
+ },
474
+ {
475
+ "epoch": 1.0858585858585859,
476
+ "grad_norm": 0.932084360690738,
477
+ "learning_rate": 3.922559389867758e-05,
478
+ "loss": 0.2333,
479
+ "loss_nan_ranks": 0,
480
+ "loss_rank_avg": 0.24001938104629517,
481
+ "step": 215,
482
+ "valid_targets_mean": 1553.2,
483
+ "valid_targets_min": 1094
484
+ },
485
+ {
486
+ "epoch": 1.1111111111111112,
487
+ "grad_norm": 0.9418003911519435,
488
+ "learning_rate": 3.914253504060585e-05,
489
+ "loss": 0.2289,
490
+ "loss_nan_ranks": 0,
491
+ "loss_rank_avg": 0.2039659321308136,
492
+ "step": 220,
493
+ "valid_targets_mean": 1241.6,
494
+ "valid_targets_min": 594
495
+ },
496
+ {
497
+ "epoch": 1.1363636363636362,
498
+ "grad_norm": 0.9424926403942966,
499
+ "learning_rate": 3.90553430818965e-05,
500
+ "loss": 0.225,
501
+ "loss_nan_ranks": 0,
502
+ "loss_rank_avg": 0.22819814085960388,
503
+ "step": 225,
504
+ "valid_targets_mean": 1457.8,
505
+ "valid_targets_min": 837
506
+ },
507
+ {
508
+ "epoch": 1.1616161616161615,
509
+ "grad_norm": 0.8372692329391297,
510
+ "learning_rate": 3.896403684832887e-05,
511
+ "loss": 0.23,
512
+ "loss_nan_ranks": 0,
513
+ "loss_rank_avg": 0.22280089557170868,
514
+ "step": 230,
515
+ "valid_targets_mean": 1298.8,
516
+ "valid_targets_min": 791
517
+ },
518
+ {
519
+ "epoch": 1.1868686868686869,
520
+ "grad_norm": 0.9634864270568145,
521
+ "learning_rate": 3.886863605400306e-05,
522
+ "loss": 0.2314,
523
+ "loss_nan_ranks": 0,
524
+ "loss_rank_avg": 0.24971917271614075,
525
+ "step": 235,
526
+ "valid_targets_mean": 1264.4,
527
+ "valid_targets_min": 700
528
+ },
529
+ {
530
+ "epoch": 1.2121212121212122,
531
+ "grad_norm": 0.9062647387121051,
532
+ "learning_rate": 3.876916129708347e-05,
533
+ "loss": 0.2231,
534
+ "loss_nan_ranks": 0,
535
+ "loss_rank_avg": 0.22650766372680664,
536
+ "step": 240,
537
+ "valid_targets_mean": 1429.9,
538
+ "valid_targets_min": 854
539
+ },
540
+ {
541
+ "epoch": 1.2373737373737375,
542
+ "grad_norm": 0.9679426116681048,
543
+ "learning_rate": 3.866563405535142e-05,
544
+ "loss": 0.2363,
545
+ "loss_nan_ranks": 0,
546
+ "loss_rank_avg": 0.22411300241947174,
547
+ "step": 245,
548
+ "valid_targets_mean": 1249.2,
549
+ "valid_targets_min": 550
550
+ },
551
+ {
552
+ "epoch": 1.2626262626262625,
553
+ "grad_norm": 1.5210230333522776,
554
+ "learning_rate": 3.855807668156779e-05,
555
+ "loss": 0.2187,
556
+ "loss_nan_ranks": 0,
557
+ "loss_rank_avg": 0.23495987057685852,
558
+ "step": 250,
559
+ "valid_targets_mean": 1417.0,
560
+ "valid_targets_min": 720
561
+ },
562
+ {
563
+ "epoch": 1.2878787878787878,
564
+ "grad_norm": 1.0154197633890338,
565
+ "learning_rate": 3.844651239864687e-05,
566
+ "loss": 0.2273,
567
+ "loss_nan_ranks": 0,
568
+ "loss_rank_avg": 0.22711053490638733,
569
+ "step": 255,
570
+ "valid_targets_mean": 1173.8,
571
+ "valid_targets_min": 776
572
+ },
573
+ {
574
+ "epoch": 1.3131313131313131,
575
+ "grad_norm": 0.9259864099946552,
576
+ "learning_rate": 3.8330965294642186e-05,
577
+ "loss": 0.227,
578
+ "loss_nan_ranks": 0,
579
+ "loss_rank_avg": 0.2507520318031311,
580
+ "step": 260,
581
+ "valid_targets_mean": 1453.4,
582
+ "valid_targets_min": 711
583
+ },
584
+ {
585
+ "epoch": 1.3383838383838385,
586
+ "grad_norm": 0.8413676808910939,
587
+ "learning_rate": 3.821146031754565e-05,
588
+ "loss": 0.2331,
589
+ "loss_nan_ranks": 0,
590
+ "loss_rank_avg": 0.235780268907547,
591
+ "step": 265,
592
+ "valid_targets_mean": 1401.7,
593
+ "valid_targets_min": 681
594
+ },
595
+ {
596
+ "epoch": 1.3636363636363638,
597
+ "grad_norm": 0.9876932162935549,
598
+ "learning_rate": 3.808802326990096e-05,
599
+ "loss": 0.2198,
600
+ "loss_nan_ranks": 0,
601
+ "loss_rank_avg": 0.21561799943447113,
602
+ "step": 270,
603
+ "valid_targets_mean": 1162.8,
604
+ "valid_targets_min": 605
605
+ },
606
+ {
607
+ "epoch": 1.3888888888888888,
608
+ "grad_norm": 1.1186277755619236,
609
+ "learning_rate": 3.7960680803232544e-05,
610
+ "loss": 0.2212,
611
+ "loss_nan_ranks": 0,
612
+ "loss_rank_avg": 0.2476837933063507,
613
+ "step": 275,
614
+ "valid_targets_mean": 1249.6,
615
+ "valid_targets_min": 644
616
+ },
617
+ {
618
+ "epoch": 1.4141414141414141,
619
+ "grad_norm": 0.9272391389819834,
620
+ "learning_rate": 3.782946041229119e-05,
621
+ "loss": 0.2239,
622
+ "loss_nan_ranks": 0,
623
+ "loss_rank_avg": 0.22536993026733398,
624
+ "step": 280,
625
+ "valid_targets_mean": 1286.6,
626
+ "valid_targets_min": 671
627
+ },
628
+ {
629
+ "epoch": 1.4393939393939394,
630
+ "grad_norm": 0.9507970993940866,
631
+ "learning_rate": 3.769439042911758e-05,
632
+ "loss": 0.234,
633
+ "loss_nan_ranks": 0,
634
+ "loss_rank_avg": 0.24206951260566711,
635
+ "step": 285,
636
+ "valid_targets_mean": 1180.0,
637
+ "valid_targets_min": 711
638
+ },
639
+ {
640
+ "epoch": 1.4646464646464645,
641
+ "grad_norm": 0.9667877163253019,
642
+ "learning_rate": 3.755550001692506e-05,
643
+ "loss": 0.2173,
644
+ "loss_nan_ranks": 0,
645
+ "loss_rank_avg": 0.21246126294136047,
646
+ "step": 290,
647
+ "valid_targets_mean": 1220.2,
648
+ "valid_targets_min": 770
649
+ },
650
+ {
651
+ "epoch": 1.4898989898989898,
652
+ "grad_norm": 0.9116365111809664,
653
+ "learning_rate": 3.7412819163802986e-05,
654
+ "loss": 0.2105,
655
+ "loss_nan_ranks": 0,
656
+ "loss_rank_avg": 0.21166947484016418,
657
+ "step": 295,
658
+ "valid_targets_mean": 1263.8,
659
+ "valid_targets_min": 762
660
+ },
661
+ {
662
+ "epoch": 1.5151515151515151,
663
+ "grad_norm": 0.8811979745066287,
664
+ "learning_rate": 3.726637867624191e-05,
665
+ "loss": 0.2359,
666
+ "loss_nan_ranks": 0,
667
+ "loss_rank_avg": 0.20912611484527588,
668
+ "step": 300,
669
+ "valid_targets_mean": 1320.4,
670
+ "valid_targets_min": 683
671
+ },
672
+ {
673
+ "epoch": 1.5404040404040404,
674
+ "grad_norm": 0.9596648525111655,
675
+ "learning_rate": 3.711621017248213e-05,
676
+ "loss": 0.2152,
677
+ "loss_nan_ranks": 0,
678
+ "loss_rank_avg": 0.20625783503055573,
679
+ "step": 305,
680
+ "valid_targets_mean": 1253.4,
681
+ "valid_targets_min": 618
682
+ },
683
+ {
684
+ "epoch": 1.5656565656565657,
685
+ "grad_norm": 0.9622701229690029,
686
+ "learning_rate": 3.6962346075686876e-05,
687
+ "loss": 0.2259,
688
+ "loss_nan_ranks": 0,
689
+ "loss_rank_avg": 0.2124948799610138,
690
+ "step": 310,
691
+ "valid_targets_mean": 1200.3,
692
+ "valid_targets_min": 655
693
+ },
694
+ {
695
+ "epoch": 1.5909090909090908,
696
+ "grad_norm": 1.016874652742493,
697
+ "learning_rate": 3.680481960694183e-05,
698
+ "loss": 0.2276,
699
+ "loss_nan_ranks": 0,
700
+ "loss_rank_avg": 0.21283963322639465,
701
+ "step": 315,
702
+ "valid_targets_mean": 1326.4,
703
+ "valid_targets_min": 721
704
+ },
705
+ {
706
+ "epoch": 1.6161616161616161,
707
+ "grad_norm": 0.8226357783546479,
708
+ "learning_rate": 3.6643664778082254e-05,
709
+ "loss": 0.2275,
710
+ "loss_nan_ranks": 0,
711
+ "loss_rank_avg": 0.20702260732650757,
712
+ "step": 320,
713
+ "valid_targets_mean": 1401.2,
714
+ "valid_targets_min": 868
715
+ },
716
+ {
717
+ "epoch": 1.6414141414141414,
718
+ "grad_norm": 0.8672807532299297,
719
+ "learning_rate": 3.6478916384349465e-05,
720
+ "loss": 0.2193,
721
+ "loss_nan_ranks": 0,
722
+ "loss_rank_avg": 0.23550966382026672,
723
+ "step": 325,
724
+ "valid_targets_mean": 1560.4,
725
+ "valid_targets_min": 825
726
+ },
727
+ {
728
+ "epoch": 1.6666666666666665,
729
+ "grad_norm": 0.9712639328964777,
730
+ "learning_rate": 3.631060999687809e-05,
731
+ "loss": 0.2243,
732
+ "loss_nan_ranks": 0,
733
+ "loss_rank_avg": 0.22655721008777618,
734
+ "step": 330,
735
+ "valid_targets_mean": 1239.7,
736
+ "valid_targets_min": 704
737
+ },
738
+ {
739
+ "epoch": 1.691919191919192,
740
+ "grad_norm": 0.9069646079111703,
741
+ "learning_rate": 3.613878195501586e-05,
742
+ "loss": 0.2132,
743
+ "loss_nan_ranks": 0,
744
+ "loss_rank_avg": 0.20803983509540558,
745
+ "step": 335,
746
+ "valid_targets_mean": 1168.1,
747
+ "valid_targets_min": 670
748
+ },
749
+ {
750
+ "epoch": 1.7171717171717171,
751
+ "grad_norm": 0.9455557791381117,
752
+ "learning_rate": 3.596346935847752e-05,
753
+ "loss": 0.2148,
754
+ "loss_nan_ranks": 0,
755
+ "loss_rank_avg": 0.21128031611442566,
756
+ "step": 340,
757
+ "valid_targets_mean": 1368.7,
758
+ "valid_targets_min": 753
759
+ },
760
+ {
761
+ "epoch": 1.7424242424242424,
762
+ "grad_norm": 1.1078047614048518,
763
+ "learning_rate": 3.578471005933454e-05,
764
+ "loss": 0.225,
765
+ "loss_nan_ranks": 0,
766
+ "loss_rank_avg": 0.2151433229446411,
767
+ "step": 345,
768
+ "valid_targets_mean": 1172.9,
769
+ "valid_targets_min": 764
770
+ },
771
+ {
772
+ "epoch": 1.7676767676767677,
773
+ "grad_norm": 1.025649374731909,
774
+ "learning_rate": 3.5602542653842416e-05,
775
+ "loss": 0.2148,
776
+ "loss_nan_ranks": 0,
777
+ "loss_rank_avg": 0.21121200919151306,
778
+ "step": 350,
779
+ "valid_targets_mean": 1404.9,
780
+ "valid_targets_min": 802
781
+ },
782
+ {
783
+ "epoch": 1.7929292929292928,
784
+ "grad_norm": 0.9227312319452774,
785
+ "learning_rate": 3.541700647410728e-05,
786
+ "loss": 0.2203,
787
+ "loss_nan_ranks": 0,
788
+ "loss_rank_avg": 0.20654287934303284,
789
+ "step": 355,
790
+ "valid_targets_mean": 1270.9,
791
+ "valid_targets_min": 694
792
+ },
793
+ {
794
+ "epoch": 1.8181818181818183,
795
+ "grad_norm": 0.8839026808611836,
796
+ "learning_rate": 3.522814157959362e-05,
797
+ "loss": 0.2307,
798
+ "loss_nan_ranks": 0,
799
+ "loss_rank_avg": 0.21429893374443054,
800
+ "step": 360,
801
+ "valid_targets_mean": 1328.3,
802
+ "valid_targets_min": 774
803
+ },
804
+ {
805
+ "epoch": 1.8434343434343434,
806
+ "grad_norm": 0.8418961695574284,
807
+ "learning_rate": 3.5035988748474974e-05,
808
+ "loss": 0.23,
809
+ "loss_nan_ranks": 0,
810
+ "loss_rank_avg": 0.2303229570388794,
811
+ "step": 365,
812
+ "valid_targets_mean": 1438.8,
813
+ "valid_targets_min": 801
814
+ },
815
+ {
816
+ "epoch": 1.8686868686868687,
817
+ "grad_norm": 0.8660276681232518,
818
+ "learning_rate": 3.484058946882946e-05,
819
+ "loss": 0.2128,
820
+ "loss_nan_ranks": 0,
821
+ "loss_rank_avg": 0.20196302235126495,
822
+ "step": 370,
823
+ "valid_targets_mean": 1269.9,
824
+ "valid_targets_min": 765
825
+ },
826
+ {
827
+ "epoch": 1.893939393939394,
828
+ "grad_norm": 0.8556571315590435,
829
+ "learning_rate": 3.4641985929681954e-05,
830
+ "loss": 0.221,
831
+ "loss_nan_ranks": 0,
832
+ "loss_rank_avg": 0.2154424786567688,
833
+ "step": 375,
834
+ "valid_targets_mean": 1228.6,
835
+ "valid_targets_min": 738
836
+ },
837
+ {
838
+ "epoch": 1.9191919191919191,
839
+ "grad_norm": 0.9057324926130037,
840
+ "learning_rate": 3.4440221011895026e-05,
841
+ "loss": 0.2149,
842
+ "loss_nan_ranks": 0,
843
+ "loss_rank_avg": 0.22812026739120483,
844
+ "step": 380,
845
+ "valid_targets_mean": 1510.7,
846
+ "valid_targets_min": 648
847
+ },
848
+ {
849
+ "epoch": 1.9444444444444444,
850
+ "grad_norm": 1.22905062391175,
851
+ "learning_rate": 3.423533827891044e-05,
852
+ "loss": 0.2226,
853
+ "loss_nan_ranks": 0,
854
+ "loss_rank_avg": 0.22331665456295013,
855
+ "step": 385,
856
+ "valid_targets_mean": 1256.8,
857
+ "valid_targets_min": 796
858
+ },
859
+ {
860
+ "epoch": 1.9696969696969697,
861
+ "grad_norm": 0.8864960651317679,
862
+ "learning_rate": 3.402738196734327e-05,
863
+ "loss": 0.2269,
864
+ "loss_nan_ranks": 0,
865
+ "loss_rank_avg": 0.2474365234375,
866
+ "step": 390,
867
+ "valid_targets_mean": 1468.1,
868
+ "valid_targets_min": 808
869
+ },
870
+ {
871
+ "epoch": 1.9949494949494948,
872
+ "grad_norm": 0.8116444633238107,
873
+ "learning_rate": 3.381639697743073e-05,
874
+ "loss": 0.2196,
875
+ "loss_nan_ranks": 0,
876
+ "loss_rank_avg": 0.21921899914741516,
877
+ "step": 395,
878
+ "valid_targets_mean": 1372.4,
879
+ "valid_targets_min": 696
880
+ },
881
+ {
882
+ "epoch": 2.0202020202020203,
883
+ "grad_norm": 0.8089562588991992,
884
+ "learning_rate": 3.3602428863337625e-05,
885
+ "loss": 0.1951,
886
+ "loss_nan_ranks": 0,
887
+ "loss_rank_avg": 0.18117868900299072,
888
+ "step": 400,
889
+ "valid_targets_mean": 1378.9,
890
+ "valid_targets_min": 830
891
+ },
892
+ {
893
+ "epoch": 2.0454545454545454,
894
+ "grad_norm": 0.9722779083533737,
895
+ "learning_rate": 3.338552382332073e-05,
896
+ "loss": 0.1904,
897
+ "loss_nan_ranks": 0,
898
+ "loss_rank_avg": 0.20024555921554565,
899
+ "step": 405,
900
+ "valid_targets_mean": 1251.8,
901
+ "valid_targets_min": 644
902
+ },
903
+ {
904
+ "epoch": 2.0707070707070705,
905
+ "grad_norm": 0.9539651111813244,
906
+ "learning_rate": 3.3165728689753976e-05,
907
+ "loss": 0.1952,
908
+ "loss_nan_ranks": 0,
909
+ "loss_rank_avg": 0.1934088170528412,
910
+ "step": 410,
911
+ "valid_targets_mean": 1190.8,
912
+ "valid_targets_min": 655
913
+ },
914
+ {
915
+ "epoch": 2.095959595959596,
916
+ "grad_norm": 0.8845502621948085,
917
+ "learning_rate": 3.2943090919016815e-05,
918
+ "loss": 0.1938,
919
+ "loss_nan_ranks": 0,
920
+ "loss_rank_avg": 0.21735027432441711,
921
+ "step": 415,
922
+ "valid_targets_mean": 1383.0,
923
+ "valid_targets_min": 651
924
+ },
925
+ {
926
+ "epoch": 2.121212121212121,
927
+ "grad_norm": 0.8702714930026663,
928
+ "learning_rate": 3.2717658581247844e-05,
929
+ "loss": 0.1834,
930
+ "loss_nan_ranks": 0,
931
+ "loss_rank_avg": 0.17540085315704346,
932
+ "step": 420,
933
+ "valid_targets_mean": 1534.4,
934
+ "valid_targets_min": 686
935
+ },
936
+ {
937
+ "epoch": 2.1464646464646466,
938
+ "grad_norm": 0.8478046646870934,
939
+ "learning_rate": 3.248948034996583e-05,
940
+ "loss": 0.1995,
941
+ "loss_nan_ranks": 0,
942
+ "loss_rank_avg": 0.19789089262485504,
943
+ "step": 425,
944
+ "valid_targets_mean": 1489.8,
945
+ "valid_targets_min": 578
946
+ },
947
+ {
948
+ "epoch": 2.1717171717171717,
949
+ "grad_norm": 0.8469812735159056,
950
+ "learning_rate": 3.2258605491560606e-05,
951
+ "loss": 0.184,
952
+ "loss_nan_ranks": 0,
953
+ "loss_rank_avg": 0.17873477935791016,
954
+ "step": 430,
955
+ "valid_targets_mean": 1449.3,
956
+ "valid_targets_min": 964
957
+ },
958
+ {
959
+ "epoch": 2.196969696969697,
960
+ "grad_norm": 1.0152996119033395,
961
+ "learning_rate": 3.2025083854655776e-05,
962
+ "loss": 0.1889,
963
+ "loss_nan_ranks": 0,
964
+ "loss_rank_avg": 0.19385069608688354,
965
+ "step": 435,
966
+ "valid_targets_mean": 1232.6,
967
+ "valid_targets_min": 532
968
+ },
969
+ {
970
+ "epoch": 2.2222222222222223,
971
+ "grad_norm": 0.9854965513455352,
972
+ "learning_rate": 3.178896585934588e-05,
973
+ "loss": 0.1922,
974
+ "loss_nan_ranks": 0,
975
+ "loss_rank_avg": 0.20682978630065918,
976
+ "step": 440,
977
+ "valid_targets_mean": 1247.1,
978
+ "valid_targets_min": 708
979
+ },
980
+ {
981
+ "epoch": 2.2474747474747474,
982
+ "grad_norm": 0.8783905199567699,
983
+ "learning_rate": 3.1550302486310076e-05,
984
+ "loss": 0.181,
985
+ "loss_nan_ranks": 0,
986
+ "loss_rank_avg": 0.17547810077667236,
987
+ "step": 445,
988
+ "valid_targets_mean": 1395.0,
989
+ "valid_targets_min": 986
990
+ },
991
+ {
992
+ "epoch": 2.2727272727272725,
993
+ "grad_norm": 0.7915351995988669,
994
+ "learning_rate": 3.130914526580478e-05,
995
+ "loss": 0.1932,
996
+ "loss_nan_ranks": 0,
997
+ "loss_rank_avg": 0.19163087010383606,
998
+ "step": 450,
999
+ "valid_targets_mean": 1419.6,
1000
+ "valid_targets_min": 550
1001
+ },
1002
+ {
1003
+ "epoch": 2.297979797979798,
1004
+ "grad_norm": 0.8603558279215641,
1005
+ "learning_rate": 3.10655462665377e-05,
1006
+ "loss": 0.1943,
1007
+ "loss_nan_ranks": 0,
1008
+ "loss_rank_avg": 0.18317505717277527,
1009
+ "step": 455,
1010
+ "valid_targets_mean": 1478.6,
1011
+ "valid_targets_min": 796
1012
+ },
1013
+ {
1014
+ "epoch": 2.323232323232323,
1015
+ "grad_norm": 0.9291992362997976,
1016
+ "learning_rate": 3.0819558084425574e-05,
1017
+ "loss": 0.1858,
1018
+ "loss_nan_ranks": 0,
1019
+ "loss_rank_avg": 0.1851797103881836,
1020
+ "step": 460,
1021
+ "valid_targets_mean": 1252.2,
1022
+ "valid_targets_min": 862
1023
+ },
1024
+ {
1025
+ "epoch": 2.3484848484848486,
1026
+ "grad_norm": 0.9994111902225407,
1027
+ "learning_rate": 3.0571233831238093e-05,
1028
+ "loss": 0.1912,
1029
+ "loss_nan_ranks": 0,
1030
+ "loss_rank_avg": 0.1833767294883728,
1031
+ "step": 465,
1032
+ "valid_targets_mean": 1134.1,
1033
+ "valid_targets_min": 619
1034
+ },
1035
+ {
1036
+ "epoch": 2.3737373737373737,
1037
+ "grad_norm": 0.9400802045489894,
1038
+ "learning_rate": 3.032062712313044e-05,
1039
+ "loss": 0.1907,
1040
+ "loss_nan_ranks": 0,
1041
+ "loss_rank_avg": 0.19591230154037476,
1042
+ "step": 470,
1043
+ "valid_targets_mean": 1326.4,
1044
+ "valid_targets_min": 801
1045
+ },
1046
+ {
1047
+ "epoch": 2.398989898989899,
1048
+ "grad_norm": 0.8710505639917704,
1049
+ "learning_rate": 3.0067792069066902e-05,
1050
+ "loss": 0.1963,
1051
+ "loss_nan_ranks": 0,
1052
+ "loss_rank_avg": 0.21165084838867188,
1053
+ "step": 475,
1054
+ "valid_targets_mean": 1496.6,
1055
+ "valid_targets_min": 774
1056
+ },
1057
+ {
1058
+ "epoch": 2.4242424242424243,
1059
+ "grad_norm": 0.8977896309267136,
1060
+ "learning_rate": 2.9812783259138133e-05,
1061
+ "loss": 0.2008,
1062
+ "loss_nan_ranks": 0,
1063
+ "loss_rank_avg": 0.20881733298301697,
1064
+ "step": 480,
1065
+ "valid_targets_mean": 1376.1,
1066
+ "valid_targets_min": 600
1067
+ },
1068
+ {
1069
+ "epoch": 2.4494949494949494,
1070
+ "grad_norm": 0.8422125060200054,
1071
+ "learning_rate": 2.955565575277449e-05,
1072
+ "loss": 0.1868,
1073
+ "loss_nan_ranks": 0,
1074
+ "loss_rank_avg": 0.18488860130310059,
1075
+ "step": 485,
1076
+ "valid_targets_mean": 1432.2,
1077
+ "valid_targets_min": 585
1078
+ },
1079
+ {
1080
+ "epoch": 2.474747474747475,
1081
+ "grad_norm": 0.841379053094665,
1082
+ "learning_rate": 2.929646506685805e-05,
1083
+ "loss": 0.1896,
1084
+ "loss_nan_ranks": 0,
1085
+ "loss_rank_avg": 0.19385570287704468,
1086
+ "step": 490,
1087
+ "valid_targets_mean": 1452.9,
1088
+ "valid_targets_min": 811
1089
+ },
1090
+ {
1091
+ "epoch": 2.5,
1092
+ "grad_norm": 0.8681195861479708,
1093
+ "learning_rate": 2.9035267163735856e-05,
1094
+ "loss": 0.1853,
1095
+ "loss_nan_ranks": 0,
1096
+ "loss_rank_avg": 0.18989858031272888,
1097
+ "step": 495,
1098
+ "valid_targets_mean": 1264.6,
1099
+ "valid_targets_min": 674
1100
+ },
1101
+ {
1102
+ "epoch": 2.525252525252525,
1103
+ "grad_norm": 0.9486462794838508,
1104
+ "learning_rate": 2.8772118439136972e-05,
1105
+ "loss": 0.1917,
1106
+ "loss_nan_ranks": 0,
1107
+ "loss_rank_avg": 0.20061013102531433,
1108
+ "step": 500,
1109
+ "valid_targets_mean": 1308.9,
1110
+ "valid_targets_min": 854
1111
+ },
1112
+ {
1113
+ "epoch": 2.5505050505050506,
1114
+ "grad_norm": 1.0120219125127008,
1115
+ "learning_rate": 2.8507075709996015e-05,
1116
+ "loss": 0.2,
1117
+ "loss_nan_ranks": 0,
1118
+ "loss_rank_avg": 0.19788292050361633,
1119
+ "step": 505,
1120
+ "valid_targets_mean": 1336.9,
1121
+ "valid_targets_min": 615
1122
+ },
1123
+ {
1124
+ "epoch": 2.5757575757575757,
1125
+ "grad_norm": 0.8552983942324871,
1126
+ "learning_rate": 2.8240196202185636e-05,
1127
+ "loss": 0.1841,
1128
+ "loss_nan_ranks": 0,
1129
+ "loss_rank_avg": 0.18201503157615662,
1130
+ "step": 510,
1131
+ "valid_targets_mean": 1183.4,
1132
+ "valid_targets_min": 648
1133
+ },
1134
+ {
1135
+ "epoch": 2.601010101010101,
1136
+ "grad_norm": 1.0101472372139235,
1137
+ "learning_rate": 2.797153753816084e-05,
1138
+ "loss": 0.2015,
1139
+ "loss_nan_ranks": 0,
1140
+ "loss_rank_avg": 0.18643571436405182,
1141
+ "step": 515,
1142
+ "valid_targets_mean": 1155.6,
1143
+ "valid_targets_min": 538
1144
+ },
1145
+ {
1146
+ "epoch": 2.6262626262626263,
1147
+ "grad_norm": 0.8259754112301633,
1148
+ "learning_rate": 2.770115772451758e-05,
1149
+ "loss": 0.1847,
1150
+ "loss_nan_ranks": 0,
1151
+ "loss_rank_avg": 0.20032909512519836,
1152
+ "step": 520,
1153
+ "valid_targets_mean": 1452.1,
1154
+ "valid_targets_min": 640
1155
+ },
1156
+ {
1157
+ "epoch": 2.6515151515151514,
1158
+ "grad_norm": 1.0080765247154564,
1159
+ "learning_rate": 2.7429115139468443e-05,
1160
+ "loss": 0.1868,
1161
+ "loss_nan_ranks": 0,
1162
+ "loss_rank_avg": 0.2050439417362213,
1163
+ "step": 525,
1164
+ "valid_targets_mean": 1231.2,
1165
+ "valid_targets_min": 678
1166
+ },
1167
+ {
1168
+ "epoch": 2.676767676767677,
1169
+ "grad_norm": 0.9228666377959548,
1170
+ "learning_rate": 2.7155468520238116e-05,
1171
+ "loss": 0.1955,
1172
+ "loss_nan_ranks": 0,
1173
+ "loss_rank_avg": 0.20735567808151245,
1174
+ "step": 530,
1175
+ "valid_targets_mean": 1336.9,
1176
+ "valid_targets_min": 587
1177
+ },
1178
+ {
1179
+ "epoch": 2.702020202020202,
1180
+ "grad_norm": 0.997304280142903,
1181
+ "learning_rate": 2.6880276950381316e-05,
1182
+ "loss": 0.1881,
1183
+ "loss_nan_ranks": 0,
1184
+ "loss_rank_avg": 0.1983107328414917,
1185
+ "step": 535,
1186
+ "valid_targets_mean": 1249.5,
1187
+ "valid_targets_min": 521
1188
+ },
1189
+ {
1190
+ "epoch": 2.7272727272727275,
1191
+ "grad_norm": 0.9193727742835162,
1192
+ "learning_rate": 2.6603599847025935e-05,
1193
+ "loss": 0.1805,
1194
+ "loss_nan_ranks": 0,
1195
+ "loss_rank_avg": 0.18839475512504578,
1196
+ "step": 540,
1197
+ "valid_targets_mean": 1276.8,
1198
+ "valid_targets_min": 776
1199
+ },
1200
+ {
1201
+ "epoch": 2.7525252525252526,
1202
+ "grad_norm": 0.9459373491774038,
1203
+ "learning_rate": 2.63254969480442e-05,
1204
+ "loss": 0.191,
1205
+ "loss_nan_ranks": 0,
1206
+ "loss_rank_avg": 0.19836702942848206,
1207
+ "step": 545,
1208
+ "valid_targets_mean": 1273.7,
1209
+ "valid_targets_min": 702
1210
+ },
1211
+ {
1212
+ "epoch": 2.7777777777777777,
1213
+ "grad_norm": 0.8854618406463927,
1214
+ "learning_rate": 2.6046028299154545e-05,
1215
+ "loss": 0.1875,
1216
+ "loss_nan_ranks": 0,
1217
+ "loss_rank_avg": 0.18734903633594513,
1218
+ "step": 550,
1219
+ "valid_targets_mean": 1388.6,
1220
+ "valid_targets_min": 546
1221
+ },
1222
+ {
1223
+ "epoch": 2.8030303030303028,
1224
+ "grad_norm": 0.7860179708066867,
1225
+ "learning_rate": 2.5765254240957024e-05,
1226
+ "loss": 0.1886,
1227
+ "loss_nan_ranks": 0,
1228
+ "loss_rank_avg": 0.18938644230365753,
1229
+ "step": 555,
1230
+ "valid_targets_mean": 1631.9,
1231
+ "valid_targets_min": 1077
1232
+ },
1233
+ {
1234
+ "epoch": 2.8282828282828283,
1235
+ "grad_norm": 0.8367854200810775,
1236
+ "learning_rate": 2.5483235395905056e-05,
1237
+ "loss": 0.1867,
1238
+ "loss_nan_ranks": 0,
1239
+ "loss_rank_avg": 0.20430481433868408,
1240
+ "step": 560,
1241
+ "valid_targets_mean": 1437.8,
1242
+ "valid_targets_min": 690
1243
+ },
1244
+ {
1245
+ "epoch": 2.8535353535353534,
1246
+ "grad_norm": 0.9180703166963079,
1247
+ "learning_rate": 2.5200032655216343e-05,
1248
+ "loss": 0.1894,
1249
+ "loss_nan_ranks": 0,
1250
+ "loss_rank_avg": 0.18853136897087097,
1251
+ "step": 565,
1252
+ "valid_targets_mean": 1259.7,
1253
+ "valid_targets_min": 684
1254
+ },
1255
+ {
1256
+ "epoch": 2.878787878787879,
1257
+ "grad_norm": 0.8164719253098022,
1258
+ "learning_rate": 2.4915707165725694e-05,
1259
+ "loss": 0.1858,
1260
+ "loss_nan_ranks": 0,
1261
+ "loss_rank_avg": 0.18391425907611847,
1262
+ "step": 570,
1263
+ "valid_targets_mean": 1376.6,
1264
+ "valid_targets_min": 802
1265
+ },
1266
+ {
1267
+ "epoch": 2.904040404040404,
1268
+ "grad_norm": 0.9943903226603121,
1269
+ "learning_rate": 2.4630320316682724e-05,
1270
+ "loss": 0.1847,
1271
+ "loss_nan_ranks": 0,
1272
+ "loss_rank_avg": 0.19583845138549805,
1273
+ "step": 575,
1274
+ "valid_targets_mean": 1316.8,
1275
+ "valid_targets_min": 685
1276
+ },
1277
+ {
1278
+ "epoch": 2.929292929292929,
1279
+ "grad_norm": 0.9662414433511091,
1280
+ "learning_rate": 2.4343933726497183e-05,
1281
+ "loss": 0.1856,
1282
+ "loss_nan_ranks": 0,
1283
+ "loss_rank_avg": 0.17909899353981018,
1284
+ "step": 580,
1285
+ "valid_targets_mean": 1219.6,
1286
+ "valid_targets_min": 603
1287
+ },
1288
+ {
1289
+ "epoch": 2.9545454545454546,
1290
+ "grad_norm": 0.8264667060320464,
1291
+ "learning_rate": 2.4056609229434812e-05,
1292
+ "loss": 0.1887,
1293
+ "loss_nan_ranks": 0,
1294
+ "loss_rank_avg": 0.1930309534072876,
1295
+ "step": 585,
1296
+ "valid_targets_mean": 1353.8,
1297
+ "valid_targets_min": 751
1298
+ },
1299
+ {
1300
+ "epoch": 2.9797979797979797,
1301
+ "grad_norm": 0.8672579031747749,
1302
+ "learning_rate": 2.376840886226656e-05,
1303
+ "loss": 0.1867,
1304
+ "loss_nan_ranks": 0,
1305
+ "loss_rank_avg": 0.1810949146747589,
1306
+ "step": 590,
1307
+ "valid_targets_mean": 1360.6,
1308
+ "valid_targets_min": 681
1309
+ },
1310
+ {
1311
+ "epoch": 3.005050505050505,
1312
+ "grad_norm": 0.8370979896650649,
1313
+ "learning_rate": 2.347939485087416e-05,
1314
+ "loss": 0.1831,
1315
+ "loss_nan_ranks": 0,
1316
+ "loss_rank_avg": 0.16158387064933777,
1317
+ "step": 595,
1318
+ "valid_targets_mean": 1132.1,
1319
+ "valid_targets_min": 475
1320
+ },
1321
+ {
1322
+ "epoch": 3.0303030303030303,
1323
+ "grad_norm": 0.8281295886895214,
1324
+ "learning_rate": 2.3189629596814788e-05,
1325
+ "loss": 0.157,
1326
+ "loss_nan_ranks": 0,
1327
+ "loss_rank_avg": 0.15015463531017303,
1328
+ "step": 600,
1329
+ "valid_targets_mean": 1357.4,
1330
+ "valid_targets_min": 629
1331
+ },
1332
+ {
1333
+ "epoch": 3.0555555555555554,
1334
+ "grad_norm": 1.0077015146202375,
1335
+ "learning_rate": 2.2899175663847823e-05,
1336
+ "loss": 0.1605,
1337
+ "loss_nan_ranks": 0,
1338
+ "loss_rank_avg": 0.17284418642520905,
1339
+ "step": 605,
1340
+ "valid_targets_mean": 1323.3,
1341
+ "valid_targets_min": 681
1342
+ },
1343
+ {
1344
+ "epoch": 3.080808080808081,
1345
+ "grad_norm": 0.9055532065630415,
1346
+ "learning_rate": 2.2608095764426602e-05,
1347
+ "loss": 0.1609,
1348
+ "loss_nan_ranks": 0,
1349
+ "loss_rank_avg": 0.172052800655365,
1350
+ "step": 610,
1351
+ "valid_targets_mean": 1438.2,
1352
+ "valid_targets_min": 785
1353
+ },
1354
+ {
1355
+ "epoch": 3.106060606060606,
1356
+ "grad_norm": 0.9289877404583321,
1357
+ "learning_rate": 2.2316452746158063e-05,
1358
+ "loss": 0.1649,
1359
+ "loss_nan_ranks": 0,
1360
+ "loss_rank_avg": 0.18071796000003815,
1361
+ "step": 615,
1362
+ "valid_targets_mean": 1361.3,
1363
+ "valid_targets_min": 618
1364
+ },
1365
+ {
1366
+ "epoch": 3.1313131313131315,
1367
+ "grad_norm": 0.9997329115169467,
1368
+ "learning_rate": 2.2024309578233174e-05,
1369
+ "loss": 0.161,
1370
+ "loss_nan_ranks": 0,
1371
+ "loss_rank_avg": 0.16634033620357513,
1372
+ "step": 620,
1373
+ "valid_targets_mean": 1173.6,
1374
+ "valid_targets_min": 690
1375
+ },
1376
+ {
1377
+ "epoch": 3.1565656565656566,
1378
+ "grad_norm": 1.009976573995706,
1379
+ "learning_rate": 2.1731729337831173e-05,
1380
+ "loss": 0.1574,
1381
+ "loss_nan_ranks": 0,
1382
+ "loss_rank_avg": 0.16698682308197021,
1383
+ "step": 625,
1384
+ "valid_targets_mean": 1469.7,
1385
+ "valid_targets_min": 677
1386
+ },
1387
+ {
1388
+ "epoch": 3.1818181818181817,
1389
+ "grad_norm": 0.9404190990393296,
1390
+ "learning_rate": 2.143877519650042e-05,
1391
+ "loss": 0.165,
1392
+ "loss_nan_ranks": 0,
1393
+ "loss_rank_avg": 0.16389097273349762,
1394
+ "step": 630,
1395
+ "valid_targets_mean": 1243.7,
1396
+ "valid_targets_min": 655
1397
+ },
1398
+ {
1399
+ "epoch": 3.207070707070707,
1400
+ "grad_norm": 0.9313377151581581,
1401
+ "learning_rate": 2.1145510406518928e-05,
1402
+ "loss": 0.1654,
1403
+ "loss_nan_ranks": 0,
1404
+ "loss_rank_avg": 0.17033018171787262,
1405
+ "step": 635,
1406
+ "valid_targets_mean": 1205.6,
1407
+ "valid_targets_min": 742
1408
+ },
1409
+ {
1410
+ "epoch": 3.2323232323232323,
1411
+ "grad_norm": 0.8337581545510251,
1412
+ "learning_rate": 2.0851998287237452e-05,
1413
+ "loss": 0.1603,
1414
+ "loss_nan_ranks": 0,
1415
+ "loss_rank_avg": 0.14411024749279022,
1416
+ "step": 640,
1417
+ "valid_targets_mean": 1329.4,
1418
+ "valid_targets_min": 746
1419
+ },
1420
+ {
1421
+ "epoch": 3.257575757575758,
1422
+ "grad_norm": 0.8441677147621939,
1423
+ "learning_rate": 2.0558302211408075e-05,
1424
+ "loss": 0.1568,
1425
+ "loss_nan_ranks": 0,
1426
+ "loss_rank_avg": 0.15460258722305298,
1427
+ "step": 645,
1428
+ "valid_targets_mean": 1355.2,
1429
+ "valid_targets_min": 684
1430
+ },
1431
+ {
1432
+ "epoch": 3.282828282828283,
1433
+ "grad_norm": 0.894454117393474,
1434
+ "learning_rate": 2.0264485591501272e-05,
1435
+ "loss": 0.1657,
1436
+ "loss_nan_ranks": 0,
1437
+ "loss_rank_avg": 0.1565285176038742,
1438
+ "step": 650,
1439
+ "valid_targets_mean": 1227.0,
1440
+ "valid_targets_min": 686
1441
+ },
1442
+ {
1443
+ "epoch": 3.308080808080808,
1444
+ "grad_norm": 1.0769409575744318,
1445
+ "learning_rate": 1.9970611866014432e-05,
1446
+ "loss": 0.1626,
1447
+ "loss_nan_ranks": 0,
1448
+ "loss_rank_avg": 0.1580488085746765,
1449
+ "step": 655,
1450
+ "valid_targets_mean": 1152.6,
1451
+ "valid_targets_min": 576
1452
+ },
1453
+ {
1454
+ "epoch": 3.3333333333333335,
1455
+ "grad_norm": 0.9561591809140133,
1456
+ "learning_rate": 1.967674448577471e-05,
1457
+ "loss": 0.1581,
1458
+ "loss_nan_ranks": 0,
1459
+ "loss_rank_avg": 0.15929345786571503,
1460
+ "step": 660,
1461
+ "valid_targets_mean": 1284.5,
1462
+ "valid_targets_min": 629
1463
+ },
1464
+ {
1465
+ "epoch": 3.3585858585858586,
1466
+ "grad_norm": 0.9519946019411174,
1467
+ "learning_rate": 1.9382946900239247e-05,
1468
+ "loss": 0.1586,
1469
+ "loss_nan_ranks": 0,
1470
+ "loss_rank_avg": 0.1578896939754486,
1471
+ "step": 665,
1472
+ "valid_targets_mean": 1273.5,
1473
+ "valid_targets_min": 841
1474
+ },
1475
+ {
1476
+ "epoch": 3.3838383838383836,
1477
+ "grad_norm": 0.9445690376209704,
1478
+ "learning_rate": 1.9089282543795692e-05,
1479
+ "loss": 0.171,
1480
+ "loss_nan_ranks": 0,
1481
+ "loss_rank_avg": 0.16323482990264893,
1482
+ "step": 670,
1483
+ "valid_targets_mean": 1237.7,
1484
+ "valid_targets_min": 704
1485
+ },
1486
+ {
1487
+ "epoch": 3.409090909090909,
1488
+ "grad_norm": 0.9125004973109183,
1489
+ "learning_rate": 1.879581482206592e-05,
1490
+ "loss": 0.1549,
1491
+ "loss_nan_ranks": 0,
1492
+ "loss_rank_avg": 0.1543537974357605,
1493
+ "step": 675,
1494
+ "valid_targets_mean": 1212.9,
1495
+ "valid_targets_min": 608
1496
+ },
1497
+ {
1498
+ "epoch": 3.4343434343434343,
1499
+ "grad_norm": 0.862065205565415,
1500
+ "learning_rate": 1.8502607098216056e-05,
1501
+ "loss": 0.1645,
1502
+ "loss_nan_ranks": 0,
1503
+ "loss_rank_avg": 0.17540498077869415,
1504
+ "step": 680,
1505
+ "valid_targets_mean": 1439.5,
1506
+ "valid_targets_min": 626
1507
+ },
1508
+ {
1509
+ "epoch": 3.45959595959596,
1510
+ "grad_norm": 0.9640325333317653,
1511
+ "learning_rate": 1.8209722679275602e-05,
1512
+ "loss": 0.1636,
1513
+ "loss_nan_ranks": 0,
1514
+ "loss_rank_avg": 0.16301819682121277,
1515
+ "step": 685,
1516
+ "valid_targets_mean": 1403.1,
1517
+ "valid_targets_min": 758
1518
+ },
1519
+ {
1520
+ "epoch": 3.484848484848485,
1521
+ "grad_norm": 0.9118905245785244,
1522
+ "learning_rate": 1.791722480246868e-05,
1523
+ "loss": 0.1615,
1524
+ "loss_nan_ranks": 0,
1525
+ "loss_rank_avg": 0.1672666370868683,
1526
+ "step": 690,
1527
+ "valid_targets_mean": 1300.1,
1528
+ "valid_targets_min": 859
1529
+ },
1530
+ {
1531
+ "epoch": 3.51010101010101,
1532
+ "grad_norm": 0.879350730391716,
1533
+ "learning_rate": 1.762517662156037e-05,
1534
+ "loss": 0.1592,
1535
+ "loss_nan_ranks": 0,
1536
+ "loss_rank_avg": 0.1608966588973999,
1537
+ "step": 695,
1538
+ "valid_targets_mean": 1377.4,
1539
+ "valid_targets_min": 927
1540
+ },
1541
+ {
1542
+ "epoch": 3.5353535353535355,
1543
+ "grad_norm": 0.9462737198336908,
1544
+ "learning_rate": 1.733364119322109e-05,
1545
+ "loss": 0.1648,
1546
+ "loss_nan_ranks": 0,
1547
+ "loss_rank_avg": 0.17058268189430237,
1548
+ "step": 700,
1549
+ "valid_targets_mean": 1403.2,
1550
+ "valid_targets_min": 609
1551
+ },
1552
+ {
1553
+ "epoch": 3.5606060606060606,
1554
+ "grad_norm": 0.9043276008484828,
1555
+ "learning_rate": 1.704268146341185e-05,
1556
+ "loss": 0.1683,
1557
+ "loss_nan_ranks": 0,
1558
+ "loss_rank_avg": 0.17930132150650024,
1559
+ "step": 705,
1560
+ "valid_targets_mean": 1315.9,
1561
+ "valid_targets_min": 678
1562
+ },
1563
+ {
1564
+ "epoch": 3.5858585858585856,
1565
+ "grad_norm": 0.8976270383472466,
1566
+ "learning_rate": 1.675236025379355e-05,
1567
+ "loss": 0.1626,
1568
+ "loss_nan_ranks": 0,
1569
+ "loss_rank_avg": 0.15831458568572998,
1570
+ "step": 710,
1571
+ "valid_targets_mean": 1351.8,
1572
+ "valid_targets_min": 626
1573
+ },
1574
+ {
1575
+ "epoch": 3.611111111111111,
1576
+ "grad_norm": 0.9372138654679043,
1577
+ "learning_rate": 1.6462740248162988e-05,
1578
+ "loss": 0.1628,
1579
+ "loss_nan_ranks": 0,
1580
+ "loss_rank_avg": 0.16674670577049255,
1581
+ "step": 715,
1582
+ "valid_targets_mean": 1407.9,
1583
+ "valid_targets_min": 869
1584
+ },
1585
+ {
1586
+ "epoch": 3.6363636363636362,
1587
+ "grad_norm": 0.8889620690389666,
1588
+ "learning_rate": 1.6173883978918682e-05,
1589
+ "loss": 0.1579,
1590
+ "loss_nan_ranks": 0,
1591
+ "loss_rank_avg": 0.16323408484458923,
1592
+ "step": 720,
1593
+ "valid_targets_mean": 1347.6,
1594
+ "valid_targets_min": 714
1595
+ },
1596
+ {
1597
+ "epoch": 3.6616161616161618,
1598
+ "grad_norm": 0.9461049287423077,
1599
+ "learning_rate": 1.5885853813559392e-05,
1600
+ "loss": 0.1606,
1601
+ "loss_nan_ranks": 0,
1602
+ "loss_rank_avg": 0.16170771420001984,
1603
+ "step": 725,
1604
+ "valid_targets_mean": 1151.1,
1605
+ "valid_targets_min": 474
1606
+ },
1607
+ {
1608
+ "epoch": 3.686868686868687,
1609
+ "grad_norm": 0.9883837352207571,
1610
+ "learning_rate": 1.5598711941218265e-05,
1611
+ "loss": 0.1639,
1612
+ "loss_nan_ranks": 0,
1613
+ "loss_rank_avg": 0.1605217158794403,
1614
+ "step": 730,
1615
+ "valid_targets_mean": 1241.0,
1616
+ "valid_targets_min": 578
1617
+ },
1618
+ {
1619
+ "epoch": 3.712121212121212,
1620
+ "grad_norm": 2.3494111496602836,
1621
+ "learning_rate": 1.531252035923541e-05,
1622
+ "loss": 0.1686,
1623
+ "loss_nan_ranks": 0,
1624
+ "loss_rank_avg": 0.17595025897026062,
1625
+ "step": 735,
1626
+ "valid_targets_mean": 1373.8,
1627
+ "valid_targets_min": 591
1628
+ },
1629
+ {
1630
+ "epoch": 3.7373737373737375,
1631
+ "grad_norm": 0.9593181868358482,
1632
+ "learning_rate": 1.5027340859771972e-05,
1633
+ "loss": 0.1652,
1634
+ "loss_nan_ranks": 0,
1635
+ "loss_rank_avg": 0.15891680121421814,
1636
+ "step": 740,
1637
+ "valid_targets_mean": 1432.9,
1638
+ "valid_targets_min": 785
1639
+ },
1640
+ {
1641
+ "epoch": 3.7626262626262625,
1642
+ "grad_norm": 1.2303147950630564,
1643
+ "learning_rate": 1.4743235016468474e-05,
1644
+ "loss": 0.1628,
1645
+ "loss_nan_ranks": 0,
1646
+ "loss_rank_avg": 0.15962883830070496,
1647
+ "step": 745,
1648
+ "valid_targets_mean": 1254.1,
1649
+ "valid_targets_min": 640
1650
+ },
1651
+ {
1652
+ "epoch": 3.787878787878788,
1653
+ "grad_norm": 0.868720721287915,
1654
+ "learning_rate": 1.4460264171150296e-05,
1655
+ "loss": 0.1614,
1656
+ "loss_nan_ranks": 0,
1657
+ "loss_rank_avg": 0.16074195504188538,
1658
+ "step": 750,
1659
+ "valid_targets_mean": 1357.2,
1660
+ "valid_targets_min": 915
1661
+ },
1662
+ {
1663
+ "epoch": 3.813131313131313,
1664
+ "grad_norm": 0.8933503401903754,
1665
+ "learning_rate": 1.4178489420583297e-05,
1666
+ "loss": 0.1659,
1667
+ "loss_nan_ranks": 0,
1668
+ "loss_rank_avg": 0.16842246055603027,
1669
+ "step": 755,
1670
+ "valid_targets_mean": 1366.2,
1671
+ "valid_targets_min": 494
1672
+ },
1673
+ {
1674
+ "epoch": 3.8383838383838382,
1675
+ "grad_norm": 0.9117330229229211,
1676
+ "learning_rate": 1.3897971603282278e-05,
1677
+ "loss": 0.1595,
1678
+ "loss_nan_ranks": 0,
1679
+ "loss_rank_avg": 0.15995121002197266,
1680
+ "step": 760,
1681
+ "valid_targets_mean": 1271.9,
1682
+ "valid_targets_min": 709
1683
+ },
1684
+ {
1685
+ "epoch": 3.8636363636363638,
1686
+ "grad_norm": 0.9015209816689598,
1687
+ "learning_rate": 1.36187712863752e-05,
1688
+ "loss": 0.166,
1689
+ "loss_nan_ranks": 0,
1690
+ "loss_rank_avg": 0.15097013115882874,
1691
+ "step": 765,
1692
+ "valid_targets_mean": 1283.9,
1693
+ "valid_targets_min": 650
1694
+ },
1695
+ {
1696
+ "epoch": 3.888888888888889,
1697
+ "grad_norm": 1.0021456080906517,
1698
+ "learning_rate": 1.3340948752526069e-05,
1699
+ "loss": 0.1599,
1700
+ "loss_nan_ranks": 0,
1701
+ "loss_rank_avg": 0.16016200184822083,
1702
+ "step": 770,
1703
+ "valid_targets_mean": 1304.5,
1704
+ "valid_targets_min": 508
1705
+ },
1706
+ {
1707
+ "epoch": 3.9141414141414144,
1708
+ "grad_norm": 0.9443204411485612,
1709
+ "learning_rate": 1.3064563986919142e-05,
1710
+ "loss": 0.1596,
1711
+ "loss_nan_ranks": 0,
1712
+ "loss_rank_avg": 0.16480416059494019,
1713
+ "step": 775,
1714
+ "valid_targets_mean": 1187.4,
1715
+ "valid_targets_min": 618
1716
+ },
1717
+ {
1718
+ "epoch": 3.9393939393939394,
1719
+ "grad_norm": 1.0288285211308523,
1720
+ "learning_rate": 1.278967666430745e-05,
1721
+ "loss": 0.1628,
1722
+ "loss_nan_ranks": 0,
1723
+ "loss_rank_avg": 0.16644668579101562,
1724
+ "step": 780,
1725
+ "valid_targets_mean": 1176.1,
1726
+ "valid_targets_min": 711
1727
+ },
1728
+ {
1729
+ "epoch": 3.9646464646464645,
1730
+ "grad_norm": 0.9366989096743141,
1731
+ "learning_rate": 1.2516346136128318e-05,
1732
+ "loss": 0.1641,
1733
+ "loss_nan_ranks": 0,
1734
+ "loss_rank_avg": 0.1586305946111679,
1735
+ "step": 785,
1736
+ "valid_targets_mean": 1275.4,
1737
+ "valid_targets_min": 615
1738
+ },
1739
+ {
1740
+ "epoch": 3.98989898989899,
1741
+ "grad_norm": 1.0032114563953773,
1742
+ "learning_rate": 1.2244631417688632e-05,
1743
+ "loss": 0.1634,
1744
+ "loss_nan_ranks": 0,
1745
+ "loss_rank_avg": 0.16418969631195068,
1746
+ "step": 790,
1747
+ "valid_targets_mean": 1259.9,
1748
+ "valid_targets_min": 784
1749
+ },
1750
+ {
1751
+ "epoch": 4.015151515151516,
1752
+ "grad_norm": 0.8527845312952415,
1753
+ "learning_rate": 1.197459117542278e-05,
1754
+ "loss": 0.1509,
1755
+ "loss_nan_ranks": 0,
1756
+ "loss_rank_avg": 0.14444586634635925,
1757
+ "step": 795,
1758
+ "valid_targets_mean": 1407.8,
1759
+ "valid_targets_min": 683
1760
+ },
1761
+ {
1762
+ "epoch": 4.040404040404041,
1763
+ "grad_norm": 0.9834828014413456,
1764
+ "learning_rate": 1.170628371422587e-05,
1765
+ "loss": 0.1365,
1766
+ "loss_nan_ranks": 0,
1767
+ "loss_rank_avg": 0.1405264139175415,
1768
+ "step": 800,
1769
+ "valid_targets_mean": 1329.9,
1770
+ "valid_targets_min": 688
1771
+ },
1772
+ {
1773
+ "epoch": 4.065656565656566,
1774
+ "grad_norm": 1.0122303972675173,
1775
+ "learning_rate": 1.1439766964864995e-05,
1776
+ "loss": 0.1454,
1777
+ "loss_nan_ranks": 0,
1778
+ "loss_rank_avg": 0.15851764380931854,
1779
+ "step": 805,
1780
+ "valid_targets_mean": 1417.4,
1781
+ "valid_targets_min": 800
1782
+ },
1783
+ {
1784
+ "epoch": 4.090909090909091,
1785
+ "grad_norm": 1.0252397884032003,
1786
+ "learning_rate": 1.117509847147128e-05,
1787
+ "loss": 0.1405,
1788
+ "loss_nan_ranks": 0,
1789
+ "loss_rank_avg": 0.13895373046398163,
1790
+ "step": 810,
1791
+ "valid_targets_mean": 1156.9,
1792
+ "valid_targets_min": 574
1793
+ },
1794
+ {
1795
+ "epoch": 4.116161616161616,
1796
+ "grad_norm": 0.8421871414828316,
1797
+ "learning_rate": 1.0912335379115469e-05,
1798
+ "loss": 0.138,
1799
+ "loss_nan_ranks": 0,
1800
+ "loss_rank_avg": 0.1370278298854828,
1801
+ "step": 815,
1802
+ "valid_targets_mean": 1569.4,
1803
+ "valid_targets_min": 668
1804
+ },
1805
+ {
1806
+ "epoch": 4.141414141414141,
1807
+ "grad_norm": 0.9960492717705738,
1808
+ "learning_rate": 1.0651534421469569e-05,
1809
+ "loss": 0.1441,
1810
+ "loss_nan_ranks": 0,
1811
+ "loss_rank_avg": 0.1409623920917511,
1812
+ "step": 820,
1813
+ "valid_targets_mean": 1150.9,
1814
+ "valid_targets_min": 604
1815
+ },
1816
+ {
1817
+ "epoch": 4.166666666666667,
1818
+ "grad_norm": 0.9475930503556128,
1819
+ "learning_rate": 1.0392751908557406e-05,
1820
+ "loss": 0.1423,
1821
+ "loss_nan_ranks": 0,
1822
+ "loss_rank_avg": 0.14038413763046265,
1823
+ "step": 825,
1824
+ "valid_targets_mean": 1251.2,
1825
+ "valid_targets_min": 741
1826
+ },
1827
+ {
1828
+ "epoch": 4.191919191919192,
1829
+ "grad_norm": 0.9077538390467569,
1830
+ "learning_rate": 1.013604371459663e-05,
1831
+ "loss": 0.1385,
1832
+ "loss_nan_ranks": 0,
1833
+ "loss_rank_avg": 0.1355401575565338,
1834
+ "step": 830,
1835
+ "valid_targets_mean": 1393.7,
1836
+ "valid_targets_min": 680
1837
+ },
1838
+ {
1839
+ "epoch": 4.217171717171717,
1840
+ "grad_norm": 0.9313116277410778,
1841
+ "learning_rate": 9.881465265934802e-06,
1842
+ "loss": 0.1434,
1843
+ "loss_nan_ranks": 0,
1844
+ "loss_rank_avg": 0.13590630888938904,
1845
+ "step": 835,
1846
+ "valid_targets_mean": 1176.8,
1847
+ "valid_targets_min": 785
1848
+ },
1849
+ {
1850
+ "epoch": 4.242424242424242,
1851
+ "grad_norm": 1.0954460592720818,
1852
+ "learning_rate": 9.62907152908215e-06,
1853
+ "loss": 0.1395,
1854
+ "loss_nan_ranks": 0,
1855
+ "loss_rank_avg": 0.14245356619358063,
1856
+ "step": 840,
1857
+ "valid_targets_mean": 1191.3,
1858
+ "valid_targets_min": 619
1859
+ },
1860
+ {
1861
+ "epoch": 4.267676767676767,
1862
+ "grad_norm": 0.9434679350463996,
1863
+ "learning_rate": 9.378916998843716e-06,
1864
+ "loss": 0.145,
1865
+ "loss_nan_ranks": 0,
1866
+ "loss_rank_avg": 0.1527371108531952,
1867
+ "step": 845,
1868
+ "valid_targets_mean": 1231.9,
1869
+ "valid_targets_min": 714
1870
+ },
1871
+ {
1872
+ "epoch": 4.292929292929293,
1873
+ "grad_norm": 0.9392165090395129,
1874
+ "learning_rate": 9.13105568655322e-06,
1875
+ "loss": 0.1407,
1876
+ "loss_nan_ranks": 0,
1877
+ "loss_rank_avg": 0.14035402238368988,
1878
+ "step": 850,
1879
+ "valid_targets_mean": 1247.8,
1880
+ "valid_targets_min": 494
1881
+ },
1882
+ {
1883
+ "epoch": 4.318181818181818,
1884
+ "grad_norm": 0.9610037453369663,
1885
+ "learning_rate": 8.885541108411386e-06,
1886
+ "loss": 0.1433,
1887
+ "loss_nan_ranks": 0,
1888
+ "loss_rank_avg": 0.13992322981357574,
1889
+ "step": 855,
1890
+ "valid_targets_mean": 1307.4,
1891
+ "valid_targets_min": 681
1892
+ },
1893
+ {
1894
+ "epoch": 4.343434343434343,
1895
+ "grad_norm": 0.9683597353689614,
1896
+ "learning_rate": 8.642426273931202e-06,
1897
+ "loss": 0.146,
1898
+ "loss_nan_ranks": 0,
1899
+ "loss_rank_avg": 0.1510647088289261,
1900
+ "step": 860,
1901
+ "valid_targets_mean": 1492.4,
1902
+ "valid_targets_min": 714
1903
+ },
1904
+ {
1905
+ "epoch": 4.3686868686868685,
1906
+ "grad_norm": 0.9533225821806605,
1907
+ "learning_rate": 8.40176367449247e-06,
1908
+ "loss": 0.1386,
1909
+ "loss_nan_ranks": 0,
1910
+ "loss_rank_avg": 0.1331542581319809,
1911
+ "step": 865,
1912
+ "valid_targets_mean": 1215.2,
1913
+ "valid_targets_min": 576
1914
+ },
1915
+ {
1916
+ "epoch": 4.393939393939394,
1917
+ "grad_norm": 1.0553941956845185,
1918
+ "learning_rate": 8.16360527200833e-06,
1919
+ "loss": 0.1461,
1920
+ "loss_nan_ranks": 0,
1921
+ "loss_rank_avg": 0.13314183056354523,
1922
+ "step": 870,
1923
+ "valid_targets_mean": 1251.7,
1924
+ "valid_targets_min": 696
1925
+ },
1926
+ {
1927
+ "epoch": 4.41919191919192,
1928
+ "grad_norm": 0.9400873253173821,
1929
+ "learning_rate": 7.928002487706077e-06,
1930
+ "loss": 0.1456,
1931
+ "loss_nan_ranks": 0,
1932
+ "loss_rank_avg": 0.14997591078281403,
1933
+ "step": 875,
1934
+ "valid_targets_mean": 1381.4,
1935
+ "valid_targets_min": 954
1936
+ },
1937
+ {
1938
+ "epoch": 4.444444444444445,
1939
+ "grad_norm": 0.9551571559692043,
1940
+ "learning_rate": 7.69500619102469e-06,
1941
+ "loss": 0.1423,
1942
+ "loss_nan_ranks": 0,
1943
+ "loss_rank_avg": 0.1366039514541626,
1944
+ "step": 880,
1945
+ "valid_targets_mean": 1127.9,
1946
+ "valid_targets_min": 702
1947
+ },
1948
+ {
1949
+ "epoch": 4.46969696969697,
1950
+ "grad_norm": 3.643078356257917,
1951
+ "learning_rate": 7.464666688631497e-06,
1952
+ "loss": 0.1453,
1953
+ "loss_nan_ranks": 0,
1954
+ "loss_rank_avg": 0.1459706425666809,
1955
+ "step": 885,
1956
+ "valid_targets_mean": 1290.3,
1957
+ "valid_targets_min": 671
1958
+ },
1959
+ {
1960
+ "epoch": 4.494949494949495,
1961
+ "grad_norm": 1.0361986594800512,
1962
+ "learning_rate": 7.237033713560415e-06,
1963
+ "loss": 0.1366,
1964
+ "loss_nan_ranks": 0,
1965
+ "loss_rank_avg": 0.13341154158115387,
1966
+ "step": 890,
1967
+ "valid_targets_mean": 1187.2,
1968
+ "valid_targets_min": 825
1969
+ },
1970
+ {
1971
+ "epoch": 4.52020202020202,
1972
+ "grad_norm": 0.9978450478422115,
1973
+ "learning_rate": 7.01215641447395e-06,
1974
+ "loss": 0.1416,
1975
+ "loss_nan_ranks": 0,
1976
+ "loss_rank_avg": 0.1472000777721405,
1977
+ "step": 895,
1978
+ "valid_targets_mean": 1245.2,
1979
+ "valid_targets_min": 797
1980
+ },
1981
+ {
1982
+ "epoch": 4.545454545454545,
1983
+ "grad_norm": 1.0033638283511224,
1984
+ "learning_rate": 6.790083345051457e-06,
1985
+ "loss": 0.144,
1986
+ "loss_nan_ranks": 0,
1987
+ "loss_rank_avg": 0.16042381525039673,
1988
+ "step": 900,
1989
+ "valid_targets_mean": 1246.5,
1990
+ "valid_targets_min": 655
1991
+ },
1992
+ {
1993
+ "epoch": 4.570707070707071,
1994
+ "grad_norm": 0.9370414548757213,
1995
+ "learning_rate": 6.570862453505793e-06,
1996
+ "loss": 0.142,
1997
+ "loss_nan_ranks": 0,
1998
+ "loss_rank_avg": 0.1394890397787094,
1999
+ "step": 905,
2000
+ "valid_targets_mean": 1438.2,
2001
+ "valid_targets_min": 677
2002
+ },
2003
+ {
2004
+ "epoch": 4.595959595959596,
2005
+ "grad_norm": 0.8845963118772141,
2006
+ "learning_rate": 6.35454107223074e-06,
2007
+ "loss": 0.1398,
2008
+ "loss_nan_ranks": 0,
2009
+ "loss_rank_avg": 0.1384766846895218,
2010
+ "step": 910,
2011
+ "valid_targets_mean": 1392.9,
2012
+ "valid_targets_min": 592
2013
+ },
2014
+ {
2015
+ "epoch": 4.621212121212121,
2016
+ "grad_norm": 0.9355477324402229,
2017
+ "learning_rate": 6.141165907581395e-06,
2018
+ "loss": 0.1419,
2019
+ "loss_nan_ranks": 0,
2020
+ "loss_rank_avg": 0.14917227625846863,
2021
+ "step": 915,
2022
+ "valid_targets_mean": 1327.9,
2023
+ "valid_targets_min": 753
2024
+ },
2025
+ {
2026
+ "epoch": 4.646464646464646,
2027
+ "grad_norm": 0.9640466514670939,
2028
+ "learning_rate": 5.9307830297896755e-06,
2029
+ "loss": 0.138,
2030
+ "loss_nan_ranks": 0,
2031
+ "loss_rank_avg": 0.13884782791137695,
2032
+ "step": 920,
2033
+ "valid_targets_mean": 1263.7,
2034
+ "valid_targets_min": 632
2035
+ },
2036
+ {
2037
+ "epoch": 4.671717171717171,
2038
+ "grad_norm": 0.9141945168421931,
2039
+ "learning_rate": 5.723437863017256e-06,
2040
+ "loss": 0.1378,
2041
+ "loss_nan_ranks": 0,
2042
+ "loss_rank_avg": 0.14294667541980743,
2043
+ "step": 925,
2044
+ "valid_targets_mean": 1302.5,
2045
+ "valid_targets_min": 777
2046
+ },
2047
+ {
2048
+ "epoch": 4.696969696969697,
2049
+ "grad_norm": 0.9123065154375142,
2050
+ "learning_rate": 5.519175175547919e-06,
2051
+ "loss": 0.138,
2052
+ "loss_nan_ranks": 0,
2053
+ "loss_rank_avg": 0.1268976479768753,
2054
+ "step": 930,
2055
+ "valid_targets_mean": 1319.0,
2056
+ "valid_targets_min": 661
2057
+ },
2058
+ {
2059
+ "epoch": 4.722222222222222,
2060
+ "grad_norm": 0.8964668435132555,
2061
+ "learning_rate": 5.318039070121557e-06,
2062
+ "loss": 0.1466,
2063
+ "loss_nan_ranks": 0,
2064
+ "loss_rank_avg": 0.14182919263839722,
2065
+ "step": 935,
2066
+ "valid_targets_mean": 1347.6,
2067
+ "valid_targets_min": 804
2068
+ },
2069
+ {
2070
+ "epoch": 4.747474747474747,
2071
+ "grad_norm": 0.9249960271962788,
2072
+ "learning_rate": 5.120072974411863e-06,
2073
+ "loss": 0.1371,
2074
+ "loss_nan_ranks": 0,
2075
+ "loss_rank_avg": 0.13355281949043274,
2076
+ "step": 940,
2077
+ "valid_targets_mean": 1267.5,
2078
+ "valid_targets_min": 631
2079
+ },
2080
+ {
2081
+ "epoch": 4.7727272727272725,
2082
+ "grad_norm": 0.9555005569091014,
2083
+ "learning_rate": 4.92531963164981e-06,
2084
+ "loss": 0.1408,
2085
+ "loss_nan_ranks": 0,
2086
+ "loss_rank_avg": 0.13473376631736755,
2087
+ "step": 945,
2088
+ "valid_targets_mean": 1280.8,
2089
+ "valid_targets_min": 741
2090
+ },
2091
+ {
2092
+ "epoch": 4.797979797979798,
2093
+ "grad_norm": 0.9026207825389647,
2094
+ "learning_rate": 4.733821091394841e-06,
2095
+ "loss": 0.1377,
2096
+ "loss_nan_ranks": 0,
2097
+ "loss_rank_avg": 0.12557560205459595,
2098
+ "step": 950,
2099
+ "valid_targets_mean": 1251.2,
2100
+ "valid_targets_min": 639
2101
+ },
2102
+ {
2103
+ "epoch": 4.8232323232323235,
2104
+ "grad_norm": 0.8648311888440021,
2105
+ "learning_rate": 4.5456187004558806e-06,
2106
+ "loss": 0.1341,
2107
+ "loss_nan_ranks": 0,
2108
+ "loss_rank_avg": 0.13172218203544617,
2109
+ "step": 955,
2110
+ "valid_targets_mean": 1288.4,
2111
+ "valid_targets_min": 615
2112
+ },
2113
+ {
2114
+ "epoch": 4.848484848484849,
2115
+ "grad_norm": 0.935270097185908,
2116
+ "learning_rate": 4.360753093964094e-06,
2117
+ "loss": 0.1415,
2118
+ "loss_nan_ranks": 0,
2119
+ "loss_rank_avg": 0.15985289216041565,
2120
+ "step": 960,
2121
+ "valid_targets_mean": 1484.0,
2122
+ "valid_targets_min": 806
2123
+ },
2124
+ {
2125
+ "epoch": 4.873737373737374,
2126
+ "grad_norm": 0.9346939564460294,
2127
+ "learning_rate": 4.179264186599239e-06,
2128
+ "loss": 0.1474,
2129
+ "loss_nan_ranks": 0,
2130
+ "loss_rank_avg": 0.14724518358707428,
2131
+ "step": 965,
2132
+ "valid_targets_mean": 1514.4,
2133
+ "valid_targets_min": 457
2134
+ },
2135
+ {
2136
+ "epoch": 4.898989898989899,
2137
+ "grad_norm": 0.9281283556463727,
2138
+ "learning_rate": 4.001191163971645e-06,
2139
+ "loss": 0.1362,
2140
+ "loss_nan_ranks": 0,
2141
+ "loss_rank_avg": 0.1386679857969284,
2142
+ "step": 970,
2143
+ "valid_targets_mean": 1296.4,
2144
+ "valid_targets_min": 817
2145
+ },
2146
+ {
2147
+ "epoch": 4.924242424242424,
2148
+ "grad_norm": 0.9213870761453768,
2149
+ "learning_rate": 3.826572474161565e-06,
2150
+ "loss": 0.1436,
2151
+ "loss_nan_ranks": 0,
2152
+ "loss_rank_avg": 0.13845255970954895,
2153
+ "step": 975,
2154
+ "valid_targets_mean": 1287.9,
2155
+ "valid_targets_min": 702
2156
+ },
2157
+ {
2158
+ "epoch": 4.94949494949495,
2159
+ "grad_norm": 1.0036592428841529,
2160
+ "learning_rate": 3.65544581941776e-06,
2161
+ "loss": 0.1378,
2162
+ "loss_nan_ranks": 0,
2163
+ "loss_rank_avg": 0.12950043380260468,
2164
+ "step": 980,
2165
+ "valid_targets_mean": 1237.9,
2166
+ "valid_targets_min": 618
2167
+ },
2168
+ {
2169
+ "epoch": 4.974747474747475,
2170
+ "grad_norm": 0.9657789228628604,
2171
+ "learning_rate": 3.487848148017161e-06,
2172
+ "loss": 0.1428,
2173
+ "loss_nan_ranks": 0,
2174
+ "loss_rank_avg": 0.13668538630008698,
2175
+ "step": 985,
2176
+ "valid_targets_mean": 1134.9,
2177
+ "valid_targets_min": 692
2178
+ },
2179
+ {
2180
+ "epoch": 5.0,
2181
+ "grad_norm": 0.8611796892921691,
2182
+ "learning_rate": 3.3238156462872937e-06,
2183
+ "loss": 0.1371,
2184
+ "loss_nan_ranks": 0,
2185
+ "loss_rank_avg": 0.12768948078155518,
2186
+ "step": 990,
2187
+ "valid_targets_mean": 1368.5,
2188
+ "valid_targets_min": 651
2189
+ },
2190
+ {
2191
+ "epoch": 5.025252525252525,
2192
+ "grad_norm": 0.9261974676714096,
2193
+ "learning_rate": 3.1633837307932037e-06,
2194
+ "loss": 0.1309,
2195
+ "loss_nan_ranks": 0,
2196
+ "loss_rank_avg": 0.13367873430252075,
2197
+ "step": 995,
2198
+ "valid_targets_mean": 1154.2,
2199
+ "valid_targets_min": 618
2200
+ },
2201
+ {
2202
+ "epoch": 5.05050505050505,
2203
+ "grad_norm": 1.0036400719596783,
2204
+ "learning_rate": 3.0065870406906094e-06,
2205
+ "loss": 0.1292,
2206
+ "loss_nan_ranks": 0,
2207
+ "loss_rank_avg": 0.13053901493549347,
2208
+ "step": 1000,
2209
+ "valid_targets_mean": 1190.0,
2210
+ "valid_targets_min": 633
2211
+ },
2212
+ {
2213
+ "epoch": 5.075757575757576,
2214
+ "grad_norm": 0.9363551041241059,
2215
+ "learning_rate": 2.8534594302469142e-06,
2216
+ "loss": 0.1253,
2217
+ "loss_nan_ranks": 0,
2218
+ "loss_rank_avg": 0.12562397122383118,
2219
+ "step": 1005,
2220
+ "valid_targets_mean": 1375.2,
2221
+ "valid_targets_min": 772
2222
+ },
2223
+ {
2224
+ "epoch": 5.101010101010101,
2225
+ "grad_norm": 1.2700394350667998,
2226
+ "learning_rate": 2.7040339615316315e-06,
2227
+ "loss": 0.1276,
2228
+ "loss_nan_ranks": 0,
2229
+ "loss_rank_avg": 0.1437637358903885,
2230
+ "step": 1010,
2231
+ "valid_targets_mean": 1253.7,
2232
+ "valid_targets_min": 837
2233
+ },
2234
+ {
2235
+ "epoch": 5.126262626262626,
2236
+ "grad_norm": 0.9276952092042188,
2237
+ "learning_rate": 2.5583428972779236e-06,
2238
+ "loss": 0.1323,
2239
+ "loss_nan_ranks": 0,
2240
+ "loss_rank_avg": 0.13029523193836212,
2241
+ "step": 1015,
2242
+ "valid_targets_mean": 1374.9,
2243
+ "valid_targets_min": 771
2244
+ },
2245
+ {
2246
+ "epoch": 5.151515151515151,
2247
+ "grad_norm": 0.962578989323224,
2248
+ "learning_rate": 2.4164176939166883e-06,
2249
+ "loss": 0.1323,
2250
+ "loss_nan_ranks": 0,
2251
+ "loss_rank_avg": 0.12893831729888916,
2252
+ "step": 1020,
2253
+ "valid_targets_mean": 1278.4,
2254
+ "valid_targets_min": 623
2255
+ },
2256
+ {
2257
+ "epoch": 5.1767676767676765,
2258
+ "grad_norm": 0.8740471250815197,
2259
+ "learning_rate": 2.278288994784723e-06,
2260
+ "loss": 0.1353,
2261
+ "loss_nan_ranks": 0,
2262
+ "loss_rank_avg": 0.13151079416275024,
2263
+ "step": 1025,
2264
+ "valid_targets_mean": 1495.8,
2265
+ "valid_targets_min": 695
2266
+ },
2267
+ {
2268
+ "epoch": 5.202020202020202,
2269
+ "grad_norm": 0.8967346790633705,
2270
+ "learning_rate": 2.143986623508478e-06,
2271
+ "loss": 0.1335,
2272
+ "loss_nan_ranks": 0,
2273
+ "loss_rank_avg": 0.15209463238716125,
2274
+ "step": 1030,
2275
+ "valid_targets_mean": 1422.6,
2276
+ "valid_targets_min": 587
2277
+ },
2278
+ {
2279
+ "epoch": 5.2272727272727275,
2280
+ "grad_norm": 0.994338705656964,
2281
+ "learning_rate": 2.0135395775647916e-06,
2282
+ "loss": 0.1302,
2283
+ "loss_nan_ranks": 0,
2284
+ "loss_rank_avg": 0.1367867887020111,
2285
+ "step": 1035,
2286
+ "valid_targets_mean": 1365.3,
2287
+ "valid_targets_min": 648
2288
+ },
2289
+ {
2290
+ "epoch": 5.252525252525253,
2291
+ "grad_norm": 0.9608409928013174,
2292
+ "learning_rate": 1.8869760220199707e-06,
2293
+ "loss": 0.1292,
2294
+ "loss_nan_ranks": 0,
2295
+ "loss_rank_avg": 0.12372744828462601,
2296
+ "step": 1040,
2297
+ "valid_targets_mean": 1244.5,
2298
+ "valid_targets_min": 788
2299
+ },
2300
+ {
2301
+ "epoch": 5.277777777777778,
2302
+ "grad_norm": 0.8629245570874375,
2303
+ "learning_rate": 1.7643232834486347e-06,
2304
+ "loss": 0.1331,
2305
+ "loss_nan_ranks": 0,
2306
+ "loss_rank_avg": 0.12655162811279297,
2307
+ "step": 1045,
2308
+ "valid_targets_mean": 1365.6,
2309
+ "valid_targets_min": 813
2310
+ },
2311
+ {
2312
+ "epoch": 5.303030303030303,
2313
+ "grad_norm": 0.8668280603831705,
2314
+ "learning_rate": 1.6456078440335699e-06,
2315
+ "loss": 0.1284,
2316
+ "loss_nan_ranks": 0,
2317
+ "loss_rank_avg": 0.11772658675909042,
2318
+ "step": 1050,
2319
+ "valid_targets_mean": 1353.5,
2320
+ "valid_targets_min": 891
2321
+ },
2322
+ {
2323
+ "epoch": 5.328282828282829,
2324
+ "grad_norm": 0.9006577141118385,
2325
+ "learning_rate": 1.530855335847916e-06,
2326
+ "loss": 0.1252,
2327
+ "loss_nan_ranks": 0,
2328
+ "loss_rank_avg": 0.12341552972793579,
2329
+ "step": 1055,
2330
+ "valid_targets_mean": 1405.5,
2331
+ "valid_targets_min": 582
2332
+ },
2333
+ {
2334
+ "epoch": 5.353535353535354,
2335
+ "grad_norm": 0.891591633370806,
2336
+ "learning_rate": 1.4200905353209127e-06,
2337
+ "loss": 0.1281,
2338
+ "loss_nan_ranks": 0,
2339
+ "loss_rank_avg": 0.11440576612949371,
2340
+ "step": 1060,
2341
+ "valid_targets_mean": 1307.4,
2342
+ "valid_targets_min": 948
2343
+ },
2344
+ {
2345
+ "epoch": 5.378787878787879,
2346
+ "grad_norm": 0.9342456964635858,
2347
+ "learning_rate": 1.3133373578883557e-06,
2348
+ "loss": 0.1294,
2349
+ "loss_nan_ranks": 0,
2350
+ "loss_rank_avg": 0.130423903465271,
2351
+ "step": 1065,
2352
+ "valid_targets_mean": 1313.1,
2353
+ "valid_targets_min": 618
2354
+ },
2355
+ {
2356
+ "epoch": 5.404040404040404,
2357
+ "grad_norm": 1.0164525546198238,
2358
+ "learning_rate": 1.210618852828962e-06,
2359
+ "loss": 0.1299,
2360
+ "loss_nan_ranks": 0,
2361
+ "loss_rank_avg": 0.11982132494449615,
2362
+ "step": 1070,
2363
+ "valid_targets_mean": 1178.9,
2364
+ "valid_targets_min": 576
2365
+ },
2366
+ {
2367
+ "epoch": 5.429292929292929,
2368
+ "grad_norm": 0.9239002515903104,
2369
+ "learning_rate": 1.111957198287792e-06,
2370
+ "loss": 0.1277,
2371
+ "loss_nan_ranks": 0,
2372
+ "loss_rank_avg": 0.12996883690357208,
2373
+ "step": 1075,
2374
+ "valid_targets_mean": 1293.5,
2375
+ "valid_targets_min": 640
2376
+ },
2377
+ {
2378
+ "epoch": 5.454545454545454,
2379
+ "grad_norm": 0.9511383148715302,
2380
+ "learning_rate": 1.0173736964876867e-06,
2381
+ "loss": 0.1263,
2382
+ "loss_nan_ranks": 0,
2383
+ "loss_rank_avg": 0.12540099024772644,
2384
+ "step": 1080,
2385
+ "valid_targets_mean": 1283.8,
2386
+ "valid_targets_min": 474
2387
+ },
2388
+ {
2389
+ "epoch": 5.47979797979798,
2390
+ "grad_norm": 1.0846233711368571,
2391
+ "learning_rate": 9.268887691298878e-07,
2392
+ "loss": 0.1284,
2393
+ "loss_nan_ranks": 0,
2394
+ "loss_rank_avg": 0.12933316826820374,
2395
+ "step": 1085,
2396
+ "valid_targets_mean": 1257.1,
2397
+ "valid_targets_min": 605
2398
+ },
2399
+ {
2400
+ "epoch": 5.505050505050505,
2401
+ "grad_norm": 0.9384952309399883,
2402
+ "learning_rate": 8.405219529847453e-07,
2403
+ "loss": 0.1277,
2404
+ "loss_nan_ranks": 0,
2405
+ "loss_rank_avg": 0.1350557953119278,
2406
+ "step": 1090,
2407
+ "valid_targets_mean": 1315.2,
2408
+ "valid_targets_min": 629
2409
+ },
2410
+ {
2411
+ "epoch": 5.53030303030303,
2412
+ "grad_norm": 0.9014524057924357,
2413
+ "learning_rate": 7.582918956734909e-07,
2414
+ "loss": 0.1345,
2415
+ "loss_nan_ranks": 0,
2416
+ "loss_rank_avg": 0.12424828857183456,
2417
+ "step": 1095,
2418
+ "valid_targets_mean": 1340.4,
2419
+ "valid_targets_min": 781
2420
+ },
2421
+ {
2422
+ "epoch": 5.555555555555555,
2423
+ "grad_norm": 0.9292653851483377,
2424
+ "learning_rate": 6.802163516419979e-07,
2425
+ "loss": 0.1265,
2426
+ "loss_nan_ranks": 0,
2427
+ "loss_rank_avg": 0.12965571880340576,
2428
+ "step": 1100,
2429
+ "valid_targets_mean": 1274.8,
2430
+ "valid_targets_min": 486
2431
+ },
2432
+ {
2433
+ "epoch": 5.58080808080808,
2434
+ "grad_norm": 0.943887372743222,
2435
+ "learning_rate": 6.063121783273907e-07,
2436
+ "loss": 0.1307,
2437
+ "loss_nan_ranks": 0,
2438
+ "loss_rank_avg": 0.1259051263332367,
2439
+ "step": 1105,
2440
+ "valid_targets_mean": 1327.0,
2441
+ "valid_targets_min": 836
2442
+ },
2443
+ {
2444
+ "epoch": 5.606060606060606,
2445
+ "grad_norm": 0.9737257175600421,
2446
+ "learning_rate": 5.365953325183216e-07,
2447
+ "loss": 0.1277,
2448
+ "loss_nan_ranks": 0,
2449
+ "loss_rank_avg": 0.12629470229148865,
2450
+ "step": 1110,
2451
+ "valid_targets_mean": 1253.1,
2452
+ "valid_targets_min": 762
2453
+ },
2454
+ {
2455
+ "epoch": 5.6313131313131315,
2456
+ "grad_norm": 1.0509566598389861,
2457
+ "learning_rate": 4.7108086690970113e-07,
2458
+ "loss": 0.1323,
2459
+ "loss_nan_ranks": 0,
2460
+ "loss_rank_avg": 0.13950996100902557,
2461
+ "step": 1115,
2462
+ "valid_targets_mean": 1205.4,
2463
+ "valid_targets_min": 655
2464
+ },
2465
+ {
2466
+ "epoch": 5.656565656565657,
2467
+ "grad_norm": 0.8734622130361067,
2468
+ "learning_rate": 4.0978292685265765e-07,
2469
+ "loss": 0.1235,
2470
+ "loss_nan_ranks": 0,
2471
+ "loss_rank_avg": 0.11927177011966705,
2472
+ "step": 1120,
2473
+ "valid_targets_mean": 1288.9,
2474
+ "valid_targets_min": 656
2475
+ },
2476
+ {
2477
+ "epoch": 5.681818181818182,
2478
+ "grad_norm": 1.041895306850947,
2479
+ "learning_rate": 3.5271474730037557e-07,
2480
+ "loss": 0.1302,
2481
+ "loss_nan_ranks": 0,
2482
+ "loss_rank_avg": 0.12915349006652832,
2483
+ "step": 1125,
2484
+ "valid_targets_mean": 1260.8,
2485
+ "valid_targets_min": 804
2486
+ },
2487
+ {
2488
+ "epoch": 5.707070707070707,
2489
+ "grad_norm": 0.9809767376936727,
2490
+ "learning_rate": 2.998886499505171e-07,
2491
+ "loss": 0.127,
2492
+ "loss_nan_ranks": 0,
2493
+ "loss_rank_avg": 0.1269259750843048,
2494
+ "step": 1130,
2495
+ "valid_targets_mean": 1323.3,
2496
+ "valid_targets_min": 741
2497
+ },
2498
+ {
2499
+ "epoch": 5.732323232323233,
2500
+ "grad_norm": 1.2138384974786145,
2501
+ "learning_rate": 2.513160405848303e-07,
2502
+ "loss": 0.1345,
2503
+ "loss_nan_ranks": 0,
2504
+ "loss_rank_avg": 0.14036789536476135,
2505
+ "step": 1135,
2506
+ "valid_targets_mean": 1340.8,
2507
+ "valid_targets_min": 615
2508
+ },
2509
+ {
2510
+ "epoch": 5.757575757575758,
2511
+ "grad_norm": 0.8909820802700256,
2512
+ "learning_rate": 2.0700740660648312e-07,
2513
+ "loss": 0.1328,
2514
+ "loss_nan_ranks": 0,
2515
+ "loss_rank_avg": 0.11737575381994247,
2516
+ "step": 1140,
2517
+ "valid_targets_mean": 1354.6,
2518
+ "valid_targets_min": 864
2519
+ },
2520
+ {
2521
+ "epoch": 5.782828282828283,
2522
+ "grad_norm": 1.0571471194993831,
2523
+ "learning_rate": 1.669723147757263e-07,
2524
+ "loss": 0.1284,
2525
+ "loss_nan_ranks": 0,
2526
+ "loss_rank_avg": 0.12995797395706177,
2527
+ "step": 1145,
2528
+ "valid_targets_mean": 1081.0,
2529
+ "valid_targets_min": 582
2530
+ },
2531
+ {
2532
+ "epoch": 5.808080808080808,
2533
+ "grad_norm": 1.4245132821438702,
2534
+ "learning_rate": 1.312194091443142e-07,
2535
+ "loss": 0.1331,
2536
+ "loss_nan_ranks": 0,
2537
+ "loss_rank_avg": 0.13467878103256226,
2538
+ "step": 1150,
2539
+ "valid_targets_mean": 1349.2,
2540
+ "valid_targets_min": 656
2541
+ },
2542
+ {
2543
+ "epoch": 5.833333333333333,
2544
+ "grad_norm": 0.9507679786850238,
2545
+ "learning_rate": 9.975640918915119e-08,
2546
+ "loss": 0.1306,
2547
+ "loss_nan_ranks": 0,
2548
+ "loss_rank_avg": 0.12804457545280457,
2549
+ "step": 1155,
2550
+ "valid_targets_mean": 1274.2,
2551
+ "valid_targets_min": 574
2552
+ },
2553
+ {
2554
+ "epoch": 5.858585858585858,
2555
+ "grad_norm": 0.9639470109858415,
2556
+ "learning_rate": 7.259010814555378e-08,
2557
+ "loss": 0.1304,
2558
+ "loss_nan_ranks": 0,
2559
+ "loss_rank_avg": 0.12429563701152802,
2560
+ "step": 1160,
2561
+ "valid_targets_mean": 1331.4,
2562
+ "valid_targets_min": 740
2563
+ },
2564
+ {
2565
+ "epoch": 5.883838383838384,
2566
+ "grad_norm": 0.9048090193387625,
2567
+ "learning_rate": 4.972637154052606e-08,
2568
+ "loss": 0.1292,
2569
+ "loss_nan_ranks": 0,
2570
+ "loss_rank_avg": 0.1266717165708542,
2571
+ "step": 1165,
2572
+ "valid_targets_mean": 1540.2,
2573
+ "valid_targets_min": 1033
2574
+ },
2575
+ {
2576
+ "epoch": 5.909090909090909,
2577
+ "grad_norm": 0.9137387217315224,
2578
+ "learning_rate": 3.117013592631501e-08,
2579
+ "loss": 0.1341,
2580
+ "loss_nan_ranks": 0,
2581
+ "loss_rank_avg": 0.13005393743515015,
2582
+ "step": 1170,
2583
+ "valid_targets_mean": 1407.2,
2584
+ "valid_targets_min": 683
2585
+ },
2586
+ {
2587
+ "epoch": 5.934343434343434,
2588
+ "grad_norm": 0.9221913785952132,
2589
+ "learning_rate": 1.6925407814545325e-08,
2590
+ "loss": 0.1267,
2591
+ "loss_nan_ranks": 0,
2592
+ "loss_rank_avg": 0.1224047988653183,
2593
+ "step": 1175,
2594
+ "valid_targets_mean": 1231.8,
2595
+ "valid_targets_min": 782
2596
+ },
2597
+ {
2598
+ "epoch": 5.959595959595959,
2599
+ "grad_norm": 0.89954321451429,
2600
+ "learning_rate": 6.995262811178016e-09,
2601
+ "loss": 0.1297,
2602
+ "loss_nan_ranks": 0,
2603
+ "loss_rank_avg": 0.12393134087324142,
2604
+ "step": 1180,
2605
+ "valid_targets_mean": 1373.0,
2606
+ "valid_targets_min": 746
2607
+ },
2608
+ {
2609
+ "epoch": 5.984848484848484,
2610
+ "grad_norm": 0.9614911698663842,
2611
+ "learning_rate": 1.3818449524416467e-09,
2612
+ "loss": 0.1304,
2613
+ "loss_nan_ranks": 0,
2614
+ "loss_rank_avg": 0.12797527015209198,
2615
+ "step": 1185,
2616
+ "valid_targets_mean": 1194.4,
2617
+ "valid_targets_min": 743
2618
+ },
2619
+ {
2620
+ "epoch": 6.0,
2621
+ "loss_nan_ranks": 0,
2622
+ "loss_rank_avg": 0.133074551820755,
2623
+ "step": 1188,
2624
+ "total_flos": 123574112944128.0,
2625
+ "train_loss": 0.2030692549064906,
2626
+ "train_runtime": 3957.8692,
2627
+ "train_samples_per_second": 4.79,
2628
+ "train_steps_per_second": 0.3,
2629
+ "valid_targets_mean": 1319.8,
2630
+ "valid_targets_min": 875
2631
+ }
2632
+ ],
2633
+ "logging_steps": 5,
2634
+ "max_steps": 1188,
2635
+ "num_input_tokens_seen": 0,
2636
+ "num_train_epochs": 6,
2637
+ "save_steps": 100,
2638
+ "stateful_callbacks": {
2639
+ "TrainerControl": {
2640
+ "args": {
2641
+ "should_epoch_stop": false,
2642
+ "should_evaluate": false,
2643
+ "should_log": false,
2644
+ "should_save": true,
2645
+ "should_training_stop": true
2646
+ },
2647
+ "attributes": {}
2648
+ }
2649
+ },
2650
+ "total_flos": 123574112944128.0,
2651
+ "train_batch_size": 1,
2652
+ "trial_name": null,
2653
+ "trial_params": null
2654
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67bd93f609301173c1d380511c7edcd9a67297c6f5e72816ca28043ba61fc617
3
+ size 8657
training_loss.png ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff