Harsh1729 commited on
Commit
7818f69
·
verified ·
1 Parent(s): 328584f

Upload Datamix 2B 80pct DPO HelpSteer3 16k

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: laineyyy/openeurollm-datamix-2b-en-80pct-DPO-HelpSteer3
5
+ tags:
6
+ - llama-factory
7
+ - full
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: openeurollm-datamix-2b-en-80pct-DPO-HelpSteer3
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # openeurollm-datamix-2b-en-80pct-DPO-HelpSteer3
18
+
19
+ This model is a fine-tuned version of [laineyyy/openeurollm-datamix-2b-en-80pct-DPO-HelpSteer3](https://huggingface.co/laineyyy/openeurollm-datamix-2b-en-80pct-DPO-HelpSteer3) on the long_sft dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 0.0002
39
+ - train_batch_size: 2
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 8
44
+ - gradient_accumulation_steps: 2
45
+ - total_train_batch_size: 32
46
+ - total_eval_batch_size: 64
47
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
48
+ - lr_scheduler_type: cosine
49
+ - lr_scheduler_warmup_ratio: 0.05
50
+ - num_epochs: 1.0
51
+
52
+ ### Training results
53
+
54
+
55
+
56
+ ### Framework versions
57
+
58
+ - Transformers 4.57.0
59
+ - Pytorch 2.6.0+cu124
60
+ - Datasets 4.0.0
61
+ - Tokenizers 0.22.1
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 113508824776704.0,
4
+ "train_loss": 1.074277332074394,
5
+ "train_runtime": 22414.5784,
6
+ "train_samples_per_second": 2.317,
7
+ "train_steps_per_second": 0.072
8
+ }
chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 2,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 106,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 8192,
15
+ "max_position_embeddings": 16384.0,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 32,
19
+ "num_hidden_layers": 24,
20
+ "num_key_value_heads": 32,
21
+ "pad_token_id": 0,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_scaling": {
25
+ "factor": 8.0,
26
+ "original_max_position_embeddings": 2048,
27
+ "rope_type": "yarn"
28
+ },
29
+ "rope_theta": 10000,
30
+ "tie_word_embeddings": true,
31
+ "transformers_version": "4.57.0",
32
+ "use_cache": false,
33
+ "vocab_size": 262272
34
+ }
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": [
5
+ 106,
6
+ 1
7
+ ],
8
+ "pad_token_id": 0,
9
+ "transformers_version": "4.57.0",
10
+ "use_cache": false
11
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55c2f844c7a6329e64bee0e1dd9f18635becab6fedf66c4006404fd9d0b954b0
3
+ size 4295717584
special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<end_of_turn>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 113508824776704.0,
4
+ "train_loss": 1.074277332074394,
5
+ "train_runtime": 22414.5784,
6
+ "train_samples_per_second": 2.317,
7
+ "train_steps_per_second": 0.072
8
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 5, "total_steps": 1623, "loss": 1.456, "lr": 9.756097560975611e-06, "epoch": 0.0030807147258163892, "percentage": 0.31, "elapsed_time": "0:01:25", "remaining_time": "7:40:46"}
2
+ {"current_steps": 10, "total_steps": 1623, "loss": 1.6273, "lr": 2.1951219512195124e-05, "epoch": 0.0061614294516327784, "percentage": 0.62, "elapsed_time": "0:02:37", "remaining_time": "7:02:28"}
3
+ {"current_steps": 15, "total_steps": 1623, "loss": 1.3216, "lr": 3.414634146341464e-05, "epoch": 0.009242144177449169, "percentage": 0.92, "elapsed_time": "0:03:52", "remaining_time": "6:54:51"}
4
+ {"current_steps": 20, "total_steps": 1623, "loss": 1.0714, "lr": 4.634146341463415e-05, "epoch": 0.012322858903265557, "percentage": 1.23, "elapsed_time": "0:05:06", "remaining_time": "6:49:18"}
5
+ {"current_steps": 25, "total_steps": 1623, "loss": 0.8755, "lr": 5.853658536585366e-05, "epoch": 0.015403573629081947, "percentage": 1.54, "elapsed_time": "0:05:57", "remaining_time": "6:21:23"}
6
+ {"current_steps": 30, "total_steps": 1623, "loss": 1.0835, "lr": 7.073170731707317e-05, "epoch": 0.018484288354898338, "percentage": 1.85, "elapsed_time": "0:07:03", "remaining_time": "6:14:38"}
7
+ {"current_steps": 35, "total_steps": 1623, "loss": 1.1251, "lr": 8.292682926829268e-05, "epoch": 0.021565003080714726, "percentage": 2.16, "elapsed_time": "0:08:16", "remaining_time": "6:15:42"}
8
+ {"current_steps": 40, "total_steps": 1623, "loss": 1.153, "lr": 9.51219512195122e-05, "epoch": 0.024645717806531114, "percentage": 2.46, "elapsed_time": "0:09:24", "remaining_time": "6:12:30"}
9
+ {"current_steps": 45, "total_steps": 1623, "loss": 1.2438, "lr": 0.00010731707317073172, "epoch": 0.027726432532347505, "percentage": 2.77, "elapsed_time": "0:10:35", "remaining_time": "6:11:33"}
10
+ {"current_steps": 50, "total_steps": 1623, "loss": 0.9104, "lr": 0.00011951219512195122, "epoch": 0.030807147258163893, "percentage": 3.08, "elapsed_time": "0:11:31", "remaining_time": "6:02:45"}
11
+ {"current_steps": 55, "total_steps": 1623, "loss": 1.157, "lr": 0.00013170731707317076, "epoch": 0.033887861983980284, "percentage": 3.39, "elapsed_time": "0:12:51", "remaining_time": "6:06:23"}
12
+ {"current_steps": 60, "total_steps": 1623, "loss": 1.2097, "lr": 0.00014390243902439025, "epoch": 0.036968576709796676, "percentage": 3.7, "elapsed_time": "0:14:02", "remaining_time": "6:05:44"}
13
+ {"current_steps": 65, "total_steps": 1623, "loss": 1.143, "lr": 0.00015609756097560978, "epoch": 0.04004929143561306, "percentage": 4.0, "elapsed_time": "0:15:17", "remaining_time": "6:06:22"}
14
+ {"current_steps": 70, "total_steps": 1623, "loss": 1.246, "lr": 0.00016829268292682927, "epoch": 0.04313000616142945, "percentage": 4.31, "elapsed_time": "0:16:25", "remaining_time": "6:04:13"}
15
+ {"current_steps": 75, "total_steps": 1623, "loss": 0.9725, "lr": 0.0001804878048780488, "epoch": 0.04621072088724584, "percentage": 4.62, "elapsed_time": "0:17:16", "remaining_time": "5:56:41"}
16
+ {"current_steps": 80, "total_steps": 1623, "loss": 1.1749, "lr": 0.0001926829268292683, "epoch": 0.04929143561306223, "percentage": 4.93, "elapsed_time": "0:18:27", "remaining_time": "5:56:08"}
17
+ {"current_steps": 85, "total_steps": 1623, "loss": 1.3022, "lr": 0.0001999991687649223, "epoch": 0.05237215033887862, "percentage": 5.24, "elapsed_time": "0:19:35", "remaining_time": "5:54:28"}
18
+ {"current_steps": 90, "total_steps": 1623, "loss": 1.2844, "lr": 0.00019998981752900036, "epoch": 0.05545286506469501, "percentage": 5.55, "elapsed_time": "0:20:44", "remaining_time": "5:53:21"}
19
+ {"current_steps": 95, "total_steps": 1623, "loss": 1.3119, "lr": 0.00019997007698817557, "epoch": 0.0585335797905114, "percentage": 5.85, "elapsed_time": "0:21:54", "remaining_time": "5:52:28"}
20
+ {"current_steps": 100, "total_steps": 1623, "loss": 1.0251, "lr": 0.00019993994919356167, "epoch": 0.061614294516327786, "percentage": 6.16, "elapsed_time": "0:22:49", "remaining_time": "5:47:33"}
21
+ {"current_steps": 105, "total_steps": 1623, "loss": 1.2598, "lr": 0.00019989943727554598, "epoch": 0.06469500924214418, "percentage": 6.47, "elapsed_time": "0:23:57", "remaining_time": "5:46:17"}
22
+ {"current_steps": 110, "total_steps": 1623, "loss": 1.251, "lr": 0.00019984854544346367, "epoch": 0.06777572396796057, "percentage": 6.78, "elapsed_time": "0:25:08", "remaining_time": "5:45:48"}
23
+ {"current_steps": 115, "total_steps": 1623, "loss": 1.3078, "lr": 0.00019978727898516086, "epoch": 0.07085643869377696, "percentage": 7.09, "elapsed_time": "0:26:11", "remaining_time": "5:43:31"}
24
+ {"current_steps": 120, "total_steps": 1623, "loss": 1.2101, "lr": 0.0001997156442664449, "epoch": 0.07393715341959335, "percentage": 7.39, "elapsed_time": "0:27:22", "remaining_time": "5:42:49"}
25
+ {"current_steps": 125, "total_steps": 1623, "loss": 1.0401, "lr": 0.00019963364873042298, "epoch": 0.07701786814540973, "percentage": 7.7, "elapsed_time": "0:28:16", "remaining_time": "5:38:53"}
26
+ {"current_steps": 130, "total_steps": 1623, "loss": 1.3897, "lr": 0.0001995413008967289, "epoch": 0.08009858287122612, "percentage": 8.01, "elapsed_time": "0:29:25", "remaining_time": "5:37:59"}
27
+ {"current_steps": 135, "total_steps": 1623, "loss": 1.2378, "lr": 0.00019943861036063768, "epoch": 0.08317929759704251, "percentage": 8.32, "elapsed_time": "0:30:41", "remaining_time": "5:38:14"}
28
+ {"current_steps": 140, "total_steps": 1623, "loss": 1.3101, "lr": 0.00019932558779206874, "epoch": 0.0862600123228589, "percentage": 8.63, "elapsed_time": "0:31:57", "remaining_time": "5:38:32"}
29
+ {"current_steps": 145, "total_steps": 1623, "loss": 1.3698, "lr": 0.00019920224493447702, "epoch": 0.0893407270486753, "percentage": 8.93, "elapsed_time": "0:33:08", "remaining_time": "5:37:43"}
30
+ {"current_steps": 150, "total_steps": 1623, "loss": 1.0169, "lr": 0.00019906859460363307, "epoch": 0.09242144177449169, "percentage": 9.24, "elapsed_time": "0:34:03", "remaining_time": "5:34:24"}
31
+ {"current_steps": 155, "total_steps": 1623, "loss": 1.5459, "lr": 0.00019892465068629131, "epoch": 0.09550215650030808, "percentage": 9.55, "elapsed_time": "0:35:18", "remaining_time": "5:34:26"}
32
+ {"current_steps": 160, "total_steps": 1623, "loss": 1.5477, "lr": 0.0001987704281387471, "epoch": 0.09858287122612445, "percentage": 9.86, "elapsed_time": "0:36:37", "remaining_time": "5:34:51"}
33
+ {"current_steps": 165, "total_steps": 1623, "loss": 1.4966, "lr": 0.00019860594298528282, "epoch": 0.10166358595194085, "percentage": 10.17, "elapsed_time": "0:37:54", "remaining_time": "5:35:02"}
34
+ {"current_steps": 170, "total_steps": 1623, "loss": 1.5506, "lr": 0.0001984312123165028, "epoch": 0.10474430067775724, "percentage": 10.47, "elapsed_time": "0:39:08", "remaining_time": "5:34:31"}
35
+ {"current_steps": 175, "total_steps": 1623, "loss": 1.0489, "lr": 0.0001982462542875576, "epoch": 0.10782501540357363, "percentage": 10.78, "elapsed_time": "0:40:03", "remaining_time": "5:31:27"}
36
+ {"current_steps": 180, "total_steps": 1623, "loss": 1.3007, "lr": 0.00019805108811625773, "epoch": 0.11090573012939002, "percentage": 11.09, "elapsed_time": "0:41:14", "remaining_time": "5:30:33"}
37
+ {"current_steps": 185, "total_steps": 1623, "loss": 1.2466, "lr": 0.00019784573408107657, "epoch": 0.11398644485520641, "percentage": 11.4, "elapsed_time": "0:42:27", "remaining_time": "5:30:04"}
38
+ {"current_steps": 190, "total_steps": 1623, "loss": 1.254, "lr": 0.00019763021351904358, "epoch": 0.1170671595810228, "percentage": 11.71, "elapsed_time": "0:43:42", "remaining_time": "5:29:39"}
39
+ {"current_steps": 195, "total_steps": 1623, "loss": 1.2234, "lr": 0.00019740454882352732, "epoch": 0.12014787430683918, "percentage": 12.01, "elapsed_time": "0:44:53", "remaining_time": "5:28:43"}
40
+ {"current_steps": 200, "total_steps": 1623, "loss": 0.9971, "lr": 0.0001971687634419086, "epoch": 0.12322858903265557, "percentage": 12.32, "elapsed_time": "0:45:50", "remaining_time": "5:26:09"}
41
+ {"current_steps": 205, "total_steps": 1623, "loss": 1.3131, "lr": 0.0001969228818731442, "epoch": 0.12630930375847196, "percentage": 12.63, "elapsed_time": "0:47:07", "remaining_time": "5:25:57"}
42
+ {"current_steps": 210, "total_steps": 1623, "loss": 1.3129, "lr": 0.00019666692966522145, "epoch": 0.12939001848428835, "percentage": 12.94, "elapsed_time": "0:48:26", "remaining_time": "5:25:58"}
43
+ {"current_steps": 215, "total_steps": 1623, "loss": 1.2031, "lr": 0.00019640093341250357, "epoch": 0.13247073321010475, "percentage": 13.25, "elapsed_time": "0:49:35", "remaining_time": "5:24:45"}
44
+ {"current_steps": 220, "total_steps": 1623, "loss": 1.2734, "lr": 0.0001961249207529665, "epoch": 0.13555144793592114, "percentage": 13.56, "elapsed_time": "0:50:50", "remaining_time": "5:24:16"}
45
+ {"current_steps": 225, "total_steps": 1623, "loss": 1.0102, "lr": 0.00019583892036532726, "epoch": 0.13863216266173753, "percentage": 13.86, "elapsed_time": "0:51:47", "remaining_time": "5:21:50"}
46
+ {"current_steps": 230, "total_steps": 1623, "loss": 1.2675, "lr": 0.00019554296196606395, "epoch": 0.14171287738755392, "percentage": 14.17, "elapsed_time": "0:52:59", "remaining_time": "5:20:54"}
47
+ {"current_steps": 235, "total_steps": 1623, "loss": 1.205, "lr": 0.00019523707630632835, "epoch": 0.1447935921133703, "percentage": 14.48, "elapsed_time": "0:54:12", "remaining_time": "5:20:08"}
48
+ {"current_steps": 240, "total_steps": 1623, "loss": 1.2834, "lr": 0.00019492129516875055, "epoch": 0.1478743068391867, "percentage": 14.79, "elapsed_time": "0:55:30", "remaining_time": "5:19:52"}
49
+ {"current_steps": 245, "total_steps": 1623, "loss": 1.3999, "lr": 0.00019459565136413666, "epoch": 0.15095502156500307, "percentage": 15.1, "elapsed_time": "0:56:39", "remaining_time": "5:18:37"}
50
+ {"current_steps": 250, "total_steps": 1623, "loss": 1.0177, "lr": 0.0001942601787280598, "epoch": 0.15403573629081946, "percentage": 15.4, "elapsed_time": "0:57:33", "remaining_time": "5:16:06"}
51
+ {"current_steps": 255, "total_steps": 1623, "loss": 1.2871, "lr": 0.00019391491211734425, "epoch": 0.15711645101663585, "percentage": 15.71, "elapsed_time": "0:58:43", "remaining_time": "5:15:03"}
52
+ {"current_steps": 260, "total_steps": 1623, "loss": 1.2824, "lr": 0.0001935598874064438, "epoch": 0.16019716574245224, "percentage": 16.02, "elapsed_time": "0:59:53", "remaining_time": "5:13:56"}
53
+ {"current_steps": 265, "total_steps": 1623, "loss": 1.1658, "lr": 0.00019319514148371435, "epoch": 0.16327788046826863, "percentage": 16.33, "elapsed_time": "1:00:58", "remaining_time": "5:12:28"}
54
+ {"current_steps": 270, "total_steps": 1623, "loss": 1.2408, "lr": 0.00019282071224758091, "epoch": 0.16635859519408502, "percentage": 16.64, "elapsed_time": "1:02:08", "remaining_time": "5:11:24"}
55
+ {"current_steps": 275, "total_steps": 1623, "loss": 1.0435, "lr": 0.00019243663860259993, "epoch": 0.16943930991990142, "percentage": 16.94, "elapsed_time": "1:03:03", "remaining_time": "5:09:04"}
56
+ {"current_steps": 280, "total_steps": 1623, "loss": 1.2576, "lr": 0.00019204296045541685, "epoch": 0.1725200246457178, "percentage": 17.25, "elapsed_time": "1:04:19", "remaining_time": "5:08:29"}
57
+ {"current_steps": 285, "total_steps": 1623, "loss": 1.1854, "lr": 0.0001916397187106199, "epoch": 0.1756007393715342, "percentage": 17.56, "elapsed_time": "1:05:27", "remaining_time": "5:07:16"}
58
+ {"current_steps": 290, "total_steps": 1623, "loss": 1.1993, "lr": 0.00019122695526648968, "epoch": 0.1786814540973506, "percentage": 17.87, "elapsed_time": "1:06:42", "remaining_time": "5:06:39"}
59
+ {"current_steps": 295, "total_steps": 1623, "loss": 1.3542, "lr": 0.00019080471301064598, "epoch": 0.18176216882316698, "percentage": 18.18, "elapsed_time": "1:07:52", "remaining_time": "5:05:31"}
60
+ {"current_steps": 300, "total_steps": 1623, "loss": 1.0038, "lr": 0.00019037303581559143, "epoch": 0.18484288354898337, "percentage": 18.48, "elapsed_time": "1:08:45", "remaining_time": "5:03:14"}
61
+ {"current_steps": 305, "total_steps": 1623, "loss": 1.2003, "lr": 0.00018993196853415317, "epoch": 0.18792359827479976, "percentage": 18.79, "elapsed_time": "1:09:58", "remaining_time": "5:02:22"}
62
+ {"current_steps": 310, "total_steps": 1623, "loss": 1.1497, "lr": 0.00018948155699482244, "epoch": 0.19100431300061615, "percentage": 19.1, "elapsed_time": "1:11:08", "remaining_time": "5:01:17"}
63
+ {"current_steps": 315, "total_steps": 1623, "loss": 1.2854, "lr": 0.00018902184799699263, "epoch": 0.19408502772643252, "percentage": 19.41, "elapsed_time": "1:12:23", "remaining_time": "5:00:35"}
64
+ {"current_steps": 320, "total_steps": 1623, "loss": 1.2265, "lr": 0.00018855288930609692, "epoch": 0.1971657424522489, "percentage": 19.72, "elapsed_time": "1:13:31", "remaining_time": "4:59:23"}
65
+ {"current_steps": 325, "total_steps": 1623, "loss": 1.0144, "lr": 0.00018807472964864515, "epoch": 0.2002464571780653, "percentage": 20.02, "elapsed_time": "1:14:24", "remaining_time": "4:57:12"}
66
+ {"current_steps": 330, "total_steps": 1623, "loss": 1.2764, "lr": 0.00018758741870716092, "epoch": 0.2033271719038817, "percentage": 20.33, "elapsed_time": "1:15:36", "remaining_time": "4:56:14"}
67
+ {"current_steps": 335, "total_steps": 1623, "loss": 1.2892, "lr": 0.00018709100711501955, "epoch": 0.20640788662969808, "percentage": 20.64, "elapsed_time": "1:16:49", "remaining_time": "4:55:23"}
68
+ {"current_steps": 340, "total_steps": 1623, "loss": 1.1708, "lr": 0.0001865855464511869, "epoch": 0.20948860135551448, "percentage": 20.95, "elapsed_time": "1:18:04", "remaining_time": "4:54:35"}
69
+ {"current_steps": 345, "total_steps": 1623, "loss": 1.0886, "lr": 0.00018607108923486025, "epoch": 0.21256931608133087, "percentage": 21.26, "elapsed_time": "1:19:13", "remaining_time": "4:53:27"}
70
+ {"current_steps": 350, "total_steps": 1623, "loss": 0.9796, "lr": 0.00018554768892001136, "epoch": 0.21565003080714726, "percentage": 21.57, "elapsed_time": "1:20:06", "remaining_time": "4:51:22"}
71
+ {"current_steps": 355, "total_steps": 1623, "loss": 1.0829, "lr": 0.00018501539988983234, "epoch": 0.21873074553296365, "percentage": 21.87, "elapsed_time": "1:21:22", "remaining_time": "4:50:38"}
72
+ {"current_steps": 360, "total_steps": 1623, "loss": 1.19, "lr": 0.0001844742774510851, "epoch": 0.22181146025878004, "percentage": 22.18, "elapsed_time": "1:22:35", "remaining_time": "4:49:47"}
73
+ {"current_steps": 365, "total_steps": 1623, "loss": 1.2743, "lr": 0.00018392437782835475, "epoch": 0.22489217498459643, "percentage": 22.49, "elapsed_time": "1:23:48", "remaining_time": "4:48:50"}
74
+ {"current_steps": 370, "total_steps": 1623, "loss": 1.2868, "lr": 0.00018336575815820766, "epoch": 0.22797288971041282, "percentage": 22.8, "elapsed_time": "1:25:01", "remaining_time": "4:47:55"}
75
+ {"current_steps": 375, "total_steps": 1623, "loss": 1.0349, "lr": 0.00018279847648325478, "epoch": 0.23105360443622922, "percentage": 23.11, "elapsed_time": "1:25:57", "remaining_time": "4:46:05"}
76
+ {"current_steps": 380, "total_steps": 1623, "loss": 1.2061, "lr": 0.0001822225917461208, "epoch": 0.2341343191620456, "percentage": 23.41, "elapsed_time": "1:27:14", "remaining_time": "4:45:20"}
77
+ {"current_steps": 385, "total_steps": 1623, "loss": 1.1963, "lr": 0.0001816381637833198, "epoch": 0.23721503388786197, "percentage": 23.72, "elapsed_time": "1:28:23", "remaining_time": "4:44:12"}
78
+ {"current_steps": 390, "total_steps": 1623, "loss": 1.2712, "lr": 0.00018104525331903799, "epoch": 0.24029574861367836, "percentage": 24.03, "elapsed_time": "1:29:36", "remaining_time": "4:43:16"}
79
+ {"current_steps": 395, "total_steps": 1623, "loss": 1.3406, "lr": 0.00018044392195882427, "epoch": 0.24337646333949475, "percentage": 24.34, "elapsed_time": "1:30:51", "remaining_time": "4:42:26"}
80
+ {"current_steps": 400, "total_steps": 1623, "loss": 1.0176, "lr": 0.00017983423218318918, "epoch": 0.24645717806531114, "percentage": 24.65, "elapsed_time": "1:31:48", "remaining_time": "4:40:42"}
81
+ {"current_steps": 405, "total_steps": 1623, "loss": 1.1581, "lr": 0.00017921624734111292, "epoch": 0.24953789279112754, "percentage": 24.95, "elapsed_time": "1:32:59", "remaining_time": "4:39:40"}
82
+ {"current_steps": 410, "total_steps": 1623, "loss": 1.288, "lr": 0.00017859003164346336, "epoch": 0.2526186075169439, "percentage": 25.26, "elapsed_time": "1:34:11", "remaining_time": "4:38:39"}
83
+ {"current_steps": 415, "total_steps": 1623, "loss": 1.217, "lr": 0.0001779556501563239, "epoch": 0.2556993222427603, "percentage": 25.57, "elapsed_time": "1:35:26", "remaining_time": "4:37:49"}
84
+ {"current_steps": 420, "total_steps": 1623, "loss": 1.239, "lr": 0.00017731316879423327, "epoch": 0.2587800369685767, "percentage": 25.88, "elapsed_time": "1:36:37", "remaining_time": "4:36:45"}
85
+ {"current_steps": 425, "total_steps": 1623, "loss": 1.0663, "lr": 0.00017666265431333654, "epoch": 0.2618607516943931, "percentage": 26.19, "elapsed_time": "1:37:34", "remaining_time": "4:35:03"}
86
+ {"current_steps": 430, "total_steps": 1623, "loss": 1.2585, "lr": 0.000176004174304449, "epoch": 0.2649414664202095, "percentage": 26.49, "elapsed_time": "1:38:48", "remaining_time": "4:34:09"}
87
+ {"current_steps": 435, "total_steps": 1623, "loss": 1.1862, "lr": 0.00017533779718603313, "epoch": 0.2680221811460259, "percentage": 26.8, "elapsed_time": "1:40:05", "remaining_time": "4:33:19"}
88
+ {"current_steps": 440, "total_steps": 1623, "loss": 1.218, "lr": 0.00017466359219708985, "epoch": 0.2711028958718423, "percentage": 27.11, "elapsed_time": "1:41:18", "remaining_time": "4:32:23"}
89
+ {"current_steps": 445, "total_steps": 1623, "loss": 1.1601, "lr": 0.00017398162938996422, "epoch": 0.27418361059765867, "percentage": 27.42, "elapsed_time": "1:42:29", "remaining_time": "4:31:19"}
90
+ {"current_steps": 450, "total_steps": 1623, "loss": 1.0049, "lr": 0.00017329197962306664, "epoch": 0.27726432532347506, "percentage": 27.73, "elapsed_time": "1:43:23", "remaining_time": "4:29:30"}
91
+ {"current_steps": 455, "total_steps": 1623, "loss": 1.0931, "lr": 0.00017259471455351072, "epoch": 0.28034504004929145, "percentage": 28.03, "elapsed_time": "1:44:30", "remaining_time": "4:28:17"}
92
+ {"current_steps": 460, "total_steps": 1623, "loss": 1.1999, "lr": 0.0001718899066296675, "epoch": 0.28342575477510784, "percentage": 28.34, "elapsed_time": "1:45:45", "remaining_time": "4:27:24"}
93
+ {"current_steps": 465, "total_steps": 1623, "loss": 1.2684, "lr": 0.000171177629083638, "epoch": 0.28650646950092423, "percentage": 28.65, "elapsed_time": "1:46:57", "remaining_time": "4:26:22"}
94
+ {"current_steps": 470, "total_steps": 1623, "loss": 1.2786, "lr": 0.0001704579559236441, "epoch": 0.2895871842267406, "percentage": 28.96, "elapsed_time": "1:48:06", "remaining_time": "4:25:13"}
95
+ {"current_steps": 475, "total_steps": 1623, "loss": 0.9667, "lr": 0.00016973096192633884, "epoch": 0.292667898952557, "percentage": 29.27, "elapsed_time": "1:49:01", "remaining_time": "4:23:30"}
96
+ {"current_steps": 480, "total_steps": 1623, "loss": 1.1185, "lr": 0.00016899672262903677, "epoch": 0.2957486136783734, "percentage": 29.57, "elapsed_time": "1:50:11", "remaining_time": "4:22:24"}
97
+ {"current_steps": 485, "total_steps": 1623, "loss": 1.2081, "lr": 0.00016825531432186543, "epoch": 0.2988293284041898, "percentage": 29.88, "elapsed_time": "1:51:25", "remaining_time": "4:21:27"}
98
+ {"current_steps": 490, "total_steps": 1623, "loss": 1.3, "lr": 0.00016750681403983846, "epoch": 0.30191004313000613, "percentage": 30.19, "elapsed_time": "1:52:38", "remaining_time": "4:20:28"}
99
+ {"current_steps": 495, "total_steps": 1623, "loss": 1.203, "lr": 0.00016675129955485152, "epoch": 0.3049907578558225, "percentage": 30.5, "elapsed_time": "1:53:51", "remaining_time": "4:19:26"}
100
+ {"current_steps": 500, "total_steps": 1623, "loss": 0.9961, "lr": 0.00016598884936760131, "epoch": 0.3080714725816389, "percentage": 30.81, "elapsed_time": "1:54:50", "remaining_time": "4:17:55"}
101
+ {"current_steps": 505, "total_steps": 1623, "loss": 1.182, "lr": 0.00016521954269942918, "epoch": 0.3111521873074553, "percentage": 31.12, "elapsed_time": "1:56:55", "remaining_time": "4:18:51"}
102
+ {"current_steps": 510, "total_steps": 1623, "loss": 1.1644, "lr": 0.00016444345948408984, "epoch": 0.3142329020332717, "percentage": 31.42, "elapsed_time": "1:58:08", "remaining_time": "4:17:50"}
103
+ {"current_steps": 515, "total_steps": 1623, "loss": 1.2397, "lr": 0.0001636606803594457, "epoch": 0.3173136167590881, "percentage": 31.73, "elapsed_time": "1:59:22", "remaining_time": "4:16:50"}
104
+ {"current_steps": 520, "total_steps": 1623, "loss": 1.227, "lr": 0.0001628712866590885, "epoch": 0.3203943314849045, "percentage": 32.04, "elapsed_time": "2:00:36", "remaining_time": "4:15:49"}
105
+ {"current_steps": 525, "total_steps": 1623, "loss": 0.9743, "lr": 0.00016207536040388845, "epoch": 0.3234750462107209, "percentage": 32.35, "elapsed_time": "2:01:33", "remaining_time": "4:14:13"}
106
+ {"current_steps": 530, "total_steps": 1623, "loss": 1.2353, "lr": 0.0001612729842934718, "epoch": 0.32655576093653726, "percentage": 32.66, "elapsed_time": "2:02:42", "remaining_time": "4:13:03"}
107
+ {"current_steps": 535, "total_steps": 1623, "loss": 1.2453, "lr": 0.00016046424169762827, "epoch": 0.32963647566235366, "percentage": 32.96, "elapsed_time": "2:03:52", "remaining_time": "4:11:55"}
108
+ {"current_steps": 540, "total_steps": 1623, "loss": 1.2025, "lr": 0.0001596492166476485, "epoch": 0.33271719038817005, "percentage": 33.27, "elapsed_time": "2:05:06", "remaining_time": "4:10:54"}
109
+ {"current_steps": 545, "total_steps": 1623, "loss": 1.187, "lr": 0.0001588279938275929, "epoch": 0.33579790511398644, "percentage": 33.58, "elapsed_time": "2:06:14", "remaining_time": "4:09:42"}
110
+ {"current_steps": 550, "total_steps": 1623, "loss": 0.9161, "lr": 0.00015800065856549269, "epoch": 0.33887861983980283, "percentage": 33.89, "elapsed_time": "2:07:11", "remaining_time": "4:08:08"}
111
+ {"current_steps": 555, "total_steps": 1623, "loss": 1.183, "lr": 0.00015716729682448393, "epoch": 0.3419593345656192, "percentage": 34.2, "elapsed_time": "2:08:22", "remaining_time": "4:07:02"}
112
+ {"current_steps": 560, "total_steps": 1623, "loss": 1.1642, "lr": 0.0001563279951938758, "epoch": 0.3450400492914356, "percentage": 34.5, "elapsed_time": "2:09:38", "remaining_time": "4:06:05"}
113
+ {"current_steps": 565, "total_steps": 1623, "loss": 1.1843, "lr": 0.00015548284088015354, "epoch": 0.348120764017252, "percentage": 34.81, "elapsed_time": "2:10:49", "remaining_time": "4:04:59"}
114
+ {"current_steps": 570, "total_steps": 1623, "loss": 1.188, "lr": 0.00015463192169791741, "epoch": 0.3512014787430684, "percentage": 35.12, "elapsed_time": "2:11:58", "remaining_time": "4:03:48"}
115
+ {"current_steps": 575, "total_steps": 1623, "loss": 0.9604, "lr": 0.0001537753260607584, "epoch": 0.3542821934688848, "percentage": 35.43, "elapsed_time": "2:12:53", "remaining_time": "4:02:12"}
116
+ {"current_steps": 580, "total_steps": 1623, "loss": 1.1923, "lr": 0.00015291314297207175, "epoch": 0.3573629081947012, "percentage": 35.74, "elapsed_time": "2:14:08", "remaining_time": "4:01:13"}
117
+ {"current_steps": 585, "total_steps": 1623, "loss": 1.1974, "lr": 0.0001520454620158093, "epoch": 0.36044362292051757, "percentage": 36.04, "elapsed_time": "2:15:18", "remaining_time": "4:00:04"}
118
+ {"current_steps": 590, "total_steps": 1623, "loss": 1.0945, "lr": 0.00015117237334717117, "epoch": 0.36352433764633396, "percentage": 36.35, "elapsed_time": "2:16:32", "remaining_time": "3:59:02"}
119
+ {"current_steps": 595, "total_steps": 1623, "loss": 1.2735, "lr": 0.00015029396768323846, "epoch": 0.36660505237215035, "percentage": 36.66, "elapsed_time": "2:17:41", "remaining_time": "3:57:54"}
120
+ {"current_steps": 600, "total_steps": 1623, "loss": 1.0361, "lr": 0.00014941033629354734, "epoch": 0.36968576709796674, "percentage": 36.97, "elapsed_time": "2:18:39", "remaining_time": "3:56:24"}
121
+ {"current_steps": 605, "total_steps": 1623, "loss": 1.2175, "lr": 0.00014852157099060596, "epoch": 0.37276648182378314, "percentage": 37.28, "elapsed_time": "2:19:52", "remaining_time": "3:55:21"}
122
+ {"current_steps": 610, "total_steps": 1623, "loss": 1.2687, "lr": 0.00014762776412035456, "epoch": 0.3758471965495995, "percentage": 37.58, "elapsed_time": "2:21:09", "remaining_time": "3:54:24"}
123
+ {"current_steps": 615, "total_steps": 1623, "loss": 1.1694, "lr": 0.00014672900855257056, "epoch": 0.3789279112754159, "percentage": 37.89, "elapsed_time": "2:22:21", "remaining_time": "3:53:20"}
124
+ {"current_steps": 620, "total_steps": 1623, "loss": 1.2024, "lr": 0.00014582539767121904, "epoch": 0.3820086260012323, "percentage": 38.2, "elapsed_time": "2:23:31", "remaining_time": "3:52:10"}
125
+ {"current_steps": 625, "total_steps": 1623, "loss": 0.8874, "lr": 0.0001449170253647498, "epoch": 0.3850893407270487, "percentage": 38.51, "elapsed_time": "2:24:26", "remaining_time": "3:50:38"}
126
+ {"current_steps": 630, "total_steps": 1623, "loss": 1.2114, "lr": 0.0001440039860163419, "epoch": 0.38817005545286504, "percentage": 38.82, "elapsed_time": "2:25:42", "remaining_time": "3:49:40"}
127
+ {"current_steps": 635, "total_steps": 1623, "loss": 1.1644, "lr": 0.00014308637449409706, "epoch": 0.39125077017868143, "percentage": 39.13, "elapsed_time": "2:26:58", "remaining_time": "3:48:41"}
128
+ {"current_steps": 640, "total_steps": 1623, "loss": 1.1707, "lr": 0.00014216428614118243, "epoch": 0.3943314849044978, "percentage": 39.43, "elapsed_time": "2:28:12", "remaining_time": "3:47:37"}
129
+ {"current_steps": 645, "total_steps": 1623, "loss": 1.1516, "lr": 0.00014123781676592418, "epoch": 0.3974121996303142, "percentage": 39.74, "elapsed_time": "2:29:22", "remaining_time": "3:46:29"}
130
+ {"current_steps": 650, "total_steps": 1623, "loss": 0.9869, "lr": 0.00014030706263185247, "epoch": 0.4004929143561306, "percentage": 40.05, "elapsed_time": "2:30:18", "remaining_time": "3:44:59"}
131
+ {"current_steps": 655, "total_steps": 1623, "loss": 1.1226, "lr": 0.00013937212044769955, "epoch": 0.403573629081947, "percentage": 40.36, "elapsed_time": "2:31:29", "remaining_time": "3:43:53"}
132
+ {"current_steps": 660, "total_steps": 1623, "loss": 1.2051, "lr": 0.0001384330873573513, "epoch": 0.4066543438077634, "percentage": 40.67, "elapsed_time": "2:32:43", "remaining_time": "3:42:50"}
133
+ {"current_steps": 665, "total_steps": 1623, "loss": 1.2038, "lr": 0.00013749006092975347, "epoch": 0.4097350585335798, "percentage": 40.97, "elapsed_time": "2:33:51", "remaining_time": "3:41:38"}
134
+ {"current_steps": 670, "total_steps": 1623, "loss": 1.202, "lr": 0.00013654313914877414, "epoch": 0.41281577325939617, "percentage": 41.28, "elapsed_time": "2:34:59", "remaining_time": "3:40:27"}
135
+ {"current_steps": 675, "total_steps": 1623, "loss": 0.9386, "lr": 0.00013559242040302272, "epoch": 0.41589648798521256, "percentage": 41.59, "elapsed_time": "2:35:56", "remaining_time": "3:39:00"}
136
+ {"current_steps": 680, "total_steps": 1623, "loss": 1.1778, "lr": 0.00013463800347562706, "epoch": 0.41897720271102895, "percentage": 41.9, "elapsed_time": "2:37:07", "remaining_time": "3:37:54"}
137
+ {"current_steps": 685, "total_steps": 1623, "loss": 1.0864, "lr": 0.00013367998753396944, "epoch": 0.42205791743684534, "percentage": 42.21, "elapsed_time": "2:38:21", "remaining_time": "3:36:50"}
138
+ {"current_steps": 690, "total_steps": 1623, "loss": 1.1923, "lr": 0.00013271847211938285, "epoch": 0.42513863216266173, "percentage": 42.51, "elapsed_time": "2:39:34", "remaining_time": "3:35:46"}
139
+ {"current_steps": 695, "total_steps": 1623, "loss": 1.1341, "lr": 0.0001317535571368082, "epoch": 0.4282193468884781, "percentage": 42.82, "elapsed_time": "2:40:47", "remaining_time": "3:34:41"}
140
+ {"current_steps": 700, "total_steps": 1623, "loss": 1.0109, "lr": 0.00013078534284441382, "epoch": 0.4313000616142945, "percentage": 43.13, "elapsed_time": "2:41:43", "remaining_time": "3:33:15"}
141
+ {"current_steps": 705, "total_steps": 1623, "loss": 1.0852, "lr": 0.00012981392984317834, "epoch": 0.4343807763401109, "percentage": 43.44, "elapsed_time": "2:42:51", "remaining_time": "3:32:04"}
142
+ {"current_steps": 710, "total_steps": 1623, "loss": 1.2037, "lr": 0.00012883941906643786, "epoch": 0.4374614910659273, "percentage": 43.75, "elapsed_time": "2:44:05", "remaining_time": "3:31:00"}
143
+ {"current_steps": 715, "total_steps": 1623, "loss": 1.1009, "lr": 0.00012786191176939848, "epoch": 0.4405422057917437, "percentage": 44.05, "elapsed_time": "2:45:12", "remaining_time": "3:29:48"}
144
+ {"current_steps": 720, "total_steps": 1623, "loss": 1.0769, "lr": 0.00012688150951861582, "epoch": 0.4436229205175601, "percentage": 44.36, "elapsed_time": "2:46:23", "remaining_time": "3:28:40"}
145
+ {"current_steps": 725, "total_steps": 1623, "loss": 0.9434, "lr": 0.00012589831418144154, "epoch": 0.4467036352433765, "percentage": 44.67, "elapsed_time": "2:47:18", "remaining_time": "3:27:13"}
146
+ {"current_steps": 730, "total_steps": 1623, "loss": 1.1517, "lr": 0.00012491242791543922, "epoch": 0.44978434996919286, "percentage": 44.98, "elapsed_time": "2:48:28", "remaining_time": "3:26:05"}
147
+ {"current_steps": 735, "total_steps": 1623, "loss": 1.1303, "lr": 0.00012392395315776963, "epoch": 0.45286506469500926, "percentage": 45.29, "elapsed_time": "2:49:36", "remaining_time": "3:24:55"}
148
+ {"current_steps": 740, "total_steps": 1623, "loss": 1.1156, "lr": 0.00012293299261454725, "epoch": 0.45594577942082565, "percentage": 45.59, "elapsed_time": "2:50:48", "remaining_time": "3:23:49"}
149
+ {"current_steps": 745, "total_steps": 1623, "loss": 1.0727, "lr": 0.00012193964925016872, "epoch": 0.45902649414664204, "percentage": 45.9, "elapsed_time": "2:52:00", "remaining_time": "3:22:42"}
150
+ {"current_steps": 750, "total_steps": 1623, "loss": 0.9198, "lr": 0.00012094402627661447, "epoch": 0.46210720887245843, "percentage": 46.21, "elapsed_time": "2:52:55", "remaining_time": "3:21:17"}
151
+ {"current_steps": 755, "total_steps": 1623, "loss": 1.1152, "lr": 0.00011994622714272448, "epoch": 0.4651879235982748, "percentage": 46.52, "elapsed_time": "2:54:10", "remaining_time": "3:20:14"}
152
+ {"current_steps": 760, "total_steps": 1623, "loss": 1.1196, "lr": 0.00011894635552344975, "epoch": 0.4682686383240912, "percentage": 46.83, "elapsed_time": "2:55:24", "remaining_time": "3:19:10"}
153
+ {"current_steps": 765, "total_steps": 1623, "loss": 1.1229, "lr": 0.00011794451530908011, "epoch": 0.4713493530499076, "percentage": 47.13, "elapsed_time": "2:56:34", "remaining_time": "3:18:02"}
154
+ {"current_steps": 770, "total_steps": 1623, "loss": 1.1024, "lr": 0.00011694081059444946, "epoch": 0.47443006777572394, "percentage": 47.44, "elapsed_time": "2:57:44", "remaining_time": "3:16:54"}
155
+ {"current_steps": 775, "total_steps": 1623, "loss": 0.9312, "lr": 0.0001159353456681201, "epoch": 0.47751078250154033, "percentage": 47.75, "elapsed_time": "2:58:41", "remaining_time": "3:15:31"}
156
+ {"current_steps": 780, "total_steps": 1623, "loss": 1.1166, "lr": 0.00011492822500154667, "epoch": 0.4805914972273567, "percentage": 48.06, "elapsed_time": "2:59:51", "remaining_time": "3:14:22"}
157
+ {"current_steps": 785, "total_steps": 1623, "loss": 1.0645, "lr": 0.00011391955323822126, "epoch": 0.4836722119531731, "percentage": 48.37, "elapsed_time": "3:00:59", "remaining_time": "3:13:13"}
158
+ {"current_steps": 790, "total_steps": 1623, "loss": 1.1993, "lr": 0.00011290943518280057, "epoch": 0.4867529266789895, "percentage": 48.68, "elapsed_time": "3:02:15", "remaining_time": "3:12:11"}
159
+ {"current_steps": 795, "total_steps": 1623, "loss": 1.0944, "lr": 0.0001118979757902162, "epoch": 0.4898336414048059, "percentage": 48.98, "elapsed_time": "3:03:25", "remaining_time": "3:11:02"}
160
+ {"current_steps": 800, "total_steps": 1623, "loss": 0.9793, "lr": 0.00011088528015476964, "epoch": 0.4929143561306223, "percentage": 49.29, "elapsed_time": "3:04:21", "remaining_time": "3:09:39"}
161
+ {"current_steps": 805, "total_steps": 1623, "loss": 1.0437, "lr": 0.00010987145349921251, "epoch": 0.4959950708564387, "percentage": 49.6, "elapsed_time": "3:05:32", "remaining_time": "3:08:32"}
162
+ {"current_steps": 810, "total_steps": 1623, "loss": 1.0949, "lr": 0.0001088566011638134, "epoch": 0.49907578558225507, "percentage": 49.91, "elapsed_time": "3:06:45", "remaining_time": "3:07:27"}
163
+ {"current_steps": 815, "total_steps": 1623, "loss": 1.0549, "lr": 0.00010784082859541292, "epoch": 0.5021565003080715, "percentage": 50.22, "elapsed_time": "3:07:56", "remaining_time": "3:06:19"}
164
+ {"current_steps": 820, "total_steps": 1623, "loss": 1.1154, "lr": 0.0001068242413364671, "epoch": 0.5052372150338879, "percentage": 50.52, "elapsed_time": "3:09:07", "remaining_time": "3:05:12"}
165
+ {"current_steps": 825, "total_steps": 1623, "loss": 0.9391, "lr": 0.00010580694501408138, "epoch": 0.5083179297597042, "percentage": 50.83, "elapsed_time": "3:10:02", "remaining_time": "3:03:49"}
166
+ {"current_steps": 830, "total_steps": 1623, "loss": 1.0181, "lr": 0.00010478904532903535, "epoch": 0.5113986444855206, "percentage": 51.14, "elapsed_time": "3:11:10", "remaining_time": "3:02:38"}
167
+ {"current_steps": 835, "total_steps": 1623, "loss": 1.1629, "lr": 0.00010377064804480025, "epoch": 0.514479359211337, "percentage": 51.45, "elapsed_time": "3:12:26", "remaining_time": "3:01:36"}
168
+ {"current_steps": 840, "total_steps": 1623, "loss": 0.9915, "lr": 0.00010275185897654971, "epoch": 0.5175600739371534, "percentage": 51.76, "elapsed_time": "3:13:37", "remaining_time": "3:00:29"}
169
+ {"current_steps": 845, "total_steps": 1623, "loss": 1.0566, "lr": 0.00010173278398016501, "epoch": 0.5206407886629698, "percentage": 52.06, "elapsed_time": "3:14:47", "remaining_time": "2:59:20"}
170
+ {"current_steps": 850, "total_steps": 1623, "loss": 0.913, "lr": 0.00010071352894123654, "epoch": 0.5237215033887862, "percentage": 52.37, "elapsed_time": "3:15:43", "remaining_time": "2:57:59"}
171
+ {"current_steps": 855, "total_steps": 1623, "loss": 1.0919, "lr": 9.969419976406165e-05, "epoch": 0.5268022181146026, "percentage": 52.68, "elapsed_time": "3:16:51", "remaining_time": "2:56:49"}
172
+ {"current_steps": 860, "total_steps": 1623, "loss": 1.1026, "lr": 9.867490236064108e-05, "epoch": 0.529882932840419, "percentage": 52.99, "elapsed_time": "3:18:05", "remaining_time": "2:55:45"}
173
+ {"current_steps": 865, "total_steps": 1623, "loss": 1.1784, "lr": 9.765574263967396e-05, "epoch": 0.5329636475662354, "percentage": 53.3, "elapsed_time": "3:19:21", "remaining_time": "2:54:41"}
174
+ {"current_steps": 870, "total_steps": 1623, "loss": 0.9908, "lr": 9.66368264955539e-05, "epoch": 0.5360443622920518, "percentage": 53.6, "elapsed_time": "3:20:36", "remaining_time": "2:53:37"}
175
+ {"current_steps": 875, "total_steps": 1623, "loss": 0.92, "lr": 9.56182597973658e-05, "epoch": 0.5391250770178682, "percentage": 53.91, "elapsed_time": "3:21:28", "remaining_time": "2:52:14"}
176
+ {"current_steps": 880, "total_steps": 1623, "loss": 1.0832, "lr": 9.460014837788605e-05, "epoch": 0.5422057917436846, "percentage": 54.22, "elapsed_time": "3:22:39", "remaining_time": "2:51:06"}
177
+ {"current_steps": 885, "total_steps": 1623, "loss": 1.1143, "lr": 9.358259802258581e-05, "epoch": 0.5452865064695009, "percentage": 54.53, "elapsed_time": "3:23:54", "remaining_time": "2:50:02"}
178
+ {"current_steps": 890, "total_steps": 1623, "loss": 1.0344, "lr": 9.256571445863972e-05, "epoch": 0.5483672211953173, "percentage": 54.84, "elapsed_time": "3:25:04", "remaining_time": "2:48:53"}
179
+ {"current_steps": 895, "total_steps": 1623, "loss": 1.0876, "lr": 9.154960334394027e-05, "epoch": 0.5514479359211337, "percentage": 55.14, "elapsed_time": "3:26:18", "remaining_time": "2:47:48"}
180
+ {"current_steps": 900, "total_steps": 1623, "loss": 0.8999, "lr": 9.053437025611973e-05, "epoch": 0.5545286506469501, "percentage": 55.45, "elapsed_time": "3:27:15", "remaining_time": "2:46:29"}
181
+ {"current_steps": 905, "total_steps": 1623, "loss": 1.077, "lr": 8.952012068158027e-05, "epoch": 0.5576093653727665, "percentage": 55.76, "elapsed_time": "3:28:23", "remaining_time": "2:45:19"}
182
+ {"current_steps": 910, "total_steps": 1623, "loss": 1.214, "lr": 8.850696000453326e-05, "epoch": 0.5606900800985829, "percentage": 56.07, "elapsed_time": "3:29:38", "remaining_time": "2:44:15"}
183
+ {"current_steps": 915, "total_steps": 1623, "loss": 1.0412, "lr": 8.749499349604993e-05, "epoch": 0.5637707948243993, "percentage": 56.38, "elapsed_time": "3:30:51", "remaining_time": "2:43:09"}
184
+ {"current_steps": 920, "total_steps": 1623, "loss": 0.9857, "lr": 8.64843263031228e-05, "epoch": 0.5668515095502157, "percentage": 56.69, "elapsed_time": "3:31:59", "remaining_time": "2:41:59"}
185
+ {"current_steps": 925, "total_steps": 1623, "loss": 0.8463, "lr": 8.547506343774097e-05, "epoch": 0.5699322242760321, "percentage": 56.99, "elapsed_time": "3:32:51", "remaining_time": "2:40:37"}
186
+ {"current_steps": 930, "total_steps": 1623, "loss": 1.1347, "lr": 8.446730976597878e-05, "epoch": 0.5730129390018485, "percentage": 57.3, "elapsed_time": "3:34:02", "remaining_time": "2:39:29"}
187
+ {"current_steps": 935, "total_steps": 1623, "loss": 1.04, "lr": 8.346116999709975e-05, "epoch": 0.5760936537276649, "percentage": 57.61, "elapsed_time": "3:35:15", "remaining_time": "2:38:23"}
188
+ {"current_steps": 940, "total_steps": 1623, "loss": 1.0614, "lr": 8.245674867267724e-05, "epoch": 0.5791743684534812, "percentage": 57.92, "elapsed_time": "3:36:32", "remaining_time": "2:37:20"}
189
+ {"current_steps": 945, "total_steps": 1623, "loss": 1.1235, "lr": 8.145415015573183e-05, "epoch": 0.5822550831792976, "percentage": 58.23, "elapsed_time": "3:37:39", "remaining_time": "2:36:09"}
190
+ {"current_steps": 950, "total_steps": 1623, "loss": 0.8619, "lr": 8.045347861988789e-05, "epoch": 0.585335797905114, "percentage": 58.53, "elapsed_time": "3:38:34", "remaining_time": "2:34:50"}
191
+ {"current_steps": 955, "total_steps": 1623, "loss": 1.0461, "lr": 7.945483803854936e-05, "epoch": 0.5884165126309304, "percentage": 58.84, "elapsed_time": "3:39:46", "remaining_time": "2:33:43"}
192
+ {"current_steps": 960, "total_steps": 1623, "loss": 1.1688, "lr": 7.845833217409675e-05, "epoch": 0.5914972273567468, "percentage": 59.15, "elapsed_time": "3:41:00", "remaining_time": "2:32:37"}
193
+ {"current_steps": 965, "total_steps": 1623, "loss": 1.0399, "lr": 7.746406456710564e-05, "epoch": 0.5945779420825632, "percentage": 59.46, "elapsed_time": "3:42:07", "remaining_time": "2:31:27"}
194
+ {"current_steps": 970, "total_steps": 1623, "loss": 1.0488, "lr": 7.64721385255886e-05, "epoch": 0.5976586568083796, "percentage": 59.77, "elapsed_time": "3:43:16", "remaining_time": "2:30:18"}
195
+ {"current_steps": 975, "total_steps": 1623, "loss": 0.8563, "lr": 7.548265711426104e-05, "epoch": 0.600739371534196, "percentage": 60.07, "elapsed_time": "3:44:11", "remaining_time": "2:29:00"}
196
+ {"current_steps": 980, "total_steps": 1623, "loss": 1.1057, "lr": 7.449572314383237e-05, "epoch": 0.6038200862600123, "percentage": 60.38, "elapsed_time": "3:45:31", "remaining_time": "2:27:58"}
197
+ {"current_steps": 985, "total_steps": 1623, "loss": 1.1176, "lr": 7.351143916032374e-05, "epoch": 0.6069008009858287, "percentage": 60.69, "elapsed_time": "3:46:47", "remaining_time": "2:26:53"}
198
+ {"current_steps": 990, "total_steps": 1623, "loss": 1.0584, "lr": 7.252990743441293e-05, "epoch": 0.609981515711645, "percentage": 61.0, "elapsed_time": "3:48:02", "remaining_time": "2:25:48"}
199
+ {"current_steps": 995, "total_steps": 1623, "loss": 1.1065, "lr": 7.155122995080827e-05, "epoch": 0.6130622304374614, "percentage": 61.31, "elapsed_time": "3:49:15", "remaining_time": "2:24:41"}
200
+ {"current_steps": 1000, "total_steps": 1623, "loss": 0.9011, "lr": 7.057550839765188e-05, "epoch": 0.6161429451632778, "percentage": 61.61, "elapsed_time": "3:50:10", "remaining_time": "2:23:24"}
201
+ {"current_steps": 1005, "total_steps": 1623, "loss": 1.0172, "lr": 6.960284415595407e-05, "epoch": 0.6192236598890942, "percentage": 61.92, "elapsed_time": "3:52:18", "remaining_time": "2:22:51"}
202
+ {"current_steps": 1010, "total_steps": 1623, "loss": 1.2291, "lr": 6.863333828905929e-05, "epoch": 0.6223043746149106, "percentage": 62.23, "elapsed_time": "3:53:39", "remaining_time": "2:21:48"}
203
+ {"current_steps": 1015, "total_steps": 1623, "loss": 1.093, "lr": 6.766709153214542e-05, "epoch": 0.625385089340727, "percentage": 62.54, "elapsed_time": "3:54:53", "remaining_time": "2:20:42"}
204
+ {"current_steps": 1020, "total_steps": 1623, "loss": 1.062, "lr": 6.670420428175705e-05, "epoch": 0.6284658040665434, "percentage": 62.85, "elapsed_time": "3:56:06", "remaining_time": "2:19:34"}
205
+ {"current_steps": 1025, "total_steps": 1623, "loss": 0.8716, "lr": 6.574477658537375e-05, "epoch": 0.6315465187923598, "percentage": 63.15, "elapsed_time": "3:57:03", "remaining_time": "2:18:18"}
206
+ {"current_steps": 1030, "total_steps": 1623, "loss": 0.9438, "lr": 6.4788908131015e-05, "epoch": 0.6346272335181762, "percentage": 63.46, "elapsed_time": "3:58:19", "remaining_time": "2:17:12"}
207
+ {"current_steps": 1035, "total_steps": 1623, "loss": 1.0037, "lr": 6.38366982368819e-05, "epoch": 0.6377079482439926, "percentage": 63.77, "elapsed_time": "3:59:31", "remaining_time": "2:16:04"}
208
+ {"current_steps": 1040, "total_steps": 1623, "loss": 1.0592, "lr": 6.288824584103816e-05, "epoch": 0.640788662969809, "percentage": 64.08, "elapsed_time": "4:00:42", "remaining_time": "2:14:56"}
209
+ {"current_steps": 1045, "total_steps": 1623, "loss": 1.0116, "lr": 6.194364949112953e-05, "epoch": 0.6438693776956254, "percentage": 64.39, "elapsed_time": "4:01:53", "remaining_time": "2:13:47"}
210
+ {"current_steps": 1050, "total_steps": 1623, "loss": 0.8504, "lr": 6.100300733414474e-05, "epoch": 0.6469500924214417, "percentage": 64.7, "elapsed_time": "4:02:50", "remaining_time": "2:12:31"}
211
+ {"current_steps": 1055, "total_steps": 1623, "loss": 1.039, "lr": 6.0066417106217455e-05, "epoch": 0.6500308071472581, "percentage": 65.0, "elapsed_time": "4:04:02", "remaining_time": "2:11:23"}
212
+ {"current_steps": 1060, "total_steps": 1623, "loss": 1.0437, "lr": 5.9133976122471214e-05, "epoch": 0.6531115218730745, "percentage": 65.31, "elapsed_time": "4:05:13", "remaining_time": "2:10:14"}
213
+ {"current_steps": 1065, "total_steps": 1623, "loss": 1.0864, "lr": 5.82057812669081e-05, "epoch": 0.6561922365988909, "percentage": 65.62, "elapsed_time": "4:06:24", "remaining_time": "2:09:06"}
214
+ {"current_steps": 1070, "total_steps": 1623, "loss": 0.9514, "lr": 5.728192898234195e-05, "epoch": 0.6592729513247073, "percentage": 65.93, "elapsed_time": "4:07:35", "remaining_time": "2:07:57"}
215
+ {"current_steps": 1075, "total_steps": 1623, "loss": 0.8168, "lr": 5.6362515260377835e-05, "epoch": 0.6623536660505237, "percentage": 66.24, "elapsed_time": "4:08:31", "remaining_time": "2:06:41"}
216
+ {"current_steps": 1080, "total_steps": 1623, "loss": 1.0327, "lr": 5.544763563143793e-05, "epoch": 0.6654343807763401, "percentage": 66.54, "elapsed_time": "4:09:49", "remaining_time": "2:05:36"}
217
+ {"current_steps": 1085, "total_steps": 1623, "loss": 1.0939, "lr": 5.4537385154835864e-05, "epoch": 0.6685150955021565, "percentage": 66.85, "elapsed_time": "4:11:03", "remaining_time": "2:04:29"}
218
+ {"current_steps": 1090, "total_steps": 1623, "loss": 0.9814, "lr": 5.363185840889935e-05, "epoch": 0.6715958102279729, "percentage": 67.16, "elapsed_time": "4:12:14", "remaining_time": "2:03:20"}
219
+ {"current_steps": 1095, "total_steps": 1623, "loss": 1.1126, "lr": 5.273114948114346e-05, "epoch": 0.6746765249537893, "percentage": 67.47, "elapsed_time": "4:13:26", "remaining_time": "2:02:12"}
220
+ {"current_steps": 1100, "total_steps": 1623, "loss": 0.8299, "lr": 5.1835351958494515e-05, "epoch": 0.6777572396796057, "percentage": 67.78, "elapsed_time": "4:14:25", "remaining_time": "2:00:57"}
221
+ {"current_steps": 1105, "total_steps": 1623, "loss": 1.0172, "lr": 5.094455891756587e-05, "epoch": 0.680837954405422, "percentage": 68.08, "elapsed_time": "4:15:44", "remaining_time": "1:59:53"}
222
+ {"current_steps": 1110, "total_steps": 1623, "loss": 1.1171, "lr": 5.00588629149872e-05, "epoch": 0.6839186691312384, "percentage": 68.39, "elapsed_time": "4:17:02", "remaining_time": "1:58:47"}
223
+ {"current_steps": 1115, "total_steps": 1623, "loss": 0.9896, "lr": 4.91783559777873e-05, "epoch": 0.6869993838570548, "percentage": 68.7, "elapsed_time": "4:18:15", "remaining_time": "1:57:39"}
224
+ {"current_steps": 1120, "total_steps": 1623, "loss": 0.9553, "lr": 4.830312959383238e-05, "epoch": 0.6900800985828712, "percentage": 69.01, "elapsed_time": "4:19:23", "remaining_time": "1:56:29"}
225
+ {"current_steps": 1125, "total_steps": 1623, "loss": 0.7778, "lr": 4.7433274702319815e-05, "epoch": 0.6931608133086876, "percentage": 69.32, "elapsed_time": "4:20:18", "remaining_time": "1:55:13"}
226
+ {"current_steps": 1130, "total_steps": 1623, "loss": 1.0221, "lr": 4.656888168432962e-05, "epoch": 0.696241528034504, "percentage": 69.62, "elapsed_time": "4:21:26", "remaining_time": "1:54:03"}
227
+ {"current_steps": 1135, "total_steps": 1623, "loss": 1.0651, "lr": 4.571004035343315e-05, "epoch": 0.6993222427603204, "percentage": 69.93, "elapsed_time": "4:22:37", "remaining_time": "1:52:54"}
228
+ {"current_steps": 1140, "total_steps": 1623, "loss": 1.0022, "lr": 4.485683994636144e-05, "epoch": 0.7024029574861368, "percentage": 70.24, "elapsed_time": "4:23:53", "remaining_time": "1:51:48"}
229
+ {"current_steps": 1145, "total_steps": 1623, "loss": 1.0825, "lr": 4.400936911373308e-05, "epoch": 0.7054836722119532, "percentage": 70.55, "elapsed_time": "4:25:03", "remaining_time": "1:50:39"}
230
+ {"current_steps": 1150, "total_steps": 1623, "loss": 0.8401, "lr": 4.3167715910842966e-05, "epoch": 0.7085643869377696, "percentage": 70.86, "elapsed_time": "4:25:55", "remaining_time": "1:49:22"}
231
+ {"current_steps": 1155, "total_steps": 1623, "loss": 1.0047, "lr": 4.2331967788513295e-05, "epoch": 0.711645101663586, "percentage": 71.16, "elapsed_time": "4:27:04", "remaining_time": "1:48:13"}
232
+ {"current_steps": 1160, "total_steps": 1623, "loss": 1.0024, "lr": 4.1502211584006836e-05, "epoch": 0.7147258163894024, "percentage": 71.47, "elapsed_time": "4:28:11", "remaining_time": "1:47:02"}
233
+ {"current_steps": 1165, "total_steps": 1623, "loss": 1.0983, "lr": 4.067853351200446e-05, "epoch": 0.7178065311152187, "percentage": 71.78, "elapsed_time": "4:29:20", "remaining_time": "1:45:53"}
234
+ {"current_steps": 1170, "total_steps": 1623, "loss": 0.9629, "lr": 3.986101915564695e-05, "epoch": 0.7208872458410351, "percentage": 72.09, "elapsed_time": "4:30:29", "remaining_time": "1:44:43"}
235
+ {"current_steps": 1175, "total_steps": 1623, "loss": 0.8496, "lr": 3.904975345764262e-05, "epoch": 0.7239679605668515, "percentage": 72.4, "elapsed_time": "4:31:22", "remaining_time": "1:43:27"}
236
+ {"current_steps": 1180, "total_steps": 1623, "loss": 1.033, "lr": 3.824482071144163e-05, "epoch": 0.7270486752926679, "percentage": 72.7, "elapsed_time": "4:32:35", "remaining_time": "1:42:20"}
237
+ {"current_steps": 1185, "total_steps": 1623, "loss": 1.0317, "lr": 3.744630455247739e-05, "epoch": 0.7301293900184843, "percentage": 73.01, "elapsed_time": "4:33:49", "remaining_time": "1:41:12"}
238
+ {"current_steps": 1190, "total_steps": 1623, "loss": 1.0313, "lr": 3.6654287949476626e-05, "epoch": 0.7332101047443007, "percentage": 73.32, "elapsed_time": "4:34:57", "remaining_time": "1:40:02"}
239
+ {"current_steps": 1195, "total_steps": 1623, "loss": 1.0512, "lr": 3.586885319583858e-05, "epoch": 0.7362908194701171, "percentage": 73.63, "elapsed_time": "4:36:07", "remaining_time": "1:38:53"}
240
+ {"current_steps": 1200, "total_steps": 1623, "loss": 0.845, "lr": 3.5090081901084525e-05, "epoch": 0.7393715341959335, "percentage": 73.94, "elapsed_time": "4:36:59", "remaining_time": "1:37:38"}
241
+ {"current_steps": 1205, "total_steps": 1623, "loss": 1.0898, "lr": 3.431805498237808e-05, "epoch": 0.7424522489217499, "percentage": 74.25, "elapsed_time": "4:38:09", "remaining_time": "1:36:29"}
242
+ {"current_steps": 1210, "total_steps": 1623, "loss": 1.0914, "lr": 3.355285265611784e-05, "epoch": 0.7455329636475663, "percentage": 74.55, "elapsed_time": "4:39:20", "remaining_time": "1:35:20"}
243
+ {"current_steps": 1215, "total_steps": 1623, "loss": 1.0227, "lr": 3.279455442960238e-05, "epoch": 0.7486136783733827, "percentage": 74.86, "elapsed_time": "4:40:34", "remaining_time": "1:34:13"}
244
+ {"current_steps": 1220, "total_steps": 1623, "loss": 1.0516, "lr": 3.204323909276924e-05, "epoch": 0.751694393099199, "percentage": 75.17, "elapsed_time": "4:41:45", "remaining_time": "1:33:04"}
245
+ {"current_steps": 1225, "total_steps": 1623, "loss": 0.8078, "lr": 3.1298984710008484e-05, "epoch": 0.7547751078250154, "percentage": 75.48, "elapsed_time": "4:42:40", "remaining_time": "1:31:50"}
246
+ {"current_steps": 1230, "total_steps": 1623, "loss": 0.9343, "lr": 3.056186861205136e-05, "epoch": 0.7578558225508318, "percentage": 75.79, "elapsed_time": "4:43:50", "remaining_time": "1:30:41"}
247
+ {"current_steps": 1235, "total_steps": 1623, "loss": 1.0701, "lr": 2.9831967387935467e-05, "epoch": 0.7609365372766482, "percentage": 76.09, "elapsed_time": "4:45:04", "remaining_time": "1:29:33"}
248
+ {"current_steps": 1240, "total_steps": 1623, "loss": 0.879, "lr": 2.9109356877046712e-05, "epoch": 0.7640172520024646, "percentage": 76.4, "elapsed_time": "4:46:12", "remaining_time": "1:28:24"}
249
+ {"current_steps": 1245, "total_steps": 1623, "loss": 1.0175, "lr": 2.8394112161239605e-05, "epoch": 0.767097966728281, "percentage": 76.71, "elapsed_time": "4:47:20", "remaining_time": "1:27:14"}
250
+ {"current_steps": 1250, "total_steps": 1623, "loss": 0.794, "lr": 2.7686307557035685e-05, "epoch": 0.7701786814540974, "percentage": 77.02, "elapsed_time": "4:48:13", "remaining_time": "1:26:00"}
251
+ {"current_steps": 1255, "total_steps": 1623, "loss": 0.9536, "lr": 2.6986016607901908e-05, "epoch": 0.7732593961799138, "percentage": 77.33, "elapsed_time": "4:49:24", "remaining_time": "1:24:51"}
252
+ {"current_steps": 1260, "total_steps": 1623, "loss": 0.9825, "lr": 2.629331207660931e-05, "epoch": 0.7763401109057301, "percentage": 77.63, "elapsed_time": "4:50:30", "remaining_time": "1:23:41"}
253
+ {"current_steps": 1265, "total_steps": 1623, "loss": 1.0014, "lr": 2.5608265937672436e-05, "epoch": 0.7794208256315465, "percentage": 77.94, "elapsed_time": "4:51:38", "remaining_time": "1:22:32"}
254
+ {"current_steps": 1270, "total_steps": 1623, "loss": 1.0176, "lr": 2.4930949369871203e-05, "epoch": 0.7825015403573629, "percentage": 78.25, "elapsed_time": "4:52:48", "remaining_time": "1:21:23"}
255
+ {"current_steps": 1275, "total_steps": 1623, "loss": 0.7798, "lr": 2.426143274885493e-05, "epoch": 0.7855822550831792, "percentage": 78.56, "elapsed_time": "4:53:43", "remaining_time": "1:20:10"}
256
+ {"current_steps": 1280, "total_steps": 1623, "loss": 0.9403, "lr": 2.359978563983022e-05, "epoch": 0.7886629698089956, "percentage": 78.87, "elapsed_time": "4:54:54", "remaining_time": "1:19:01"}
257
+ {"current_steps": 1285, "total_steps": 1623, "loss": 0.9362, "lr": 2.2946076790332827e-05, "epoch": 0.791743684534812, "percentage": 79.17, "elapsed_time": "4:56:03", "remaining_time": "1:17:52"}
258
+ {"current_steps": 1290, "total_steps": 1623, "loss": 0.8806, "lr": 2.2300374123084522e-05, "epoch": 0.7948243992606284, "percentage": 79.48, "elapsed_time": "4:57:15", "remaining_time": "1:16:43"}
259
+ {"current_steps": 1295, "total_steps": 1623, "loss": 0.9592, "lr": 2.166274472893567e-05, "epoch": 0.7979051139864448, "percentage": 79.79, "elapsed_time": "4:58:23", "remaining_time": "1:15:34"}
260
+ {"current_steps": 1300, "total_steps": 1623, "loss": 0.7639, "lr": 2.1033254859894226e-05, "epoch": 0.8009858287122612, "percentage": 80.1, "elapsed_time": "4:59:17", "remaining_time": "1:14:21"}
261
+ {"current_steps": 1305, "total_steps": 1623, "loss": 0.9676, "lr": 2.041196992224206e-05, "epoch": 0.8040665434380776, "percentage": 80.41, "elapsed_time": "5:00:24", "remaining_time": "1:13:12"}
262
+ {"current_steps": 1310, "total_steps": 1623, "loss": 0.9551, "lr": 1.9798954469738762e-05, "epoch": 0.807147258163894, "percentage": 80.71, "elapsed_time": "5:01:34", "remaining_time": "1:12:03"}
263
+ {"current_steps": 1315, "total_steps": 1623, "loss": 1.0136, "lr": 1.919427219691453e-05, "epoch": 0.8102279728897104, "percentage": 81.02, "elapsed_time": "5:02:47", "remaining_time": "1:10:55"}
264
+ {"current_steps": 1320, "total_steps": 1623, "loss": 0.9425, "lr": 1.8597985932451856e-05, "epoch": 0.8133086876155268, "percentage": 81.33, "elapsed_time": "5:03:55", "remaining_time": "1:09:45"}
265
+ {"current_steps": 1325, "total_steps": 1623, "loss": 0.7703, "lr": 1.8010157632657543e-05, "epoch": 0.8163894023413432, "percentage": 81.64, "elapsed_time": "5:04:48", "remaining_time": "1:08:33"}
266
+ {"current_steps": 1330, "total_steps": 1623, "loss": 1.0024, "lr": 1.7430848375025176e-05, "epoch": 0.8194701170671596, "percentage": 81.95, "elapsed_time": "5:05:57", "remaining_time": "1:07:24"}
267
+ {"current_steps": 1335, "total_steps": 1623, "loss": 1.0252, "lr": 1.686011835188891e-05, "epoch": 0.822550831792976, "percentage": 82.26, "elapsed_time": "5:07:04", "remaining_time": "1:06:14"}
268
+ {"current_steps": 1340, "total_steps": 1623, "loss": 1.0137, "lr": 1.6298026864169335e-05, "epoch": 0.8256315465187923, "percentage": 82.56, "elapsed_time": "5:08:11", "remaining_time": "1:05:05"}
269
+ {"current_steps": 1345, "total_steps": 1623, "loss": 0.9795, "lr": 1.5744632315211815e-05, "epoch": 0.8287122612446087, "percentage": 82.87, "elapsed_time": "5:09:21", "remaining_time": "1:03:56"}
270
+ {"current_steps": 1350, "total_steps": 1623, "loss": 0.8058, "lr": 1.5199992204718294e-05, "epoch": 0.8317929759704251, "percentage": 83.18, "elapsed_time": "5:10:16", "remaining_time": "1:02:44"}
271
+ {"current_steps": 1355, "total_steps": 1623, "loss": 0.9304, "lr": 1.4664163122772689e-05, "epoch": 0.8348736906962415, "percentage": 83.49, "elapsed_time": "5:11:28", "remaining_time": "1:01:36"}
272
+ {"current_steps": 1360, "total_steps": 1623, "loss": 0.9923, "lr": 1.4137200743961188e-05, "epoch": 0.8379544054220579, "percentage": 83.8, "elapsed_time": "5:12:42", "remaining_time": "1:00:28"}
273
+ {"current_steps": 1365, "total_steps": 1623, "loss": 0.9683, "lr": 1.3619159821587235e-05, "epoch": 0.8410351201478743, "percentage": 84.1, "elapsed_time": "5:13:50", "remaining_time": "0:59:19"}
274
+ {"current_steps": 1370, "total_steps": 1623, "loss": 0.9893, "lr": 1.3110094181982657e-05, "epoch": 0.8441158348736907, "percentage": 84.41, "elapsed_time": "5:14:56", "remaining_time": "0:58:09"}
275
+ {"current_steps": 1375, "total_steps": 1623, "loss": 0.8361, "lr": 1.261005671891482e-05, "epoch": 0.8471965495995071, "percentage": 84.72, "elapsed_time": "5:15:54", "remaining_time": "0:56:58"}
276
+ {"current_steps": 1380, "total_steps": 1623, "loss": 1.0073, "lr": 1.2119099388090716e-05, "epoch": 0.8502772643253235, "percentage": 85.03, "elapsed_time": "5:17:06", "remaining_time": "0:55:50"}
277
+ {"current_steps": 1385, "total_steps": 1623, "loss": 0.9662, "lr": 1.1637273201758748e-05, "epoch": 0.8533579790511399, "percentage": 85.34, "elapsed_time": "5:18:15", "remaining_time": "0:54:41"}
278
+ {"current_steps": 1390, "total_steps": 1623, "loss": 0.9058, "lr": 1.1164628223408168e-05, "epoch": 0.8564386937769563, "percentage": 85.64, "elapsed_time": "5:19:28", "remaining_time": "0:53:33"}
279
+ {"current_steps": 1395, "total_steps": 1623, "loss": 0.9428, "lr": 1.0701213562567492e-05, "epoch": 0.8595194085027726, "percentage": 85.95, "elapsed_time": "5:20:37", "remaining_time": "0:52:24"}
280
+ {"current_steps": 1400, "total_steps": 1623, "loss": 0.7677, "lr": 1.0247077369701653e-05, "epoch": 0.862600123228589, "percentage": 86.26, "elapsed_time": "5:21:30", "remaining_time": "0:51:12"}
281
+ {"current_steps": 1405, "total_steps": 1623, "loss": 0.933, "lr": 9.802266831209206e-06, "epoch": 0.8656808379544054, "percentage": 86.57, "elapsed_time": "5:22:47", "remaining_time": "0:50:05"}
282
+ {"current_steps": 1410, "total_steps": 1623, "loss": 0.9072, "lr": 9.366828164519258e-06, "epoch": 0.8687615526802218, "percentage": 86.88, "elapsed_time": "5:23:54", "remaining_time": "0:48:55"}
283
+ {"current_steps": 1415, "total_steps": 1623, "loss": 1.0238, "lr": 8.940806613289498e-06, "epoch": 0.8718422674060382, "percentage": 87.18, "elapsed_time": "5:25:04", "remaining_time": "0:47:47"}
284
+ {"current_steps": 1420, "total_steps": 1623, "loss": 0.8856, "lr": 8.524246442705153e-06, "epoch": 0.8749229821318546, "percentage": 87.49, "elapsed_time": "5:26:12", "remaining_time": "0:46:38"}
285
+ {"current_steps": 1425, "total_steps": 1623, "loss": 0.7973, "lr": 8.117190934879593e-06, "epoch": 0.878003696857671, "percentage": 87.8, "elapsed_time": "5:27:07", "remaining_time": "0:45:27"}
286
+ {"current_steps": 1430, "total_steps": 1623, "loss": 0.9935, "lr": 7.719682384357308e-06, "epoch": 0.8810844115834874, "percentage": 88.11, "elapsed_time": "5:28:16", "remaining_time": "0:44:18"}
287
+ {"current_steps": 1435, "total_steps": 1623, "loss": 0.9538, "lr": 7.33176209371923e-06, "epoch": 0.8841651263093038, "percentage": 88.42, "elapsed_time": "5:29:24", "remaining_time": "0:43:09"}
288
+ {"current_steps": 1440, "total_steps": 1623, "loss": 1.0136, "lr": 6.953470369291348e-06, "epoch": 0.8872458410351202, "percentage": 88.72, "elapsed_time": "5:30:41", "remaining_time": "0:42:01"}
289
+ {"current_steps": 1445, "total_steps": 1623, "loss": 1.0357, "lr": 6.5848465169566e-06, "epoch": 0.8903265557609366, "percentage": 89.03, "elapsed_time": "5:31:51", "remaining_time": "0:40:52"}
290
+ {"current_steps": 1450, "total_steps": 1623, "loss": 0.7719, "lr": 6.225928838071016e-06, "epoch": 0.893407270486753, "percentage": 89.34, "elapsed_time": "5:32:45", "remaining_time": "0:39:42"}
291
+ {"current_steps": 1455, "total_steps": 1623, "loss": 0.8551, "lr": 5.876754625483904e-06, "epoch": 0.8964879852125693, "percentage": 89.65, "elapsed_time": "5:33:56", "remaining_time": "0:38:33"}
292
+ {"current_steps": 1460, "total_steps": 1623, "loss": 0.9694, "lr": 5.537360159663108e-06, "epoch": 0.8995686999383857, "percentage": 89.96, "elapsed_time": "5:35:09", "remaining_time": "0:37:25"}
293
+ {"current_steps": 1465, "total_steps": 1623, "loss": 0.9334, "lr": 5.207780704925314e-06, "epoch": 0.9026494146642021, "percentage": 90.26, "elapsed_time": "5:36:19", "remaining_time": "0:36:16"}
294
+ {"current_steps": 1470, "total_steps": 1623, "loss": 1.0187, "lr": 4.888050505771868e-06, "epoch": 0.9057301293900185, "percentage": 90.57, "elapsed_time": "5:37:28", "remaining_time": "0:35:07"}
295
+ {"current_steps": 1475, "total_steps": 1623, "loss": 0.8009, "lr": 4.578202783330799e-06, "epoch": 0.9088108441158349, "percentage": 90.88, "elapsed_time": "5:38:23", "remaining_time": "0:33:57"}
296
+ {"current_steps": 1480, "total_steps": 1623, "loss": 0.9427, "lr": 4.2782697319048605e-06, "epoch": 0.9118915588416513, "percentage": 91.19, "elapsed_time": "5:39:33", "remaining_time": "0:32:48"}
297
+ {"current_steps": 1485, "total_steps": 1623, "loss": 0.8948, "lr": 3.988282515626585e-06, "epoch": 0.9149722735674677, "percentage": 91.5, "elapsed_time": "5:40:42", "remaining_time": "0:31:39"}
298
+ {"current_steps": 1490, "total_steps": 1623, "loss": 0.9136, "lr": 3.7082712652200867e-06, "epoch": 0.9180529882932841, "percentage": 91.81, "elapsed_time": "5:41:56", "remaining_time": "0:30:31"}
299
+ {"current_steps": 1495, "total_steps": 1623, "loss": 1.0515, "lr": 3.438265074870417e-06, "epoch": 0.9211337030191005, "percentage": 92.11, "elapsed_time": "5:43:06", "remaining_time": "0:29:22"}
300
+ {"current_steps": 1500, "total_steps": 1623, "loss": 0.7778, "lr": 3.1782919992006333e-06, "epoch": 0.9242144177449169, "percentage": 92.42, "elapsed_time": "5:44:01", "remaining_time": "0:28:12"}
301
+ {"current_steps": 1505, "total_steps": 1623, "loss": 1.041, "lr": 2.9283790503567222e-06, "epoch": 0.9272951324707333, "percentage": 92.73, "elapsed_time": "5:46:06", "remaining_time": "0:27:08"}
302
+ {"current_steps": 1510, "total_steps": 1623, "loss": 0.9829, "lr": 2.6885521952010105e-06, "epoch": 0.9303758471965496, "percentage": 93.04, "elapsed_time": "5:47:18", "remaining_time": "0:25:59"}
303
+ {"current_steps": 1515, "total_steps": 1623, "loss": 0.9059, "lr": 2.458836352614069e-06, "epoch": 0.933456561922366, "percentage": 93.35, "elapsed_time": "5:48:30", "remaining_time": "0:24:50"}
304
+ {"current_steps": 1520, "total_steps": 1623, "loss": 0.959, "lr": 2.239255390905581e-06, "epoch": 0.9365372766481824, "percentage": 93.65, "elapsed_time": "5:49:40", "remaining_time": "0:23:41"}
305
+ {"current_steps": 1525, "total_steps": 1623, "loss": 0.7814, "lr": 2.029832125334319e-06, "epoch": 0.9396179913739988, "percentage": 93.96, "elapsed_time": "5:50:34", "remaining_time": "0:22:31"}
306
+ {"current_steps": 1530, "total_steps": 1623, "loss": 0.9064, "lr": 1.8305883157375804e-06, "epoch": 0.9426987060998152, "percentage": 94.27, "elapsed_time": "5:51:42", "remaining_time": "0:21:22"}
307
+ {"current_steps": 1535, "total_steps": 1623, "loss": 1.0203, "lr": 1.6415446642702337e-06, "epoch": 0.9457794208256316, "percentage": 94.58, "elapsed_time": "5:52:54", "remaining_time": "0:20:13"}
308
+ {"current_steps": 1540, "total_steps": 1623, "loss": 0.9649, "lr": 1.462720813253682e-06, "epoch": 0.9488601355514479, "percentage": 94.89, "elapsed_time": "5:54:01", "remaining_time": "0:19:04"}
309
+ {"current_steps": 1545, "total_steps": 1623, "loss": 0.942, "lr": 1.2941353431350056e-06, "epoch": 0.9519408502772643, "percentage": 95.19, "elapsed_time": "5:55:12", "remaining_time": "0:17:55"}
310
+ {"current_steps": 1550, "total_steps": 1623, "loss": 0.8007, "lr": 1.135805770556364e-06, "epoch": 0.9550215650030807, "percentage": 95.5, "elapsed_time": "5:56:07", "remaining_time": "0:16:46"}
311
+ {"current_steps": 1555, "total_steps": 1623, "loss": 0.98, "lr": 9.877485465349058e-07, "epoch": 0.958102279728897, "percentage": 95.81, "elapsed_time": "5:57:20", "remaining_time": "0:15:37"}
312
+ {"current_steps": 1560, "total_steps": 1623, "loss": 0.8888, "lr": 8.499790547535025e-07, "epoch": 0.9611829944547134, "percentage": 96.12, "elapsed_time": "5:58:29", "remaining_time": "0:14:28"}
313
+ {"current_steps": 1565, "total_steps": 1623, "loss": 0.8864, "lr": 7.225116099623286e-07, "epoch": 0.9642637091805298, "percentage": 96.43, "elapsed_time": "5:59:41", "remaining_time": "0:13:19"}
314
+ {"current_steps": 1570, "total_steps": 1623, "loss": 0.978, "lr": 6.053594564914611e-07, "epoch": 0.9673444239063462, "percentage": 96.73, "elapsed_time": "6:00:49", "remaining_time": "0:12:10"}
315
+ {"current_steps": 1575, "total_steps": 1623, "loss": 0.7762, "lr": 4.985347668747809e-07, "epoch": 0.9704251386321626, "percentage": 97.04, "elapsed_time": "6:01:43", "remaining_time": "0:11:01"}
316
+ {"current_steps": 1580, "total_steps": 1623, "loss": 1.0036, "lr": 4.0204864058522864e-07, "epoch": 0.973505853357979, "percentage": 97.35, "elapsed_time": "6:02:53", "remaining_time": "0:09:52"}
317
+ {"current_steps": 1585, "total_steps": 1623, "loss": 1.0982, "lr": 3.15911102881461e-07, "epoch": 0.9765865680837954, "percentage": 97.66, "elapsed_time": "6:04:09", "remaining_time": "0:08:43"}
318
+ {"current_steps": 1590, "total_steps": 1623, "loss": 0.9256, "lr": 2.40131103766239e-07, "epoch": 0.9796672828096118, "percentage": 97.97, "elapsed_time": "6:05:16", "remaining_time": "0:07:34"}
319
+ {"current_steps": 1595, "total_steps": 1623, "loss": 1.0309, "lr": 1.747165170564724e-07, "epoch": 0.9827479975354282, "percentage": 98.27, "elapsed_time": "6:06:22", "remaining_time": "0:06:25"}
320
+ {"current_steps": 1600, "total_steps": 1623, "loss": 0.8133, "lr": 1.1967413956510686e-07, "epoch": 0.9858287122612446, "percentage": 98.58, "elapsed_time": "6:07:14", "remaining_time": "0:05:16"}
321
+ {"current_steps": 1605, "total_steps": 1623, "loss": 0.9367, "lr": 7.500969039491157e-08, "epoch": 0.988909426987061, "percentage": 98.89, "elapsed_time": "6:08:20", "remaining_time": "0:04:07"}
322
+ {"current_steps": 1610, "total_steps": 1623, "loss": 1.0131, "lr": 4.0727810344254325e-08, "epoch": 0.9919901417128774, "percentage": 99.2, "elapsed_time": "6:09:33", "remaining_time": "0:02:59"}
323
+ {"current_steps": 1615, "total_steps": 1623, "loss": 0.9065, "lr": 1.6832061424865153e-08, "epoch": 0.9950708564386938, "percentage": 99.51, "elapsed_time": "6:10:44", "remaining_time": "0:01:50"}
324
+ {"current_steps": 1620, "total_steps": 1623, "loss": 0.8729, "lr": 3.3249264917878387e-09, "epoch": 0.9981515711645101, "percentage": 99.82, "elapsed_time": "6:11:52", "remaining_time": "0:00:41"}
325
+ {"current_steps": 1623, "total_steps": 1623, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "6:13:34", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,2311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1623,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0030807147258163892,
14
+ "grad_norm": 6.2553863525390625,
15
+ "learning_rate": 9.756097560975611e-06,
16
+ "loss": 1.456,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.0061614294516327784,
21
+ "grad_norm": 3.4887969493865967,
22
+ "learning_rate": 2.1951219512195124e-05,
23
+ "loss": 1.6273,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.009242144177449169,
28
+ "grad_norm": 3.932939052581787,
29
+ "learning_rate": 3.414634146341464e-05,
30
+ "loss": 1.3216,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.012322858903265557,
35
+ "grad_norm": 4.0206146240234375,
36
+ "learning_rate": 4.634146341463415e-05,
37
+ "loss": 1.0714,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.015403573629081947,
42
+ "grad_norm": 4.394537925720215,
43
+ "learning_rate": 5.853658536585366e-05,
44
+ "loss": 0.8755,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.018484288354898338,
49
+ "grad_norm": 3.1858558654785156,
50
+ "learning_rate": 7.073170731707317e-05,
51
+ "loss": 1.0835,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.021565003080714726,
56
+ "grad_norm": 2.6023573875427246,
57
+ "learning_rate": 8.292682926829268e-05,
58
+ "loss": 1.1251,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.024645717806531114,
63
+ "grad_norm": 4.036273002624512,
64
+ "learning_rate": 9.51219512195122e-05,
65
+ "loss": 1.153,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.027726432532347505,
70
+ "grad_norm": 18.676860809326172,
71
+ "learning_rate": 0.00010731707317073172,
72
+ "loss": 1.2438,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.030807147258163893,
77
+ "grad_norm": 5.359733581542969,
78
+ "learning_rate": 0.00011951219512195122,
79
+ "loss": 0.9104,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.033887861983980284,
84
+ "grad_norm": 3.0401103496551514,
85
+ "learning_rate": 0.00013170731707317076,
86
+ "loss": 1.157,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.036968576709796676,
91
+ "grad_norm": 3.265693426132202,
92
+ "learning_rate": 0.00014390243902439025,
93
+ "loss": 1.2097,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.04004929143561306,
98
+ "grad_norm": 3.6194801330566406,
99
+ "learning_rate": 0.00015609756097560978,
100
+ "loss": 1.143,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.04313000616142945,
105
+ "grad_norm": 5.0640106201171875,
106
+ "learning_rate": 0.00016829268292682927,
107
+ "loss": 1.246,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.04621072088724584,
112
+ "grad_norm": 4.030930042266846,
113
+ "learning_rate": 0.0001804878048780488,
114
+ "loss": 0.9725,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.04929143561306223,
119
+ "grad_norm": 3.715740203857422,
120
+ "learning_rate": 0.0001926829268292683,
121
+ "loss": 1.1749,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.05237215033887862,
126
+ "grad_norm": 2.9077277183532715,
127
+ "learning_rate": 0.0001999991687649223,
128
+ "loss": 1.3022,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.05545286506469501,
133
+ "grad_norm": 2.939671516418457,
134
+ "learning_rate": 0.00019998981752900036,
135
+ "loss": 1.2844,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.0585335797905114,
140
+ "grad_norm": 7.6453776359558105,
141
+ "learning_rate": 0.00019997007698817557,
142
+ "loss": 1.3119,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.061614294516327786,
147
+ "grad_norm": 3.2917823791503906,
148
+ "learning_rate": 0.00019993994919356167,
149
+ "loss": 1.0251,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.06469500924214418,
154
+ "grad_norm": 4.276785373687744,
155
+ "learning_rate": 0.00019989943727554598,
156
+ "loss": 1.2598,
157
+ "step": 105
158
+ },
159
+ {
160
+ "epoch": 0.06777572396796057,
161
+ "grad_norm": 2.5673391819000244,
162
+ "learning_rate": 0.00019984854544346367,
163
+ "loss": 1.251,
164
+ "step": 110
165
+ },
166
+ {
167
+ "epoch": 0.07085643869377696,
168
+ "grad_norm": 4.126647472381592,
169
+ "learning_rate": 0.00019978727898516086,
170
+ "loss": 1.3078,
171
+ "step": 115
172
+ },
173
+ {
174
+ "epoch": 0.07393715341959335,
175
+ "grad_norm": 5.534213066101074,
176
+ "learning_rate": 0.0001997156442664449,
177
+ "loss": 1.2101,
178
+ "step": 120
179
+ },
180
+ {
181
+ "epoch": 0.07701786814540973,
182
+ "grad_norm": 4.482140064239502,
183
+ "learning_rate": 0.00019963364873042298,
184
+ "loss": 1.0401,
185
+ "step": 125
186
+ },
187
+ {
188
+ "epoch": 0.08009858287122612,
189
+ "grad_norm": 3.433528423309326,
190
+ "learning_rate": 0.0001995413008967289,
191
+ "loss": 1.3897,
192
+ "step": 130
193
+ },
194
+ {
195
+ "epoch": 0.08317929759704251,
196
+ "grad_norm": 3.556488275527954,
197
+ "learning_rate": 0.00019943861036063768,
198
+ "loss": 1.2378,
199
+ "step": 135
200
+ },
201
+ {
202
+ "epoch": 0.0862600123228589,
203
+ "grad_norm": 2.2870471477508545,
204
+ "learning_rate": 0.00019932558779206874,
205
+ "loss": 1.3101,
206
+ "step": 140
207
+ },
208
+ {
209
+ "epoch": 0.0893407270486753,
210
+ "grad_norm": 5.174860000610352,
211
+ "learning_rate": 0.00019920224493447702,
212
+ "loss": 1.3698,
213
+ "step": 145
214
+ },
215
+ {
216
+ "epoch": 0.09242144177449169,
217
+ "grad_norm": 2.9989383220672607,
218
+ "learning_rate": 0.00019906859460363307,
219
+ "loss": 1.0169,
220
+ "step": 150
221
+ },
222
+ {
223
+ "epoch": 0.09550215650030808,
224
+ "grad_norm": 2.323556900024414,
225
+ "learning_rate": 0.00019892465068629131,
226
+ "loss": 1.5459,
227
+ "step": 155
228
+ },
229
+ {
230
+ "epoch": 0.09858287122612445,
231
+ "grad_norm": 2.2696855068206787,
232
+ "learning_rate": 0.0001987704281387471,
233
+ "loss": 1.5477,
234
+ "step": 160
235
+ },
236
+ {
237
+ "epoch": 0.10166358595194085,
238
+ "grad_norm": 2.089024305343628,
239
+ "learning_rate": 0.00019860594298528282,
240
+ "loss": 1.4966,
241
+ "step": 165
242
+ },
243
+ {
244
+ "epoch": 0.10474430067775724,
245
+ "grad_norm": 10.206141471862793,
246
+ "learning_rate": 0.0001984312123165028,
247
+ "loss": 1.5506,
248
+ "step": 170
249
+ },
250
+ {
251
+ "epoch": 0.10782501540357363,
252
+ "grad_norm": 2.7964935302734375,
253
+ "learning_rate": 0.0001982462542875576,
254
+ "loss": 1.0489,
255
+ "step": 175
256
+ },
257
+ {
258
+ "epoch": 0.11090573012939002,
259
+ "grad_norm": 2.734079122543335,
260
+ "learning_rate": 0.00019805108811625773,
261
+ "loss": 1.3007,
262
+ "step": 180
263
+ },
264
+ {
265
+ "epoch": 0.11398644485520641,
266
+ "grad_norm": 2.0400795936584473,
267
+ "learning_rate": 0.00019784573408107657,
268
+ "loss": 1.2466,
269
+ "step": 185
270
+ },
271
+ {
272
+ "epoch": 0.1170671595810228,
273
+ "grad_norm": 2.773949146270752,
274
+ "learning_rate": 0.00019763021351904358,
275
+ "loss": 1.254,
276
+ "step": 190
277
+ },
278
+ {
279
+ "epoch": 0.12014787430683918,
280
+ "grad_norm": 3.435565233230591,
281
+ "learning_rate": 0.00019740454882352732,
282
+ "loss": 1.2234,
283
+ "step": 195
284
+ },
285
+ {
286
+ "epoch": 0.12322858903265557,
287
+ "grad_norm": 3.103530168533325,
288
+ "learning_rate": 0.0001971687634419086,
289
+ "loss": 0.9971,
290
+ "step": 200
291
+ },
292
+ {
293
+ "epoch": 0.12630930375847196,
294
+ "grad_norm": 3.090085983276367,
295
+ "learning_rate": 0.0001969228818731442,
296
+ "loss": 1.3131,
297
+ "step": 205
298
+ },
299
+ {
300
+ "epoch": 0.12939001848428835,
301
+ "grad_norm": 2.061624050140381,
302
+ "learning_rate": 0.00019666692966522145,
303
+ "loss": 1.3129,
304
+ "step": 210
305
+ },
306
+ {
307
+ "epoch": 0.13247073321010475,
308
+ "grad_norm": 1.9503087997436523,
309
+ "learning_rate": 0.00019640093341250357,
310
+ "loss": 1.2031,
311
+ "step": 215
312
+ },
313
+ {
314
+ "epoch": 0.13555144793592114,
315
+ "grad_norm": 4.388856410980225,
316
+ "learning_rate": 0.0001961249207529665,
317
+ "loss": 1.2734,
318
+ "step": 220
319
+ },
320
+ {
321
+ "epoch": 0.13863216266173753,
322
+ "grad_norm": 2.8406777381896973,
323
+ "learning_rate": 0.00019583892036532726,
324
+ "loss": 1.0102,
325
+ "step": 225
326
+ },
327
+ {
328
+ "epoch": 0.14171287738755392,
329
+ "grad_norm": 1.941605567932129,
330
+ "learning_rate": 0.00019554296196606395,
331
+ "loss": 1.2675,
332
+ "step": 230
333
+ },
334
+ {
335
+ "epoch": 0.1447935921133703,
336
+ "grad_norm": 1.958099365234375,
337
+ "learning_rate": 0.00019523707630632835,
338
+ "loss": 1.205,
339
+ "step": 235
340
+ },
341
+ {
342
+ "epoch": 0.1478743068391867,
343
+ "grad_norm": 1.868795394897461,
344
+ "learning_rate": 0.00019492129516875055,
345
+ "loss": 1.2834,
346
+ "step": 240
347
+ },
348
+ {
349
+ "epoch": 0.15095502156500307,
350
+ "grad_norm": 4.904277801513672,
351
+ "learning_rate": 0.00019459565136413666,
352
+ "loss": 1.3999,
353
+ "step": 245
354
+ },
355
+ {
356
+ "epoch": 0.15403573629081946,
357
+ "grad_norm": 3.6694881916046143,
358
+ "learning_rate": 0.0001942601787280598,
359
+ "loss": 1.0177,
360
+ "step": 250
361
+ },
362
+ {
363
+ "epoch": 0.15711645101663585,
364
+ "grad_norm": 2.2460737228393555,
365
+ "learning_rate": 0.00019391491211734425,
366
+ "loss": 1.2871,
367
+ "step": 255
368
+ },
369
+ {
370
+ "epoch": 0.16019716574245224,
371
+ "grad_norm": 1.6575430631637573,
372
+ "learning_rate": 0.0001935598874064438,
373
+ "loss": 1.2824,
374
+ "step": 260
375
+ },
376
+ {
377
+ "epoch": 0.16327788046826863,
378
+ "grad_norm": 1.8551292419433594,
379
+ "learning_rate": 0.00019319514148371435,
380
+ "loss": 1.1658,
381
+ "step": 265
382
+ },
383
+ {
384
+ "epoch": 0.16635859519408502,
385
+ "grad_norm": 4.493773937225342,
386
+ "learning_rate": 0.00019282071224758091,
387
+ "loss": 1.2408,
388
+ "step": 270
389
+ },
390
+ {
391
+ "epoch": 0.16943930991990142,
392
+ "grad_norm": 3.0752346515655518,
393
+ "learning_rate": 0.00019243663860259993,
394
+ "loss": 1.0435,
395
+ "step": 275
396
+ },
397
+ {
398
+ "epoch": 0.1725200246457178,
399
+ "grad_norm": 2.145301580429077,
400
+ "learning_rate": 0.00019204296045541685,
401
+ "loss": 1.2576,
402
+ "step": 280
403
+ },
404
+ {
405
+ "epoch": 0.1756007393715342,
406
+ "grad_norm": 2.2302119731903076,
407
+ "learning_rate": 0.0001916397187106199,
408
+ "loss": 1.1854,
409
+ "step": 285
410
+ },
411
+ {
412
+ "epoch": 0.1786814540973506,
413
+ "grad_norm": 1.5676261186599731,
414
+ "learning_rate": 0.00019122695526648968,
415
+ "loss": 1.1993,
416
+ "step": 290
417
+ },
418
+ {
419
+ "epoch": 0.18176216882316698,
420
+ "grad_norm": 4.74669075012207,
421
+ "learning_rate": 0.00019080471301064598,
422
+ "loss": 1.3542,
423
+ "step": 295
424
+ },
425
+ {
426
+ "epoch": 0.18484288354898337,
427
+ "grad_norm": 2.922614574432373,
428
+ "learning_rate": 0.00019037303581559143,
429
+ "loss": 1.0038,
430
+ "step": 300
431
+ },
432
+ {
433
+ "epoch": 0.18792359827479976,
434
+ "grad_norm": 2.0476372241973877,
435
+ "learning_rate": 0.00018993196853415317,
436
+ "loss": 1.2003,
437
+ "step": 305
438
+ },
439
+ {
440
+ "epoch": 0.19100431300061615,
441
+ "grad_norm": 2.0472970008850098,
442
+ "learning_rate": 0.00018948155699482244,
443
+ "loss": 1.1497,
444
+ "step": 310
445
+ },
446
+ {
447
+ "epoch": 0.19408502772643252,
448
+ "grad_norm": 1.9279072284698486,
449
+ "learning_rate": 0.00018902184799699263,
450
+ "loss": 1.2854,
451
+ "step": 315
452
+ },
453
+ {
454
+ "epoch": 0.1971657424522489,
455
+ "grad_norm": 3.2887041568756104,
456
+ "learning_rate": 0.00018855288930609692,
457
+ "loss": 1.2265,
458
+ "step": 320
459
+ },
460
+ {
461
+ "epoch": 0.2002464571780653,
462
+ "grad_norm": 2.4545176029205322,
463
+ "learning_rate": 0.00018807472964864515,
464
+ "loss": 1.0144,
465
+ "step": 325
466
+ },
467
+ {
468
+ "epoch": 0.2033271719038817,
469
+ "grad_norm": 1.9444770812988281,
470
+ "learning_rate": 0.00018758741870716092,
471
+ "loss": 1.2764,
472
+ "step": 330
473
+ },
474
+ {
475
+ "epoch": 0.20640788662969808,
476
+ "grad_norm": 1.92123544216156,
477
+ "learning_rate": 0.00018709100711501955,
478
+ "loss": 1.2892,
479
+ "step": 335
480
+ },
481
+ {
482
+ "epoch": 0.20948860135551448,
483
+ "grad_norm": 1.8828374147415161,
484
+ "learning_rate": 0.0001865855464511869,
485
+ "loss": 1.1708,
486
+ "step": 340
487
+ },
488
+ {
489
+ "epoch": 0.21256931608133087,
490
+ "grad_norm": 3.544203281402588,
491
+ "learning_rate": 0.00018607108923486025,
492
+ "loss": 1.0886,
493
+ "step": 345
494
+ },
495
+ {
496
+ "epoch": 0.21565003080714726,
497
+ "grad_norm": 2.528841257095337,
498
+ "learning_rate": 0.00018554768892001136,
499
+ "loss": 0.9796,
500
+ "step": 350
501
+ },
502
+ {
503
+ "epoch": 0.21873074553296365,
504
+ "grad_norm": 1.9211112260818481,
505
+ "learning_rate": 0.00018501539988983234,
506
+ "loss": 1.0829,
507
+ "step": 355
508
+ },
509
+ {
510
+ "epoch": 0.22181146025878004,
511
+ "grad_norm": 2.294722080230713,
512
+ "learning_rate": 0.0001844742774510851,
513
+ "loss": 1.19,
514
+ "step": 360
515
+ },
516
+ {
517
+ "epoch": 0.22489217498459643,
518
+ "grad_norm": 1.3482877016067505,
519
+ "learning_rate": 0.00018392437782835475,
520
+ "loss": 1.2743,
521
+ "step": 365
522
+ },
523
+ {
524
+ "epoch": 0.22797288971041282,
525
+ "grad_norm": 3.6750497817993164,
526
+ "learning_rate": 0.00018336575815820766,
527
+ "loss": 1.2868,
528
+ "step": 370
529
+ },
530
+ {
531
+ "epoch": 0.23105360443622922,
532
+ "grad_norm": 3.0952634811401367,
533
+ "learning_rate": 0.00018279847648325478,
534
+ "loss": 1.0349,
535
+ "step": 375
536
+ },
537
+ {
538
+ "epoch": 0.2341343191620456,
539
+ "grad_norm": 1.8905701637268066,
540
+ "learning_rate": 0.0001822225917461208,
541
+ "loss": 1.2061,
542
+ "step": 380
543
+ },
544
+ {
545
+ "epoch": 0.23721503388786197,
546
+ "grad_norm": 1.9268580675125122,
547
+ "learning_rate": 0.0001816381637833198,
548
+ "loss": 1.1963,
549
+ "step": 385
550
+ },
551
+ {
552
+ "epoch": 0.24029574861367836,
553
+ "grad_norm": 2.0002329349517822,
554
+ "learning_rate": 0.00018104525331903799,
555
+ "loss": 1.2712,
556
+ "step": 390
557
+ },
558
+ {
559
+ "epoch": 0.24337646333949475,
560
+ "grad_norm": 5.285348415374756,
561
+ "learning_rate": 0.00018044392195882427,
562
+ "loss": 1.3406,
563
+ "step": 395
564
+ },
565
+ {
566
+ "epoch": 0.24645717806531114,
567
+ "grad_norm": 2.668099880218506,
568
+ "learning_rate": 0.00017983423218318918,
569
+ "loss": 1.0176,
570
+ "step": 400
571
+ },
572
+ {
573
+ "epoch": 0.24953789279112754,
574
+ "grad_norm": 1.791987657546997,
575
+ "learning_rate": 0.00017921624734111292,
576
+ "loss": 1.1581,
577
+ "step": 405
578
+ },
579
+ {
580
+ "epoch": 0.2526186075169439,
581
+ "grad_norm": 1.531745195388794,
582
+ "learning_rate": 0.00017859003164346336,
583
+ "loss": 1.288,
584
+ "step": 410
585
+ },
586
+ {
587
+ "epoch": 0.2556993222427603,
588
+ "grad_norm": 1.6432485580444336,
589
+ "learning_rate": 0.0001779556501563239,
590
+ "loss": 1.217,
591
+ "step": 415
592
+ },
593
+ {
594
+ "epoch": 0.2587800369685767,
595
+ "grad_norm": 3.3277509212493896,
596
+ "learning_rate": 0.00017731316879423327,
597
+ "loss": 1.239,
598
+ "step": 420
599
+ },
600
+ {
601
+ "epoch": 0.2618607516943931,
602
+ "grad_norm": 2.6562795639038086,
603
+ "learning_rate": 0.00017666265431333654,
604
+ "loss": 1.0663,
605
+ "step": 425
606
+ },
607
+ {
608
+ "epoch": 0.2649414664202095,
609
+ "grad_norm": 1.8265531063079834,
610
+ "learning_rate": 0.000176004174304449,
611
+ "loss": 1.2585,
612
+ "step": 430
613
+ },
614
+ {
615
+ "epoch": 0.2680221811460259,
616
+ "grad_norm": 1.6376618146896362,
617
+ "learning_rate": 0.00017533779718603313,
618
+ "loss": 1.1862,
619
+ "step": 435
620
+ },
621
+ {
622
+ "epoch": 0.2711028958718423,
623
+ "grad_norm": 1.5394827127456665,
624
+ "learning_rate": 0.00017466359219708985,
625
+ "loss": 1.218,
626
+ "step": 440
627
+ },
628
+ {
629
+ "epoch": 0.27418361059765867,
630
+ "grad_norm": 2.490170955657959,
631
+ "learning_rate": 0.00017398162938996422,
632
+ "loss": 1.1601,
633
+ "step": 445
634
+ },
635
+ {
636
+ "epoch": 0.27726432532347506,
637
+ "grad_norm": 2.091308116912842,
638
+ "learning_rate": 0.00017329197962306664,
639
+ "loss": 1.0049,
640
+ "step": 450
641
+ },
642
+ {
643
+ "epoch": 0.28034504004929145,
644
+ "grad_norm": 1.9662305116653442,
645
+ "learning_rate": 0.00017259471455351072,
646
+ "loss": 1.0931,
647
+ "step": 455
648
+ },
649
+ {
650
+ "epoch": 0.28342575477510784,
651
+ "grad_norm": 1.607360601425171,
652
+ "learning_rate": 0.0001718899066296675,
653
+ "loss": 1.1999,
654
+ "step": 460
655
+ },
656
+ {
657
+ "epoch": 0.28650646950092423,
658
+ "grad_norm": 2.037191152572632,
659
+ "learning_rate": 0.000171177629083638,
660
+ "loss": 1.2684,
661
+ "step": 465
662
+ },
663
+ {
664
+ "epoch": 0.2895871842267406,
665
+ "grad_norm": 3.934429883956909,
666
+ "learning_rate": 0.0001704579559236441,
667
+ "loss": 1.2786,
668
+ "step": 470
669
+ },
670
+ {
671
+ "epoch": 0.292667898952557,
672
+ "grad_norm": 3.046133041381836,
673
+ "learning_rate": 0.00016973096192633884,
674
+ "loss": 0.9667,
675
+ "step": 475
676
+ },
677
+ {
678
+ "epoch": 0.2957486136783734,
679
+ "grad_norm": 1.834618330001831,
680
+ "learning_rate": 0.00016899672262903677,
681
+ "loss": 1.1185,
682
+ "step": 480
683
+ },
684
+ {
685
+ "epoch": 0.2988293284041898,
686
+ "grad_norm": 1.7675973176956177,
687
+ "learning_rate": 0.00016825531432186543,
688
+ "loss": 1.2081,
689
+ "step": 485
690
+ },
691
+ {
692
+ "epoch": 0.30191004313000613,
693
+ "grad_norm": 1.440274715423584,
694
+ "learning_rate": 0.00016750681403983846,
695
+ "loss": 1.3,
696
+ "step": 490
697
+ },
698
+ {
699
+ "epoch": 0.3049907578558225,
700
+ "grad_norm": 3.8798115253448486,
701
+ "learning_rate": 0.00016675129955485152,
702
+ "loss": 1.203,
703
+ "step": 495
704
+ },
705
+ {
706
+ "epoch": 0.3080714725816389,
707
+ "grad_norm": 2.388153314590454,
708
+ "learning_rate": 0.00016598884936760131,
709
+ "loss": 0.9961,
710
+ "step": 500
711
+ },
712
+ {
713
+ "epoch": 0.3111521873074553,
714
+ "grad_norm": 1.9182560443878174,
715
+ "learning_rate": 0.00016521954269942918,
716
+ "loss": 1.182,
717
+ "step": 505
718
+ },
719
+ {
720
+ "epoch": 0.3142329020332717,
721
+ "grad_norm": 1.6203210353851318,
722
+ "learning_rate": 0.00016444345948408984,
723
+ "loss": 1.1644,
724
+ "step": 510
725
+ },
726
+ {
727
+ "epoch": 0.3173136167590881,
728
+ "grad_norm": 1.6294291019439697,
729
+ "learning_rate": 0.0001636606803594457,
730
+ "loss": 1.2397,
731
+ "step": 515
732
+ },
733
+ {
734
+ "epoch": 0.3203943314849045,
735
+ "grad_norm": 10.618769645690918,
736
+ "learning_rate": 0.0001628712866590885,
737
+ "loss": 1.227,
738
+ "step": 520
739
+ },
740
+ {
741
+ "epoch": 0.3234750462107209,
742
+ "grad_norm": 2.6154747009277344,
743
+ "learning_rate": 0.00016207536040388845,
744
+ "loss": 0.9743,
745
+ "step": 525
746
+ },
747
+ {
748
+ "epoch": 0.32655576093653726,
749
+ "grad_norm": 1.7929004430770874,
750
+ "learning_rate": 0.0001612729842934718,
751
+ "loss": 1.2353,
752
+ "step": 530
753
+ },
754
+ {
755
+ "epoch": 0.32963647566235366,
756
+ "grad_norm": 1.581451416015625,
757
+ "learning_rate": 0.00016046424169762827,
758
+ "loss": 1.2453,
759
+ "step": 535
760
+ },
761
+ {
762
+ "epoch": 0.33271719038817005,
763
+ "grad_norm": 1.2970712184906006,
764
+ "learning_rate": 0.0001596492166476485,
765
+ "loss": 1.2025,
766
+ "step": 540
767
+ },
768
+ {
769
+ "epoch": 0.33579790511398644,
770
+ "grad_norm": 3.232973098754883,
771
+ "learning_rate": 0.0001588279938275929,
772
+ "loss": 1.187,
773
+ "step": 545
774
+ },
775
+ {
776
+ "epoch": 0.33887861983980283,
777
+ "grad_norm": 2.832794666290283,
778
+ "learning_rate": 0.00015800065856549269,
779
+ "loss": 0.9161,
780
+ "step": 550
781
+ },
782
+ {
783
+ "epoch": 0.3419593345656192,
784
+ "grad_norm": 1.8040834665298462,
785
+ "learning_rate": 0.00015716729682448393,
786
+ "loss": 1.183,
787
+ "step": 555
788
+ },
789
+ {
790
+ "epoch": 0.3450400492914356,
791
+ "grad_norm": 1.4347432851791382,
792
+ "learning_rate": 0.0001563279951938758,
793
+ "loss": 1.1642,
794
+ "step": 560
795
+ },
796
+ {
797
+ "epoch": 0.348120764017252,
798
+ "grad_norm": 1.390684962272644,
799
+ "learning_rate": 0.00015548284088015354,
800
+ "loss": 1.1843,
801
+ "step": 565
802
+ },
803
+ {
804
+ "epoch": 0.3512014787430684,
805
+ "grad_norm": 3.3264737129211426,
806
+ "learning_rate": 0.00015463192169791741,
807
+ "loss": 1.188,
808
+ "step": 570
809
+ },
810
+ {
811
+ "epoch": 0.3542821934688848,
812
+ "grad_norm": 2.754194974899292,
813
+ "learning_rate": 0.0001537753260607584,
814
+ "loss": 0.9604,
815
+ "step": 575
816
+ },
817
+ {
818
+ "epoch": 0.3573629081947012,
819
+ "grad_norm": 1.5689305067062378,
820
+ "learning_rate": 0.00015291314297207175,
821
+ "loss": 1.1923,
822
+ "step": 580
823
+ },
824
+ {
825
+ "epoch": 0.36044362292051757,
826
+ "grad_norm": 1.2691988945007324,
827
+ "learning_rate": 0.0001520454620158093,
828
+ "loss": 1.1974,
829
+ "step": 585
830
+ },
831
+ {
832
+ "epoch": 0.36352433764633396,
833
+ "grad_norm": 1.3295036554336548,
834
+ "learning_rate": 0.00015117237334717117,
835
+ "loss": 1.0945,
836
+ "step": 590
837
+ },
838
+ {
839
+ "epoch": 0.36660505237215035,
840
+ "grad_norm": 2.945547342300415,
841
+ "learning_rate": 0.00015029396768323846,
842
+ "loss": 1.2735,
843
+ "step": 595
844
+ },
845
+ {
846
+ "epoch": 0.36968576709796674,
847
+ "grad_norm": 2.4595086574554443,
848
+ "learning_rate": 0.00014941033629354734,
849
+ "loss": 1.0361,
850
+ "step": 600
851
+ },
852
+ {
853
+ "epoch": 0.37276648182378314,
854
+ "grad_norm": 1.6354780197143555,
855
+ "learning_rate": 0.00014852157099060596,
856
+ "loss": 1.2175,
857
+ "step": 605
858
+ },
859
+ {
860
+ "epoch": 0.3758471965495995,
861
+ "grad_norm": 1.4186891317367554,
862
+ "learning_rate": 0.00014762776412035456,
863
+ "loss": 1.2687,
864
+ "step": 610
865
+ },
866
+ {
867
+ "epoch": 0.3789279112754159,
868
+ "grad_norm": 1.4597561359405518,
869
+ "learning_rate": 0.00014672900855257056,
870
+ "loss": 1.1694,
871
+ "step": 615
872
+ },
873
+ {
874
+ "epoch": 0.3820086260012323,
875
+ "grad_norm": 2.9169702529907227,
876
+ "learning_rate": 0.00014582539767121904,
877
+ "loss": 1.2024,
878
+ "step": 620
879
+ },
880
+ {
881
+ "epoch": 0.3850893407270487,
882
+ "grad_norm": 2.2185842990875244,
883
+ "learning_rate": 0.0001449170253647498,
884
+ "loss": 0.8874,
885
+ "step": 625
886
+ },
887
+ {
888
+ "epoch": 0.38817005545286504,
889
+ "grad_norm": 1.9062397480010986,
890
+ "learning_rate": 0.0001440039860163419,
891
+ "loss": 1.2114,
892
+ "step": 630
893
+ },
894
+ {
895
+ "epoch": 0.39125077017868143,
896
+ "grad_norm": 1.6245906352996826,
897
+ "learning_rate": 0.00014308637449409706,
898
+ "loss": 1.1644,
899
+ "step": 635
900
+ },
901
+ {
902
+ "epoch": 0.3943314849044978,
903
+ "grad_norm": 1.4193432331085205,
904
+ "learning_rate": 0.00014216428614118243,
905
+ "loss": 1.1707,
906
+ "step": 640
907
+ },
908
+ {
909
+ "epoch": 0.3974121996303142,
910
+ "grad_norm": 2.847059726715088,
911
+ "learning_rate": 0.00014123781676592418,
912
+ "loss": 1.1516,
913
+ "step": 645
914
+ },
915
+ {
916
+ "epoch": 0.4004929143561306,
917
+ "grad_norm": 2.4037296772003174,
918
+ "learning_rate": 0.00014030706263185247,
919
+ "loss": 0.9869,
920
+ "step": 650
921
+ },
922
+ {
923
+ "epoch": 0.403573629081947,
924
+ "grad_norm": 1.6916531324386597,
925
+ "learning_rate": 0.00013937212044769955,
926
+ "loss": 1.1226,
927
+ "step": 655
928
+ },
929
+ {
930
+ "epoch": 0.4066543438077634,
931
+ "grad_norm": 1.8065935373306274,
932
+ "learning_rate": 0.0001384330873573513,
933
+ "loss": 1.2051,
934
+ "step": 660
935
+ },
936
+ {
937
+ "epoch": 0.4097350585335798,
938
+ "grad_norm": 1.243482232093811,
939
+ "learning_rate": 0.00013749006092975347,
940
+ "loss": 1.2038,
941
+ "step": 665
942
+ },
943
+ {
944
+ "epoch": 0.41281577325939617,
945
+ "grad_norm": 3.1718995571136475,
946
+ "learning_rate": 0.00013654313914877414,
947
+ "loss": 1.202,
948
+ "step": 670
949
+ },
950
+ {
951
+ "epoch": 0.41589648798521256,
952
+ "grad_norm": 2.4607560634613037,
953
+ "learning_rate": 0.00013559242040302272,
954
+ "loss": 0.9386,
955
+ "step": 675
956
+ },
957
+ {
958
+ "epoch": 0.41897720271102895,
959
+ "grad_norm": 1.8109748363494873,
960
+ "learning_rate": 0.00013463800347562706,
961
+ "loss": 1.1778,
962
+ "step": 680
963
+ },
964
+ {
965
+ "epoch": 0.42205791743684534,
966
+ "grad_norm": 1.3518471717834473,
967
+ "learning_rate": 0.00013367998753396944,
968
+ "loss": 1.0864,
969
+ "step": 685
970
+ },
971
+ {
972
+ "epoch": 0.42513863216266173,
973
+ "grad_norm": 1.2594832181930542,
974
+ "learning_rate": 0.00013271847211938285,
975
+ "loss": 1.1923,
976
+ "step": 690
977
+ },
978
+ {
979
+ "epoch": 0.4282193468884781,
980
+ "grad_norm": 3.4967703819274902,
981
+ "learning_rate": 0.0001317535571368082,
982
+ "loss": 1.1341,
983
+ "step": 695
984
+ },
985
+ {
986
+ "epoch": 0.4313000616142945,
987
+ "grad_norm": 2.2631278038024902,
988
+ "learning_rate": 0.00013078534284441382,
989
+ "loss": 1.0109,
990
+ "step": 700
991
+ },
992
+ {
993
+ "epoch": 0.4343807763401109,
994
+ "grad_norm": 1.5012656450271606,
995
+ "learning_rate": 0.00012981392984317834,
996
+ "loss": 1.0852,
997
+ "step": 705
998
+ },
999
+ {
1000
+ "epoch": 0.4374614910659273,
1001
+ "grad_norm": 1.609452724456787,
1002
+ "learning_rate": 0.00012883941906643786,
1003
+ "loss": 1.2037,
1004
+ "step": 710
1005
+ },
1006
+ {
1007
+ "epoch": 0.4405422057917437,
1008
+ "grad_norm": 1.3627073764801025,
1009
+ "learning_rate": 0.00012786191176939848,
1010
+ "loss": 1.1009,
1011
+ "step": 715
1012
+ },
1013
+ {
1014
+ "epoch": 0.4436229205175601,
1015
+ "grad_norm": 2.737517833709717,
1016
+ "learning_rate": 0.00012688150951861582,
1017
+ "loss": 1.0769,
1018
+ "step": 720
1019
+ },
1020
+ {
1021
+ "epoch": 0.4467036352433765,
1022
+ "grad_norm": 2.6469473838806152,
1023
+ "learning_rate": 0.00012589831418144154,
1024
+ "loss": 0.9434,
1025
+ "step": 725
1026
+ },
1027
+ {
1028
+ "epoch": 0.44978434996919286,
1029
+ "grad_norm": 1.5728322267532349,
1030
+ "learning_rate": 0.00012491242791543922,
1031
+ "loss": 1.1517,
1032
+ "step": 730
1033
+ },
1034
+ {
1035
+ "epoch": 0.45286506469500926,
1036
+ "grad_norm": 1.3436719179153442,
1037
+ "learning_rate": 0.00012392395315776963,
1038
+ "loss": 1.1303,
1039
+ "step": 735
1040
+ },
1041
+ {
1042
+ "epoch": 0.45594577942082565,
1043
+ "grad_norm": 1.7293676137924194,
1044
+ "learning_rate": 0.00012293299261454725,
1045
+ "loss": 1.1156,
1046
+ "step": 740
1047
+ },
1048
+ {
1049
+ "epoch": 0.45902649414664204,
1050
+ "grad_norm": 2.8371763229370117,
1051
+ "learning_rate": 0.00012193964925016872,
1052
+ "loss": 1.0727,
1053
+ "step": 745
1054
+ },
1055
+ {
1056
+ "epoch": 0.46210720887245843,
1057
+ "grad_norm": 2.760446786880493,
1058
+ "learning_rate": 0.00012094402627661447,
1059
+ "loss": 0.9198,
1060
+ "step": 750
1061
+ },
1062
+ {
1063
+ "epoch": 0.4651879235982748,
1064
+ "grad_norm": 2.4913201332092285,
1065
+ "learning_rate": 0.00011994622714272448,
1066
+ "loss": 1.1152,
1067
+ "step": 755
1068
+ },
1069
+ {
1070
+ "epoch": 0.4682686383240912,
1071
+ "grad_norm": 1.6550869941711426,
1072
+ "learning_rate": 0.00011894635552344975,
1073
+ "loss": 1.1196,
1074
+ "step": 760
1075
+ },
1076
+ {
1077
+ "epoch": 0.4713493530499076,
1078
+ "grad_norm": 1.1215524673461914,
1079
+ "learning_rate": 0.00011794451530908011,
1080
+ "loss": 1.1229,
1081
+ "step": 765
1082
+ },
1083
+ {
1084
+ "epoch": 0.47443006777572394,
1085
+ "grad_norm": 3.11061429977417,
1086
+ "learning_rate": 0.00011694081059444946,
1087
+ "loss": 1.1024,
1088
+ "step": 770
1089
+ },
1090
+ {
1091
+ "epoch": 0.47751078250154033,
1092
+ "grad_norm": 2.154513120651245,
1093
+ "learning_rate": 0.0001159353456681201,
1094
+ "loss": 0.9312,
1095
+ "step": 775
1096
+ },
1097
+ {
1098
+ "epoch": 0.4805914972273567,
1099
+ "grad_norm": 1.4802271127700806,
1100
+ "learning_rate": 0.00011492822500154667,
1101
+ "loss": 1.1166,
1102
+ "step": 780
1103
+ },
1104
+ {
1105
+ "epoch": 0.4836722119531731,
1106
+ "grad_norm": 1.447892427444458,
1107
+ "learning_rate": 0.00011391955323822126,
1108
+ "loss": 1.0645,
1109
+ "step": 785
1110
+ },
1111
+ {
1112
+ "epoch": 0.4867529266789895,
1113
+ "grad_norm": 1.1939027309417725,
1114
+ "learning_rate": 0.00011290943518280057,
1115
+ "loss": 1.1993,
1116
+ "step": 790
1117
+ },
1118
+ {
1119
+ "epoch": 0.4898336414048059,
1120
+ "grad_norm": 2.7073330879211426,
1121
+ "learning_rate": 0.0001118979757902162,
1122
+ "loss": 1.0944,
1123
+ "step": 795
1124
+ },
1125
+ {
1126
+ "epoch": 0.4929143561306223,
1127
+ "grad_norm": 2.381559133529663,
1128
+ "learning_rate": 0.00011088528015476964,
1129
+ "loss": 0.9793,
1130
+ "step": 800
1131
+ },
1132
+ {
1133
+ "epoch": 0.4959950708564387,
1134
+ "grad_norm": 1.5212994813919067,
1135
+ "learning_rate": 0.00010987145349921251,
1136
+ "loss": 1.0437,
1137
+ "step": 805
1138
+ },
1139
+ {
1140
+ "epoch": 0.49907578558225507,
1141
+ "grad_norm": 1.395302414894104,
1142
+ "learning_rate": 0.0001088566011638134,
1143
+ "loss": 1.0949,
1144
+ "step": 810
1145
+ },
1146
+ {
1147
+ "epoch": 0.5021565003080715,
1148
+ "grad_norm": 1.248336672782898,
1149
+ "learning_rate": 0.00010784082859541292,
1150
+ "loss": 1.0549,
1151
+ "step": 815
1152
+ },
1153
+ {
1154
+ "epoch": 0.5052372150338879,
1155
+ "grad_norm": 2.566803455352783,
1156
+ "learning_rate": 0.0001068242413364671,
1157
+ "loss": 1.1154,
1158
+ "step": 820
1159
+ },
1160
+ {
1161
+ "epoch": 0.5083179297597042,
1162
+ "grad_norm": 2.379218816757202,
1163
+ "learning_rate": 0.00010580694501408138,
1164
+ "loss": 0.9391,
1165
+ "step": 825
1166
+ },
1167
+ {
1168
+ "epoch": 0.5113986444855206,
1169
+ "grad_norm": 1.4035269021987915,
1170
+ "learning_rate": 0.00010478904532903535,
1171
+ "loss": 1.0181,
1172
+ "step": 830
1173
+ },
1174
+ {
1175
+ "epoch": 0.514479359211337,
1176
+ "grad_norm": 1.3133803606033325,
1177
+ "learning_rate": 0.00010377064804480025,
1178
+ "loss": 1.1629,
1179
+ "step": 835
1180
+ },
1181
+ {
1182
+ "epoch": 0.5175600739371534,
1183
+ "grad_norm": 1.1715658903121948,
1184
+ "learning_rate": 0.00010275185897654971,
1185
+ "loss": 0.9915,
1186
+ "step": 840
1187
+ },
1188
+ {
1189
+ "epoch": 0.5206407886629698,
1190
+ "grad_norm": 2.9421846866607666,
1191
+ "learning_rate": 0.00010173278398016501,
1192
+ "loss": 1.0566,
1193
+ "step": 845
1194
+ },
1195
+ {
1196
+ "epoch": 0.5237215033887862,
1197
+ "grad_norm": 2.0492677688598633,
1198
+ "learning_rate": 0.00010071352894123654,
1199
+ "loss": 0.913,
1200
+ "step": 850
1201
+ },
1202
+ {
1203
+ "epoch": 0.5268022181146026,
1204
+ "grad_norm": 1.631962776184082,
1205
+ "learning_rate": 9.969419976406165e-05,
1206
+ "loss": 1.0919,
1207
+ "step": 855
1208
+ },
1209
+ {
1210
+ "epoch": 0.529882932840419,
1211
+ "grad_norm": 1.5217899084091187,
1212
+ "learning_rate": 9.867490236064108e-05,
1213
+ "loss": 1.1026,
1214
+ "step": 860
1215
+ },
1216
+ {
1217
+ "epoch": 0.5329636475662354,
1218
+ "grad_norm": 1.4602949619293213,
1219
+ "learning_rate": 9.765574263967396e-05,
1220
+ "loss": 1.1784,
1221
+ "step": 865
1222
+ },
1223
+ {
1224
+ "epoch": 0.5360443622920518,
1225
+ "grad_norm": 2.078181743621826,
1226
+ "learning_rate": 9.66368264955539e-05,
1227
+ "loss": 0.9908,
1228
+ "step": 870
1229
+ },
1230
+ {
1231
+ "epoch": 0.5391250770178682,
1232
+ "grad_norm": 2.2240121364593506,
1233
+ "learning_rate": 9.56182597973658e-05,
1234
+ "loss": 0.92,
1235
+ "step": 875
1236
+ },
1237
+ {
1238
+ "epoch": 0.5422057917436846,
1239
+ "grad_norm": 1.3107519149780273,
1240
+ "learning_rate": 9.460014837788605e-05,
1241
+ "loss": 1.0832,
1242
+ "step": 880
1243
+ },
1244
+ {
1245
+ "epoch": 0.5452865064695009,
1246
+ "grad_norm": 1.3502496480941772,
1247
+ "learning_rate": 9.358259802258581e-05,
1248
+ "loss": 1.1143,
1249
+ "step": 885
1250
+ },
1251
+ {
1252
+ "epoch": 0.5483672211953173,
1253
+ "grad_norm": 1.143684983253479,
1254
+ "learning_rate": 9.256571445863972e-05,
1255
+ "loss": 1.0344,
1256
+ "step": 890
1257
+ },
1258
+ {
1259
+ "epoch": 0.5514479359211337,
1260
+ "grad_norm": 3.2240543365478516,
1261
+ "learning_rate": 9.154960334394027e-05,
1262
+ "loss": 1.0876,
1263
+ "step": 895
1264
+ },
1265
+ {
1266
+ "epoch": 0.5545286506469501,
1267
+ "grad_norm": 2.4902024269104004,
1268
+ "learning_rate": 9.053437025611973e-05,
1269
+ "loss": 0.8999,
1270
+ "step": 900
1271
+ },
1272
+ {
1273
+ "epoch": 0.5576093653727665,
1274
+ "grad_norm": 1.845697283744812,
1275
+ "learning_rate": 8.952012068158027e-05,
1276
+ "loss": 1.077,
1277
+ "step": 905
1278
+ },
1279
+ {
1280
+ "epoch": 0.5606900800985829,
1281
+ "grad_norm": 1.2748725414276123,
1282
+ "learning_rate": 8.850696000453326e-05,
1283
+ "loss": 1.214,
1284
+ "step": 910
1285
+ },
1286
+ {
1287
+ "epoch": 0.5637707948243993,
1288
+ "grad_norm": 1.1144680976867676,
1289
+ "learning_rate": 8.749499349604993e-05,
1290
+ "loss": 1.0412,
1291
+ "step": 915
1292
+ },
1293
+ {
1294
+ "epoch": 0.5668515095502157,
1295
+ "grad_norm": 2.4499781131744385,
1296
+ "learning_rate": 8.64843263031228e-05,
1297
+ "loss": 0.9857,
1298
+ "step": 920
1299
+ },
1300
+ {
1301
+ "epoch": 0.5699322242760321,
1302
+ "grad_norm": 2.452460289001465,
1303
+ "learning_rate": 8.547506343774097e-05,
1304
+ "loss": 0.8463,
1305
+ "step": 925
1306
+ },
1307
+ {
1308
+ "epoch": 0.5730129390018485,
1309
+ "grad_norm": 1.6317726373672485,
1310
+ "learning_rate": 8.446730976597878e-05,
1311
+ "loss": 1.1347,
1312
+ "step": 930
1313
+ },
1314
+ {
1315
+ "epoch": 0.5760936537276649,
1316
+ "grad_norm": 1.5841073989868164,
1317
+ "learning_rate": 8.346116999709975e-05,
1318
+ "loss": 1.04,
1319
+ "step": 935
1320
+ },
1321
+ {
1322
+ "epoch": 0.5791743684534812,
1323
+ "grad_norm": 1.0936833620071411,
1324
+ "learning_rate": 8.245674867267724e-05,
1325
+ "loss": 1.0614,
1326
+ "step": 940
1327
+ },
1328
+ {
1329
+ "epoch": 0.5822550831792976,
1330
+ "grad_norm": 2.5450921058654785,
1331
+ "learning_rate": 8.145415015573183e-05,
1332
+ "loss": 1.1235,
1333
+ "step": 945
1334
+ },
1335
+ {
1336
+ "epoch": 0.585335797905114,
1337
+ "grad_norm": 2.3192529678344727,
1338
+ "learning_rate": 8.045347861988789e-05,
1339
+ "loss": 0.8619,
1340
+ "step": 950
1341
+ },
1342
+ {
1343
+ "epoch": 0.5884165126309304,
1344
+ "grad_norm": 1.6475285291671753,
1345
+ "learning_rate": 7.945483803854936e-05,
1346
+ "loss": 1.0461,
1347
+ "step": 955
1348
+ },
1349
+ {
1350
+ "epoch": 0.5914972273567468,
1351
+ "grad_norm": 1.3527559041976929,
1352
+ "learning_rate": 7.845833217409675e-05,
1353
+ "loss": 1.1688,
1354
+ "step": 960
1355
+ },
1356
+ {
1357
+ "epoch": 0.5945779420825632,
1358
+ "grad_norm": 1.1767574548721313,
1359
+ "learning_rate": 7.746406456710564e-05,
1360
+ "loss": 1.0399,
1361
+ "step": 965
1362
+ },
1363
+ {
1364
+ "epoch": 0.5976586568083796,
1365
+ "grad_norm": 2.8988595008850098,
1366
+ "learning_rate": 7.64721385255886e-05,
1367
+ "loss": 1.0488,
1368
+ "step": 970
1369
+ },
1370
+ {
1371
+ "epoch": 0.600739371534196,
1372
+ "grad_norm": 2.461327314376831,
1373
+ "learning_rate": 7.548265711426104e-05,
1374
+ "loss": 0.8563,
1375
+ "step": 975
1376
+ },
1377
+ {
1378
+ "epoch": 0.6038200862600123,
1379
+ "grad_norm": 1.5591062307357788,
1380
+ "learning_rate": 7.449572314383237e-05,
1381
+ "loss": 1.1057,
1382
+ "step": 980
1383
+ },
1384
+ {
1385
+ "epoch": 0.6069008009858287,
1386
+ "grad_norm": 1.5111265182495117,
1387
+ "learning_rate": 7.351143916032374e-05,
1388
+ "loss": 1.1176,
1389
+ "step": 985
1390
+ },
1391
+ {
1392
+ "epoch": 0.609981515711645,
1393
+ "grad_norm": 1.2199238538742065,
1394
+ "learning_rate": 7.252990743441293e-05,
1395
+ "loss": 1.0584,
1396
+ "step": 990
1397
+ },
1398
+ {
1399
+ "epoch": 0.6130622304374614,
1400
+ "grad_norm": 2.5779082775115967,
1401
+ "learning_rate": 7.155122995080827e-05,
1402
+ "loss": 1.1065,
1403
+ "step": 995
1404
+ },
1405
+ {
1406
+ "epoch": 0.6161429451632778,
1407
+ "grad_norm": 2.7095351219177246,
1408
+ "learning_rate": 7.057550839765188e-05,
1409
+ "loss": 0.9011,
1410
+ "step": 1000
1411
+ },
1412
+ {
1413
+ "epoch": 0.6192236598890942,
1414
+ "grad_norm": 1.7512952089309692,
1415
+ "learning_rate": 6.960284415595407e-05,
1416
+ "loss": 1.0172,
1417
+ "step": 1005
1418
+ },
1419
+ {
1420
+ "epoch": 0.6223043746149106,
1421
+ "grad_norm": 1.3986120223999023,
1422
+ "learning_rate": 6.863333828905929e-05,
1423
+ "loss": 1.2291,
1424
+ "step": 1010
1425
+ },
1426
+ {
1427
+ "epoch": 0.625385089340727,
1428
+ "grad_norm": 1.1490086317062378,
1429
+ "learning_rate": 6.766709153214542e-05,
1430
+ "loss": 1.093,
1431
+ "step": 1015
1432
+ },
1433
+ {
1434
+ "epoch": 0.6284658040665434,
1435
+ "grad_norm": 2.561668634414673,
1436
+ "learning_rate": 6.670420428175705e-05,
1437
+ "loss": 1.062,
1438
+ "step": 1020
1439
+ },
1440
+ {
1441
+ "epoch": 0.6315465187923598,
1442
+ "grad_norm": 2.4279980659484863,
1443
+ "learning_rate": 6.574477658537375e-05,
1444
+ "loss": 0.8716,
1445
+ "step": 1025
1446
+ },
1447
+ {
1448
+ "epoch": 0.6346272335181762,
1449
+ "grad_norm": 1.5312825441360474,
1450
+ "learning_rate": 6.4788908131015e-05,
1451
+ "loss": 0.9438,
1452
+ "step": 1030
1453
+ },
1454
+ {
1455
+ "epoch": 0.6377079482439926,
1456
+ "grad_norm": 1.4004178047180176,
1457
+ "learning_rate": 6.38366982368819e-05,
1458
+ "loss": 1.0037,
1459
+ "step": 1035
1460
+ },
1461
+ {
1462
+ "epoch": 0.640788662969809,
1463
+ "grad_norm": 1.1398247480392456,
1464
+ "learning_rate": 6.288824584103816e-05,
1465
+ "loss": 1.0592,
1466
+ "step": 1040
1467
+ },
1468
+ {
1469
+ "epoch": 0.6438693776956254,
1470
+ "grad_norm": 4.921570777893066,
1471
+ "learning_rate": 6.194364949112953e-05,
1472
+ "loss": 1.0116,
1473
+ "step": 1045
1474
+ },
1475
+ {
1476
+ "epoch": 0.6469500924214417,
1477
+ "grad_norm": 2.320974111557007,
1478
+ "learning_rate": 6.100300733414474e-05,
1479
+ "loss": 0.8504,
1480
+ "step": 1050
1481
+ },
1482
+ {
1483
+ "epoch": 0.6500308071472581,
1484
+ "grad_norm": 1.6045787334442139,
1485
+ "learning_rate": 6.0066417106217455e-05,
1486
+ "loss": 1.039,
1487
+ "step": 1055
1488
+ },
1489
+ {
1490
+ "epoch": 0.6531115218730745,
1491
+ "grad_norm": 1.4648897647857666,
1492
+ "learning_rate": 5.9133976122471214e-05,
1493
+ "loss": 1.0437,
1494
+ "step": 1060
1495
+ },
1496
+ {
1497
+ "epoch": 0.6561922365988909,
1498
+ "grad_norm": 1.1157406568527222,
1499
+ "learning_rate": 5.82057812669081e-05,
1500
+ "loss": 1.0864,
1501
+ "step": 1065
1502
+ },
1503
+ {
1504
+ "epoch": 0.6592729513247073,
1505
+ "grad_norm": 1.9730424880981445,
1506
+ "learning_rate": 5.728192898234195e-05,
1507
+ "loss": 0.9514,
1508
+ "step": 1070
1509
+ },
1510
+ {
1511
+ "epoch": 0.6623536660505237,
1512
+ "grad_norm": 2.416600465774536,
1513
+ "learning_rate": 5.6362515260377835e-05,
1514
+ "loss": 0.8168,
1515
+ "step": 1075
1516
+ },
1517
+ {
1518
+ "epoch": 0.6654343807763401,
1519
+ "grad_norm": 1.5502610206604004,
1520
+ "learning_rate": 5.544763563143793e-05,
1521
+ "loss": 1.0327,
1522
+ "step": 1080
1523
+ },
1524
+ {
1525
+ "epoch": 0.6685150955021565,
1526
+ "grad_norm": 1.3881441354751587,
1527
+ "learning_rate": 5.4537385154835864e-05,
1528
+ "loss": 1.0939,
1529
+ "step": 1085
1530
+ },
1531
+ {
1532
+ "epoch": 0.6715958102279729,
1533
+ "grad_norm": 1.1561568975448608,
1534
+ "learning_rate": 5.363185840889935e-05,
1535
+ "loss": 0.9814,
1536
+ "step": 1090
1537
+ },
1538
+ {
1539
+ "epoch": 0.6746765249537893,
1540
+ "grad_norm": 2.810972213745117,
1541
+ "learning_rate": 5.273114948114346e-05,
1542
+ "loss": 1.1126,
1543
+ "step": 1095
1544
+ },
1545
+ {
1546
+ "epoch": 0.6777572396796057,
1547
+ "grad_norm": 2.332209587097168,
1548
+ "learning_rate": 5.1835351958494515e-05,
1549
+ "loss": 0.8299,
1550
+ "step": 1100
1551
+ },
1552
+ {
1553
+ "epoch": 0.680837954405422,
1554
+ "grad_norm": 1.4874842166900635,
1555
+ "learning_rate": 5.094455891756587e-05,
1556
+ "loss": 1.0172,
1557
+ "step": 1105
1558
+ },
1559
+ {
1560
+ "epoch": 0.6839186691312384,
1561
+ "grad_norm": 1.3436095714569092,
1562
+ "learning_rate": 5.00588629149872e-05,
1563
+ "loss": 1.1171,
1564
+ "step": 1110
1565
+ },
1566
+ {
1567
+ "epoch": 0.6869993838570548,
1568
+ "grad_norm": 1.2062082290649414,
1569
+ "learning_rate": 4.91783559777873e-05,
1570
+ "loss": 0.9896,
1571
+ "step": 1115
1572
+ },
1573
+ {
1574
+ "epoch": 0.6900800985828712,
1575
+ "grad_norm": 2.3433315753936768,
1576
+ "learning_rate": 4.830312959383238e-05,
1577
+ "loss": 0.9553,
1578
+ "step": 1120
1579
+ },
1580
+ {
1581
+ "epoch": 0.6931608133086876,
1582
+ "grad_norm": 1.8677462339401245,
1583
+ "learning_rate": 4.7433274702319815e-05,
1584
+ "loss": 0.7778,
1585
+ "step": 1125
1586
+ },
1587
+ {
1588
+ "epoch": 0.696241528034504,
1589
+ "grad_norm": 1.5704991817474365,
1590
+ "learning_rate": 4.656888168432962e-05,
1591
+ "loss": 1.0221,
1592
+ "step": 1130
1593
+ },
1594
+ {
1595
+ "epoch": 0.6993222427603204,
1596
+ "grad_norm": 1.4350054264068604,
1597
+ "learning_rate": 4.571004035343315e-05,
1598
+ "loss": 1.0651,
1599
+ "step": 1135
1600
+ },
1601
+ {
1602
+ "epoch": 0.7024029574861368,
1603
+ "grad_norm": 1.0175029039382935,
1604
+ "learning_rate": 4.485683994636144e-05,
1605
+ "loss": 1.0022,
1606
+ "step": 1140
1607
+ },
1608
+ {
1609
+ "epoch": 0.7054836722119532,
1610
+ "grad_norm": 2.640902519226074,
1611
+ "learning_rate": 4.400936911373308e-05,
1612
+ "loss": 1.0825,
1613
+ "step": 1145
1614
+ },
1615
+ {
1616
+ "epoch": 0.7085643869377696,
1617
+ "grad_norm": 2.244054079055786,
1618
+ "learning_rate": 4.3167715910842966e-05,
1619
+ "loss": 0.8401,
1620
+ "step": 1150
1621
+ },
1622
+ {
1623
+ "epoch": 0.711645101663586,
1624
+ "grad_norm": 1.691078543663025,
1625
+ "learning_rate": 4.2331967788513295e-05,
1626
+ "loss": 1.0047,
1627
+ "step": 1155
1628
+ },
1629
+ {
1630
+ "epoch": 0.7147258163894024,
1631
+ "grad_norm": 1.4527450799942017,
1632
+ "learning_rate": 4.1502211584006836e-05,
1633
+ "loss": 1.0024,
1634
+ "step": 1160
1635
+ },
1636
+ {
1637
+ "epoch": 0.7178065311152187,
1638
+ "grad_norm": 1.0902256965637207,
1639
+ "learning_rate": 4.067853351200446e-05,
1640
+ "loss": 1.0983,
1641
+ "step": 1165
1642
+ },
1643
+ {
1644
+ "epoch": 0.7208872458410351,
1645
+ "grad_norm": 2.0624313354492188,
1646
+ "learning_rate": 3.986101915564695e-05,
1647
+ "loss": 0.9629,
1648
+ "step": 1170
1649
+ },
1650
+ {
1651
+ "epoch": 0.7239679605668515,
1652
+ "grad_norm": 2.8187415599823,
1653
+ "learning_rate": 3.904975345764262e-05,
1654
+ "loss": 0.8496,
1655
+ "step": 1175
1656
+ },
1657
+ {
1658
+ "epoch": 0.7270486752926679,
1659
+ "grad_norm": 1.491103172302246,
1660
+ "learning_rate": 3.824482071144163e-05,
1661
+ "loss": 1.033,
1662
+ "step": 1180
1663
+ },
1664
+ {
1665
+ "epoch": 0.7301293900184843,
1666
+ "grad_norm": 1.5210243463516235,
1667
+ "learning_rate": 3.744630455247739e-05,
1668
+ "loss": 1.0317,
1669
+ "step": 1185
1670
+ },
1671
+ {
1672
+ "epoch": 0.7332101047443007,
1673
+ "grad_norm": 1.0921562910079956,
1674
+ "learning_rate": 3.6654287949476626e-05,
1675
+ "loss": 1.0313,
1676
+ "step": 1190
1677
+ },
1678
+ {
1679
+ "epoch": 0.7362908194701171,
1680
+ "grad_norm": 2.3343021869659424,
1681
+ "learning_rate": 3.586885319583858e-05,
1682
+ "loss": 1.0512,
1683
+ "step": 1195
1684
+ },
1685
+ {
1686
+ "epoch": 0.7393715341959335,
1687
+ "grad_norm": 2.3604445457458496,
1688
+ "learning_rate": 3.5090081901084525e-05,
1689
+ "loss": 0.845,
1690
+ "step": 1200
1691
+ },
1692
+ {
1693
+ "epoch": 0.7424522489217499,
1694
+ "grad_norm": 1.939579963684082,
1695
+ "learning_rate": 3.431805498237808e-05,
1696
+ "loss": 1.0898,
1697
+ "step": 1205
1698
+ },
1699
+ {
1700
+ "epoch": 0.7455329636475663,
1701
+ "grad_norm": 1.371645212173462,
1702
+ "learning_rate": 3.355285265611784e-05,
1703
+ "loss": 1.0914,
1704
+ "step": 1210
1705
+ },
1706
+ {
1707
+ "epoch": 0.7486136783733827,
1708
+ "grad_norm": 1.1028584241867065,
1709
+ "learning_rate": 3.279455442960238e-05,
1710
+ "loss": 1.0227,
1711
+ "step": 1215
1712
+ },
1713
+ {
1714
+ "epoch": 0.751694393099199,
1715
+ "grad_norm": 2.2882368564605713,
1716
+ "learning_rate": 3.204323909276924e-05,
1717
+ "loss": 1.0516,
1718
+ "step": 1220
1719
+ },
1720
+ {
1721
+ "epoch": 0.7547751078250154,
1722
+ "grad_norm": 1.9783731698989868,
1723
+ "learning_rate": 3.1298984710008484e-05,
1724
+ "loss": 0.8078,
1725
+ "step": 1225
1726
+ },
1727
+ {
1728
+ "epoch": 0.7578558225508318,
1729
+ "grad_norm": 1.7184314727783203,
1730
+ "learning_rate": 3.056186861205136e-05,
1731
+ "loss": 0.9343,
1732
+ "step": 1230
1733
+ },
1734
+ {
1735
+ "epoch": 0.7609365372766482,
1736
+ "grad_norm": 1.4005725383758545,
1737
+ "learning_rate": 2.9831967387935467e-05,
1738
+ "loss": 1.0701,
1739
+ "step": 1235
1740
+ },
1741
+ {
1742
+ "epoch": 0.7640172520024646,
1743
+ "grad_norm": 1.1493009328842163,
1744
+ "learning_rate": 2.9109356877046712e-05,
1745
+ "loss": 0.879,
1746
+ "step": 1240
1747
+ },
1748
+ {
1749
+ "epoch": 0.767097966728281,
1750
+ "grad_norm": 3.0429165363311768,
1751
+ "learning_rate": 2.8394112161239605e-05,
1752
+ "loss": 1.0175,
1753
+ "step": 1245
1754
+ },
1755
+ {
1756
+ "epoch": 0.7701786814540974,
1757
+ "grad_norm": 2.5007412433624268,
1758
+ "learning_rate": 2.7686307557035685e-05,
1759
+ "loss": 0.794,
1760
+ "step": 1250
1761
+ },
1762
+ {
1763
+ "epoch": 0.7732593961799138,
1764
+ "grad_norm": 1.3818840980529785,
1765
+ "learning_rate": 2.6986016607901908e-05,
1766
+ "loss": 0.9536,
1767
+ "step": 1255
1768
+ },
1769
+ {
1770
+ "epoch": 0.7763401109057301,
1771
+ "grad_norm": 1.3459044694900513,
1772
+ "learning_rate": 2.629331207660931e-05,
1773
+ "loss": 0.9825,
1774
+ "step": 1260
1775
+ },
1776
+ {
1777
+ "epoch": 0.7794208256315465,
1778
+ "grad_norm": 1.134709119796753,
1779
+ "learning_rate": 2.5608265937672436e-05,
1780
+ "loss": 1.0014,
1781
+ "step": 1265
1782
+ },
1783
+ {
1784
+ "epoch": 0.7825015403573629,
1785
+ "grad_norm": 2.850844383239746,
1786
+ "learning_rate": 2.4930949369871203e-05,
1787
+ "loss": 1.0176,
1788
+ "step": 1270
1789
+ },
1790
+ {
1791
+ "epoch": 0.7855822550831792,
1792
+ "grad_norm": 3.2245547771453857,
1793
+ "learning_rate": 2.426143274885493e-05,
1794
+ "loss": 0.7798,
1795
+ "step": 1275
1796
+ },
1797
+ {
1798
+ "epoch": 0.7886629698089956,
1799
+ "grad_norm": 1.8153865337371826,
1800
+ "learning_rate": 2.359978563983022e-05,
1801
+ "loss": 0.9403,
1802
+ "step": 1280
1803
+ },
1804
+ {
1805
+ "epoch": 0.791743684534812,
1806
+ "grad_norm": 1.3627381324768066,
1807
+ "learning_rate": 2.2946076790332827e-05,
1808
+ "loss": 0.9362,
1809
+ "step": 1285
1810
+ },
1811
+ {
1812
+ "epoch": 0.7948243992606284,
1813
+ "grad_norm": 1.1612952947616577,
1814
+ "learning_rate": 2.2300374123084522e-05,
1815
+ "loss": 0.8806,
1816
+ "step": 1290
1817
+ },
1818
+ {
1819
+ "epoch": 0.7979051139864448,
1820
+ "grad_norm": 2.919231414794922,
1821
+ "learning_rate": 2.166274472893567e-05,
1822
+ "loss": 0.9592,
1823
+ "step": 1295
1824
+ },
1825
+ {
1826
+ "epoch": 0.8009858287122612,
1827
+ "grad_norm": 2.081076145172119,
1828
+ "learning_rate": 2.1033254859894226e-05,
1829
+ "loss": 0.7639,
1830
+ "step": 1300
1831
+ },
1832
+ {
1833
+ "epoch": 0.8040665434380776,
1834
+ "grad_norm": 1.3334945440292358,
1835
+ "learning_rate": 2.041196992224206e-05,
1836
+ "loss": 0.9676,
1837
+ "step": 1305
1838
+ },
1839
+ {
1840
+ "epoch": 0.807147258163894,
1841
+ "grad_norm": 1.4571508169174194,
1842
+ "learning_rate": 1.9798954469738762e-05,
1843
+ "loss": 0.9551,
1844
+ "step": 1310
1845
+ },
1846
+ {
1847
+ "epoch": 0.8102279728897104,
1848
+ "grad_norm": 1.0584170818328857,
1849
+ "learning_rate": 1.919427219691453e-05,
1850
+ "loss": 1.0136,
1851
+ "step": 1315
1852
+ },
1853
+ {
1854
+ "epoch": 0.8133086876155268,
1855
+ "grad_norm": 2.4761879444122314,
1856
+ "learning_rate": 1.8597985932451856e-05,
1857
+ "loss": 0.9425,
1858
+ "step": 1320
1859
+ },
1860
+ {
1861
+ "epoch": 0.8163894023413432,
1862
+ "grad_norm": 2.397019863128662,
1863
+ "learning_rate": 1.8010157632657543e-05,
1864
+ "loss": 0.7703,
1865
+ "step": 1325
1866
+ },
1867
+ {
1868
+ "epoch": 0.8194701170671596,
1869
+ "grad_norm": 1.5761799812316895,
1870
+ "learning_rate": 1.7430848375025176e-05,
1871
+ "loss": 1.0024,
1872
+ "step": 1330
1873
+ },
1874
+ {
1875
+ "epoch": 0.822550831792976,
1876
+ "grad_norm": 1.4736329317092896,
1877
+ "learning_rate": 1.686011835188891e-05,
1878
+ "loss": 1.0252,
1879
+ "step": 1335
1880
+ },
1881
+ {
1882
+ "epoch": 0.8256315465187923,
1883
+ "grad_norm": 0.9777911305427551,
1884
+ "learning_rate": 1.6298026864169335e-05,
1885
+ "loss": 1.0137,
1886
+ "step": 1340
1887
+ },
1888
+ {
1889
+ "epoch": 0.8287122612446087,
1890
+ "grad_norm": 2.404789686203003,
1891
+ "learning_rate": 1.5744632315211815e-05,
1892
+ "loss": 0.9795,
1893
+ "step": 1345
1894
+ },
1895
+ {
1896
+ "epoch": 0.8317929759704251,
1897
+ "grad_norm": 2.4838790893554688,
1898
+ "learning_rate": 1.5199992204718294e-05,
1899
+ "loss": 0.8058,
1900
+ "step": 1350
1901
+ },
1902
+ {
1903
+ "epoch": 0.8348736906962415,
1904
+ "grad_norm": 1.6645960807800293,
1905
+ "learning_rate": 1.4664163122772689e-05,
1906
+ "loss": 0.9304,
1907
+ "step": 1355
1908
+ },
1909
+ {
1910
+ "epoch": 0.8379544054220579,
1911
+ "grad_norm": 1.737648367881775,
1912
+ "learning_rate": 1.4137200743961188e-05,
1913
+ "loss": 0.9923,
1914
+ "step": 1360
1915
+ },
1916
+ {
1917
+ "epoch": 0.8410351201478743,
1918
+ "grad_norm": 1.0121675729751587,
1919
+ "learning_rate": 1.3619159821587235e-05,
1920
+ "loss": 0.9683,
1921
+ "step": 1365
1922
+ },
1923
+ {
1924
+ "epoch": 0.8441158348736907,
1925
+ "grad_norm": 2.5697550773620605,
1926
+ "learning_rate": 1.3110094181982657e-05,
1927
+ "loss": 0.9893,
1928
+ "step": 1370
1929
+ },
1930
+ {
1931
+ "epoch": 0.8471965495995071,
1932
+ "grad_norm": 2.4901211261749268,
1933
+ "learning_rate": 1.261005671891482e-05,
1934
+ "loss": 0.8361,
1935
+ "step": 1375
1936
+ },
1937
+ {
1938
+ "epoch": 0.8502772643253235,
1939
+ "grad_norm": 2.0964717864990234,
1940
+ "learning_rate": 1.2119099388090716e-05,
1941
+ "loss": 1.0073,
1942
+ "step": 1380
1943
+ },
1944
+ {
1945
+ "epoch": 0.8533579790511399,
1946
+ "grad_norm": 1.3931251764297485,
1947
+ "learning_rate": 1.1637273201758748e-05,
1948
+ "loss": 0.9662,
1949
+ "step": 1385
1950
+ },
1951
+ {
1952
+ "epoch": 0.8564386937769563,
1953
+ "grad_norm": 1.1147106885910034,
1954
+ "learning_rate": 1.1164628223408168e-05,
1955
+ "loss": 0.9058,
1956
+ "step": 1390
1957
+ },
1958
+ {
1959
+ "epoch": 0.8595194085027726,
1960
+ "grad_norm": 2.7236523628234863,
1961
+ "learning_rate": 1.0701213562567492e-05,
1962
+ "loss": 0.9428,
1963
+ "step": 1395
1964
+ },
1965
+ {
1966
+ "epoch": 0.862600123228589,
1967
+ "grad_norm": 2.7776854038238525,
1968
+ "learning_rate": 1.0247077369701653e-05,
1969
+ "loss": 0.7677,
1970
+ "step": 1400
1971
+ },
1972
+ {
1973
+ "epoch": 0.8656808379544054,
1974
+ "grad_norm": 1.4170615673065186,
1975
+ "learning_rate": 9.802266831209206e-06,
1976
+ "loss": 0.933,
1977
+ "step": 1405
1978
+ },
1979
+ {
1980
+ "epoch": 0.8687615526802218,
1981
+ "grad_norm": 1.3592872619628906,
1982
+ "learning_rate": 9.366828164519258e-06,
1983
+ "loss": 0.9072,
1984
+ "step": 1410
1985
+ },
1986
+ {
1987
+ "epoch": 0.8718422674060382,
1988
+ "grad_norm": 1.1270769834518433,
1989
+ "learning_rate": 8.940806613289498e-06,
1990
+ "loss": 1.0238,
1991
+ "step": 1415
1992
+ },
1993
+ {
1994
+ "epoch": 0.8749229821318546,
1995
+ "grad_norm": 2.6970269680023193,
1996
+ "learning_rate": 8.524246442705153e-06,
1997
+ "loss": 0.8856,
1998
+ "step": 1420
1999
+ },
2000
+ {
2001
+ "epoch": 0.878003696857671,
2002
+ "grad_norm": 3.5226802825927734,
2003
+ "learning_rate": 8.117190934879593e-06,
2004
+ "loss": 0.7973,
2005
+ "step": 1425
2006
+ },
2007
+ {
2008
+ "epoch": 0.8810844115834874,
2009
+ "grad_norm": 1.69875967502594,
2010
+ "learning_rate": 7.719682384357308e-06,
2011
+ "loss": 0.9935,
2012
+ "step": 1430
2013
+ },
2014
+ {
2015
+ "epoch": 0.8841651263093038,
2016
+ "grad_norm": 1.3742725849151611,
2017
+ "learning_rate": 7.33176209371923e-06,
2018
+ "loss": 0.9538,
2019
+ "step": 1435
2020
+ },
2021
+ {
2022
+ "epoch": 0.8872458410351202,
2023
+ "grad_norm": 1.2222424745559692,
2024
+ "learning_rate": 6.953470369291348e-06,
2025
+ "loss": 1.0136,
2026
+ "step": 1440
2027
+ },
2028
+ {
2029
+ "epoch": 0.8903265557609366,
2030
+ "grad_norm": 2.7292568683624268,
2031
+ "learning_rate": 6.5848465169566e-06,
2032
+ "loss": 1.0357,
2033
+ "step": 1445
2034
+ },
2035
+ {
2036
+ "epoch": 0.893407270486753,
2037
+ "grad_norm": 2.0563595294952393,
2038
+ "learning_rate": 6.225928838071016e-06,
2039
+ "loss": 0.7719,
2040
+ "step": 1450
2041
+ },
2042
+ {
2043
+ "epoch": 0.8964879852125693,
2044
+ "grad_norm": 1.679767370223999,
2045
+ "learning_rate": 5.876754625483904e-06,
2046
+ "loss": 0.8551,
2047
+ "step": 1455
2048
+ },
2049
+ {
2050
+ "epoch": 0.8995686999383857,
2051
+ "grad_norm": 1.287404179573059,
2052
+ "learning_rate": 5.537360159663108e-06,
2053
+ "loss": 0.9694,
2054
+ "step": 1460
2055
+ },
2056
+ {
2057
+ "epoch": 0.9026494146642021,
2058
+ "grad_norm": 1.1878061294555664,
2059
+ "learning_rate": 5.207780704925314e-06,
2060
+ "loss": 0.9334,
2061
+ "step": 1465
2062
+ },
2063
+ {
2064
+ "epoch": 0.9057301293900185,
2065
+ "grad_norm": 2.4936001300811768,
2066
+ "learning_rate": 4.888050505771868e-06,
2067
+ "loss": 1.0187,
2068
+ "step": 1470
2069
+ },
2070
+ {
2071
+ "epoch": 0.9088108441158349,
2072
+ "grad_norm": 2.331899642944336,
2073
+ "learning_rate": 4.578202783330799e-06,
2074
+ "loss": 0.8009,
2075
+ "step": 1475
2076
+ },
2077
+ {
2078
+ "epoch": 0.9118915588416513,
2079
+ "grad_norm": 1.3719042539596558,
2080
+ "learning_rate": 4.2782697319048605e-06,
2081
+ "loss": 0.9427,
2082
+ "step": 1480
2083
+ },
2084
+ {
2085
+ "epoch": 0.9149722735674677,
2086
+ "grad_norm": 1.3383433818817139,
2087
+ "learning_rate": 3.988282515626585e-06,
2088
+ "loss": 0.8948,
2089
+ "step": 1485
2090
+ },
2091
+ {
2092
+ "epoch": 0.9180529882932841,
2093
+ "grad_norm": 1.298445701599121,
2094
+ "learning_rate": 3.7082712652200867e-06,
2095
+ "loss": 0.9136,
2096
+ "step": 1490
2097
+ },
2098
+ {
2099
+ "epoch": 0.9211337030191005,
2100
+ "grad_norm": 2.5467193126678467,
2101
+ "learning_rate": 3.438265074870417e-06,
2102
+ "loss": 1.0515,
2103
+ "step": 1495
2104
+ },
2105
+ {
2106
+ "epoch": 0.9242144177449169,
2107
+ "grad_norm": 2.346221923828125,
2108
+ "learning_rate": 3.1782919992006333e-06,
2109
+ "loss": 0.7778,
2110
+ "step": 1500
2111
+ },
2112
+ {
2113
+ "epoch": 0.9272951324707333,
2114
+ "grad_norm": 1.5698750019073486,
2115
+ "learning_rate": 2.9283790503567222e-06,
2116
+ "loss": 1.041,
2117
+ "step": 1505
2118
+ },
2119
+ {
2120
+ "epoch": 0.9303758471965496,
2121
+ "grad_norm": 1.280504822731018,
2122
+ "learning_rate": 2.6885521952010105e-06,
2123
+ "loss": 0.9829,
2124
+ "step": 1510
2125
+ },
2126
+ {
2127
+ "epoch": 0.933456561922366,
2128
+ "grad_norm": 1.1187325716018677,
2129
+ "learning_rate": 2.458836352614069e-06,
2130
+ "loss": 0.9059,
2131
+ "step": 1515
2132
+ },
2133
+ {
2134
+ "epoch": 0.9365372766481824,
2135
+ "grad_norm": 2.8140485286712646,
2136
+ "learning_rate": 2.239255390905581e-06,
2137
+ "loss": 0.959,
2138
+ "step": 1520
2139
+ },
2140
+ {
2141
+ "epoch": 0.9396179913739988,
2142
+ "grad_norm": 2.5837156772613525,
2143
+ "learning_rate": 2.029832125334319e-06,
2144
+ "loss": 0.7814,
2145
+ "step": 1525
2146
+ },
2147
+ {
2148
+ "epoch": 0.9426987060998152,
2149
+ "grad_norm": 1.5361571311950684,
2150
+ "learning_rate": 1.8305883157375804e-06,
2151
+ "loss": 0.9064,
2152
+ "step": 1530
2153
+ },
2154
+ {
2155
+ "epoch": 0.9457794208256316,
2156
+ "grad_norm": 1.3485791683197021,
2157
+ "learning_rate": 1.6415446642702337e-06,
2158
+ "loss": 1.0203,
2159
+ "step": 1535
2160
+ },
2161
+ {
2162
+ "epoch": 0.9488601355514479,
2163
+ "grad_norm": 1.1599912643432617,
2164
+ "learning_rate": 1.462720813253682e-06,
2165
+ "loss": 0.9649,
2166
+ "step": 1540
2167
+ },
2168
+ {
2169
+ "epoch": 0.9519408502772643,
2170
+ "grad_norm": 2.5785133838653564,
2171
+ "learning_rate": 1.2941353431350056e-06,
2172
+ "loss": 0.942,
2173
+ "step": 1545
2174
+ },
2175
+ {
2176
+ "epoch": 0.9550215650030807,
2177
+ "grad_norm": 2.7626349925994873,
2178
+ "learning_rate": 1.135805770556364e-06,
2179
+ "loss": 0.8007,
2180
+ "step": 1550
2181
+ },
2182
+ {
2183
+ "epoch": 0.958102279728897,
2184
+ "grad_norm": 1.4761985540390015,
2185
+ "learning_rate": 9.877485465349058e-07,
2186
+ "loss": 0.98,
2187
+ "step": 1555
2188
+ },
2189
+ {
2190
+ "epoch": 0.9611829944547134,
2191
+ "grad_norm": 1.375490665435791,
2192
+ "learning_rate": 8.499790547535025e-07,
2193
+ "loss": 0.8888,
2194
+ "step": 1560
2195
+ },
2196
+ {
2197
+ "epoch": 0.9642637091805298,
2198
+ "grad_norm": 1.0085694789886475,
2199
+ "learning_rate": 7.225116099623286e-07,
2200
+ "loss": 0.8864,
2201
+ "step": 1565
2202
+ },
2203
+ {
2204
+ "epoch": 0.9673444239063462,
2205
+ "grad_norm": 2.4306185245513916,
2206
+ "learning_rate": 6.053594564914611e-07,
2207
+ "loss": 0.978,
2208
+ "step": 1570
2209
+ },
2210
+ {
2211
+ "epoch": 0.9704251386321626,
2212
+ "grad_norm": 2.7361271381378174,
2213
+ "learning_rate": 4.985347668747809e-07,
2214
+ "loss": 0.7762,
2215
+ "step": 1575
2216
+ },
2217
+ {
2218
+ "epoch": 0.973505853357979,
2219
+ "grad_norm": 1.6107145547866821,
2220
+ "learning_rate": 4.0204864058522864e-07,
2221
+ "loss": 1.0036,
2222
+ "step": 1580
2223
+ },
2224
+ {
2225
+ "epoch": 0.9765865680837954,
2226
+ "grad_norm": 1.0771360397338867,
2227
+ "learning_rate": 3.15911102881461e-07,
2228
+ "loss": 1.0982,
2229
+ "step": 1585
2230
+ },
2231
+ {
2232
+ "epoch": 0.9796672828096118,
2233
+ "grad_norm": 1.1698962450027466,
2234
+ "learning_rate": 2.40131103766239e-07,
2235
+ "loss": 0.9256,
2236
+ "step": 1590
2237
+ },
2238
+ {
2239
+ "epoch": 0.9827479975354282,
2240
+ "grad_norm": 2.6601476669311523,
2241
+ "learning_rate": 1.747165170564724e-07,
2242
+ "loss": 1.0309,
2243
+ "step": 1595
2244
+ },
2245
+ {
2246
+ "epoch": 0.9858287122612446,
2247
+ "grad_norm": 2.3130533695220947,
2248
+ "learning_rate": 1.1967413956510686e-07,
2249
+ "loss": 0.8133,
2250
+ "step": 1600
2251
+ },
2252
+ {
2253
+ "epoch": 0.988909426987061,
2254
+ "grad_norm": 1.792394757270813,
2255
+ "learning_rate": 7.500969039491157e-08,
2256
+ "loss": 0.9367,
2257
+ "step": 1605
2258
+ },
2259
+ {
2260
+ "epoch": 0.9919901417128774,
2261
+ "grad_norm": 1.530282974243164,
2262
+ "learning_rate": 4.0727810344254325e-08,
2263
+ "loss": 1.0131,
2264
+ "step": 1610
2265
+ },
2266
+ {
2267
+ "epoch": 0.9950708564386938,
2268
+ "grad_norm": 1.0605932474136353,
2269
+ "learning_rate": 1.6832061424865153e-08,
2270
+ "loss": 0.9065,
2271
+ "step": 1615
2272
+ },
2273
+ {
2274
+ "epoch": 0.9981515711645101,
2275
+ "grad_norm": 1.6831053495407104,
2276
+ "learning_rate": 3.3249264917878387e-09,
2277
+ "loss": 0.8729,
2278
+ "step": 1620
2279
+ },
2280
+ {
2281
+ "epoch": 1.0,
2282
+ "step": 1623,
2283
+ "total_flos": 113508824776704.0,
2284
+ "train_loss": 1.074277332074394,
2285
+ "train_runtime": 22414.5784,
2286
+ "train_samples_per_second": 2.317,
2287
+ "train_steps_per_second": 0.072
2288
+ }
2289
+ ],
2290
+ "logging_steps": 5,
2291
+ "max_steps": 1623,
2292
+ "num_input_tokens_seen": 0,
2293
+ "num_train_epochs": 1,
2294
+ "save_steps": 500,
2295
+ "stateful_callbacks": {
2296
+ "TrainerControl": {
2297
+ "args": {
2298
+ "should_epoch_stop": false,
2299
+ "should_evaluate": false,
2300
+ "should_log": false,
2301
+ "should_save": true,
2302
+ "should_training_stop": true
2303
+ },
2304
+ "attributes": {}
2305
+ }
2306
+ },
2307
+ "total_flos": 113508824776704.0,
2308
+ "train_batch_size": 2,
2309
+ "trial_name": null,
2310
+ "trial_params": null
2311
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83d3c6130a8f7a9da0bc74df8ce7b85d707c21add10a8b8134c1a8b57eafda65
3
+ size 7864
training_loss.png ADDED