sizhkhy commited on
Commit
f34bc76
·
verified ·
1 Parent(s): 7601da0
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
  library_name: peft
3
  license: other
4
- base_model: unsloth/Llama-3.2-3B-Instruct
5
  tags:
6
  - llama-factory
7
  - lora
@@ -19,7 +19,7 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  This model is a fine-tuned version of [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) on the Klystroglobal dataset.
21
  It achieves the following results on the evaluation set:
22
- - Loss: 0.0174
23
 
24
  ## Model description
25
 
@@ -51,16 +51,15 @@ The following hyperparameters were used during training:
51
 
52
  | Training Loss | Epoch | Step | Validation Loss |
53
  |:-------------:|:-----:|:----:|:---------------:|
54
- | 0.0437 | 2.5 | 25 | 0.0340 |
55
- | 0.0098 | 5.0 | 50 | 0.0166 |
56
- | 0.0039 | 7.5 | 75 | 0.0165 |
57
- | 0.0021 | 10.0 | 100 | 0.0174 |
58
- | 0.0031 | 12.5 | 125 | 0.0145 |
59
- | 0.0022 | 15.0 | 150 | 0.0235 |
60
- | 0.0013 | 17.5 | 175 | 0.0187 |
61
- | 0.0012 | 20.0 | 200 | 0.0246 |
62
- | 0.0013 | 22.5 | 225 | 0.0264 |
63
- | 0.0013 | 25.0 | 250 | 0.0277 |
64
 
65
 
66
  ### Framework versions
 
1
  ---
2
  library_name: peft
3
  license: other
4
+ base_model: unsloth/llama-3.2-3b-instruct-bnb-4bit
5
  tags:
6
  - llama-factory
7
  - lora
 
19
 
20
  This model is a fine-tuned version of [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) on the Klystroglobal dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.0154
23
 
24
  ## Model description
25
 
 
51
 
52
  | Training Loss | Epoch | Step | Validation Loss |
53
  |:-------------:|:-----:|:----:|:---------------:|
54
+ | 0.0459 | 2.5 | 25 | 0.0362 |
55
+ | 0.0083 | 5.0 | 50 | 0.0184 |
56
+ | 0.0058 | 7.5 | 75 | 0.0171 |
57
+ | 0.0024 | 10.0 | 100 | 0.0154 |
58
+ | 0.0015 | 12.5 | 125 | 0.0164 |
59
+ | 0.0038 | 15.0 | 150 | 0.0210 |
60
+ | 0.0013 | 17.5 | 175 | 0.0282 |
61
+ | 0.0012 | 20.0 | 200 | 0.0279 |
62
+ | 0.0013 | 22.5 | 225 | 0.0306 |
 
63
 
64
 
65
  ### Framework versions
adapter_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
- "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct",
5
  "bias": "none",
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
- "up_proj",
25
- "gate_proj",
26
  "down_proj",
27
- "v_proj",
28
  "q_proj",
29
- "o_proj"
 
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/llama-3.2-3b-instruct-bnb-4bit",
5
  "bias": "none",
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "o_proj",
 
 
24
  "down_proj",
 
25
  "q_proj",
26
+ "gate_proj",
27
+ "k_proj",
28
+ "up_proj",
29
+ "v_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e80bf440e224538c0d7fdaa085ba987418951442628ee238b39a518393c3690b
3
  size 1556140392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:172b5137374a28cd4b588ecaa929dd89904d9e89cc6a384fb463cb0d97bbdcc2
3
  size 1556140392
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "epoch": 25.0,
3
- "eval_loss": 0.017384245991706848,
4
- "eval_runtime": 7.0041,
5
- "eval_samples_per_second": 14.277,
6
  "eval_steps_per_second": 0.428,
7
- "total_flos": 3.832789293855867e+17,
8
- "train_loss": 0.011958676076494158,
9
- "train_runtime": 2822.6775,
10
- "train_samples_per_second": 5.617,
11
- "train_steps_per_second": 0.124
12
  }
 
1
  {
2
+ "epoch": 22.5,
3
+ "eval_loss": 0.01536454726010561,
4
+ "eval_runtime": 7.0144,
5
+ "eval_samples_per_second": 14.256,
6
  "eval_steps_per_second": 0.428,
7
+ "total_flos": 3.454694940505375e+17,
8
+ "train_loss": 0.013378481343388558,
9
+ "train_runtime": 2354.7794,
10
+ "train_samples_per_second": 6.733,
11
+ "train_steps_per_second": 0.149
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 25.0,
3
- "eval_loss": 0.017384245991706848,
4
- "eval_runtime": 7.0041,
5
- "eval_samples_per_second": 14.277,
6
  "eval_steps_per_second": 0.428
7
  }
 
1
  {
2
+ "epoch": 22.5,
3
+ "eval_loss": 0.01536454726010561,
4
+ "eval_runtime": 7.0144,
5
+ "eval_samples_per_second": 14.256,
6
  "eval_steps_per_second": 0.428
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 25.0,
3
- "total_flos": 3.832789293855867e+17,
4
- "train_loss": 0.011958676076494158,
5
- "train_runtime": 2822.6775,
6
- "train_samples_per_second": 5.617,
7
- "train_steps_per_second": 0.124
8
  }
 
1
  {
2
+ "epoch": 22.5,
3
+ "total_flos": 3.454694940505375e+17,
4
+ "train_loss": 0.013378481343388558,
5
+ "train_runtime": 2354.7794,
6
+ "train_samples_per_second": 6.733,
7
+ "train_steps_per_second": 0.149
8
  }
trainer_log.jsonl CHANGED
@@ -1,261 +1,235 @@
1
- {"current_steps": 1, "total_steps": 350, "loss": 0.1531, "lr": 2.8571428571428573e-06, "epoch": 0.1, "percentage": 0.29, "elapsed_time": "0:00:27", "remaining_time": "2:38:08"}
2
- {"current_steps": 2, "total_steps": 350, "loss": 0.1308, "lr": 5.7142857142857145e-06, "epoch": 0.2, "percentage": 0.57, "elapsed_time": "0:00:45", "remaining_time": "2:11:52"}
3
- {"current_steps": 3, "total_steps": 350, "loss": 0.1619, "lr": 8.571428571428573e-06, "epoch": 0.3, "percentage": 0.86, "elapsed_time": "0:01:04", "remaining_time": "2:04:36"}
4
- {"current_steps": 4, "total_steps": 350, "loss": 0.1325, "lr": 1.1428571428571429e-05, "epoch": 0.4, "percentage": 1.14, "elapsed_time": "0:01:17", "remaining_time": "1:51:35"}
5
- {"current_steps": 5, "total_steps": 350, "loss": 0.1206, "lr": 1.4285714285714285e-05, "epoch": 0.5, "percentage": 1.43, "elapsed_time": "0:01:27", "remaining_time": "1:40:58"}
6
- {"current_steps": 6, "total_steps": 350, "loss": 0.0875, "lr": 1.7142857142857145e-05, "epoch": 0.6, "percentage": 1.71, "elapsed_time": "0:01:38", "remaining_time": "1:33:50"}
7
- {"current_steps": 7, "total_steps": 350, "loss": 0.1067, "lr": 2e-05, "epoch": 0.7, "percentage": 2.0, "elapsed_time": "0:02:04", "remaining_time": "1:41:17"}
8
- {"current_steps": 8, "total_steps": 350, "loss": 0.0966, "lr": 2.2857142857142858e-05, "epoch": 0.8, "percentage": 2.29, "elapsed_time": "0:02:14", "remaining_time": "1:35:49"}
9
- {"current_steps": 9, "total_steps": 350, "loss": 0.1088, "lr": 2.5714285714285714e-05, "epoch": 0.9, "percentage": 2.57, "elapsed_time": "0:02:24", "remaining_time": "1:31:24"}
10
- {"current_steps": 10, "total_steps": 350, "loss": 0.0839, "lr": 2.857142857142857e-05, "epoch": 1.0, "percentage": 2.86, "elapsed_time": "0:02:37", "remaining_time": "1:29:20"}
11
- {"current_steps": 11, "total_steps": 350, "loss": 0.0713, "lr": 3.142857142857143e-05, "epoch": 1.1, "percentage": 3.14, "elapsed_time": "0:02:50", "remaining_time": "1:27:31"}
12
- {"current_steps": 12, "total_steps": 350, "loss": 0.0688, "lr": 3.428571428571429e-05, "epoch": 1.2, "percentage": 3.43, "elapsed_time": "0:03:00", "remaining_time": "1:24:44"}
13
- {"current_steps": 13, "total_steps": 350, "loss": 0.0688, "lr": 3.7142857142857143e-05, "epoch": 1.3, "percentage": 3.71, "elapsed_time": "0:03:12", "remaining_time": "1:23:01"}
14
- {"current_steps": 14, "total_steps": 350, "loss": 0.0668, "lr": 4e-05, "epoch": 1.4, "percentage": 4.0, "elapsed_time": "0:03:26", "remaining_time": "1:22:39"}
15
- {"current_steps": 15, "total_steps": 350, "loss": 0.0613, "lr": 4.2857142857142856e-05, "epoch": 1.5, "percentage": 4.29, "elapsed_time": "0:03:40", "remaining_time": "1:22:10"}
16
- {"current_steps": 16, "total_steps": 350, "loss": 0.052, "lr": 4.5714285714285716e-05, "epoch": 1.6, "percentage": 4.57, "elapsed_time": "0:03:53", "remaining_time": "1:21:24"}
17
- {"current_steps": 17, "total_steps": 350, "loss": 0.0581, "lr": 4.8571428571428576e-05, "epoch": 1.7, "percentage": 4.86, "elapsed_time": "0:04:07", "remaining_time": "1:20:46"}
18
- {"current_steps": 18, "total_steps": 350, "loss": 0.0482, "lr": 5.142857142857143e-05, "epoch": 1.8, "percentage": 5.14, "elapsed_time": "0:04:19", "remaining_time": "1:19:48"}
19
- {"current_steps": 19, "total_steps": 350, "loss": 0.0533, "lr": 5.428571428571428e-05, "epoch": 1.9, "percentage": 5.43, "elapsed_time": "0:04:30", "remaining_time": "1:18:32"}
20
- {"current_steps": 20, "total_steps": 350, "loss": 0.0513, "lr": 5.714285714285714e-05, "epoch": 2.0, "percentage": 5.71, "elapsed_time": "0:04:35", "remaining_time": "1:15:50"}
21
- {"current_steps": 21, "total_steps": 350, "loss": 0.042, "lr": 6e-05, "epoch": 2.1, "percentage": 6.0, "elapsed_time": "0:04:51", "remaining_time": "1:16:05"}
22
- {"current_steps": 22, "total_steps": 350, "loss": 0.0401, "lr": 6.285714285714286e-05, "epoch": 2.2, "percentage": 6.29, "elapsed_time": "0:05:05", "remaining_time": "1:15:51"}
23
- {"current_steps": 23, "total_steps": 350, "loss": 0.0382, "lr": 6.571428571428571e-05, "epoch": 2.3, "percentage": 6.57, "elapsed_time": "0:05:16", "remaining_time": "1:14:57"}
24
- {"current_steps": 24, "total_steps": 350, "loss": 0.034, "lr": 6.857142857142858e-05, "epoch": 2.4, "percentage": 6.86, "elapsed_time": "0:05:30", "remaining_time": "1:14:49"}
25
- {"current_steps": 25, "total_steps": 350, "loss": 0.0437, "lr": 7.142857142857143e-05, "epoch": 2.5, "percentage": 7.14, "elapsed_time": "0:05:44", "remaining_time": "1:14:38"}
26
- {"current_steps": 25, "total_steps": 350, "eval_loss": 0.03402441740036011, "epoch": 2.5, "percentage": 7.14, "elapsed_time": "0:06:03", "remaining_time": "1:18:51"}
27
- {"current_steps": 26, "total_steps": 350, "loss": 0.0422, "lr": 7.428571428571429e-05, "epoch": 2.6, "percentage": 7.43, "elapsed_time": "0:06:16", "remaining_time": "1:18:17"}
28
- {"current_steps": 27, "total_steps": 350, "loss": 0.0264, "lr": 7.714285714285715e-05, "epoch": 2.7, "percentage": 7.71, "elapsed_time": "0:06:30", "remaining_time": "1:17:48"}
29
- {"current_steps": 28, "total_steps": 350, "loss": 0.0377, "lr": 8e-05, "epoch": 2.8, "percentage": 8.0, "elapsed_time": "0:06:41", "remaining_time": "1:16:57"}
30
- {"current_steps": 29, "total_steps": 350, "loss": 0.0289, "lr": 8.285714285714287e-05, "epoch": 2.9, "percentage": 8.29, "elapsed_time": "0:06:54", "remaining_time": "1:16:23"}
31
- {"current_steps": 30, "total_steps": 350, "loss": 0.0299, "lr": 8.571428571428571e-05, "epoch": 3.0, "percentage": 8.57, "elapsed_time": "0:06:59", "remaining_time": "1:14:35"}
32
- {"current_steps": 31, "total_steps": 350, "loss": 0.0216, "lr": 8.857142857142857e-05, "epoch": 3.1, "percentage": 8.86, "elapsed_time": "0:07:15", "remaining_time": "1:14:40"}
33
- {"current_steps": 32, "total_steps": 350, "loss": 0.0359, "lr": 9.142857142857143e-05, "epoch": 3.2, "percentage": 9.14, "elapsed_time": "0:07:26", "remaining_time": "1:13:56"}
34
- {"current_steps": 33, "total_steps": 350, "loss": 0.0233, "lr": 9.428571428571429e-05, "epoch": 3.3, "percentage": 9.43, "elapsed_time": "0:07:39", "remaining_time": "1:13:36"}
35
- {"current_steps": 34, "total_steps": 350, "loss": 0.0254, "lr": 9.714285714285715e-05, "epoch": 3.4, "percentage": 9.71, "elapsed_time": "0:07:52", "remaining_time": "1:13:14"}
36
- {"current_steps": 35, "total_steps": 350, "loss": 0.0202, "lr": 0.0001, "epoch": 3.5, "percentage": 10.0, "elapsed_time": "0:08:07", "remaining_time": "1:13:07"}
37
- {"current_steps": 36, "total_steps": 350, "loss": 0.0197, "lr": 9.999751334779716e-05, "epoch": 3.6, "percentage": 10.29, "elapsed_time": "0:08:23", "remaining_time": "1:13:13"}
38
- {"current_steps": 37, "total_steps": 350, "loss": 0.0206, "lr": 9.999005363852618e-05, "epoch": 3.7, "percentage": 10.57, "elapsed_time": "0:08:38", "remaining_time": "1:13:05"}
39
- {"current_steps": 38, "total_steps": 350, "loss": 0.0197, "lr": 9.997762161417517e-05, "epoch": 3.8, "percentage": 10.86, "elapsed_time": "0:08:51", "remaining_time": "1:12:44"}
40
- {"current_steps": 39, "total_steps": 350, "loss": 0.0178, "lr": 9.996021851130897e-05, "epoch": 3.9, "percentage": 11.14, "elapsed_time": "0:09:03", "remaining_time": "1:12:14"}
41
- {"current_steps": 40, "total_steps": 350, "loss": 0.0141, "lr": 9.993784606094612e-05, "epoch": 4.0, "percentage": 11.43, "elapsed_time": "0:09:10", "remaining_time": "1:11:03"}
42
- {"current_steps": 41, "total_steps": 350, "loss": 0.012, "lr": 9.991050648838675e-05, "epoch": 4.1, "percentage": 11.71, "elapsed_time": "0:09:24", "remaining_time": "1:10:51"}
43
- {"current_steps": 42, "total_steps": 350, "loss": 0.0124, "lr": 9.987820251299122e-05, "epoch": 4.2, "percentage": 12.0, "elapsed_time": "0:09:34", "remaining_time": "1:10:13"}
44
- {"current_steps": 43, "total_steps": 350, "loss": 0.017, "lr": 9.984093734790956e-05, "epoch": 4.3, "percentage": 12.29, "elapsed_time": "0:09:47", "remaining_time": "1:09:55"}
45
- {"current_steps": 44, "total_steps": 350, "loss": 0.0132, "lr": 9.979871469976196e-05, "epoch": 4.4, "percentage": 12.57, "elapsed_time": "0:10:01", "remaining_time": "1:09:40"}
46
- {"current_steps": 45, "total_steps": 350, "loss": 0.0169, "lr": 9.975153876827008e-05, "epoch": 4.5, "percentage": 12.86, "elapsed_time": "0:10:14", "remaining_time": "1:09:23"}
47
- {"current_steps": 46, "total_steps": 350, "loss": 0.0145, "lr": 9.969941424583926e-05, "epoch": 4.6, "percentage": 13.14, "elapsed_time": "0:10:26", "remaining_time": "1:09:01"}
48
- {"current_steps": 47, "total_steps": 350, "loss": 0.0151, "lr": 9.964234631709187e-05, "epoch": 4.7, "percentage": 13.43, "elapsed_time": "0:10:40", "remaining_time": "1:08:51"}
49
- {"current_steps": 48, "total_steps": 350, "loss": 0.011, "lr": 9.958034065835151e-05, "epoch": 4.8, "percentage": 13.71, "elapsed_time": "0:10:54", "remaining_time": "1:08:38"}
50
- {"current_steps": 49, "total_steps": 350, "loss": 0.012, "lr": 9.951340343707852e-05, "epoch": 4.9, "percentage": 14.0, "elapsed_time": "0:11:08", "remaining_time": "1:08:24"}
51
- {"current_steps": 50, "total_steps": 350, "loss": 0.0098, "lr": 9.944154131125642e-05, "epoch": 5.0, "percentage": 14.29, "elapsed_time": "0:11:13", "remaining_time": "1:07:21"}
52
- {"current_steps": 50, "total_steps": 350, "eval_loss": 0.016623547300696373, "epoch": 5.0, "percentage": 14.29, "elapsed_time": "0:11:29", "remaining_time": "1:08:54"}
53
- {"current_steps": 51, "total_steps": 350, "loss": 0.011, "lr": 9.936476142872979e-05, "epoch": 5.1, "percentage": 14.57, "elapsed_time": "0:11:46", "remaining_time": "1:09:03"}
54
- {"current_steps": 52, "total_steps": 350, "loss": 0.0082, "lr": 9.928307142649316e-05, "epoch": 5.2, "percentage": 14.86, "elapsed_time": "0:12:00", "remaining_time": "1:08:49"}
55
- {"current_steps": 53, "total_steps": 350, "loss": 0.0069, "lr": 9.919647942993148e-05, "epoch": 5.3, "percentage": 15.14, "elapsed_time": "0:12:15", "remaining_time": "1:08:40"}
56
- {"current_steps": 54, "total_steps": 350, "loss": 0.0091, "lr": 9.910499405201195e-05, "epoch": 5.4, "percentage": 15.43, "elapsed_time": "0:12:28", "remaining_time": "1:08:23"}
57
- {"current_steps": 55, "total_steps": 350, "loss": 0.0062, "lr": 9.900862439242719e-05, "epoch": 5.5, "percentage": 15.71, "elapsed_time": "0:13:06", "remaining_time": "1:10:19"}
58
- {"current_steps": 56, "total_steps": 350, "loss": 0.0052, "lr": 9.890738003669029e-05, "epoch": 5.6, "percentage": 16.0, "elapsed_time": "0:13:19", "remaining_time": "1:09:59"}
59
- {"current_steps": 57, "total_steps": 350, "loss": 0.0076, "lr": 9.880127105518122e-05, "epoch": 5.7, "percentage": 16.29, "elapsed_time": "0:13:34", "remaining_time": "1:09:44"}
60
- {"current_steps": 58, "total_steps": 350, "loss": 0.0107, "lr": 9.869030800214532e-05, "epoch": 5.8, "percentage": 16.57, "elapsed_time": "0:13:44", "remaining_time": "1:09:11"}
61
- {"current_steps": 59, "total_steps": 350, "loss": 0.0081, "lr": 9.857450191464337e-05, "epoch": 5.9, "percentage": 16.86, "elapsed_time": "0:13:56", "remaining_time": "1:08:46"}
62
- {"current_steps": 60, "total_steps": 350, "loss": 0.0063, "lr": 9.84538643114539e-05, "epoch": 6.0, "percentage": 17.14, "elapsed_time": "0:14:01", "remaining_time": "1:07:49"}
63
- {"current_steps": 61, "total_steps": 350, "loss": 0.0037, "lr": 9.832840719192736e-05, "epoch": 6.1, "percentage": 17.43, "elapsed_time": "0:14:17", "remaining_time": "1:07:42"}
64
- {"current_steps": 62, "total_steps": 350, "loss": 0.0049, "lr": 9.819814303479267e-05, "epoch": 6.2, "percentage": 17.71, "elapsed_time": "0:14:31", "remaining_time": "1:07:28"}
65
- {"current_steps": 63, "total_steps": 350, "loss": 0.0051, "lr": 9.806308479691595e-05, "epoch": 6.3, "percentage": 18.0, "elapsed_time": "0:14:45", "remaining_time": "1:07:15"}
66
- {"current_steps": 64, "total_steps": 350, "loss": 0.0052, "lr": 9.792324591201179e-05, "epoch": 6.4, "percentage": 18.29, "elapsed_time": "0:15:02", "remaining_time": "1:07:14"}
67
- {"current_steps": 65, "total_steps": 350, "loss": 0.0046, "lr": 9.777864028930705e-05, "epoch": 6.5, "percentage": 18.57, "elapsed_time": "0:15:18", "remaining_time": "1:07:06"}
68
- {"current_steps": 66, "total_steps": 350, "loss": 0.0064, "lr": 9.76292823121573e-05, "epoch": 6.6, "percentage": 18.86, "elapsed_time": "0:15:30", "remaining_time": "1:06:45"}
69
- {"current_steps": 67, "total_steps": 350, "loss": 0.0044, "lr": 9.747518683661631e-05, "epoch": 6.7, "percentage": 19.14, "elapsed_time": "0:15:41", "remaining_time": "1:06:15"}
70
- {"current_steps": 68, "total_steps": 350, "loss": 0.0064, "lr": 9.731636918995821e-05, "epoch": 6.8, "percentage": 19.43, "elapsed_time": "0:15:51", "remaining_time": "1:05:46"}
71
- {"current_steps": 69, "total_steps": 350, "loss": 0.0045, "lr": 9.715284516915303e-05, "epoch": 6.9, "percentage": 19.71, "elapsed_time": "0:16:01", "remaining_time": "1:05:17"}
72
- {"current_steps": 70, "total_steps": 350, "loss": 0.0067, "lr": 9.698463103929542e-05, "epoch": 7.0, "percentage": 20.0, "elapsed_time": "0:16:06", "remaining_time": "1:04:26"}
73
- {"current_steps": 71, "total_steps": 350, "loss": 0.0037, "lr": 9.681174353198687e-05, "epoch": 7.1, "percentage": 20.29, "elapsed_time": "0:16:17", "remaining_time": "1:03:59"}
74
- {"current_steps": 72, "total_steps": 350, "loss": 0.0027, "lr": 9.663419984367139e-05, "epoch": 7.2, "percentage": 20.57, "elapsed_time": "0:16:29", "remaining_time": "1:03:41"}
75
- {"current_steps": 73, "total_steps": 350, "loss": 0.0046, "lr": 9.645201763392513e-05, "epoch": 7.3, "percentage": 20.86, "elapsed_time": "0:16:40", "remaining_time": "1:03:16"}
76
- {"current_steps": 74, "total_steps": 350, "loss": 0.0054, "lr": 9.626521502369984e-05, "epoch": 7.4, "percentage": 21.14, "elapsed_time": "0:16:50", "remaining_time": "1:02:49"}
77
- {"current_steps": 75, "total_steps": 350, "loss": 0.0039, "lr": 9.607381059352038e-05, "epoch": 7.5, "percentage": 21.43, "elapsed_time": "0:17:01", "remaining_time": "1:02:24"}
78
- {"current_steps": 75, "total_steps": 350, "eval_loss": 0.016471313312649727, "epoch": 7.5, "percentage": 21.43, "elapsed_time": "0:17:08", "remaining_time": "1:02:49"}
79
- {"current_steps": 76, "total_steps": 350, "loss": 0.0035, "lr": 9.587782338163669e-05, "epoch": 7.6, "percentage": 21.71, "elapsed_time": "0:17:18", "remaining_time": "1:02:23"}
80
- {"current_steps": 77, "total_steps": 350, "loss": 0.0047, "lr": 9.567727288213005e-05, "epoch": 7.7, "percentage": 22.0, "elapsed_time": "0:17:28", "remaining_time": "1:01:58"}
81
- {"current_steps": 78, "total_steps": 350, "loss": 0.0028, "lr": 9.547217904297411e-05, "epoch": 7.8, "percentage": 22.29, "elapsed_time": "0:17:39", "remaining_time": "1:01:33"}
82
- {"current_steps": 79, "total_steps": 350, "loss": 0.0054, "lr": 9.526256226405075e-05, "epoch": 7.9, "percentage": 22.57, "elapsed_time": "0:17:50", "remaining_time": "1:01:10"}
83
- {"current_steps": 80, "total_steps": 350, "loss": 0.0025, "lr": 9.504844339512095e-05, "epoch": 8.0, "percentage": 22.86, "elapsed_time": "0:17:54", "remaining_time": "1:00:26"}
84
- {"current_steps": 81, "total_steps": 350, "loss": 0.0037, "lr": 9.482984373375105e-05, "epoch": 8.1, "percentage": 23.14, "elapsed_time": "0:18:05", "remaining_time": "1:00:04"}
85
- {"current_steps": 82, "total_steps": 350, "loss": 0.0026, "lr": 9.460678502319418e-05, "epoch": 8.2, "percentage": 23.43, "elapsed_time": "0:18:16", "remaining_time": "0:59:42"}
86
- {"current_steps": 83, "total_steps": 350, "loss": 0.0049, "lr": 9.437928945022771e-05, "epoch": 8.3, "percentage": 23.71, "elapsed_time": "0:18:26", "remaining_time": "0:59:20"}
87
- {"current_steps": 84, "total_steps": 350, "loss": 0.0037, "lr": 9.414737964294636e-05, "epoch": 8.4, "percentage": 24.0, "elapsed_time": "0:18:37", "remaining_time": "0:58:58"}
88
- {"current_steps": 85, "total_steps": 350, "loss": 0.0025, "lr": 9.391107866851143e-05, "epoch": 8.5, "percentage": 24.29, "elapsed_time": "0:18:48", "remaining_time": "0:58:36"}
89
- {"current_steps": 86, "total_steps": 350, "loss": 0.0032, "lr": 9.367041003085649e-05, "epoch": 8.6, "percentage": 24.57, "elapsed_time": "0:18:58", "remaining_time": "0:58:14"}
90
- {"current_steps": 87, "total_steps": 350, "loss": 0.0028, "lr": 9.342539766834946e-05, "epoch": 8.7, "percentage": 24.86, "elapsed_time": "0:19:09", "remaining_time": "0:57:53"}
91
- {"current_steps": 88, "total_steps": 350, "loss": 0.0027, "lr": 9.317606595141154e-05, "epoch": 8.8, "percentage": 25.14, "elapsed_time": "0:19:19", "remaining_time": "0:57:32"}
92
- {"current_steps": 89, "total_steps": 350, "loss": 0.0029, "lr": 9.292243968009331e-05, "epoch": 8.9, "percentage": 25.43, "elapsed_time": "0:19:30", "remaining_time": "0:57:12"}
93
- {"current_steps": 90, "total_steps": 350, "loss": 0.0034, "lr": 9.266454408160779e-05, "epoch": 9.0, "percentage": 25.71, "elapsed_time": "0:19:35", "remaining_time": "0:56:34"}
94
- {"current_steps": 91, "total_steps": 350, "loss": 0.0023, "lr": 9.24024048078213e-05, "epoch": 9.1, "percentage": 26.0, "elapsed_time": "0:19:46", "remaining_time": "0:56:15"}
95
- {"current_steps": 92, "total_steps": 350, "loss": 0.0024, "lr": 9.213604793270196e-05, "epoch": 9.2, "percentage": 26.29, "elapsed_time": "0:19:57", "remaining_time": "0:55:56"}
96
- {"current_steps": 93, "total_steps": 350, "loss": 0.0031, "lr": 9.186549994972618e-05, "epoch": 9.3, "percentage": 26.57, "elapsed_time": "0:20:07", "remaining_time": "0:55:37"}
97
- {"current_steps": 94, "total_steps": 350, "loss": 0.0029, "lr": 9.159078776924346e-05, "epoch": 9.4, "percentage": 26.86, "elapsed_time": "0:20:18", "remaining_time": "0:55:17"}
98
- {"current_steps": 95, "total_steps": 350, "loss": 0.0017, "lr": 9.131193871579975e-05, "epoch": 9.5, "percentage": 27.14, "elapsed_time": "0:20:28", "remaining_time": "0:54:57"}
99
- {"current_steps": 96, "total_steps": 350, "loss": 0.0022, "lr": 9.102898052541958e-05, "epoch": 9.6, "percentage": 27.43, "elapsed_time": "0:20:39", "remaining_time": "0:54:38"}
100
- {"current_steps": 97, "total_steps": 350, "loss": 0.0025, "lr": 9.074194134284726e-05, "epoch": 9.7, "percentage": 27.71, "elapsed_time": "0:20:49", "remaining_time": "0:54:19"}
101
- {"current_steps": 98, "total_steps": 350, "loss": 0.002, "lr": 9.045084971874738e-05, "epoch": 9.8, "percentage": 28.0, "elapsed_time": "0:21:00", "remaining_time": "0:54:00"}
102
- {"current_steps": 99, "total_steps": 350, "loss": 0.0026, "lr": 9.015573460686509e-05, "epoch": 9.9, "percentage": 28.29, "elapsed_time": "0:21:10", "remaining_time": "0:53:41"}
103
- {"current_steps": 100, "total_steps": 350, "loss": 0.0021, "lr": 8.985662536114613e-05, "epoch": 10.0, "percentage": 28.57, "elapsed_time": "0:21:15", "remaining_time": "0:53:08"}
104
- {"current_steps": 100, "total_steps": 350, "eval_loss": 0.017384279519319534, "epoch": 10.0, "percentage": 28.57, "elapsed_time": "0:21:22", "remaining_time": "0:53:26"}
105
- {"current_steps": 101, "total_steps": 350, "loss": 0.0017, "lr": 8.955355173281708e-05, "epoch": 10.1, "percentage": 28.86, "elapsed_time": "0:21:51", "remaining_time": "0:53:54"}
106
- {"current_steps": 102, "total_steps": 350, "loss": 0.0018, "lr": 8.924654386742613e-05, "epoch": 10.2, "percentage": 29.14, "elapsed_time": "0:22:02", "remaining_time": "0:53:34"}
107
- {"current_steps": 103, "total_steps": 350, "loss": 0.0025, "lr": 8.89356323018447e-05, "epoch": 10.3, "percentage": 29.43, "elapsed_time": "0:22:12", "remaining_time": "0:53:15"}
108
- {"current_steps": 104, "total_steps": 350, "loss": 0.0017, "lr": 8.862084796122998e-05, "epoch": 10.4, "percentage": 29.71, "elapsed_time": "0:22:22", "remaining_time": "0:52:56"}
109
- {"current_steps": 105, "total_steps": 350, "loss": 0.0029, "lr": 8.83022221559489e-05, "epoch": 10.5, "percentage": 30.0, "elapsed_time": "0:22:33", "remaining_time": "0:52:37"}
110
- {"current_steps": 106, "total_steps": 350, "loss": 0.0022, "lr": 8.797978657846391e-05, "epoch": 10.6, "percentage": 30.29, "elapsed_time": "0:22:43", "remaining_time": "0:52:18"}
111
- {"current_steps": 107, "total_steps": 350, "loss": 0.0024, "lr": 8.765357330018056e-05, "epoch": 10.7, "percentage": 30.57, "elapsed_time": "0:22:54", "remaining_time": "0:52:00"}
112
- {"current_steps": 108, "total_steps": 350, "loss": 0.0028, "lr": 8.732361476825752e-05, "epoch": 10.8, "percentage": 30.86, "elapsed_time": "0:23:04", "remaining_time": "0:51:42"}
113
- {"current_steps": 109, "total_steps": 350, "loss": 0.0018, "lr": 8.69899438023792e-05, "epoch": 10.9, "percentage": 31.14, "elapsed_time": "0:23:15", "remaining_time": "0:51:24"}
114
- {"current_steps": 110, "total_steps": 350, "loss": 0.0018, "lr": 8.665259359149132e-05, "epoch": 11.0, "percentage": 31.43, "elapsed_time": "0:23:19", "remaining_time": "0:50:53"}
115
- {"current_steps": 111, "total_steps": 350, "loss": 0.0028, "lr": 8.631159769049965e-05, "epoch": 11.1, "percentage": 31.71, "elapsed_time": "0:23:30", "remaining_time": "0:50:36"}
116
- {"current_steps": 112, "total_steps": 350, "loss": 0.0018, "lr": 8.596699001693255e-05, "epoch": 11.2, "percentage": 32.0, "elapsed_time": "0:23:40", "remaining_time": "0:50:18"}
117
- {"current_steps": 113, "total_steps": 350, "loss": 0.0018, "lr": 8.561880484756725e-05, "epoch": 11.3, "percentage": 32.29, "elapsed_time": "0:23:50", "remaining_time": "0:50:00"}
118
- {"current_steps": 114, "total_steps": 350, "loss": 0.0036, "lr": 8.526707681502044e-05, "epoch": 11.4, "percentage": 32.57, "elapsed_time": "0:24:01", "remaining_time": "0:49:43"}
119
- {"current_steps": 115, "total_steps": 350, "loss": 0.0019, "lr": 8.491184090430364e-05, "epoch": 11.5, "percentage": 32.86, "elapsed_time": "0:24:12", "remaining_time": "0:49:27"}
120
- {"current_steps": 116, "total_steps": 350, "loss": 0.0024, "lr": 8.455313244934324e-05, "epoch": 11.6, "percentage": 33.14, "elapsed_time": "0:24:22", "remaining_time": "0:49:10"}
121
- {"current_steps": 117, "total_steps": 350, "loss": 0.0033, "lr": 8.419098712946601e-05, "epoch": 11.7, "percentage": 33.43, "elapsed_time": "0:24:32", "remaining_time": "0:48:53"}
122
- {"current_steps": 118, "total_steps": 350, "loss": 0.0032, "lr": 8.382544096585027e-05, "epoch": 11.8, "percentage": 33.71, "elapsed_time": "0:24:43", "remaining_time": "0:48:36"}
123
- {"current_steps": 119, "total_steps": 350, "loss": 0.0032, "lr": 8.345653031794292e-05, "epoch": 11.9, "percentage": 34.0, "elapsed_time": "0:24:53", "remaining_time": "0:48:19"}
124
- {"current_steps": 120, "total_steps": 350, "loss": 0.0044, "lr": 8.308429187984297e-05, "epoch": 12.0, "percentage": 34.29, "elapsed_time": "0:24:58", "remaining_time": "0:47:51"}
125
- {"current_steps": 121, "total_steps": 350, "loss": 0.0025, "lr": 8.270876267665173e-05, "epoch": 12.1, "percentage": 34.57, "elapsed_time": "0:25:08", "remaining_time": "0:47:35"}
126
- {"current_steps": 122, "total_steps": 350, "loss": 0.002, "lr": 8.232998006078997e-05, "epoch": 12.2, "percentage": 34.86, "elapsed_time": "0:25:19", "remaining_time": "0:47:19"}
127
- {"current_steps": 123, "total_steps": 350, "loss": 0.0023, "lr": 8.19479817082828e-05, "epoch": 12.3, "percentage": 35.14, "elapsed_time": "0:25:29", "remaining_time": "0:47:03"}
128
- {"current_steps": 124, "total_steps": 350, "loss": 0.0025, "lr": 8.156280561501195e-05, "epoch": 12.4, "percentage": 35.43, "elapsed_time": "0:25:40", "remaining_time": "0:46:47"}
129
- {"current_steps": 125, "total_steps": 350, "loss": 0.0031, "lr": 8.117449009293668e-05, "epoch": 12.5, "percentage": 35.71, "elapsed_time": "0:25:50", "remaining_time": "0:46:30"}
130
- {"current_steps": 125, "total_steps": 350, "eval_loss": 0.014472348615527153, "epoch": 12.5, "percentage": 35.71, "elapsed_time": "0:25:57", "remaining_time": "0:46:43"}
131
- {"current_steps": 126, "total_steps": 350, "loss": 0.0023, "lr": 8.07830737662829e-05, "epoch": 12.6, "percentage": 36.0, "elapsed_time": "0:26:07", "remaining_time": "0:46:27"}
132
- {"current_steps": 127, "total_steps": 350, "loss": 0.0027, "lr": 8.038859556770151e-05, "epoch": 12.7, "percentage": 36.29, "elapsed_time": "0:26:18", "remaining_time": "0:46:11"}
133
- {"current_steps": 128, "total_steps": 350, "loss": 0.0019, "lr": 7.999109473439569e-05, "epoch": 12.8, "percentage": 36.57, "elapsed_time": "0:26:28", "remaining_time": "0:45:55"}
134
- {"current_steps": 129, "total_steps": 350, "loss": 0.0026, "lr": 7.959061080421839e-05, "epoch": 12.9, "percentage": 36.86, "elapsed_time": "0:26:39", "remaining_time": "0:45:39"}
135
- {"current_steps": 130, "total_steps": 350, "loss": 0.0016, "lr": 7.91871836117395e-05, "epoch": 13.0, "percentage": 37.14, "elapsed_time": "0:26:43", "remaining_time": "0:45:14"}
136
- {"current_steps": 131, "total_steps": 350, "loss": 0.0018, "lr": 7.878085328428369e-05, "epoch": 13.1, "percentage": 37.43, "elapsed_time": "0:26:54", "remaining_time": "0:44:58"}
137
- {"current_steps": 132, "total_steps": 350, "loss": 0.0037, "lr": 7.83716602379391e-05, "epoch": 13.2, "percentage": 37.71, "elapsed_time": "0:27:04", "remaining_time": "0:44:43"}
138
- {"current_steps": 133, "total_steps": 350, "loss": 0.0021, "lr": 7.795964517353735e-05, "epoch": 13.3, "percentage": 38.0, "elapsed_time": "0:27:14", "remaining_time": "0:44:27"}
139
- {"current_steps": 134, "total_steps": 350, "loss": 0.0023, "lr": 7.754484907260513e-05, "epoch": 13.4, "percentage": 38.29, "elapsed_time": "0:27:25", "remaining_time": "0:44:11"}
140
- {"current_steps": 135, "total_steps": 350, "loss": 0.0022, "lr": 7.712731319328798e-05, "epoch": 13.5, "percentage": 38.57, "elapsed_time": "0:27:35", "remaining_time": "0:43:56"}
141
- {"current_steps": 136, "total_steps": 350, "loss": 0.0016, "lr": 7.670707906624644e-05, "epoch": 13.6, "percentage": 38.86, "elapsed_time": "0:27:45", "remaining_time": "0:43:41"}
142
- {"current_steps": 137, "total_steps": 350, "loss": 0.0024, "lr": 7.628418849052523e-05, "epoch": 13.7, "percentage": 39.14, "elapsed_time": "0:27:56", "remaining_time": "0:43:26"}
143
- {"current_steps": 138, "total_steps": 350, "loss": 0.0016, "lr": 7.585868352939563e-05, "epoch": 13.8, "percentage": 39.43, "elapsed_time": "0:28:06", "remaining_time": "0:43:10"}
144
- {"current_steps": 139, "total_steps": 350, "loss": 0.0018, "lr": 7.543060650617158e-05, "epoch": 13.9, "percentage": 39.71, "elapsed_time": "0:28:16", "remaining_time": "0:42:55"}
145
- {"current_steps": 140, "total_steps": 350, "loss": 0.0025, "lr": 7.500000000000001e-05, "epoch": 14.0, "percentage": 40.0, "elapsed_time": "0:28:21", "remaining_time": "0:42:32"}
146
- {"current_steps": 141, "total_steps": 350, "loss": 0.0026, "lr": 7.456690684162557e-05, "epoch": 14.1, "percentage": 40.29, "elapsed_time": "0:28:32", "remaining_time": "0:42:17"}
147
- {"current_steps": 142, "total_steps": 350, "loss": 0.0019, "lr": 7.413137010913054e-05, "epoch": 14.2, "percentage": 40.57, "elapsed_time": "0:28:42", "remaining_time": "0:42:02"}
148
- {"current_steps": 143, "total_steps": 350, "loss": 0.002, "lr": 7.369343312364993e-05, "epoch": 14.3, "percentage": 40.86, "elapsed_time": "0:28:52", "remaining_time": "0:41:48"}
149
- {"current_steps": 144, "total_steps": 350, "loss": 0.0015, "lr": 7.325313944506254e-05, "epoch": 14.4, "percentage": 41.14, "elapsed_time": "0:29:03", "remaining_time": "0:41:33"}
150
- {"current_steps": 145, "total_steps": 350, "loss": 0.0018, "lr": 7.281053286765815e-05, "epoch": 14.5, "percentage": 41.43, "elapsed_time": "0:29:13", "remaining_time": "0:41:18"}
151
- {"current_steps": 146, "total_steps": 350, "loss": 0.0024, "lr": 7.236565741578163e-05, "epoch": 14.6, "percentage": 41.71, "elapsed_time": "0:29:24", "remaining_time": "0:41:04"}
152
- {"current_steps": 147, "total_steps": 350, "loss": 0.0049, "lr": 7.191855733945387e-05, "epoch": 14.7, "percentage": 42.0, "elapsed_time": "0:29:34", "remaining_time": "0:40:50"}
153
- {"current_steps": 148, "total_steps": 350, "loss": 0.0024, "lr": 7.146927710997047e-05, "epoch": 14.8, "percentage": 42.29, "elapsed_time": "0:29:44", "remaining_time": "0:40:36"}
154
- {"current_steps": 149, "total_steps": 350, "loss": 0.0018, "lr": 7.101786141547828e-05, "epoch": 14.9, "percentage": 42.57, "elapsed_time": "0:29:55", "remaining_time": "0:40:21"}
155
- {"current_steps": 150, "total_steps": 350, "loss": 0.0022, "lr": 7.056435515653059e-05, "epoch": 15.0, "percentage": 42.86, "elapsed_time": "0:29:59", "remaining_time": "0:39:59"}
156
- {"current_steps": 150, "total_steps": 350, "eval_loss": 0.023497436195611954, "epoch": 15.0, "percentage": 42.86, "elapsed_time": "0:30:06", "remaining_time": "0:40:09"}
157
- {"current_steps": 151, "total_steps": 350, "loss": 0.0015, "lr": 7.010880344162088e-05, "epoch": 15.1, "percentage": 43.14, "elapsed_time": "0:30:17", "remaining_time": "0:39:54"}
158
- {"current_steps": 152, "total_steps": 350, "loss": 0.0022, "lr": 6.965125158269619e-05, "epoch": 15.2, "percentage": 43.43, "elapsed_time": "0:30:27", "remaining_time": "0:39:40"}
159
- {"current_steps": 153, "total_steps": 350, "loss": 0.0029, "lr": 6.919174509065004e-05, "epoch": 15.3, "percentage": 43.71, "elapsed_time": "0:30:37", "remaining_time": "0:39:26"}
160
- {"current_steps": 154, "total_steps": 350, "loss": 0.0022, "lr": 6.873032967079561e-05, "epoch": 15.4, "percentage": 44.0, "elapsed_time": "0:30:47", "remaining_time": "0:39:11"}
161
- {"current_steps": 155, "total_steps": 350, "loss": 0.0033, "lr": 6.826705121831976e-05, "epoch": 15.5, "percentage": 44.29, "elapsed_time": "0:30:57", "remaining_time": "0:38:57"}
162
- {"current_steps": 156, "total_steps": 350, "loss": 0.0022, "lr": 6.780195581371784e-05, "epoch": 15.6, "percentage": 44.57, "elapsed_time": "0:31:08", "remaining_time": "0:38:43"}
163
- {"current_steps": 157, "total_steps": 350, "loss": 0.0021, "lr": 6.733508971821036e-05, "epoch": 15.7, "percentage": 44.86, "elapsed_time": "0:31:19", "remaining_time": "0:38:30"}
164
- {"current_steps": 158, "total_steps": 350, "loss": 0.0019, "lr": 6.686649936914152e-05, "epoch": 15.8, "percentage": 45.14, "elapsed_time": "0:31:29", "remaining_time": "0:38:16"}
165
- {"current_steps": 159, "total_steps": 350, "loss": 0.002, "lr": 6.639623137536023e-05, "epoch": 15.9, "percentage": 45.43, "elapsed_time": "0:31:39", "remaining_time": "0:38:02"}
166
- {"current_steps": 160, "total_steps": 350, "loss": 0.0014, "lr": 6.592433251258423e-05, "epoch": 16.0, "percentage": 45.71, "elapsed_time": "0:31:44", "remaining_time": "0:37:41"}
167
- {"current_steps": 161, "total_steps": 350, "loss": 0.0017, "lr": 6.545084971874738e-05, "epoch": 16.1, "percentage": 46.0, "elapsed_time": "0:31:54", "remaining_time": "0:37:27"}
168
- {"current_steps": 162, "total_steps": 350, "loss": 0.0014, "lr": 6.497583008933097e-05, "epoch": 16.2, "percentage": 46.29, "elapsed_time": "0:32:05", "remaining_time": "0:37:13"}
169
- {"current_steps": 163, "total_steps": 350, "loss": 0.0016, "lr": 6.449932087267932e-05, "epoch": 16.3, "percentage": 46.57, "elapsed_time": "0:32:15", "remaining_time": "0:37:00"}
170
- {"current_steps": 164, "total_steps": 350, "loss": 0.0013, "lr": 6.402136946530014e-05, "epoch": 16.4, "percentage": 46.86, "elapsed_time": "0:32:26", "remaining_time": "0:36:47"}
171
- {"current_steps": 165, "total_steps": 350, "loss": 0.0016, "lr": 6.354202340715026e-05, "epoch": 16.5, "percentage": 47.14, "elapsed_time": "0:32:36", "remaining_time": "0:36:33"}
172
- {"current_steps": 166, "total_steps": 350, "loss": 0.0019, "lr": 6.306133037690693e-05, "epoch": 16.6, "percentage": 47.43, "elapsed_time": "0:32:46", "remaining_time": "0:36:20"}
173
- {"current_steps": 167, "total_steps": 350, "loss": 0.0016, "lr": 6.257933818722543e-05, "epoch": 16.7, "percentage": 47.71, "elapsed_time": "0:32:56", "remaining_time": "0:36:06"}
174
- {"current_steps": 168, "total_steps": 350, "loss": 0.0025, "lr": 6.209609477998338e-05, "epoch": 16.8, "percentage": 48.0, "elapsed_time": "0:33:07", "remaining_time": "0:35:52"}
175
- {"current_steps": 169, "total_steps": 350, "loss": 0.0018, "lr": 6.161164822151213e-05, "epoch": 16.9, "percentage": 48.29, "elapsed_time": "0:33:17", "remaining_time": "0:35:39"}
176
- {"current_steps": 170, "total_steps": 350, "loss": 0.0017, "lr": 6.112604669781572e-05, "epoch": 17.0, "percentage": 48.57, "elapsed_time": "0:33:22", "remaining_time": "0:35:19"}
177
- {"current_steps": 171, "total_steps": 350, "loss": 0.0012, "lr": 6.063933850977811e-05, "epoch": 17.1, "percentage": 48.86, "elapsed_time": "0:33:32", "remaining_time": "0:35:06"}
178
- {"current_steps": 172, "total_steps": 350, "loss": 0.0014, "lr": 6.015157206835881e-05, "epoch": 17.2, "percentage": 49.14, "elapsed_time": "0:33:43", "remaining_time": "0:34:53"}
179
- {"current_steps": 173, "total_steps": 350, "loss": 0.0013, "lr": 5.9662795889777666e-05, "epoch": 17.3, "percentage": 49.43, "elapsed_time": "0:33:53", "remaining_time": "0:34:40"}
180
- {"current_steps": 174, "total_steps": 350, "loss": 0.0013, "lr": 5.917305859068912e-05, "epoch": 17.4, "percentage": 49.71, "elapsed_time": "0:34:03", "remaining_time": "0:34:27"}
181
- {"current_steps": 175, "total_steps": 350, "loss": 0.0013, "lr": 5.868240888334653e-05, "epoch": 17.5, "percentage": 50.0, "elapsed_time": "0:34:14", "remaining_time": "0:34:14"}
182
- {"current_steps": 175, "total_steps": 350, "eval_loss": 0.01870564930140972, "epoch": 17.5, "percentage": 50.0, "elapsed_time": "0:34:21", "remaining_time": "0:34:21"}
183
- {"current_steps": 176, "total_steps": 350, "loss": 0.0018, "lr": 5.819089557075689e-05, "epoch": 17.6, "percentage": 50.29, "elapsed_time": "0:34:31", "remaining_time": "0:34:08"}
184
- {"current_steps": 177, "total_steps": 350, "loss": 0.0014, "lr": 5.7698567541826675e-05, "epoch": 17.7, "percentage": 50.57, "elapsed_time": "0:34:42", "remaining_time": "0:33:55"}
185
- {"current_steps": 178, "total_steps": 350, "loss": 0.0025, "lr": 5.7205473766499005e-05, "epoch": 17.8, "percentage": 50.86, "elapsed_time": "0:34:52", "remaining_time": "0:33:42"}
186
- {"current_steps": 179, "total_steps": 350, "loss": 0.0024, "lr": 5.6711663290882776e-05, "epoch": 17.9, "percentage": 51.14, "elapsed_time": "0:35:02", "remaining_time": "0:33:28"}
187
- {"current_steps": 180, "total_steps": 350, "loss": 0.0014, "lr": 5.621718523237427e-05, "epoch": 18.0, "percentage": 51.43, "elapsed_time": "0:35:07", "remaining_time": "0:33:10"}
188
- {"current_steps": 181, "total_steps": 350, "loss": 0.0016, "lr": 5.57220887747716e-05, "epoch": 18.1, "percentage": 51.71, "elapsed_time": "0:35:17", "remaining_time": "0:32:57"}
189
- {"current_steps": 182, "total_steps": 350, "loss": 0.0013, "lr": 5.522642316338268e-05, "epoch": 18.2, "percentage": 52.0, "elapsed_time": "0:35:28", "remaining_time": "0:32:45"}
190
- {"current_steps": 183, "total_steps": 350, "loss": 0.0017, "lr": 5.473023770012686e-05, "epoch": 18.3, "percentage": 52.29, "elapsed_time": "0:35:39", "remaining_time": "0:32:32"}
191
- {"current_steps": 184, "total_steps": 350, "loss": 0.0013, "lr": 5.4233581738631165e-05, "epoch": 18.4, "percentage": 52.57, "elapsed_time": "0:35:49", "remaining_time": "0:32:19"}
192
- {"current_steps": 185, "total_steps": 350, "loss": 0.0016, "lr": 5.373650467932122e-05, "epoch": 18.5, "percentage": 52.86, "elapsed_time": "0:35:59", "remaining_time": "0:32:06"}
193
- {"current_steps": 186, "total_steps": 350, "loss": 0.0015, "lr": 5.323905596450759e-05, "epoch": 18.6, "percentage": 53.14, "elapsed_time": "0:36:09", "remaining_time": "0:31:53"}
194
- {"current_steps": 187, "total_steps": 350, "loss": 0.0013, "lr": 5.274128507346801e-05, "epoch": 18.7, "percentage": 53.43, "elapsed_time": "0:36:20", "remaining_time": "0:31:40"}
195
- {"current_steps": 188, "total_steps": 350, "loss": 0.0013, "lr": 5.2243241517525754e-05, "epoch": 18.8, "percentage": 53.71, "elapsed_time": "0:36:30", "remaining_time": "0:31:27"}
196
- {"current_steps": 189, "total_steps": 350, "loss": 0.0013, "lr": 5.174497483512506e-05, "epoch": 18.9, "percentage": 54.0, "elapsed_time": "0:36:41", "remaining_time": "0:31:14"}
197
- {"current_steps": 190, "total_steps": 350, "loss": 0.0013, "lr": 5.124653458690365e-05, "epoch": 19.0, "percentage": 54.29, "elapsed_time": "0:36:45", "remaining_time": "0:30:57"}
198
- {"current_steps": 191, "total_steps": 350, "loss": 0.0013, "lr": 5.074797035076319e-05, "epoch": 19.1, "percentage": 54.57, "elapsed_time": "0:36:56", "remaining_time": "0:30:45"}
199
- {"current_steps": 192, "total_steps": 350, "loss": 0.0013, "lr": 5.024933171693791e-05, "epoch": 19.2, "percentage": 54.86, "elapsed_time": "0:37:07", "remaining_time": "0:30:32"}
200
- {"current_steps": 193, "total_steps": 350, "loss": 0.0012, "lr": 4.9750668283062104e-05, "epoch": 19.3, "percentage": 55.14, "elapsed_time": "0:37:17", "remaining_time": "0:30:20"}
201
- {"current_steps": 194, "total_steps": 350, "loss": 0.0013, "lr": 4.925202964923683e-05, "epoch": 19.4, "percentage": 55.43, "elapsed_time": "0:37:27", "remaining_time": "0:30:07"}
202
- {"current_steps": 195, "total_steps": 350, "loss": 0.0017, "lr": 4.875346541309637e-05, "epoch": 19.5, "percentage": 55.71, "elapsed_time": "0:37:38", "remaining_time": "0:29:54"}
203
- {"current_steps": 196, "total_steps": 350, "loss": 0.0013, "lr": 4.825502516487497e-05, "epoch": 19.6, "percentage": 56.0, "elapsed_time": "0:37:48", "remaining_time": "0:29:42"}
204
- {"current_steps": 197, "total_steps": 350, "loss": 0.0013, "lr": 4.775675848247427e-05, "epoch": 19.7, "percentage": 56.29, "elapsed_time": "0:37:58", "remaining_time": "0:29:29"}
205
- {"current_steps": 198, "total_steps": 350, "loss": 0.0013, "lr": 4.725871492653199e-05, "epoch": 19.8, "percentage": 56.57, "elapsed_time": "0:38:08", "remaining_time": "0:29:17"}
206
- {"current_steps": 199, "total_steps": 350, "loss": 0.0014, "lr": 4.6760944035492404e-05, "epoch": 19.9, "percentage": 56.86, "elapsed_time": "0:38:19", "remaining_time": "0:29:04"}
207
- {"current_steps": 200, "total_steps": 350, "loss": 0.0012, "lr": 4.626349532067879e-05, "epoch": 20.0, "percentage": 57.14, "elapsed_time": "0:38:23", "remaining_time": "0:28:47"}
208
- {"current_steps": 200, "total_steps": 350, "eval_loss": 0.02464105747640133, "epoch": 20.0, "percentage": 57.14, "elapsed_time": "0:38:30", "remaining_time": "0:28:53"}
209
- {"current_steps": 201, "total_steps": 350, "loss": 0.0013, "lr": 4.576641826136884e-05, "epoch": 20.1, "percentage": 57.43, "elapsed_time": "0:38:47", "remaining_time": "0:28:45"}
210
- {"current_steps": 202, "total_steps": 350, "loss": 0.0015, "lr": 4.526976229987315e-05, "epoch": 20.2, "percentage": 57.71, "elapsed_time": "0:38:58", "remaining_time": "0:28:33"}
211
- {"current_steps": 203, "total_steps": 350, "loss": 0.0013, "lr": 4.477357683661734e-05, "epoch": 20.3, "percentage": 58.0, "elapsed_time": "0:39:08", "remaining_time": "0:28:20"}
212
- {"current_steps": 204, "total_steps": 350, "loss": 0.0014, "lr": 4.4277911225228414e-05, "epoch": 20.4, "percentage": 58.29, "elapsed_time": "0:39:18", "remaining_time": "0:28:08"}
213
- {"current_steps": 205, "total_steps": 350, "loss": 0.0013, "lr": 4.378281476762576e-05, "epoch": 20.5, "percentage": 58.57, "elapsed_time": "0:39:29", "remaining_time": "0:27:55"}
214
- {"current_steps": 206, "total_steps": 350, "loss": 0.0012, "lr": 4.328833670911724e-05, "epoch": 20.6, "percentage": 58.86, "elapsed_time": "0:39:39", "remaining_time": "0:27:43"}
215
- {"current_steps": 207, "total_steps": 350, "loss": 0.0012, "lr": 4.2794526233501006e-05, "epoch": 20.7, "percentage": 59.14, "elapsed_time": "0:39:49", "remaining_time": "0:27:30"}
216
- {"current_steps": 208, "total_steps": 350, "loss": 0.0012, "lr": 4.230143245817332e-05, "epoch": 20.8, "percentage": 59.43, "elapsed_time": "0:40:00", "remaining_time": "0:27:18"}
217
- {"current_steps": 209, "total_steps": 350, "loss": 0.0013, "lr": 4.180910442924312e-05, "epoch": 20.9, "percentage": 59.71, "elapsed_time": "0:40:10", "remaining_time": "0:27:06"}
218
- {"current_steps": 210, "total_steps": 350, "loss": 0.0012, "lr": 4.131759111665349e-05, "epoch": 21.0, "percentage": 60.0, "elapsed_time": "0:40:15", "remaining_time": "0:26:50"}
219
- {"current_steps": 211, "total_steps": 350, "loss": 0.0013, "lr": 4.082694140931089e-05, "epoch": 21.1, "percentage": 60.29, "elapsed_time": "0:40:25", "remaining_time": "0:26:37"}
220
- {"current_steps": 212, "total_steps": 350, "loss": 0.0012, "lr": 4.0337204110222346e-05, "epoch": 21.2, "percentage": 60.57, "elapsed_time": "0:40:36", "remaining_time": "0:26:25"}
221
- {"current_steps": 213, "total_steps": 350, "loss": 0.0013, "lr": 3.98484279316412e-05, "epoch": 21.3, "percentage": 60.86, "elapsed_time": "0:40:46", "remaining_time": "0:26:13"}
222
- {"current_steps": 214, "total_steps": 350, "loss": 0.0013, "lr": 3.936066149022191e-05, "epoch": 21.4, "percentage": 61.14, "elapsed_time": "0:40:57", "remaining_time": "0:26:01"}
223
- {"current_steps": 215, "total_steps": 350, "loss": 0.0012, "lr": 3.887395330218429e-05, "epoch": 21.5, "percentage": 61.43, "elapsed_time": "0:41:07", "remaining_time": "0:25:49"}
224
- {"current_steps": 216, "total_steps": 350, "loss": 0.0013, "lr": 3.838835177848788e-05, "epoch": 21.6, "percentage": 61.71, "elapsed_time": "0:41:17", "remaining_time": "0:25:37"}
225
- {"current_steps": 217, "total_steps": 350, "loss": 0.0013, "lr": 3.790390522001662e-05, "epoch": 21.7, "percentage": 62.0, "elapsed_time": "0:41:28", "remaining_time": "0:25:25"}
226
- {"current_steps": 218, "total_steps": 350, "loss": 0.0013, "lr": 3.742066181277458e-05, "epoch": 21.8, "percentage": 62.29, "elapsed_time": "0:41:38", "remaining_time": "0:25:13"}
227
- {"current_steps": 219, "total_steps": 350, "loss": 0.0018, "lr": 3.6938669623093084e-05, "epoch": 21.9, "percentage": 62.57, "elapsed_time": "0:41:49", "remaining_time": "0:25:00"}
228
- {"current_steps": 220, "total_steps": 350, "loss": 0.0013, "lr": 3.6457976592849754e-05, "epoch": 22.0, "percentage": 62.86, "elapsed_time": "0:41:53", "remaining_time": "0:24:45"}
229
- {"current_steps": 221, "total_steps": 350, "loss": 0.0012, "lr": 3.597863053469987e-05, "epoch": 22.1, "percentage": 63.14, "elapsed_time": "0:42:04", "remaining_time": "0:24:33"}
230
- {"current_steps": 222, "total_steps": 350, "loss": 0.0012, "lr": 3.550067912732069e-05, "epoch": 22.2, "percentage": 63.43, "elapsed_time": "0:42:14", "remaining_time": "0:24:21"}
231
- {"current_steps": 223, "total_steps": 350, "loss": 0.0012, "lr": 3.502416991066904e-05, "epoch": 22.3, "percentage": 63.71, "elapsed_time": "0:42:25", "remaining_time": "0:24:09"}
232
- {"current_steps": 224, "total_steps": 350, "loss": 0.0013, "lr": 3.4549150281252636e-05, "epoch": 22.4, "percentage": 64.0, "elapsed_time": "0:42:35", "remaining_time": "0:23:57"}
233
- {"current_steps": 225, "total_steps": 350, "loss": 0.0013, "lr": 3.4075667487415785e-05, "epoch": 22.5, "percentage": 64.29, "elapsed_time": "0:42:45", "remaining_time": "0:23:45"}
234
- {"current_steps": 225, "total_steps": 350, "eval_loss": 0.02635515108704567, "epoch": 22.5, "percentage": 64.29, "elapsed_time": "0:42:52", "remaining_time": "0:23:49"}
235
- {"current_steps": 226, "total_steps": 350, "loss": 0.0012, "lr": 3.360376862463979e-05, "epoch": 22.6, "percentage": 64.57, "elapsed_time": "0:43:02", "remaining_time": "0:23:37"}
236
- {"current_steps": 227, "total_steps": 350, "loss": 0.0012, "lr": 3.313350063085851e-05, "epoch": 22.7, "percentage": 64.86, "elapsed_time": "0:43:12", "remaining_time": "0:23:24"}
237
- {"current_steps": 228, "total_steps": 350, "loss": 0.0013, "lr": 3.266491028178964e-05, "epoch": 22.8, "percentage": 65.14, "elapsed_time": "0:43:23", "remaining_time": "0:23:12"}
238
- {"current_steps": 229, "total_steps": 350, "loss": 0.0012, "lr": 3.219804418628216e-05, "epoch": 22.9, "percentage": 65.43, "elapsed_time": "0:43:33", "remaining_time": "0:23:01"}
239
- {"current_steps": 230, "total_steps": 350, "loss": 0.0013, "lr": 3.173294878168025e-05, "epoch": 23.0, "percentage": 65.71, "elapsed_time": "0:43:38", "remaining_time": "0:22:46"}
240
- {"current_steps": 231, "total_steps": 350, "loss": 0.0013, "lr": 3.12696703292044e-05, "epoch": 23.1, "percentage": 66.0, "elapsed_time": "0:43:48", "remaining_time": "0:22:34"}
241
- {"current_steps": 232, "total_steps": 350, "loss": 0.0013, "lr": 3.080825490934999e-05, "epoch": 23.2, "percentage": 66.29, "elapsed_time": "0:43:59", "remaining_time": "0:22:22"}
242
- {"current_steps": 233, "total_steps": 350, "loss": 0.0012, "lr": 3.0348748417303823e-05, "epoch": 23.3, "percentage": 66.57, "elapsed_time": "0:44:09", "remaining_time": "0:22:10"}
243
- {"current_steps": 234, "total_steps": 350, "loss": 0.0013, "lr": 2.989119655837913e-05, "epoch": 23.4, "percentage": 66.86, "elapsed_time": "0:44:20", "remaining_time": "0:21:58"}
244
- {"current_steps": 235, "total_steps": 350, "loss": 0.0013, "lr": 2.9435644843469436e-05, "epoch": 23.5, "percentage": 67.14, "elapsed_time": "0:44:30", "remaining_time": "0:21:46"}
245
- {"current_steps": 236, "total_steps": 350, "loss": 0.0013, "lr": 2.8982138584521735e-05, "epoch": 23.6, "percentage": 67.43, "elapsed_time": "0:44:40", "remaining_time": "0:21:34"}
246
- {"current_steps": 237, "total_steps": 350, "loss": 0.0013, "lr": 2.8530722890029537e-05, "epoch": 23.7, "percentage": 67.71, "elapsed_time": "0:44:50", "remaining_time": "0:21:23"}
247
- {"current_steps": 238, "total_steps": 350, "loss": 0.0013, "lr": 2.8081442660546125e-05, "epoch": 23.8, "percentage": 68.0, "elapsed_time": "0:45:01", "remaining_time": "0:21:11"}
248
- {"current_steps": 239, "total_steps": 350, "loss": 0.0013, "lr": 2.7634342584218365e-05, "epoch": 23.9, "percentage": 68.29, "elapsed_time": "0:45:11", "remaining_time": "0:20:59"}
249
- {"current_steps": 240, "total_steps": 350, "loss": 0.0013, "lr": 2.718946713234185e-05, "epoch": 24.0, "percentage": 68.57, "elapsed_time": "0:45:16", "remaining_time": "0:20:44"}
250
- {"current_steps": 241, "total_steps": 350, "loss": 0.0012, "lr": 2.674686055493748e-05, "epoch": 24.1, "percentage": 68.86, "elapsed_time": "0:45:26", "remaining_time": "0:20:33"}
251
- {"current_steps": 242, "total_steps": 350, "loss": 0.0013, "lr": 2.630656687635007e-05, "epoch": 24.2, "percentage": 69.14, "elapsed_time": "0:45:36", "remaining_time": "0:20:21"}
252
- {"current_steps": 243, "total_steps": 350, "loss": 0.0012, "lr": 2.5868629890869468e-05, "epoch": 24.3, "percentage": 69.43, "elapsed_time": "0:45:47", "remaining_time": "0:20:09"}
253
- {"current_steps": 244, "total_steps": 350, "loss": 0.0013, "lr": 2.543309315837444e-05, "epoch": 24.4, "percentage": 69.71, "elapsed_time": "0:45:57", "remaining_time": "0:19:58"}
254
- {"current_steps": 245, "total_steps": 350, "loss": 0.0012, "lr": 2.500000000000001e-05, "epoch": 24.5, "percentage": 70.0, "elapsed_time": "0:46:08", "remaining_time": "0:19:46"}
255
- {"current_steps": 246, "total_steps": 350, "loss": 0.0012, "lr": 2.456939349382843e-05, "epoch": 24.6, "percentage": 70.29, "elapsed_time": "0:46:18", "remaining_time": "0:19:34"}
256
- {"current_steps": 247, "total_steps": 350, "loss": 0.0013, "lr": 2.4141316470604362e-05, "epoch": 24.7, "percentage": 70.57, "elapsed_time": "0:46:28", "remaining_time": "0:19:22"}
257
- {"current_steps": 248, "total_steps": 350, "loss": 0.0012, "lr": 2.371581150947476e-05, "epoch": 24.8, "percentage": 70.86, "elapsed_time": "0:46:39", "remaining_time": "0:19:11"}
258
- {"current_steps": 249, "total_steps": 350, "loss": 0.0013, "lr": 2.3292920933753566e-05, "epoch": 24.9, "percentage": 71.14, "elapsed_time": "0:46:49", "remaining_time": "0:18:59"}
259
- {"current_steps": 250, "total_steps": 350, "loss": 0.0013, "lr": 2.2872686806712035e-05, "epoch": 25.0, "percentage": 71.43, "elapsed_time": "0:46:54", "remaining_time": "0:18:45"}
260
- {"current_steps": 250, "total_steps": 350, "eval_loss": 0.027748363092541695, "epoch": 25.0, "percentage": 71.43, "elapsed_time": "0:47:01", "remaining_time": "0:18:48"}
261
- {"current_steps": 250, "total_steps": 350, "epoch": 25.0, "percentage": 71.43, "elapsed_time": "0:47:01", "remaining_time": "0:18:48"}
 
1
+ {"current_steps": 1, "total_steps": 350, "loss": 0.1562, "lr": 2.8571428571428573e-06, "epoch": 0.1, "percentage": 0.29, "elapsed_time": "0:00:21", "remaining_time": "2:04:19"}
2
+ {"current_steps": 2, "total_steps": 350, "loss": 0.1295, "lr": 5.7142857142857145e-06, "epoch": 0.2, "percentage": 0.57, "elapsed_time": "0:00:31", "remaining_time": "1:30:52"}
3
+ {"current_steps": 3, "total_steps": 350, "loss": 0.1632, "lr": 8.571428571428573e-06, "epoch": 0.3, "percentage": 0.86, "elapsed_time": "0:00:42", "remaining_time": "1:21:07"}
4
+ {"current_steps": 4, "total_steps": 350, "loss": 0.1368, "lr": 1.1428571428571429e-05, "epoch": 0.4, "percentage": 1.14, "elapsed_time": "0:00:52", "remaining_time": "1:15:47"}
5
+ {"current_steps": 5, "total_steps": 350, "loss": 0.1199, "lr": 1.4285714285714285e-05, "epoch": 0.5, "percentage": 1.43, "elapsed_time": "0:01:02", "remaining_time": "1:12:26"}
6
+ {"current_steps": 6, "total_steps": 350, "loss": 0.0867, "lr": 1.7142857142857145e-05, "epoch": 0.6, "percentage": 1.71, "elapsed_time": "0:01:13", "remaining_time": "1:10:08"}
7
+ {"current_steps": 7, "total_steps": 350, "loss": 0.1135, "lr": 2e-05, "epoch": 0.7, "percentage": 2.0, "elapsed_time": "0:01:46", "remaining_time": "1:27:16"}
8
+ {"current_steps": 8, "total_steps": 350, "loss": 0.1035, "lr": 2.2857142857142858e-05, "epoch": 0.8, "percentage": 2.29, "elapsed_time": "0:01:57", "remaining_time": "1:23:38"}
9
+ {"current_steps": 9, "total_steps": 350, "loss": 0.1129, "lr": 2.5714285714285714e-05, "epoch": 0.9, "percentage": 2.57, "elapsed_time": "0:02:07", "remaining_time": "1:20:37"}
10
+ {"current_steps": 10, "total_steps": 350, "loss": 0.0841, "lr": 2.857142857142857e-05, "epoch": 1.0, "percentage": 2.86, "elapsed_time": "0:02:13", "remaining_time": "1:15:48"}
11
+ {"current_steps": 11, "total_steps": 350, "loss": 0.0747, "lr": 3.142857142857143e-05, "epoch": 1.1, "percentage": 3.14, "elapsed_time": "0:02:24", "remaining_time": "1:14:09"}
12
+ {"current_steps": 12, "total_steps": 350, "loss": 0.0697, "lr": 3.428571428571429e-05, "epoch": 1.2, "percentage": 3.43, "elapsed_time": "0:02:34", "remaining_time": "1:12:38"}
13
+ {"current_steps": 13, "total_steps": 350, "loss": 0.0706, "lr": 3.7142857142857143e-05, "epoch": 1.3, "percentage": 3.71, "elapsed_time": "0:02:45", "remaining_time": "1:11:38"}
14
+ {"current_steps": 14, "total_steps": 350, "loss": 0.0698, "lr": 4e-05, "epoch": 1.4, "percentage": 4.0, "elapsed_time": "0:02:56", "remaining_time": "1:10:33"}
15
+ {"current_steps": 15, "total_steps": 350, "loss": 0.0627, "lr": 4.2857142857142856e-05, "epoch": 1.5, "percentage": 4.29, "elapsed_time": "0:03:07", "remaining_time": "1:09:39"}
16
+ {"current_steps": 16, "total_steps": 350, "loss": 0.0528, "lr": 4.5714285714285716e-05, "epoch": 1.6, "percentage": 4.57, "elapsed_time": "0:03:17", "remaining_time": "1:08:46"}
17
+ {"current_steps": 17, "total_steps": 350, "loss": 0.0587, "lr": 4.8571428571428576e-05, "epoch": 1.7, "percentage": 4.86, "elapsed_time": "0:03:28", "remaining_time": "1:08:00"}
18
+ {"current_steps": 18, "total_steps": 350, "loss": 0.05, "lr": 5.142857142857143e-05, "epoch": 1.8, "percentage": 5.14, "elapsed_time": "0:03:38", "remaining_time": "1:07:15"}
19
+ {"current_steps": 19, "total_steps": 350, "loss": 0.0548, "lr": 5.428571428571428e-05, "epoch": 1.9, "percentage": 5.43, "elapsed_time": "0:03:49", "remaining_time": "1:06:34"}
20
+ {"current_steps": 20, "total_steps": 350, "loss": 0.0533, "lr": 5.714285714285714e-05, "epoch": 2.0, "percentage": 5.71, "elapsed_time": "0:03:54", "remaining_time": "1:04:21"}
21
+ {"current_steps": 21, "total_steps": 350, "loss": 0.0436, "lr": 6e-05, "epoch": 2.1, "percentage": 6.0, "elapsed_time": "0:04:05", "remaining_time": "1:04:03"}
22
+ {"current_steps": 22, "total_steps": 350, "loss": 0.0418, "lr": 6.285714285714286e-05, "epoch": 2.2, "percentage": 6.29, "elapsed_time": "0:04:16", "remaining_time": "1:03:37"}
23
+ {"current_steps": 23, "total_steps": 350, "loss": 0.0393, "lr": 6.571428571428571e-05, "epoch": 2.3, "percentage": 6.57, "elapsed_time": "0:04:26", "remaining_time": "1:03:11"}
24
+ {"current_steps": 24, "total_steps": 350, "loss": 0.0336, "lr": 6.857142857142858e-05, "epoch": 2.4, "percentage": 6.86, "elapsed_time": "0:04:37", "remaining_time": "1:02:45"}
25
+ {"current_steps": 25, "total_steps": 350, "loss": 0.0459, "lr": 7.142857142857143e-05, "epoch": 2.5, "percentage": 7.14, "elapsed_time": "0:04:48", "remaining_time": "1:02:24"}
26
+ {"current_steps": 25, "total_steps": 350, "eval_loss": 0.0361514613032341, "epoch": 2.5, "percentage": 7.14, "elapsed_time": "0:04:57", "remaining_time": "1:04:28"}
27
+ {"current_steps": 26, "total_steps": 350, "loss": 0.0427, "lr": 7.428571428571429e-05, "epoch": 2.6, "percentage": 7.43, "elapsed_time": "0:05:08", "remaining_time": "1:04:00"}
28
+ {"current_steps": 27, "total_steps": 350, "loss": 0.0286, "lr": 7.714285714285715e-05, "epoch": 2.7, "percentage": 7.71, "elapsed_time": "0:05:18", "remaining_time": "1:03:33"}
29
+ {"current_steps": 28, "total_steps": 350, "loss": 0.0377, "lr": 8e-05, "epoch": 2.8, "percentage": 8.0, "elapsed_time": "0:05:29", "remaining_time": "1:03:08"}
30
+ {"current_steps": 29, "total_steps": 350, "loss": 0.0304, "lr": 8.285714285714287e-05, "epoch": 2.9, "percentage": 8.29, "elapsed_time": "0:05:39", "remaining_time": "1:02:42"}
31
+ {"current_steps": 30, "total_steps": 350, "loss": 0.031, "lr": 8.571428571428571e-05, "epoch": 3.0, "percentage": 8.57, "elapsed_time": "0:05:44", "remaining_time": "1:01:14"}
32
+ {"current_steps": 31, "total_steps": 350, "loss": 0.0222, "lr": 8.857142857142857e-05, "epoch": 3.1, "percentage": 8.86, "elapsed_time": "0:05:55", "remaining_time": "1:00:56"}
33
+ {"current_steps": 32, "total_steps": 350, "loss": 0.0361, "lr": 9.142857142857143e-05, "epoch": 3.2, "percentage": 9.14, "elapsed_time": "0:06:05", "remaining_time": "1:00:36"}
34
+ {"current_steps": 33, "total_steps": 350, "loss": 0.0234, "lr": 9.428571428571429e-05, "epoch": 3.3, "percentage": 9.43, "elapsed_time": "0:06:16", "remaining_time": "1:00:17"}
35
+ {"current_steps": 34, "total_steps": 350, "loss": 0.0267, "lr": 9.714285714285715e-05, "epoch": 3.4, "percentage": 9.71, "elapsed_time": "0:06:27", "remaining_time": "0:59:58"}
36
+ {"current_steps": 35, "total_steps": 350, "loss": 0.0215, "lr": 0.0001, "epoch": 3.5, "percentage": 10.0, "elapsed_time": "0:06:37", "remaining_time": "0:59:41"}
37
+ {"current_steps": 36, "total_steps": 350, "loss": 0.0198, "lr": 9.999751334779716e-05, "epoch": 3.6, "percentage": 10.29, "elapsed_time": "0:06:48", "remaining_time": "0:59:24"}
38
+ {"current_steps": 37, "total_steps": 350, "loss": 0.021, "lr": 9.999005363852618e-05, "epoch": 3.7, "percentage": 10.57, "elapsed_time": "0:06:59", "remaining_time": "0:59:06"}
39
+ {"current_steps": 38, "total_steps": 350, "loss": 0.0215, "lr": 9.997762161417517e-05, "epoch": 3.8, "percentage": 10.86, "elapsed_time": "0:07:10", "remaining_time": "0:58:53"}
40
+ {"current_steps": 39, "total_steps": 350, "loss": 0.0189, "lr": 9.996021851130897e-05, "epoch": 3.9, "percentage": 11.14, "elapsed_time": "0:07:20", "remaining_time": "0:58:33"}
41
+ {"current_steps": 40, "total_steps": 350, "loss": 0.0156, "lr": 9.993784606094612e-05, "epoch": 4.0, "percentage": 11.43, "elapsed_time": "0:07:25", "remaining_time": "0:57:29"}
42
+ {"current_steps": 41, "total_steps": 350, "loss": 0.0125, "lr": 9.991050648838675e-05, "epoch": 4.1, "percentage": 11.71, "elapsed_time": "0:07:35", "remaining_time": "0:57:16"}
43
+ {"current_steps": 42, "total_steps": 350, "loss": 0.0115, "lr": 9.987820251299122e-05, "epoch": 4.2, "percentage": 12.0, "elapsed_time": "0:07:46", "remaining_time": "0:56:59"}
44
+ {"current_steps": 43, "total_steps": 350, "loss": 0.0164, "lr": 9.984093734790956e-05, "epoch": 4.3, "percentage": 12.29, "elapsed_time": "0:07:56", "remaining_time": "0:56:44"}
45
+ {"current_steps": 44, "total_steps": 350, "loss": 0.0137, "lr": 9.979871469976196e-05, "epoch": 4.4, "percentage": 12.57, "elapsed_time": "0:08:07", "remaining_time": "0:56:31"}
46
+ {"current_steps": 45, "total_steps": 350, "loss": 0.0153, "lr": 9.975153876827008e-05, "epoch": 4.5, "percentage": 12.86, "elapsed_time": "0:08:17", "remaining_time": "0:56:12"}
47
+ {"current_steps": 46, "total_steps": 350, "loss": 0.0128, "lr": 9.969941424583926e-05, "epoch": 4.6, "percentage": 13.14, "elapsed_time": "0:08:28", "remaining_time": "0:56:01"}
48
+ {"current_steps": 47, "total_steps": 350, "loss": 0.016, "lr": 9.964234631709187e-05, "epoch": 4.7, "percentage": 13.43, "elapsed_time": "0:08:39", "remaining_time": "0:55:47"}
49
+ {"current_steps": 48, "total_steps": 350, "loss": 0.0098, "lr": 9.958034065835151e-05, "epoch": 4.8, "percentage": 13.71, "elapsed_time": "0:08:49", "remaining_time": "0:55:33"}
50
+ {"current_steps": 49, "total_steps": 350, "loss": 0.0112, "lr": 9.951340343707852e-05, "epoch": 4.9, "percentage": 14.0, "elapsed_time": "0:09:00", "remaining_time": "0:55:20"}
51
+ {"current_steps": 50, "total_steps": 350, "loss": 0.0083, "lr": 9.944154131125642e-05, "epoch": 5.0, "percentage": 14.29, "elapsed_time": "0:09:05", "remaining_time": "0:54:31"}
52
+ {"current_steps": 50, "total_steps": 350, "eval_loss": 0.018401963636279106, "epoch": 5.0, "percentage": 14.29, "elapsed_time": "0:09:12", "remaining_time": "0:55:15"}
53
+ {"current_steps": 51, "total_steps": 350, "loss": 0.0093, "lr": 9.936476142872979e-05, "epoch": 5.1, "percentage": 14.57, "elapsed_time": "0:09:23", "remaining_time": "0:55:03"}
54
+ {"current_steps": 52, "total_steps": 350, "loss": 0.0075, "lr": 9.928307142649316e-05, "epoch": 5.2, "percentage": 14.86, "elapsed_time": "0:09:33", "remaining_time": "0:54:49"}
55
+ {"current_steps": 53, "total_steps": 350, "loss": 0.0053, "lr": 9.919647942993148e-05, "epoch": 5.3, "percentage": 15.14, "elapsed_time": "0:09:45", "remaining_time": "0:54:38"}
56
+ {"current_steps": 54, "total_steps": 350, "loss": 0.0086, "lr": 9.910499405201195e-05, "epoch": 5.4, "percentage": 15.43, "elapsed_time": "0:09:55", "remaining_time": "0:54:25"}
57
+ {"current_steps": 55, "total_steps": 350, "loss": 0.0069, "lr": 9.900862439242719e-05, "epoch": 5.5, "percentage": 15.71, "elapsed_time": "0:10:13", "remaining_time": "0:54:50"}
58
+ {"current_steps": 56, "total_steps": 350, "loss": 0.0048, "lr": 9.890738003669029e-05, "epoch": 5.6, "percentage": 16.0, "elapsed_time": "0:10:23", "remaining_time": "0:54:35"}
59
+ {"current_steps": 57, "total_steps": 350, "loss": 0.007, "lr": 9.880127105518122e-05, "epoch": 5.7, "percentage": 16.29, "elapsed_time": "0:10:34", "remaining_time": "0:54:21"}
60
+ {"current_steps": 58, "total_steps": 350, "loss": 0.0106, "lr": 9.869030800214532e-05, "epoch": 5.8, "percentage": 16.57, "elapsed_time": "0:10:44", "remaining_time": "0:54:05"}
61
+ {"current_steps": 59, "total_steps": 350, "loss": 0.0067, "lr": 9.857450191464337e-05, "epoch": 5.9, "percentage": 16.86, "elapsed_time": "0:10:55", "remaining_time": "0:53:50"}
62
+ {"current_steps": 60, "total_steps": 350, "loss": 0.0073, "lr": 9.84538643114539e-05, "epoch": 6.0, "percentage": 17.14, "elapsed_time": "0:10:59", "remaining_time": "0:53:08"}
63
+ {"current_steps": 61, "total_steps": 350, "loss": 0.0039, "lr": 9.832840719192736e-05, "epoch": 6.1, "percentage": 17.43, "elapsed_time": "0:11:10", "remaining_time": "0:52:54"}
64
+ {"current_steps": 62, "total_steps": 350, "loss": 0.0066, "lr": 9.819814303479267e-05, "epoch": 6.2, "percentage": 17.71, "elapsed_time": "0:11:20", "remaining_time": "0:52:41"}
65
+ {"current_steps": 63, "total_steps": 350, "loss": 0.0047, "lr": 9.806308479691595e-05, "epoch": 6.3, "percentage": 18.0, "elapsed_time": "0:11:31", "remaining_time": "0:52:29"}
66
+ {"current_steps": 64, "total_steps": 350, "loss": 0.0075, "lr": 9.792324591201179e-05, "epoch": 6.4, "percentage": 18.29, "elapsed_time": "0:11:41", "remaining_time": "0:52:16"}
67
+ {"current_steps": 65, "total_steps": 350, "loss": 0.0061, "lr": 9.777864028930705e-05, "epoch": 6.5, "percentage": 18.57, "elapsed_time": "0:11:52", "remaining_time": "0:52:02"}
68
+ {"current_steps": 66, "total_steps": 350, "loss": 0.0077, "lr": 9.76292823121573e-05, "epoch": 6.6, "percentage": 18.86, "elapsed_time": "0:12:02", "remaining_time": "0:51:49"}
69
+ {"current_steps": 67, "total_steps": 350, "loss": 0.005, "lr": 9.747518683661631e-05, "epoch": 6.7, "percentage": 19.14, "elapsed_time": "0:12:13", "remaining_time": "0:51:36"}
70
+ {"current_steps": 68, "total_steps": 350, "loss": 0.0056, "lr": 9.731636918995821e-05, "epoch": 6.8, "percentage": 19.43, "elapsed_time": "0:12:23", "remaining_time": "0:51:24"}
71
+ {"current_steps": 69, "total_steps": 350, "loss": 0.006, "lr": 9.715284516915303e-05, "epoch": 6.9, "percentage": 19.71, "elapsed_time": "0:12:33", "remaining_time": "0:51:10"}
72
+ {"current_steps": 70, "total_steps": 350, "loss": 0.0069, "lr": 9.698463103929542e-05, "epoch": 7.0, "percentage": 20.0, "elapsed_time": "0:12:38", "remaining_time": "0:50:34"}
73
+ {"current_steps": 71, "total_steps": 350, "loss": 0.0048, "lr": 9.681174353198687e-05, "epoch": 7.1, "percentage": 20.29, "elapsed_time": "0:12:49", "remaining_time": "0:50:22"}
74
+ {"current_steps": 72, "total_steps": 350, "loss": 0.0036, "lr": 9.663419984367139e-05, "epoch": 7.2, "percentage": 20.57, "elapsed_time": "0:13:02", "remaining_time": "0:50:19"}
75
+ {"current_steps": 73, "total_steps": 350, "loss": 0.004, "lr": 9.645201763392513e-05, "epoch": 7.3, "percentage": 20.86, "elapsed_time": "0:13:12", "remaining_time": "0:50:08"}
76
+ {"current_steps": 74, "total_steps": 350, "loss": 0.0054, "lr": 9.626521502369984e-05, "epoch": 7.4, "percentage": 21.14, "elapsed_time": "0:13:23", "remaining_time": "0:49:55"}
77
+ {"current_steps": 75, "total_steps": 350, "loss": 0.0058, "lr": 9.607381059352038e-05, "epoch": 7.5, "percentage": 21.43, "elapsed_time": "0:13:33", "remaining_time": "0:49:42"}
78
+ {"current_steps": 75, "total_steps": 350, "eval_loss": 0.017095288261771202, "epoch": 7.5, "percentage": 21.43, "elapsed_time": "0:13:40", "remaining_time": "0:50:08"}
79
+ {"current_steps": 76, "total_steps": 350, "loss": 0.0037, "lr": 9.587782338163669e-05, "epoch": 7.6, "percentage": 21.71, "elapsed_time": "0:13:50", "remaining_time": "0:49:54"}
80
+ {"current_steps": 77, "total_steps": 350, "loss": 0.0063, "lr": 9.567727288213005e-05, "epoch": 7.7, "percentage": 22.0, "elapsed_time": "0:14:01", "remaining_time": "0:49:42"}
81
+ {"current_steps": 78, "total_steps": 350, "loss": 0.0038, "lr": 9.547217904297411e-05, "epoch": 7.8, "percentage": 22.29, "elapsed_time": "0:14:11", "remaining_time": "0:49:29"}
82
+ {"current_steps": 79, "total_steps": 350, "loss": 0.0053, "lr": 9.526256226405075e-05, "epoch": 7.9, "percentage": 22.57, "elapsed_time": "0:14:22", "remaining_time": "0:49:18"}
83
+ {"current_steps": 80, "total_steps": 350, "loss": 0.005, "lr": 9.504844339512095e-05, "epoch": 8.0, "percentage": 22.86, "elapsed_time": "0:14:26", "remaining_time": "0:48:45"}
84
+ {"current_steps": 81, "total_steps": 350, "loss": 0.0041, "lr": 9.482984373375105e-05, "epoch": 8.1, "percentage": 23.14, "elapsed_time": "0:14:37", "remaining_time": "0:48:33"}
85
+ {"current_steps": 82, "total_steps": 350, "loss": 0.0035, "lr": 9.460678502319418e-05, "epoch": 8.2, "percentage": 23.43, "elapsed_time": "0:14:47", "remaining_time": "0:48:22"}
86
+ {"current_steps": 83, "total_steps": 350, "loss": 0.0039, "lr": 9.437928945022771e-05, "epoch": 8.3, "percentage": 23.71, "elapsed_time": "0:14:58", "remaining_time": "0:48:10"}
87
+ {"current_steps": 84, "total_steps": 350, "loss": 0.005, "lr": 9.414737964294636e-05, "epoch": 8.4, "percentage": 24.0, "elapsed_time": "0:15:08", "remaining_time": "0:47:57"}
88
+ {"current_steps": 85, "total_steps": 350, "loss": 0.0033, "lr": 9.391107866851143e-05, "epoch": 8.5, "percentage": 24.29, "elapsed_time": "0:15:19", "remaining_time": "0:47:45"}
89
+ {"current_steps": 86, "total_steps": 350, "loss": 0.0027, "lr": 9.367041003085649e-05, "epoch": 8.6, "percentage": 24.57, "elapsed_time": "0:15:29", "remaining_time": "0:47:33"}
90
+ {"current_steps": 87, "total_steps": 350, "loss": 0.0039, "lr": 9.342539766834946e-05, "epoch": 8.7, "percentage": 24.86, "elapsed_time": "0:15:39", "remaining_time": "0:47:21"}
91
+ {"current_steps": 88, "total_steps": 350, "loss": 0.0028, "lr": 9.317606595141154e-05, "epoch": 8.8, "percentage": 25.14, "elapsed_time": "0:15:50", "remaining_time": "0:47:09"}
92
+ {"current_steps": 89, "total_steps": 350, "loss": 0.0047, "lr": 9.292243968009331e-05, "epoch": 8.9, "percentage": 25.43, "elapsed_time": "0:16:01", "remaining_time": "0:46:58"}
93
+ {"current_steps": 90, "total_steps": 350, "loss": 0.0043, "lr": 9.266454408160779e-05, "epoch": 9.0, "percentage": 25.71, "elapsed_time": "0:16:05", "remaining_time": "0:46:29"}
94
+ {"current_steps": 91, "total_steps": 350, "loss": 0.0026, "lr": 9.24024048078213e-05, "epoch": 9.1, "percentage": 26.0, "elapsed_time": "0:16:16", "remaining_time": "0:46:18"}
95
+ {"current_steps": 92, "total_steps": 350, "loss": 0.0024, "lr": 9.213604793270196e-05, "epoch": 9.2, "percentage": 26.29, "elapsed_time": "0:16:27", "remaining_time": "0:46:08"}
96
+ {"current_steps": 93, "total_steps": 350, "loss": 0.0037, "lr": 9.186549994972618e-05, "epoch": 9.3, "percentage": 26.57, "elapsed_time": "0:16:37", "remaining_time": "0:45:56"}
97
+ {"current_steps": 94, "total_steps": 350, "loss": 0.0027, "lr": 9.159078776924346e-05, "epoch": 9.4, "percentage": 26.86, "elapsed_time": "0:16:47", "remaining_time": "0:45:45"}
98
+ {"current_steps": 95, "total_steps": 350, "loss": 0.003, "lr": 9.131193871579975e-05, "epoch": 9.5, "percentage": 27.14, "elapsed_time": "0:16:57", "remaining_time": "0:45:32"}
99
+ {"current_steps": 96, "total_steps": 350, "loss": 0.0032, "lr": 9.102898052541958e-05, "epoch": 9.6, "percentage": 27.43, "elapsed_time": "0:17:08", "remaining_time": "0:45:21"}
100
+ {"current_steps": 97, "total_steps": 350, "loss": 0.0022, "lr": 9.074194134284726e-05, "epoch": 9.7, "percentage": 27.71, "elapsed_time": "0:17:18", "remaining_time": "0:45:09"}
101
+ {"current_steps": 98, "total_steps": 350, "loss": 0.002, "lr": 9.045084971874738e-05, "epoch": 9.8, "percentage": 28.0, "elapsed_time": "0:17:29", "remaining_time": "0:44:57"}
102
+ {"current_steps": 99, "total_steps": 350, "loss": 0.0027, "lr": 9.015573460686509e-05, "epoch": 9.9, "percentage": 28.29, "elapsed_time": "0:17:39", "remaining_time": "0:44:45"}
103
+ {"current_steps": 100, "total_steps": 350, "loss": 0.0024, "lr": 8.985662536114613e-05, "epoch": 10.0, "percentage": 28.57, "elapsed_time": "0:17:44", "remaining_time": "0:44:20"}
104
+ {"current_steps": 100, "total_steps": 350, "eval_loss": 0.015364531427621841, "epoch": 10.0, "percentage": 28.57, "elapsed_time": "0:17:51", "remaining_time": "0:44:37"}
105
+ {"current_steps": 101, "total_steps": 350, "loss": 0.0021, "lr": 8.955355173281708e-05, "epoch": 10.1, "percentage": 28.86, "elapsed_time": "0:18:07", "remaining_time": "0:44:41"}
106
+ {"current_steps": 102, "total_steps": 350, "loss": 0.0022, "lr": 8.924654386742613e-05, "epoch": 10.2, "percentage": 29.14, "elapsed_time": "0:18:18", "remaining_time": "0:44:29"}
107
+ {"current_steps": 103, "total_steps": 350, "loss": 0.0015, "lr": 8.89356323018447e-05, "epoch": 10.3, "percentage": 29.43, "elapsed_time": "0:18:28", "remaining_time": "0:44:17"}
108
+ {"current_steps": 104, "total_steps": 350, "loss": 0.0024, "lr": 8.862084796122998e-05, "epoch": 10.4, "percentage": 29.71, "elapsed_time": "0:18:38", "remaining_time": "0:44:06"}
109
+ {"current_steps": 105, "total_steps": 350, "loss": 0.0021, "lr": 8.83022221559489e-05, "epoch": 10.5, "percentage": 30.0, "elapsed_time": "0:18:49", "remaining_time": "0:43:54"}
110
+ {"current_steps": 106, "total_steps": 350, "loss": 0.0016, "lr": 8.797978657846391e-05, "epoch": 10.6, "percentage": 30.29, "elapsed_time": "0:18:59", "remaining_time": "0:43:42"}
111
+ {"current_steps": 107, "total_steps": 350, "loss": 0.0022, "lr": 8.765357330018056e-05, "epoch": 10.7, "percentage": 30.57, "elapsed_time": "0:19:10", "remaining_time": "0:43:31"}
112
+ {"current_steps": 108, "total_steps": 350, "loss": 0.0026, "lr": 8.732361476825752e-05, "epoch": 10.8, "percentage": 30.86, "elapsed_time": "0:19:20", "remaining_time": "0:43:20"}
113
+ {"current_steps": 109, "total_steps": 350, "loss": 0.0014, "lr": 8.69899438023792e-05, "epoch": 10.9, "percentage": 31.14, "elapsed_time": "0:19:31", "remaining_time": "0:43:09"}
114
+ {"current_steps": 110, "total_steps": 350, "loss": 0.0025, "lr": 8.665259359149132e-05, "epoch": 11.0, "percentage": 31.43, "elapsed_time": "0:19:35", "remaining_time": "0:42:44"}
115
+ {"current_steps": 111, "total_steps": 350, "loss": 0.0022, "lr": 8.631159769049965e-05, "epoch": 11.1, "percentage": 31.71, "elapsed_time": "0:19:46", "remaining_time": "0:42:34"}
116
+ {"current_steps": 112, "total_steps": 350, "loss": 0.0019, "lr": 8.596699001693255e-05, "epoch": 11.2, "percentage": 32.0, "elapsed_time": "0:19:56", "remaining_time": "0:42:22"}
117
+ {"current_steps": 113, "total_steps": 350, "loss": 0.0024, "lr": 8.561880484756725e-05, "epoch": 11.3, "percentage": 32.29, "elapsed_time": "0:20:07", "remaining_time": "0:42:11"}
118
+ {"current_steps": 114, "total_steps": 350, "loss": 0.0027, "lr": 8.526707681502044e-05, "epoch": 11.4, "percentage": 32.57, "elapsed_time": "0:20:17", "remaining_time": "0:42:00"}
119
+ {"current_steps": 115, "total_steps": 350, "loss": 0.0016, "lr": 8.491184090430364e-05, "epoch": 11.5, "percentage": 32.86, "elapsed_time": "0:20:28", "remaining_time": "0:41:50"}
120
+ {"current_steps": 116, "total_steps": 350, "loss": 0.0019, "lr": 8.455313244934324e-05, "epoch": 11.6, "percentage": 33.14, "elapsed_time": "0:20:38", "remaining_time": "0:41:39"}
121
+ {"current_steps": 117, "total_steps": 350, "loss": 0.0021, "lr": 8.419098712946601e-05, "epoch": 11.7, "percentage": 33.43, "elapsed_time": "0:20:49", "remaining_time": "0:41:28"}
122
+ {"current_steps": 118, "total_steps": 350, "loss": 0.002, "lr": 8.382544096585027e-05, "epoch": 11.8, "percentage": 33.71, "elapsed_time": "0:20:59", "remaining_time": "0:41:16"}
123
+ {"current_steps": 119, "total_steps": 350, "loss": 0.0019, "lr": 8.345653031794292e-05, "epoch": 11.9, "percentage": 34.0, "elapsed_time": "0:21:10", "remaining_time": "0:41:05"}
124
+ {"current_steps": 120, "total_steps": 350, "loss": 0.0027, "lr": 8.308429187984297e-05, "epoch": 12.0, "percentage": 34.29, "elapsed_time": "0:21:14", "remaining_time": "0:40:43"}
125
+ {"current_steps": 121, "total_steps": 350, "loss": 0.0016, "lr": 8.270876267665173e-05, "epoch": 12.1, "percentage": 34.57, "elapsed_time": "0:21:25", "remaining_time": "0:40:32"}
126
+ {"current_steps": 122, "total_steps": 350, "loss": 0.0016, "lr": 8.232998006078997e-05, "epoch": 12.2, "percentage": 34.86, "elapsed_time": "0:21:36", "remaining_time": "0:40:22"}
127
+ {"current_steps": 123, "total_steps": 350, "loss": 0.0018, "lr": 8.19479817082828e-05, "epoch": 12.3, "percentage": 35.14, "elapsed_time": "0:21:46", "remaining_time": "0:40:11"}
128
+ {"current_steps": 124, "total_steps": 350, "loss": 0.0014, "lr": 8.156280561501195e-05, "epoch": 12.4, "percentage": 35.43, "elapsed_time": "0:21:56", "remaining_time": "0:40:00"}
129
+ {"current_steps": 125, "total_steps": 350, "loss": 0.0015, "lr": 8.117449009293668e-05, "epoch": 12.5, "percentage": 35.71, "elapsed_time": "0:22:07", "remaining_time": "0:39:49"}
130
+ {"current_steps": 125, "total_steps": 350, "eval_loss": 0.016432486474514008, "epoch": 12.5, "percentage": 35.71, "elapsed_time": "0:22:14", "remaining_time": "0:40:01"}
131
+ {"current_steps": 126, "total_steps": 350, "loss": 0.0017, "lr": 8.07830737662829e-05, "epoch": 12.6, "percentage": 36.0, "elapsed_time": "0:22:24", "remaining_time": "0:39:50"}
132
+ {"current_steps": 127, "total_steps": 350, "loss": 0.0017, "lr": 8.038859556770151e-05, "epoch": 12.7, "percentage": 36.29, "elapsed_time": "0:22:35", "remaining_time": "0:39:39"}
133
+ {"current_steps": 128, "total_steps": 350, "loss": 0.0015, "lr": 7.999109473439569e-05, "epoch": 12.8, "percentage": 36.57, "elapsed_time": "0:22:45", "remaining_time": "0:39:28"}
134
+ {"current_steps": 129, "total_steps": 350, "loss": 0.0026, "lr": 7.959061080421839e-05, "epoch": 12.9, "percentage": 36.86, "elapsed_time": "0:22:56", "remaining_time": "0:39:17"}
135
+ {"current_steps": 130, "total_steps": 350, "loss": 0.0018, "lr": 7.91871836117395e-05, "epoch": 13.0, "percentage": 37.14, "elapsed_time": "0:23:00", "remaining_time": "0:38:56"}
136
+ {"current_steps": 131, "total_steps": 350, "loss": 0.0015, "lr": 7.878085328428369e-05, "epoch": 13.1, "percentage": 37.43, "elapsed_time": "0:23:11", "remaining_time": "0:38:46"}
137
+ {"current_steps": 132, "total_steps": 350, "loss": 0.0016, "lr": 7.83716602379391e-05, "epoch": 13.2, "percentage": 37.71, "elapsed_time": "0:23:21", "remaining_time": "0:38:35"}
138
+ {"current_steps": 133, "total_steps": 350, "loss": 0.0015, "lr": 7.795964517353735e-05, "epoch": 13.3, "percentage": 38.0, "elapsed_time": "0:23:32", "remaining_time": "0:38:24"}
139
+ {"current_steps": 134, "total_steps": 350, "loss": 0.0017, "lr": 7.754484907260513e-05, "epoch": 13.4, "percentage": 38.29, "elapsed_time": "0:23:42", "remaining_time": "0:38:12"}
140
+ {"current_steps": 135, "total_steps": 350, "loss": 0.0025, "lr": 7.712731319328798e-05, "epoch": 13.5, "percentage": 38.57, "elapsed_time": "0:23:52", "remaining_time": "0:38:02"}
141
+ {"current_steps": 136, "total_steps": 350, "loss": 0.0014, "lr": 7.670707906624644e-05, "epoch": 13.6, "percentage": 38.86, "elapsed_time": "0:24:03", "remaining_time": "0:37:51"}
142
+ {"current_steps": 137, "total_steps": 350, "loss": 0.0014, "lr": 7.628418849052523e-05, "epoch": 13.7, "percentage": 39.14, "elapsed_time": "0:24:13", "remaining_time": "0:37:40"}
143
+ {"current_steps": 138, "total_steps": 350, "loss": 0.0021, "lr": 7.585868352939563e-05, "epoch": 13.8, "percentage": 39.43, "elapsed_time": "0:24:23", "remaining_time": "0:37:28"}
144
+ {"current_steps": 139, "total_steps": 350, "loss": 0.0031, "lr": 7.543060650617158e-05, "epoch": 13.9, "percentage": 39.71, "elapsed_time": "0:24:34", "remaining_time": "0:37:18"}
145
+ {"current_steps": 140, "total_steps": 350, "loss": 0.0016, "lr": 7.500000000000001e-05, "epoch": 14.0, "percentage": 40.0, "elapsed_time": "0:24:39", "remaining_time": "0:36:58"}
146
+ {"current_steps": 141, "total_steps": 350, "loss": 0.0034, "lr": 7.456690684162557e-05, "epoch": 14.1, "percentage": 40.29, "elapsed_time": "0:24:49", "remaining_time": "0:36:48"}
147
+ {"current_steps": 142, "total_steps": 350, "loss": 0.002, "lr": 7.413137010913054e-05, "epoch": 14.2, "percentage": 40.57, "elapsed_time": "0:24:59", "remaining_time": "0:36:36"}
148
+ {"current_steps": 143, "total_steps": 350, "loss": 0.0017, "lr": 7.369343312364993e-05, "epoch": 14.3, "percentage": 40.86, "elapsed_time": "0:25:10", "remaining_time": "0:36:26"}
149
+ {"current_steps": 144, "total_steps": 350, "loss": 0.002, "lr": 7.325313944506254e-05, "epoch": 14.4, "percentage": 41.14, "elapsed_time": "0:25:20", "remaining_time": "0:36:15"}
150
+ {"current_steps": 145, "total_steps": 350, "loss": 0.0015, "lr": 7.281053286765815e-05, "epoch": 14.5, "percentage": 41.43, "elapsed_time": "0:25:30", "remaining_time": "0:36:04"}
151
+ {"current_steps": 146, "total_steps": 350, "loss": 0.0017, "lr": 7.236565741578163e-05, "epoch": 14.6, "percentage": 41.71, "elapsed_time": "0:25:41", "remaining_time": "0:35:54"}
152
+ {"current_steps": 147, "total_steps": 350, "loss": 0.0068, "lr": 7.191855733945387e-05, "epoch": 14.7, "percentage": 42.0, "elapsed_time": "0:25:52", "remaining_time": "0:35:43"}
153
+ {"current_steps": 148, "total_steps": 350, "loss": 0.0018, "lr": 7.146927710997047e-05, "epoch": 14.8, "percentage": 42.29, "elapsed_time": "0:26:02", "remaining_time": "0:35:32"}
154
+ {"current_steps": 149, "total_steps": 350, "loss": 0.0016, "lr": 7.101786141547828e-05, "epoch": 14.9, "percentage": 42.57, "elapsed_time": "0:26:12", "remaining_time": "0:35:21"}
155
+ {"current_steps": 150, "total_steps": 350, "loss": 0.0038, "lr": 7.056435515653059e-05, "epoch": 15.0, "percentage": 42.86, "elapsed_time": "0:26:17", "remaining_time": "0:35:03"}
156
+ {"current_steps": 150, "total_steps": 350, "eval_loss": 0.02101273089647293, "epoch": 15.0, "percentage": 42.86, "elapsed_time": "0:26:24", "remaining_time": "0:35:12"}
157
+ {"current_steps": 151, "total_steps": 350, "loss": 0.0013, "lr": 7.010880344162088e-05, "epoch": 15.1, "percentage": 43.14, "elapsed_time": "0:26:34", "remaining_time": "0:35:01"}
158
+ {"current_steps": 152, "total_steps": 350, "loss": 0.0016, "lr": 6.965125158269619e-05, "epoch": 15.2, "percentage": 43.43, "elapsed_time": "0:26:45", "remaining_time": "0:34:51"}
159
+ {"current_steps": 153, "total_steps": 350, "loss": 0.0022, "lr": 6.919174509065004e-05, "epoch": 15.3, "percentage": 43.71, "elapsed_time": "0:26:55", "remaining_time": "0:34:40"}
160
+ {"current_steps": 154, "total_steps": 350, "loss": 0.0017, "lr": 6.873032967079561e-05, "epoch": 15.4, "percentage": 44.0, "elapsed_time": "0:27:05", "remaining_time": "0:34:28"}
161
+ {"current_steps": 155, "total_steps": 350, "loss": 0.0019, "lr": 6.826705121831976e-05, "epoch": 15.5, "percentage": 44.29, "elapsed_time": "0:27:15", "remaining_time": "0:34:18"}
162
+ {"current_steps": 156, "total_steps": 350, "loss": 0.0014, "lr": 6.780195581371784e-05, "epoch": 15.6, "percentage": 44.57, "elapsed_time": "0:27:26", "remaining_time": "0:34:07"}
163
+ {"current_steps": 157, "total_steps": 350, "loss": 0.0016, "lr": 6.733508971821036e-05, "epoch": 15.7, "percentage": 44.86, "elapsed_time": "0:27:37", "remaining_time": "0:33:57"}
164
+ {"current_steps": 158, "total_steps": 350, "loss": 0.0016, "lr": 6.686649936914152e-05, "epoch": 15.8, "percentage": 45.14, "elapsed_time": "0:27:47", "remaining_time": "0:33:46"}
165
+ {"current_steps": 159, "total_steps": 350, "loss": 0.0016, "lr": 6.639623137536023e-05, "epoch": 15.9, "percentage": 45.43, "elapsed_time": "0:27:58", "remaining_time": "0:33:35"}
166
+ {"current_steps": 160, "total_steps": 350, "loss": 0.0015, "lr": 6.592433251258423e-05, "epoch": 16.0, "percentage": 45.71, "elapsed_time": "0:28:02", "remaining_time": "0:33:18"}
167
+ {"current_steps": 161, "total_steps": 350, "loss": 0.0016, "lr": 6.545084971874738e-05, "epoch": 16.1, "percentage": 46.0, "elapsed_time": "0:28:13", "remaining_time": "0:33:07"}
168
+ {"current_steps": 162, "total_steps": 350, "loss": 0.0014, "lr": 6.497583008933097e-05, "epoch": 16.2, "percentage": 46.29, "elapsed_time": "0:28:23", "remaining_time": "0:32:56"}
169
+ {"current_steps": 163, "total_steps": 350, "loss": 0.0013, "lr": 6.449932087267932e-05, "epoch": 16.3, "percentage": 46.57, "elapsed_time": "0:28:34", "remaining_time": "0:32:46"}
170
+ {"current_steps": 164, "total_steps": 350, "loss": 0.0013, "lr": 6.402136946530014e-05, "epoch": 16.4, "percentage": 46.86, "elapsed_time": "0:28:44", "remaining_time": "0:32:35"}
171
+ {"current_steps": 165, "total_steps": 350, "loss": 0.0017, "lr": 6.354202340715026e-05, "epoch": 16.5, "percentage": 47.14, "elapsed_time": "0:28:55", "remaining_time": "0:32:25"}
172
+ {"current_steps": 166, "total_steps": 350, "loss": 0.0012, "lr": 6.306133037690693e-05, "epoch": 16.6, "percentage": 47.43, "elapsed_time": "0:29:05", "remaining_time": "0:32:14"}
173
+ {"current_steps": 167, "total_steps": 350, "loss": 0.0013, "lr": 6.257933818722543e-05, "epoch": 16.7, "percentage": 47.71, "elapsed_time": "0:29:15", "remaining_time": "0:32:03"}
174
+ {"current_steps": 168, "total_steps": 350, "loss": 0.0016, "lr": 6.209609477998338e-05, "epoch": 16.8, "percentage": 48.0, "elapsed_time": "0:29:25", "remaining_time": "0:31:53"}
175
+ {"current_steps": 169, "total_steps": 350, "loss": 0.0013, "lr": 6.161164822151213e-05, "epoch": 16.9, "percentage": 48.29, "elapsed_time": "0:29:36", "remaining_time": "0:31:42"}
176
+ {"current_steps": 170, "total_steps": 350, "loss": 0.002, "lr": 6.112604669781572e-05, "epoch": 17.0, "percentage": 48.57, "elapsed_time": "0:29:40", "remaining_time": "0:31:25"}
177
+ {"current_steps": 171, "total_steps": 350, "loss": 0.0013, "lr": 6.063933850977811e-05, "epoch": 17.1, "percentage": 48.86, "elapsed_time": "0:29:51", "remaining_time": "0:31:15"}
178
+ {"current_steps": 172, "total_steps": 350, "loss": 0.0013, "lr": 6.015157206835881e-05, "epoch": 17.2, "percentage": 49.14, "elapsed_time": "0:30:01", "remaining_time": "0:31:04"}
179
+ {"current_steps": 173, "total_steps": 350, "loss": 0.0013, "lr": 5.9662795889777666e-05, "epoch": 17.3, "percentage": 49.43, "elapsed_time": "0:30:11", "remaining_time": "0:30:53"}
180
+ {"current_steps": 174, "total_steps": 350, "loss": 0.0012, "lr": 5.917305859068912e-05, "epoch": 17.4, "percentage": 49.71, "elapsed_time": "0:30:22", "remaining_time": "0:30:43"}
181
+ {"current_steps": 175, "total_steps": 350, "loss": 0.0013, "lr": 5.868240888334653e-05, "epoch": 17.5, "percentage": 50.0, "elapsed_time": "0:30:32", "remaining_time": "0:30:32"}
182
+ {"current_steps": 175, "total_steps": 350, "eval_loss": 0.02819022908806801, "epoch": 17.5, "percentage": 50.0, "elapsed_time": "0:30:39", "remaining_time": "0:30:39"}
183
+ {"current_steps": 176, "total_steps": 350, "loss": 0.0022, "lr": 5.819089557075689e-05, "epoch": 17.6, "percentage": 50.29, "elapsed_time": "0:30:50", "remaining_time": "0:30:29"}
184
+ {"current_steps": 177, "total_steps": 350, "loss": 0.0012, "lr": 5.7698567541826675e-05, "epoch": 17.7, "percentage": 50.57, "elapsed_time": "0:31:01", "remaining_time": "0:30:19"}
185
+ {"current_steps": 178, "total_steps": 350, "loss": 0.0025, "lr": 5.7205473766499005e-05, "epoch": 17.8, "percentage": 50.86, "elapsed_time": "0:31:11", "remaining_time": "0:30:08"}
186
+ {"current_steps": 179, "total_steps": 350, "loss": 0.0013, "lr": 5.6711663290882776e-05, "epoch": 17.9, "percentage": 51.14, "elapsed_time": "0:31:21", "remaining_time": "0:29:57"}
187
+ {"current_steps": 180, "total_steps": 350, "loss": 0.0013, "lr": 5.621718523237427e-05, "epoch": 18.0, "percentage": 51.43, "elapsed_time": "0:31:26", "remaining_time": "0:29:41"}
188
+ {"current_steps": 181, "total_steps": 350, "loss": 0.0019, "lr": 5.57220887747716e-05, "epoch": 18.1, "percentage": 51.71, "elapsed_time": "0:31:36", "remaining_time": "0:29:31"}
189
+ {"current_steps": 182, "total_steps": 350, "loss": 0.0015, "lr": 5.522642316338268e-05, "epoch": 18.2, "percentage": 52.0, "elapsed_time": "0:31:47", "remaining_time": "0:29:21"}
190
+ {"current_steps": 183, "total_steps": 350, "loss": 0.0016, "lr": 5.473023770012686e-05, "epoch": 18.3, "percentage": 52.29, "elapsed_time": "0:31:58", "remaining_time": "0:29:10"}
191
+ {"current_steps": 184, "total_steps": 350, "loss": 0.0013, "lr": 5.4233581738631165e-05, "epoch": 18.4, "percentage": 52.57, "elapsed_time": "0:32:08", "remaining_time": "0:29:00"}
192
+ {"current_steps": 185, "total_steps": 350, "loss": 0.0014, "lr": 5.373650467932122e-05, "epoch": 18.5, "percentage": 52.86, "elapsed_time": "0:32:18", "remaining_time": "0:28:49"}
193
+ {"current_steps": 186, "total_steps": 350, "loss": 0.0014, "lr": 5.323905596450759e-05, "epoch": 18.6, "percentage": 53.14, "elapsed_time": "0:32:29", "remaining_time": "0:28:38"}
194
+ {"current_steps": 187, "total_steps": 350, "loss": 0.0013, "lr": 5.274128507346801e-05, "epoch": 18.7, "percentage": 53.43, "elapsed_time": "0:32:39", "remaining_time": "0:28:28"}
195
+ {"current_steps": 188, "total_steps": 350, "loss": 0.0013, "lr": 5.2243241517525754e-05, "epoch": 18.8, "percentage": 53.71, "elapsed_time": "0:32:50", "remaining_time": "0:28:17"}
196
+ {"current_steps": 189, "total_steps": 350, "loss": 0.0013, "lr": 5.174497483512506e-05, "epoch": 18.9, "percentage": 54.0, "elapsed_time": "0:33:00", "remaining_time": "0:28:07"}
197
+ {"current_steps": 190, "total_steps": 350, "loss": 0.0013, "lr": 5.124653458690365e-05, "epoch": 19.0, "percentage": 54.29, "elapsed_time": "0:33:05", "remaining_time": "0:27:51"}
198
+ {"current_steps": 191, "total_steps": 350, "loss": 0.0013, "lr": 5.074797035076319e-05, "epoch": 19.1, "percentage": 54.57, "elapsed_time": "0:33:16", "remaining_time": "0:27:41"}
199
+ {"current_steps": 192, "total_steps": 350, "loss": 0.0017, "lr": 5.024933171693791e-05, "epoch": 19.2, "percentage": 54.86, "elapsed_time": "0:33:26", "remaining_time": "0:27:31"}
200
+ {"current_steps": 193, "total_steps": 350, "loss": 0.0013, "lr": 4.9750668283062104e-05, "epoch": 19.3, "percentage": 55.14, "elapsed_time": "0:33:37", "remaining_time": "0:27:20"}
201
+ {"current_steps": 194, "total_steps": 350, "loss": 0.0012, "lr": 4.925202964923683e-05, "epoch": 19.4, "percentage": 55.43, "elapsed_time": "0:33:47", "remaining_time": "0:27:10"}
202
+ {"current_steps": 195, "total_steps": 350, "loss": 0.0014, "lr": 4.875346541309637e-05, "epoch": 19.5, "percentage": 55.71, "elapsed_time": "0:33:57", "remaining_time": "0:26:59"}
203
+ {"current_steps": 196, "total_steps": 350, "loss": 0.0014, "lr": 4.825502516487497e-05, "epoch": 19.6, "percentage": 56.0, "elapsed_time": "0:34:08", "remaining_time": "0:26:49"}
204
+ {"current_steps": 197, "total_steps": 350, "loss": 0.0013, "lr": 4.775675848247427e-05, "epoch": 19.7, "percentage": 56.29, "elapsed_time": "0:34:18", "remaining_time": "0:26:38"}
205
+ {"current_steps": 198, "total_steps": 350, "loss": 0.0015, "lr": 4.725871492653199e-05, "epoch": 19.8, "percentage": 56.57, "elapsed_time": "0:34:28", "remaining_time": "0:26:28"}
206
+ {"current_steps": 199, "total_steps": 350, "loss": 0.0014, "lr": 4.6760944035492404e-05, "epoch": 19.9, "percentage": 56.86, "elapsed_time": "0:34:39", "remaining_time": "0:26:17"}
207
+ {"current_steps": 200, "total_steps": 350, "loss": 0.0012, "lr": 4.626349532067879e-05, "epoch": 20.0, "percentage": 57.14, "elapsed_time": "0:34:43", "remaining_time": "0:26:02"}
208
+ {"current_steps": 200, "total_steps": 350, "eval_loss": 0.027936220169067383, "epoch": 20.0, "percentage": 57.14, "elapsed_time": "0:34:50", "remaining_time": "0:26:08"}
209
+ {"current_steps": 201, "total_steps": 350, "loss": 0.0014, "lr": 4.576641826136884e-05, "epoch": 20.1, "percentage": 57.43, "elapsed_time": "0:35:07", "remaining_time": "0:26:02"}
210
+ {"current_steps": 202, "total_steps": 350, "loss": 0.0014, "lr": 4.526976229987315e-05, "epoch": 20.2, "percentage": 57.71, "elapsed_time": "0:35:17", "remaining_time": "0:25:51"}
211
+ {"current_steps": 203, "total_steps": 350, "loss": 0.0013, "lr": 4.477357683661734e-05, "epoch": 20.3, "percentage": 58.0, "elapsed_time": "0:35:27", "remaining_time": "0:25:40"}
212
+ {"current_steps": 204, "total_steps": 350, "loss": 0.0013, "lr": 4.4277911225228414e-05, "epoch": 20.4, "percentage": 58.29, "elapsed_time": "0:35:38", "remaining_time": "0:25:30"}
213
+ {"current_steps": 205, "total_steps": 350, "loss": 0.0014, "lr": 4.378281476762576e-05, "epoch": 20.5, "percentage": 58.57, "elapsed_time": "0:35:49", "remaining_time": "0:25:20"}
214
+ {"current_steps": 206, "total_steps": 350, "loss": 0.0013, "lr": 4.328833670911724e-05, "epoch": 20.6, "percentage": 58.86, "elapsed_time": "0:35:59", "remaining_time": "0:25:09"}
215
+ {"current_steps": 207, "total_steps": 350, "loss": 0.0012, "lr": 4.2794526233501006e-05, "epoch": 20.7, "percentage": 59.14, "elapsed_time": "0:36:09", "remaining_time": "0:24:58"}
216
+ {"current_steps": 208, "total_steps": 350, "loss": 0.0013, "lr": 4.230143245817332e-05, "epoch": 20.8, "percentage": 59.43, "elapsed_time": "0:36:20", "remaining_time": "0:24:48"}
217
+ {"current_steps": 209, "total_steps": 350, "loss": 0.0013, "lr": 4.180910442924312e-05, "epoch": 20.9, "percentage": 59.71, "elapsed_time": "0:36:30", "remaining_time": "0:24:37"}
218
+ {"current_steps": 210, "total_steps": 350, "loss": 0.0013, "lr": 4.131759111665349e-05, "epoch": 21.0, "percentage": 60.0, "elapsed_time": "0:36:35", "remaining_time": "0:24:23"}
219
+ {"current_steps": 211, "total_steps": 350, "loss": 0.0013, "lr": 4.082694140931089e-05, "epoch": 21.1, "percentage": 60.29, "elapsed_time": "0:36:45", "remaining_time": "0:24:13"}
220
+ {"current_steps": 212, "total_steps": 350, "loss": 0.0013, "lr": 4.0337204110222346e-05, "epoch": 21.2, "percentage": 60.57, "elapsed_time": "0:36:56", "remaining_time": "0:24:02"}
221
+ {"current_steps": 213, "total_steps": 350, "loss": 0.0013, "lr": 3.98484279316412e-05, "epoch": 21.3, "percentage": 60.86, "elapsed_time": "0:37:06", "remaining_time": "0:23:52"}
222
+ {"current_steps": 214, "total_steps": 350, "loss": 0.0013, "lr": 3.936066149022191e-05, "epoch": 21.4, "percentage": 61.14, "elapsed_time": "0:37:17", "remaining_time": "0:23:41"}
223
+ {"current_steps": 215, "total_steps": 350, "loss": 0.0013, "lr": 3.887395330218429e-05, "epoch": 21.5, "percentage": 61.43, "elapsed_time": "0:37:27", "remaining_time": "0:23:31"}
224
+ {"current_steps": 216, "total_steps": 350, "loss": 0.0012, "lr": 3.838835177848788e-05, "epoch": 21.6, "percentage": 61.71, "elapsed_time": "0:37:38", "remaining_time": "0:23:20"}
225
+ {"current_steps": 217, "total_steps": 350, "loss": 0.0013, "lr": 3.790390522001662e-05, "epoch": 21.7, "percentage": 62.0, "elapsed_time": "0:37:48", "remaining_time": "0:23:10"}
226
+ {"current_steps": 218, "total_steps": 350, "loss": 0.0013, "lr": 3.742066181277458e-05, "epoch": 21.8, "percentage": 62.29, "elapsed_time": "0:37:59", "remaining_time": "0:23:00"}
227
+ {"current_steps": 219, "total_steps": 350, "loss": 0.0013, "lr": 3.6938669623093084e-05, "epoch": 21.9, "percentage": 62.57, "elapsed_time": "0:38:09", "remaining_time": "0:22:49"}
228
+ {"current_steps": 220, "total_steps": 350, "loss": 0.0013, "lr": 3.6457976592849754e-05, "epoch": 22.0, "percentage": 62.86, "elapsed_time": "0:38:14", "remaining_time": "0:22:35"}
229
+ {"current_steps": 221, "total_steps": 350, "loss": 0.0013, "lr": 3.597863053469987e-05, "epoch": 22.1, "percentage": 63.14, "elapsed_time": "0:38:25", "remaining_time": "0:22:25"}
230
+ {"current_steps": 222, "total_steps": 350, "loss": 0.0013, "lr": 3.550067912732069e-05, "epoch": 22.2, "percentage": 63.43, "elapsed_time": "0:38:35", "remaining_time": "0:22:14"}
231
+ {"current_steps": 223, "total_steps": 350, "loss": 0.0013, "lr": 3.502416991066904e-05, "epoch": 22.3, "percentage": 63.71, "elapsed_time": "0:38:45", "remaining_time": "0:22:04"}
232
+ {"current_steps": 224, "total_steps": 350, "loss": 0.0013, "lr": 3.4549150281252636e-05, "epoch": 22.4, "percentage": 64.0, "elapsed_time": "0:38:56", "remaining_time": "0:21:54"}
233
+ {"current_steps": 225, "total_steps": 350, "loss": 0.0013, "lr": 3.4075667487415785e-05, "epoch": 22.5, "percentage": 64.29, "elapsed_time": "0:39:06", "remaining_time": "0:21:43"}
234
+ {"current_steps": 225, "total_steps": 350, "eval_loss": 0.030632568523287773, "epoch": 22.5, "percentage": 64.29, "elapsed_time": "0:39:13", "remaining_time": "0:21:47"}
235
+ {"current_steps": 225, "total_steps": 350, "epoch": 22.5, "percentage": 64.29, "elapsed_time": "0:39:13", "remaining_time": "0:21:47"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
trainer_state.json CHANGED
@@ -1,1851 +1,1668 @@
1
  {
2
- "best_metric": 0.017384279519319534,
3
  "best_model_checkpoint": "/home/paperspace/Data/models/Klystroglobal/llm3br256-v1.5/checkpoint-100",
4
- "epoch": 25.0,
5
  "eval_steps": 25,
6
- "global_step": 250,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.1,
13
- "grad_norm": 1.2220871448516846,
14
  "learning_rate": 2.8571428571428573e-06,
15
- "loss": 0.1531,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.2,
20
- "grad_norm": 1.2097222805023193,
21
  "learning_rate": 5.7142857142857145e-06,
22
- "loss": 0.1308,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.3,
27
- "grad_norm": 1.2176955938339233,
28
  "learning_rate": 8.571428571428573e-06,
29
- "loss": 0.1619,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.4,
34
- "grad_norm": 0.7081905007362366,
35
  "learning_rate": 1.1428571428571429e-05,
36
- "loss": 0.1325,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.5,
41
- "grad_norm": 0.529013991355896,
42
  "learning_rate": 1.4285714285714285e-05,
43
- "loss": 0.1206,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.6,
48
- "grad_norm": 0.32227373123168945,
49
  "learning_rate": 1.7142857142857145e-05,
50
- "loss": 0.0875,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.7,
55
- "grad_norm": 0.34188932180404663,
56
  "learning_rate": 2e-05,
57
- "loss": 0.1067,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.8,
62
- "grad_norm": 0.513898491859436,
63
  "learning_rate": 2.2857142857142858e-05,
64
- "loss": 0.0966,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.9,
69
- "grad_norm": 0.47207334637641907,
70
  "learning_rate": 2.5714285714285714e-05,
71
- "loss": 0.1088,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 1.0,
76
- "grad_norm": 0.32782670855522156,
77
  "learning_rate": 2.857142857142857e-05,
78
- "loss": 0.0839,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 1.1,
83
- "grad_norm": 0.17063647508621216,
84
  "learning_rate": 3.142857142857143e-05,
85
- "loss": 0.0713,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 1.2,
90
- "grad_norm": 0.15981265902519226,
91
  "learning_rate": 3.428571428571429e-05,
92
- "loss": 0.0688,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 1.3,
97
- "grad_norm": 0.16717936098575592,
98
  "learning_rate": 3.7142857142857143e-05,
99
- "loss": 0.0688,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 1.4,
104
- "grad_norm": 0.22196544706821442,
105
  "learning_rate": 4e-05,
106
- "loss": 0.0668,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 1.5,
111
- "grad_norm": 0.20881694555282593,
112
  "learning_rate": 4.2857142857142856e-05,
113
- "loss": 0.0613,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 1.6,
118
- "grad_norm": 0.14273549616336823,
119
  "learning_rate": 4.5714285714285716e-05,
120
- "loss": 0.052,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 1.7,
125
- "grad_norm": 0.12841083109378815,
126
  "learning_rate": 4.8571428571428576e-05,
127
- "loss": 0.0581,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 1.8,
132
- "grad_norm": 0.1572558879852295,
133
  "learning_rate": 5.142857142857143e-05,
134
- "loss": 0.0482,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 1.9,
139
- "grad_norm": 0.19301706552505493,
140
  "learning_rate": 5.428571428571428e-05,
141
- "loss": 0.0533,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 2.0,
146
- "grad_norm": 0.1539728194475174,
147
  "learning_rate": 5.714285714285714e-05,
148
- "loss": 0.0513,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 2.1,
153
- "grad_norm": 0.11170095205307007,
154
  "learning_rate": 6e-05,
155
- "loss": 0.042,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 2.2,
160
- "grad_norm": 0.09418804198503494,
161
  "learning_rate": 6.285714285714286e-05,
162
- "loss": 0.0401,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 2.3,
167
- "grad_norm": 0.11917826533317566,
168
  "learning_rate": 6.571428571428571e-05,
169
- "loss": 0.0382,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 2.4,
174
- "grad_norm": 0.10801587998867035,
175
  "learning_rate": 6.857142857142858e-05,
176
- "loss": 0.034,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 2.5,
181
- "grad_norm": 0.11365531384944916,
182
  "learning_rate": 7.142857142857143e-05,
183
- "loss": 0.0437,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 2.5,
188
- "eval_loss": 0.03402441740036011,
189
- "eval_runtime": 19.4202,
190
- "eval_samples_per_second": 5.149,
191
- "eval_steps_per_second": 0.154,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 2.6,
196
- "grad_norm": 0.1079014241695404,
197
  "learning_rate": 7.428571428571429e-05,
198
- "loss": 0.0422,
199
  "step": 26
200
  },
201
  {
202
  "epoch": 2.7,
203
- "grad_norm": 0.08936240524053574,
204
  "learning_rate": 7.714285714285715e-05,
205
- "loss": 0.0264,
206
  "step": 27
207
  },
208
  {
209
  "epoch": 2.8,
210
- "grad_norm": 0.12060200423002243,
211
  "learning_rate": 8e-05,
212
  "loss": 0.0377,
213
  "step": 28
214
  },
215
  {
216
  "epoch": 2.9,
217
- "grad_norm": 0.08112004399299622,
218
  "learning_rate": 8.285714285714287e-05,
219
- "loss": 0.0289,
220
  "step": 29
221
  },
222
  {
223
  "epoch": 3.0,
224
- "grad_norm": 0.12806135416030884,
225
  "learning_rate": 8.571428571428571e-05,
226
- "loss": 0.0299,
227
  "step": 30
228
  },
229
  {
230
  "epoch": 3.1,
231
- "grad_norm": 0.06607820093631744,
232
  "learning_rate": 8.857142857142857e-05,
233
- "loss": 0.0216,
234
  "step": 31
235
  },
236
  {
237
  "epoch": 3.2,
238
- "grad_norm": 0.08246105909347534,
239
  "learning_rate": 9.142857142857143e-05,
240
- "loss": 0.0359,
241
  "step": 32
242
  },
243
  {
244
  "epoch": 3.3,
245
- "grad_norm": 0.07171958684921265,
246
  "learning_rate": 9.428571428571429e-05,
247
- "loss": 0.0233,
248
  "step": 33
249
  },
250
  {
251
  "epoch": 3.4,
252
- "grad_norm": 0.07688147574663162,
253
  "learning_rate": 9.714285714285715e-05,
254
- "loss": 0.0254,
255
  "step": 34
256
  },
257
  {
258
  "epoch": 3.5,
259
- "grad_norm": 0.07434146851301193,
260
  "learning_rate": 0.0001,
261
- "loss": 0.0202,
262
  "step": 35
263
  },
264
  {
265
  "epoch": 3.6,
266
- "grad_norm": 0.06925389170646667,
267
  "learning_rate": 9.999751334779716e-05,
268
- "loss": 0.0197,
269
  "step": 36
270
  },
271
  {
272
  "epoch": 3.7,
273
- "grad_norm": 0.06520260870456696,
274
  "learning_rate": 9.999005363852618e-05,
275
- "loss": 0.0206,
276
  "step": 37
277
  },
278
  {
279
  "epoch": 3.8,
280
- "grad_norm": 0.07232938706874847,
281
  "learning_rate": 9.997762161417517e-05,
282
- "loss": 0.0197,
283
  "step": 38
284
  },
285
  {
286
  "epoch": 3.9,
287
- "grad_norm": 0.08089913427829742,
288
  "learning_rate": 9.996021851130897e-05,
289
- "loss": 0.0178,
290
  "step": 39
291
  },
292
  {
293
  "epoch": 4.0,
294
- "grad_norm": 0.12080717831850052,
295
  "learning_rate": 9.993784606094612e-05,
296
- "loss": 0.0141,
297
  "step": 40
298
  },
299
  {
300
  "epoch": 4.1,
301
- "grad_norm": 0.05649913102388382,
302
  "learning_rate": 9.991050648838675e-05,
303
- "loss": 0.012,
304
  "step": 41
305
  },
306
  {
307
  "epoch": 4.2,
308
- "grad_norm": 0.09042762964963913,
309
  "learning_rate": 9.987820251299122e-05,
310
- "loss": 0.0124,
311
  "step": 42
312
  },
313
  {
314
  "epoch": 4.3,
315
- "grad_norm": 0.07907257974147797,
316
  "learning_rate": 9.984093734790956e-05,
317
- "loss": 0.017,
318
  "step": 43
319
  },
320
  {
321
  "epoch": 4.4,
322
- "grad_norm": 0.07241521775722504,
323
  "learning_rate": 9.979871469976196e-05,
324
- "loss": 0.0132,
325
  "step": 44
326
  },
327
  {
328
  "epoch": 4.5,
329
- "grad_norm": 0.10079007595777512,
330
  "learning_rate": 9.975153876827008e-05,
331
- "loss": 0.0169,
332
  "step": 45
333
  },
334
  {
335
  "epoch": 4.6,
336
- "grad_norm": 0.09246091544628143,
337
  "learning_rate": 9.969941424583926e-05,
338
- "loss": 0.0145,
339
  "step": 46
340
  },
341
  {
342
  "epoch": 4.7,
343
- "grad_norm": 0.0651487484574318,
344
  "learning_rate": 9.964234631709187e-05,
345
- "loss": 0.0151,
346
  "step": 47
347
  },
348
  {
349
  "epoch": 4.8,
350
- "grad_norm": 0.06992605328559875,
351
  "learning_rate": 9.958034065835151e-05,
352
- "loss": 0.011,
353
  "step": 48
354
  },
355
  {
356
  "epoch": 4.9,
357
- "grad_norm": 0.06309088319540024,
358
  "learning_rate": 9.951340343707852e-05,
359
- "loss": 0.012,
360
  "step": 49
361
  },
362
  {
363
  "epoch": 5.0,
364
- "grad_norm": 0.06862813979387283,
365
  "learning_rate": 9.944154131125642e-05,
366
- "loss": 0.0098,
367
  "step": 50
368
  },
369
  {
370
  "epoch": 5.0,
371
- "eval_loss": 0.016623547300696373,
372
- "eval_runtime": 15.4785,
373
- "eval_samples_per_second": 6.461,
374
- "eval_steps_per_second": 0.194,
375
  "step": 50
376
  },
377
  {
378
  "epoch": 5.1,
379
- "grad_norm": 0.06234560161828995,
380
  "learning_rate": 9.936476142872979e-05,
381
- "loss": 0.011,
382
  "step": 51
383
  },
384
  {
385
  "epoch": 5.2,
386
- "grad_norm": 0.05178332328796387,
387
  "learning_rate": 9.928307142649316e-05,
388
- "loss": 0.0082,
389
  "step": 52
390
  },
391
  {
392
  "epoch": 5.3,
393
- "grad_norm": 0.0584288015961647,
394
  "learning_rate": 9.919647942993148e-05,
395
- "loss": 0.0069,
396
  "step": 53
397
  },
398
  {
399
  "epoch": 5.4,
400
- "grad_norm": 0.05619216337800026,
401
  "learning_rate": 9.910499405201195e-05,
402
- "loss": 0.0091,
403
  "step": 54
404
  },
405
  {
406
  "epoch": 5.5,
407
- "grad_norm": 0.052176207304000854,
408
  "learning_rate": 9.900862439242719e-05,
409
- "loss": 0.0062,
410
  "step": 55
411
  },
412
  {
413
  "epoch": 5.6,
414
- "grad_norm": 0.058783914893865585,
415
  "learning_rate": 9.890738003669029e-05,
416
- "loss": 0.0052,
417
  "step": 56
418
  },
419
  {
420
  "epoch": 5.7,
421
- "grad_norm": 0.08193694055080414,
422
  "learning_rate": 9.880127105518122e-05,
423
- "loss": 0.0076,
424
  "step": 57
425
  },
426
  {
427
  "epoch": 5.8,
428
- "grad_norm": 0.09745576977729797,
429
  "learning_rate": 9.869030800214532e-05,
430
- "loss": 0.0107,
431
  "step": 58
432
  },
433
  {
434
  "epoch": 5.9,
435
- "grad_norm": 0.07822689414024353,
436
  "learning_rate": 9.857450191464337e-05,
437
- "loss": 0.0081,
438
  "step": 59
439
  },
440
  {
441
  "epoch": 6.0,
442
- "grad_norm": 0.06525323539972305,
443
  "learning_rate": 9.84538643114539e-05,
444
- "loss": 0.0063,
445
  "step": 60
446
  },
447
  {
448
  "epoch": 6.1,
449
- "grad_norm": 0.03879164531826973,
450
  "learning_rate": 9.832840719192736e-05,
451
- "loss": 0.0037,
452
  "step": 61
453
  },
454
  {
455
  "epoch": 6.2,
456
- "grad_norm": 0.05432894080877304,
457
  "learning_rate": 9.819814303479267e-05,
458
- "loss": 0.0049,
459
  "step": 62
460
  },
461
  {
462
  "epoch": 6.3,
463
- "grad_norm": 0.04752165079116821,
464
  "learning_rate": 9.806308479691595e-05,
465
- "loss": 0.0051,
466
  "step": 63
467
  },
468
  {
469
  "epoch": 6.4,
470
- "grad_norm": 0.0588836595416069,
471
  "learning_rate": 9.792324591201179e-05,
472
- "loss": 0.0052,
473
  "step": 64
474
  },
475
  {
476
  "epoch": 6.5,
477
- "grad_norm": 0.07457052916288376,
478
  "learning_rate": 9.777864028930705e-05,
479
- "loss": 0.0046,
480
  "step": 65
481
  },
482
  {
483
  "epoch": 6.6,
484
- "grad_norm": 0.06699630618095398,
485
  "learning_rate": 9.76292823121573e-05,
486
- "loss": 0.0064,
487
  "step": 66
488
  },
489
  {
490
  "epoch": 6.7,
491
- "grad_norm": 0.05367649346590042,
492
  "learning_rate": 9.747518683661631e-05,
493
- "loss": 0.0044,
494
  "step": 67
495
  },
496
  {
497
  "epoch": 6.8,
498
- "grad_norm": 0.06585957109928131,
499
  "learning_rate": 9.731636918995821e-05,
500
- "loss": 0.0064,
501
  "step": 68
502
  },
503
  {
504
  "epoch": 6.9,
505
- "grad_norm": 0.05559472367167473,
506
  "learning_rate": 9.715284516915303e-05,
507
- "loss": 0.0045,
508
  "step": 69
509
  },
510
  {
511
  "epoch": 7.0,
512
- "grad_norm": 0.1440582275390625,
513
  "learning_rate": 9.698463103929542e-05,
514
- "loss": 0.0067,
515
  "step": 70
516
  },
517
  {
518
  "epoch": 7.1,
519
- "grad_norm": 0.04040021821856499,
520
  "learning_rate": 9.681174353198687e-05,
521
- "loss": 0.0037,
522
  "step": 71
523
  },
524
  {
525
  "epoch": 7.2,
526
- "grad_norm": 0.06325013935565948,
527
  "learning_rate": 9.663419984367139e-05,
528
- "loss": 0.0027,
529
  "step": 72
530
  },
531
  {
532
  "epoch": 7.3,
533
- "grad_norm": 0.11049168556928635,
534
  "learning_rate": 9.645201763392513e-05,
535
- "loss": 0.0046,
536
  "step": 73
537
  },
538
  {
539
  "epoch": 7.4,
540
- "grad_norm": 0.0775715634226799,
541
  "learning_rate": 9.626521502369984e-05,
542
  "loss": 0.0054,
543
  "step": 74
544
  },
545
  {
546
  "epoch": 7.5,
547
- "grad_norm": 0.08004690706729889,
548
  "learning_rate": 9.607381059352038e-05,
549
- "loss": 0.0039,
550
  "step": 75
551
  },
552
  {
553
  "epoch": 7.5,
554
- "eval_loss": 0.016471313312649727,
555
- "eval_runtime": 7.0024,
556
- "eval_samples_per_second": 14.281,
557
- "eval_steps_per_second": 0.428,
558
  "step": 75
559
  },
560
  {
561
  "epoch": 7.6,
562
- "grad_norm": 0.12311126291751862,
563
  "learning_rate": 9.587782338163669e-05,
564
- "loss": 0.0035,
565
  "step": 76
566
  },
567
  {
568
  "epoch": 7.7,
569
- "grad_norm": 0.05487671494483948,
570
  "learning_rate": 9.567727288213005e-05,
571
- "loss": 0.0047,
572
  "step": 77
573
  },
574
  {
575
  "epoch": 7.8,
576
- "grad_norm": 0.03079923987388611,
577
  "learning_rate": 9.547217904297411e-05,
578
- "loss": 0.0028,
579
  "step": 78
580
  },
581
  {
582
  "epoch": 7.9,
583
- "grad_norm": 0.09893915802240372,
584
  "learning_rate": 9.526256226405075e-05,
585
- "loss": 0.0054,
586
  "step": 79
587
  },
588
  {
589
  "epoch": 8.0,
590
- "grad_norm": 0.06392358988523483,
591
  "learning_rate": 9.504844339512095e-05,
592
- "loss": 0.0025,
593
  "step": 80
594
  },
595
  {
596
  "epoch": 8.1,
597
- "grad_norm": 0.04920504242181778,
598
  "learning_rate": 9.482984373375105e-05,
599
- "loss": 0.0037,
600
  "step": 81
601
  },
602
  {
603
  "epoch": 8.2,
604
- "grad_norm": 0.044106096029281616,
605
  "learning_rate": 9.460678502319418e-05,
606
- "loss": 0.0026,
607
  "step": 82
608
  },
609
  {
610
  "epoch": 8.3,
611
- "grad_norm": 0.07550745457410812,
612
  "learning_rate": 9.437928945022771e-05,
613
- "loss": 0.0049,
614
  "step": 83
615
  },
616
  {
617
  "epoch": 8.4,
618
- "grad_norm": 0.06214550510048866,
619
  "learning_rate": 9.414737964294636e-05,
620
- "loss": 0.0037,
621
  "step": 84
622
  },
623
  {
624
  "epoch": 8.5,
625
- "grad_norm": 0.057385075837373734,
626
  "learning_rate": 9.391107866851143e-05,
627
- "loss": 0.0025,
628
  "step": 85
629
  },
630
  {
631
  "epoch": 8.6,
632
- "grad_norm": 0.0968804582953453,
633
  "learning_rate": 9.367041003085649e-05,
634
- "loss": 0.0032,
635
  "step": 86
636
  },
637
  {
638
  "epoch": 8.7,
639
- "grad_norm": 0.03738746419548988,
640
  "learning_rate": 9.342539766834946e-05,
641
- "loss": 0.0028,
642
  "step": 87
643
  },
644
  {
645
  "epoch": 8.8,
646
- "grad_norm": 0.04243948310613632,
647
  "learning_rate": 9.317606595141154e-05,
648
- "loss": 0.0027,
649
  "step": 88
650
  },
651
  {
652
  "epoch": 8.9,
653
- "grad_norm": 0.034692391753196716,
654
  "learning_rate": 9.292243968009331e-05,
655
- "loss": 0.0029,
656
  "step": 89
657
  },
658
  {
659
  "epoch": 9.0,
660
- "grad_norm": 0.06521083414554596,
661
  "learning_rate": 9.266454408160779e-05,
662
- "loss": 0.0034,
663
  "step": 90
664
  },
665
  {
666
  "epoch": 9.1,
667
- "grad_norm": 0.04499003291130066,
668
  "learning_rate": 9.24024048078213e-05,
669
- "loss": 0.0023,
670
  "step": 91
671
  },
672
  {
673
  "epoch": 9.2,
674
- "grad_norm": 0.03955000266432762,
675
  "learning_rate": 9.213604793270196e-05,
676
  "loss": 0.0024,
677
  "step": 92
678
  },
679
  {
680
  "epoch": 9.3,
681
- "grad_norm": 0.03790497034788132,
682
  "learning_rate": 9.186549994972618e-05,
683
- "loss": 0.0031,
684
  "step": 93
685
  },
686
  {
687
  "epoch": 9.4,
688
- "grad_norm": 0.053670890629291534,
689
  "learning_rate": 9.159078776924346e-05,
690
- "loss": 0.0029,
691
  "step": 94
692
  },
693
  {
694
  "epoch": 9.5,
695
- "grad_norm": 0.016972996294498444,
696
  "learning_rate": 9.131193871579975e-05,
697
- "loss": 0.0017,
698
  "step": 95
699
  },
700
  {
701
  "epoch": 9.6,
702
- "grad_norm": 0.12130908668041229,
703
  "learning_rate": 9.102898052541958e-05,
704
- "loss": 0.0022,
705
  "step": 96
706
  },
707
  {
708
  "epoch": 9.7,
709
- "grad_norm": 0.04438166692852974,
710
  "learning_rate": 9.074194134284726e-05,
711
- "loss": 0.0025,
712
  "step": 97
713
  },
714
  {
715
  "epoch": 9.8,
716
- "grad_norm": 0.05157145857810974,
717
  "learning_rate": 9.045084971874738e-05,
718
  "loss": 0.002,
719
  "step": 98
720
  },
721
  {
722
  "epoch": 9.9,
723
- "grad_norm": 0.03810460492968559,
724
  "learning_rate": 9.015573460686509e-05,
725
- "loss": 0.0026,
726
  "step": 99
727
  },
728
  {
729
  "epoch": 10.0,
730
- "grad_norm": 0.06720886379480362,
731
  "learning_rate": 8.985662536114613e-05,
732
- "loss": 0.0021,
733
  "step": 100
734
  },
735
  {
736
  "epoch": 10.0,
737
- "eval_loss": 0.017384279519319534,
738
- "eval_runtime": 7.4008,
739
- "eval_samples_per_second": 13.512,
740
- "eval_steps_per_second": 0.405,
741
  "step": 100
742
  },
743
  {
744
  "epoch": 10.1,
745
- "grad_norm": 0.017021650448441505,
746
  "learning_rate": 8.955355173281708e-05,
747
- "loss": 0.0017,
748
  "step": 101
749
  },
750
  {
751
  "epoch": 10.2,
752
- "grad_norm": 0.03386203572154045,
753
  "learning_rate": 8.924654386742613e-05,
754
- "loss": 0.0018,
755
  "step": 102
756
  },
757
  {
758
  "epoch": 10.3,
759
- "grad_norm": 0.06196419894695282,
760
  "learning_rate": 8.89356323018447e-05,
761
- "loss": 0.0025,
762
  "step": 103
763
  },
764
  {
765
  "epoch": 10.4,
766
- "grad_norm": 0.02523985505104065,
767
  "learning_rate": 8.862084796122998e-05,
768
- "loss": 0.0017,
769
  "step": 104
770
  },
771
  {
772
  "epoch": 10.5,
773
- "grad_norm": 0.05176355317234993,
774
  "learning_rate": 8.83022221559489e-05,
775
- "loss": 0.0029,
776
  "step": 105
777
  },
778
  {
779
  "epoch": 10.6,
780
- "grad_norm": 0.05031086131930351,
781
  "learning_rate": 8.797978657846391e-05,
782
- "loss": 0.0022,
783
  "step": 106
784
  },
785
  {
786
  "epoch": 10.7,
787
- "grad_norm": 0.06354419887065887,
788
  "learning_rate": 8.765357330018056e-05,
789
- "loss": 0.0024,
790
  "step": 107
791
  },
792
  {
793
  "epoch": 10.8,
794
- "grad_norm": 0.06342065334320068,
795
  "learning_rate": 8.732361476825752e-05,
796
- "loss": 0.0028,
797
  "step": 108
798
  },
799
  {
800
  "epoch": 10.9,
801
- "grad_norm": 0.03949422389268875,
802
  "learning_rate": 8.69899438023792e-05,
803
- "loss": 0.0018,
804
  "step": 109
805
  },
806
  {
807
  "epoch": 11.0,
808
- "grad_norm": 0.02962133288383484,
809
  "learning_rate": 8.665259359149132e-05,
810
- "loss": 0.0018,
811
  "step": 110
812
  },
813
  {
814
  "epoch": 11.1,
815
- "grad_norm": 0.10264372825622559,
816
  "learning_rate": 8.631159769049965e-05,
817
- "loss": 0.0028,
818
  "step": 111
819
  },
820
  {
821
  "epoch": 11.2,
822
- "grad_norm": 0.021233167499303818,
823
  "learning_rate": 8.596699001693255e-05,
824
- "loss": 0.0018,
825
  "step": 112
826
  },
827
  {
828
  "epoch": 11.3,
829
- "grad_norm": 0.06390991806983948,
830
  "learning_rate": 8.561880484756725e-05,
831
- "loss": 0.0018,
832
  "step": 113
833
  },
834
  {
835
  "epoch": 11.4,
836
- "grad_norm": 0.1139807403087616,
837
  "learning_rate": 8.526707681502044e-05,
838
- "loss": 0.0036,
839
  "step": 114
840
  },
841
  {
842
  "epoch": 11.5,
843
- "grad_norm": 0.018219145014882088,
844
  "learning_rate": 8.491184090430364e-05,
845
- "loss": 0.0019,
846
  "step": 115
847
  },
848
  {
849
  "epoch": 11.6,
850
- "grad_norm": 0.03801802918314934,
851
  "learning_rate": 8.455313244934324e-05,
852
- "loss": 0.0024,
853
  "step": 116
854
  },
855
  {
856
  "epoch": 11.7,
857
- "grad_norm": 0.052779678255319595,
858
  "learning_rate": 8.419098712946601e-05,
859
- "loss": 0.0033,
860
  "step": 117
861
  },
862
  {
863
  "epoch": 11.8,
864
- "grad_norm": 0.15576517581939697,
865
  "learning_rate": 8.382544096585027e-05,
866
- "loss": 0.0032,
867
  "step": 118
868
  },
869
  {
870
  "epoch": 11.9,
871
- "grad_norm": 0.050439249724149704,
872
  "learning_rate": 8.345653031794292e-05,
873
- "loss": 0.0032,
874
  "step": 119
875
  },
876
  {
877
  "epoch": 12.0,
878
- "grad_norm": 0.13610731065273285,
879
  "learning_rate": 8.308429187984297e-05,
880
- "loss": 0.0044,
881
  "step": 120
882
  },
883
  {
884
  "epoch": 12.1,
885
- "grad_norm": 0.03380730748176575,
886
  "learning_rate": 8.270876267665173e-05,
887
- "loss": 0.0025,
888
  "step": 121
889
  },
890
  {
891
  "epoch": 12.2,
892
- "grad_norm": 0.032273851335048676,
893
  "learning_rate": 8.232998006078997e-05,
894
- "loss": 0.002,
895
  "step": 122
896
  },
897
  {
898
  "epoch": 12.3,
899
- "grad_norm": 0.021625736728310585,
900
  "learning_rate": 8.19479817082828e-05,
901
- "loss": 0.0023,
902
  "step": 123
903
  },
904
  {
905
  "epoch": 12.4,
906
- "grad_norm": 0.050165340304374695,
907
  "learning_rate": 8.156280561501195e-05,
908
- "loss": 0.0025,
909
  "step": 124
910
  },
911
  {
912
  "epoch": 12.5,
913
- "grad_norm": 0.052705712616443634,
914
  "learning_rate": 8.117449009293668e-05,
915
- "loss": 0.0031,
916
  "step": 125
917
  },
918
  {
919
  "epoch": 12.5,
920
- "eval_loss": 0.014472348615527153,
921
- "eval_runtime": 7.0196,
922
- "eval_samples_per_second": 14.246,
923
- "eval_steps_per_second": 0.427,
924
  "step": 125
925
  },
926
  {
927
  "epoch": 12.6,
928
- "grad_norm": 0.05300145596265793,
929
  "learning_rate": 8.07830737662829e-05,
930
- "loss": 0.0023,
931
  "step": 126
932
  },
933
  {
934
  "epoch": 12.7,
935
- "grad_norm": 0.06016397848725319,
936
  "learning_rate": 8.038859556770151e-05,
937
- "loss": 0.0027,
938
  "step": 127
939
  },
940
  {
941
  "epoch": 12.8,
942
- "grad_norm": 0.06083128601312637,
943
  "learning_rate": 7.999109473439569e-05,
944
- "loss": 0.0019,
945
  "step": 128
946
  },
947
  {
948
  "epoch": 12.9,
949
- "grad_norm": 0.036125779151916504,
950
  "learning_rate": 7.959061080421839e-05,
951
  "loss": 0.0026,
952
  "step": 129
953
  },
954
  {
955
  "epoch": 13.0,
956
- "grad_norm": 0.03736874461174011,
957
  "learning_rate": 7.91871836117395e-05,
958
- "loss": 0.0016,
959
  "step": 130
960
  },
961
  {
962
  "epoch": 13.1,
963
- "grad_norm": 0.0378425307571888,
964
  "learning_rate": 7.878085328428369e-05,
965
- "loss": 0.0018,
966
  "step": 131
967
  },
968
  {
969
  "epoch": 13.2,
970
- "grad_norm": 0.06520125269889832,
971
  "learning_rate": 7.83716602379391e-05,
972
- "loss": 0.0037,
973
  "step": 132
974
  },
975
  {
976
  "epoch": 13.3,
977
- "grad_norm": 0.06993651390075684,
978
  "learning_rate": 7.795964517353735e-05,
979
- "loss": 0.0021,
980
  "step": 133
981
  },
982
  {
983
  "epoch": 13.4,
984
- "grad_norm": 0.0514182485640049,
985
  "learning_rate": 7.754484907260513e-05,
986
- "loss": 0.0023,
987
  "step": 134
988
  },
989
  {
990
  "epoch": 13.5,
991
- "grad_norm": 0.0771847516298294,
992
  "learning_rate": 7.712731319328798e-05,
993
- "loss": 0.0022,
994
  "step": 135
995
  },
996
  {
997
  "epoch": 13.6,
998
- "grad_norm": 0.02829659916460514,
999
  "learning_rate": 7.670707906624644e-05,
1000
- "loss": 0.0016,
1001
  "step": 136
1002
  },
1003
  {
1004
  "epoch": 13.7,
1005
- "grad_norm": 0.08551648259162903,
1006
  "learning_rate": 7.628418849052523e-05,
1007
- "loss": 0.0024,
1008
  "step": 137
1009
  },
1010
  {
1011
  "epoch": 13.8,
1012
- "grad_norm": 0.09427579492330551,
1013
  "learning_rate": 7.585868352939563e-05,
1014
- "loss": 0.0016,
1015
  "step": 138
1016
  },
1017
  {
1018
  "epoch": 13.9,
1019
- "grad_norm": 0.04036640748381615,
1020
  "learning_rate": 7.543060650617158e-05,
1021
- "loss": 0.0018,
1022
  "step": 139
1023
  },
1024
  {
1025
  "epoch": 14.0,
1026
- "grad_norm": 0.19952990114688873,
1027
  "learning_rate": 7.500000000000001e-05,
1028
- "loss": 0.0025,
1029
  "step": 140
1030
  },
1031
  {
1032
  "epoch": 14.1,
1033
- "grad_norm": 0.11951940506696701,
1034
  "learning_rate": 7.456690684162557e-05,
1035
- "loss": 0.0026,
1036
  "step": 141
1037
  },
1038
  {
1039
  "epoch": 14.2,
1040
- "grad_norm": 0.043521635234355927,
1041
  "learning_rate": 7.413137010913054e-05,
1042
- "loss": 0.0019,
1043
  "step": 142
1044
  },
1045
  {
1046
  "epoch": 14.3,
1047
- "grad_norm": 0.07670493423938751,
1048
  "learning_rate": 7.369343312364993e-05,
1049
- "loss": 0.002,
1050
  "step": 143
1051
  },
1052
  {
1053
  "epoch": 14.4,
1054
- "grad_norm": 0.027879884466528893,
1055
  "learning_rate": 7.325313944506254e-05,
1056
- "loss": 0.0015,
1057
  "step": 144
1058
  },
1059
  {
1060
  "epoch": 14.5,
1061
- "grad_norm": 0.05514749884605408,
1062
  "learning_rate": 7.281053286765815e-05,
1063
- "loss": 0.0018,
1064
  "step": 145
1065
  },
1066
  {
1067
  "epoch": 14.6,
1068
- "grad_norm": 0.06391794979572296,
1069
  "learning_rate": 7.236565741578163e-05,
1070
- "loss": 0.0024,
1071
  "step": 146
1072
  },
1073
  {
1074
  "epoch": 14.7,
1075
- "grad_norm": 0.08744440227746964,
1076
  "learning_rate": 7.191855733945387e-05,
1077
- "loss": 0.0049,
1078
  "step": 147
1079
  },
1080
  {
1081
  "epoch": 14.8,
1082
- "grad_norm": 0.056523509323596954,
1083
  "learning_rate": 7.146927710997047e-05,
1084
- "loss": 0.0024,
1085
  "step": 148
1086
  },
1087
  {
1088
  "epoch": 14.9,
1089
- "grad_norm": 0.028166329488158226,
1090
  "learning_rate": 7.101786141547828e-05,
1091
- "loss": 0.0018,
1092
  "step": 149
1093
  },
1094
  {
1095
  "epoch": 15.0,
1096
- "grad_norm": 0.09874721616506577,
1097
  "learning_rate": 7.056435515653059e-05,
1098
- "loss": 0.0022,
1099
  "step": 150
1100
  },
1101
  {
1102
  "epoch": 15.0,
1103
- "eval_loss": 0.023497436195611954,
1104
- "eval_runtime": 7.0483,
1105
- "eval_samples_per_second": 14.188,
1106
  "eval_steps_per_second": 0.426,
1107
  "step": 150
1108
  },
1109
  {
1110
  "epoch": 15.1,
1111
- "grad_norm": 0.020559396594762802,
1112
  "learning_rate": 7.010880344162088e-05,
1113
- "loss": 0.0015,
1114
  "step": 151
1115
  },
1116
  {
1117
  "epoch": 15.2,
1118
- "grad_norm": 0.06717398762702942,
1119
  "learning_rate": 6.965125158269619e-05,
1120
- "loss": 0.0022,
1121
  "step": 152
1122
  },
1123
  {
1124
  "epoch": 15.3,
1125
- "grad_norm": 0.052798088639974594,
1126
  "learning_rate": 6.919174509065004e-05,
1127
- "loss": 0.0029,
1128
  "step": 153
1129
  },
1130
  {
1131
  "epoch": 15.4,
1132
- "grad_norm": 0.04526599869132042,
1133
  "learning_rate": 6.873032967079561e-05,
1134
- "loss": 0.0022,
1135
  "step": 154
1136
  },
1137
  {
1138
  "epoch": 15.5,
1139
- "grad_norm": 0.045334987342357635,
1140
  "learning_rate": 6.826705121831976e-05,
1141
- "loss": 0.0033,
1142
  "step": 155
1143
  },
1144
  {
1145
  "epoch": 15.6,
1146
- "grad_norm": 0.02370765618979931,
1147
  "learning_rate": 6.780195581371784e-05,
1148
- "loss": 0.0022,
1149
  "step": 156
1150
  },
1151
  {
1152
  "epoch": 15.7,
1153
- "grad_norm": 0.034078944474458694,
1154
  "learning_rate": 6.733508971821036e-05,
1155
- "loss": 0.0021,
1156
  "step": 157
1157
  },
1158
  {
1159
  "epoch": 15.8,
1160
- "grad_norm": 0.04473605379462242,
1161
  "learning_rate": 6.686649936914152e-05,
1162
- "loss": 0.0019,
1163
  "step": 158
1164
  },
1165
  {
1166
  "epoch": 15.9,
1167
- "grad_norm": 0.03901509568095207,
1168
  "learning_rate": 6.639623137536023e-05,
1169
- "loss": 0.002,
1170
  "step": 159
1171
  },
1172
  {
1173
  "epoch": 16.0,
1174
- "grad_norm": 0.027788842096924782,
1175
  "learning_rate": 6.592433251258423e-05,
1176
- "loss": 0.0014,
1177
  "step": 160
1178
  },
1179
  {
1180
  "epoch": 16.1,
1181
- "grad_norm": 0.02930135279893875,
1182
  "learning_rate": 6.545084971874738e-05,
1183
- "loss": 0.0017,
1184
  "step": 161
1185
  },
1186
  {
1187
  "epoch": 16.2,
1188
- "grad_norm": 0.010466442443430424,
1189
  "learning_rate": 6.497583008933097e-05,
1190
  "loss": 0.0014,
1191
  "step": 162
1192
  },
1193
  {
1194
  "epoch": 16.3,
1195
- "grad_norm": 0.021891970187425613,
1196
  "learning_rate": 6.449932087267932e-05,
1197
- "loss": 0.0016,
1198
  "step": 163
1199
  },
1200
  {
1201
  "epoch": 16.4,
1202
- "grad_norm": 0.012705606408417225,
1203
  "learning_rate": 6.402136946530014e-05,
1204
  "loss": 0.0013,
1205
  "step": 164
1206
  },
1207
  {
1208
  "epoch": 16.5,
1209
- "grad_norm": 0.019639883190393448,
1210
  "learning_rate": 6.354202340715026e-05,
1211
- "loss": 0.0016,
1212
  "step": 165
1213
  },
1214
  {
1215
  "epoch": 16.6,
1216
- "grad_norm": 0.03136239945888519,
1217
  "learning_rate": 6.306133037690693e-05,
1218
- "loss": 0.0019,
1219
  "step": 166
1220
  },
1221
  {
1222
  "epoch": 16.7,
1223
- "grad_norm": 0.04432203993201256,
1224
  "learning_rate": 6.257933818722543e-05,
1225
- "loss": 0.0016,
1226
  "step": 167
1227
  },
1228
  {
1229
  "epoch": 16.8,
1230
- "grad_norm": 0.06362082064151764,
1231
  "learning_rate": 6.209609477998338e-05,
1232
- "loss": 0.0025,
1233
  "step": 168
1234
  },
1235
  {
1236
  "epoch": 16.9,
1237
- "grad_norm": 0.03577618673443794,
1238
  "learning_rate": 6.161164822151213e-05,
1239
- "loss": 0.0018,
1240
  "step": 169
1241
  },
1242
  {
1243
  "epoch": 17.0,
1244
- "grad_norm": 0.033404137939214706,
1245
  "learning_rate": 6.112604669781572e-05,
1246
- "loss": 0.0017,
1247
  "step": 170
1248
  },
1249
  {
1250
  "epoch": 17.1,
1251
- "grad_norm": 0.0031848133075982332,
1252
  "learning_rate": 6.063933850977811e-05,
1253
- "loss": 0.0012,
1254
  "step": 171
1255
  },
1256
  {
1257
  "epoch": 17.2,
1258
- "grad_norm": 0.02553616650402546,
1259
  "learning_rate": 6.015157206835881e-05,
1260
- "loss": 0.0014,
1261
  "step": 172
1262
  },
1263
  {
1264
  "epoch": 17.3,
1265
- "grad_norm": 0.019564760848879814,
1266
  "learning_rate": 5.9662795889777666e-05,
1267
  "loss": 0.0013,
1268
  "step": 173
1269
  },
1270
  {
1271
  "epoch": 17.4,
1272
- "grad_norm": 0.00845835916697979,
1273
  "learning_rate": 5.917305859068912e-05,
1274
- "loss": 0.0013,
1275
  "step": 174
1276
  },
1277
  {
1278
  "epoch": 17.5,
1279
- "grad_norm": 0.008497758768498898,
1280
  "learning_rate": 5.868240888334653e-05,
1281
  "loss": 0.0013,
1282
  "step": 175
1283
  },
1284
  {
1285
  "epoch": 17.5,
1286
- "eval_loss": 0.01870564930140972,
1287
- "eval_runtime": 7.0134,
1288
- "eval_samples_per_second": 14.258,
1289
- "eval_steps_per_second": 0.428,
1290
  "step": 175
1291
  },
1292
  {
1293
  "epoch": 17.6,
1294
- "grad_norm": 0.04741276800632477,
1295
  "learning_rate": 5.819089557075689e-05,
1296
- "loss": 0.0018,
1297
  "step": 176
1298
  },
1299
  {
1300
  "epoch": 17.7,
1301
- "grad_norm": 0.014859266579151154,
1302
  "learning_rate": 5.7698567541826675e-05,
1303
- "loss": 0.0014,
1304
  "step": 177
1305
  },
1306
  {
1307
  "epoch": 17.8,
1308
- "grad_norm": 0.05082236975431442,
1309
  "learning_rate": 5.7205473766499005e-05,
1310
  "loss": 0.0025,
1311
  "step": 178
1312
  },
1313
  {
1314
  "epoch": 17.9,
1315
- "grad_norm": 0.05401023477315903,
1316
  "learning_rate": 5.6711663290882776e-05,
1317
- "loss": 0.0024,
1318
  "step": 179
1319
  },
1320
  {
1321
  "epoch": 18.0,
1322
- "grad_norm": 0.010000503621995449,
1323
  "learning_rate": 5.621718523237427e-05,
1324
- "loss": 0.0014,
1325
  "step": 180
1326
  },
1327
  {
1328
  "epoch": 18.1,
1329
- "grad_norm": 0.020556163042783737,
1330
  "learning_rate": 5.57220887747716e-05,
1331
- "loss": 0.0016,
1332
  "step": 181
1333
  },
1334
  {
1335
  "epoch": 18.2,
1336
- "grad_norm": 0.004740948788821697,
1337
  "learning_rate": 5.522642316338268e-05,
1338
- "loss": 0.0013,
1339
  "step": 182
1340
  },
1341
  {
1342
  "epoch": 18.3,
1343
- "grad_norm": 0.014636721462011337,
1344
  "learning_rate": 5.473023770012686e-05,
1345
- "loss": 0.0017,
1346
  "step": 183
1347
  },
1348
  {
1349
  "epoch": 18.4,
1350
- "grad_norm": 0.004370884504169226,
1351
  "learning_rate": 5.4233581738631165e-05,
1352
  "loss": 0.0013,
1353
  "step": 184
1354
  },
1355
  {
1356
  "epoch": 18.5,
1357
- "grad_norm": 0.03240854665637016,
1358
  "learning_rate": 5.373650467932122e-05,
1359
- "loss": 0.0016,
1360
  "step": 185
1361
  },
1362
  {
1363
  "epoch": 18.6,
1364
- "grad_norm": 0.04714665934443474,
1365
  "learning_rate": 5.323905596450759e-05,
1366
- "loss": 0.0015,
1367
  "step": 186
1368
  },
1369
  {
1370
  "epoch": 18.7,
1371
- "grad_norm": 0.018981872126460075,
1372
  "learning_rate": 5.274128507346801e-05,
1373
  "loss": 0.0013,
1374
  "step": 187
1375
  },
1376
  {
1377
  "epoch": 18.8,
1378
- "grad_norm": 0.013816704973578453,
1379
  "learning_rate": 5.2243241517525754e-05,
1380
  "loss": 0.0013,
1381
  "step": 188
1382
  },
1383
  {
1384
  "epoch": 18.9,
1385
- "grad_norm": 0.01641033962368965,
1386
  "learning_rate": 5.174497483512506e-05,
1387
  "loss": 0.0013,
1388
  "step": 189
1389
  },
1390
  {
1391
  "epoch": 19.0,
1392
- "grad_norm": 0.01083611510694027,
1393
  "learning_rate": 5.124653458690365e-05,
1394
  "loss": 0.0013,
1395
  "step": 190
1396
  },
1397
  {
1398
  "epoch": 19.1,
1399
- "grad_norm": 0.0032175600063055754,
1400
  "learning_rate": 5.074797035076319e-05,
1401
  "loss": 0.0013,
1402
  "step": 191
1403
  },
1404
  {
1405
  "epoch": 19.2,
1406
- "grad_norm": 0.0029091965407133102,
1407
  "learning_rate": 5.024933171693791e-05,
1408
- "loss": 0.0013,
1409
  "step": 192
1410
  },
1411
  {
1412
  "epoch": 19.3,
1413
- "grad_norm": 0.0017372623551636934,
1414
  "learning_rate": 4.9750668283062104e-05,
1415
- "loss": 0.0012,
1416
  "step": 193
1417
  },
1418
  {
1419
  "epoch": 19.4,
1420
- "grad_norm": 0.018875645473599434,
1421
  "learning_rate": 4.925202964923683e-05,
1422
- "loss": 0.0013,
1423
  "step": 194
1424
  },
1425
  {
1426
  "epoch": 19.5,
1427
- "grad_norm": 0.08334866166114807,
1428
  "learning_rate": 4.875346541309637e-05,
1429
- "loss": 0.0017,
1430
  "step": 195
1431
  },
1432
  {
1433
  "epoch": 19.6,
1434
- "grad_norm": 0.02136778086423874,
1435
  "learning_rate": 4.825502516487497e-05,
1436
- "loss": 0.0013,
1437
  "step": 196
1438
  },
1439
  {
1440
  "epoch": 19.7,
1441
- "grad_norm": 0.015435784123837948,
1442
  "learning_rate": 4.775675848247427e-05,
1443
  "loss": 0.0013,
1444
  "step": 197
1445
  },
1446
  {
1447
  "epoch": 19.8,
1448
- "grad_norm": 0.0207098126411438,
1449
  "learning_rate": 4.725871492653199e-05,
1450
- "loss": 0.0013,
1451
  "step": 198
1452
  },
1453
  {
1454
  "epoch": 19.9,
1455
- "grad_norm": 0.02912098728120327,
1456
  "learning_rate": 4.6760944035492404e-05,
1457
  "loss": 0.0014,
1458
  "step": 199
1459
  },
1460
  {
1461
  "epoch": 20.0,
1462
- "grad_norm": 0.0012635978637263179,
1463
  "learning_rate": 4.626349532067879e-05,
1464
  "loss": 0.0012,
1465
  "step": 200
1466
  },
1467
  {
1468
  "epoch": 20.0,
1469
- "eval_loss": 0.02464105747640133,
1470
- "eval_runtime": 7.0042,
1471
- "eval_samples_per_second": 14.277,
1472
- "eval_steps_per_second": 0.428,
1473
  "step": 200
1474
  },
1475
  {
1476
  "epoch": 20.1,
1477
- "grad_norm": 0.024539776146411896,
1478
  "learning_rate": 4.576641826136884e-05,
1479
- "loss": 0.0013,
1480
  "step": 201
1481
  },
1482
  {
1483
  "epoch": 20.2,
1484
- "grad_norm": 0.04463370889425278,
1485
  "learning_rate": 4.526976229987315e-05,
1486
- "loss": 0.0015,
1487
  "step": 202
1488
  },
1489
  {
1490
  "epoch": 20.3,
1491
- "grad_norm": 0.002574489451944828,
1492
  "learning_rate": 4.477357683661734e-05,
1493
  "loss": 0.0013,
1494
  "step": 203
1495
  },
1496
  {
1497
  "epoch": 20.4,
1498
- "grad_norm": 0.022832421585917473,
1499
  "learning_rate": 4.4277911225228414e-05,
1500
- "loss": 0.0014,
1501
  "step": 204
1502
  },
1503
  {
1504
  "epoch": 20.5,
1505
- "grad_norm": 0.011037301272153854,
1506
  "learning_rate": 4.378281476762576e-05,
1507
- "loss": 0.0013,
1508
  "step": 205
1509
  },
1510
  {
1511
  "epoch": 20.6,
1512
- "grad_norm": 0.0005778741906397045,
1513
  "learning_rate": 4.328833670911724e-05,
1514
- "loss": 0.0012,
1515
  "step": 206
1516
  },
1517
  {
1518
  "epoch": 20.7,
1519
- "grad_norm": 0.0034062073100358248,
1520
  "learning_rate": 4.2794526233501006e-05,
1521
  "loss": 0.0012,
1522
  "step": 207
1523
  },
1524
  {
1525
  "epoch": 20.8,
1526
- "grad_norm": 0.00416824035346508,
1527
  "learning_rate": 4.230143245817332e-05,
1528
- "loss": 0.0012,
1529
  "step": 208
1530
  },
1531
  {
1532
  "epoch": 20.9,
1533
- "grad_norm": 0.0014664290938526392,
1534
  "learning_rate": 4.180910442924312e-05,
1535
  "loss": 0.0013,
1536
  "step": 209
1537
  },
1538
  {
1539
  "epoch": 21.0,
1540
- "grad_norm": 0.0013818548759445548,
1541
  "learning_rate": 4.131759111665349e-05,
1542
- "loss": 0.0012,
1543
  "step": 210
1544
  },
1545
  {
1546
  "epoch": 21.1,
1547
- "grad_norm": 0.014141053892672062,
1548
  "learning_rate": 4.082694140931089e-05,
1549
  "loss": 0.0013,
1550
  "step": 211
1551
  },
1552
  {
1553
  "epoch": 21.2,
1554
- "grad_norm": 0.0033045061863958836,
1555
  "learning_rate": 4.0337204110222346e-05,
1556
- "loss": 0.0012,
1557
  "step": 212
1558
  },
1559
  {
1560
  "epoch": 21.3,
1561
- "grad_norm": 0.01347325835376978,
1562
  "learning_rate": 3.98484279316412e-05,
1563
  "loss": 0.0013,
1564
  "step": 213
1565
  },
1566
  {
1567
  "epoch": 21.4,
1568
- "grad_norm": 0.0014377759071066976,
1569
  "learning_rate": 3.936066149022191e-05,
1570
  "loss": 0.0013,
1571
  "step": 214
1572
  },
1573
  {
1574
  "epoch": 21.5,
1575
- "grad_norm": 0.004647238180041313,
1576
  "learning_rate": 3.887395330218429e-05,
1577
- "loss": 0.0012,
1578
  "step": 215
1579
  },
1580
  {
1581
  "epoch": 21.6,
1582
- "grad_norm": 0.00935914646834135,
1583
  "learning_rate": 3.838835177848788e-05,
1584
- "loss": 0.0013,
1585
  "step": 216
1586
  },
1587
  {
1588
  "epoch": 21.7,
1589
- "grad_norm": 0.0008201024029403925,
1590
  "learning_rate": 3.790390522001662e-05,
1591
  "loss": 0.0013,
1592
  "step": 217
1593
  },
1594
  {
1595
  "epoch": 21.8,
1596
- "grad_norm": 0.0038301898166537285,
1597
  "learning_rate": 3.742066181277458e-05,
1598
  "loss": 0.0013,
1599
  "step": 218
1600
  },
1601
  {
1602
  "epoch": 21.9,
1603
- "grad_norm": 0.023225074633955956,
1604
  "learning_rate": 3.6938669623093084e-05,
1605
- "loss": 0.0018,
1606
  "step": 219
1607
  },
1608
  {
1609
  "epoch": 22.0,
1610
- "grad_norm": 0.0008843315881676972,
1611
  "learning_rate": 3.6457976592849754e-05,
1612
  "loss": 0.0013,
1613
  "step": 220
1614
  },
1615
  {
1616
  "epoch": 22.1,
1617
- "grad_norm": 0.0008087409660220146,
1618
  "learning_rate": 3.597863053469987e-05,
1619
- "loss": 0.0012,
1620
  "step": 221
1621
  },
1622
  {
1623
  "epoch": 22.2,
1624
- "grad_norm": 0.0007809096714481711,
1625
  "learning_rate": 3.550067912732069e-05,
1626
- "loss": 0.0012,
1627
  "step": 222
1628
  },
1629
  {
1630
  "epoch": 22.3,
1631
- "grad_norm": 0.0003785623121075332,
1632
  "learning_rate": 3.502416991066904e-05,
1633
- "loss": 0.0012,
1634
  "step": 223
1635
  },
1636
  {
1637
  "epoch": 22.4,
1638
- "grad_norm": 0.0011643291218206286,
1639
  "learning_rate": 3.4549150281252636e-05,
1640
  "loss": 0.0013,
1641
  "step": 224
1642
  },
1643
  {
1644
  "epoch": 22.5,
1645
- "grad_norm": 0.00037547224201261997,
1646
  "learning_rate": 3.4075667487415785e-05,
1647
  "loss": 0.0013,
1648
  "step": 225
1649
  },
1650
  {
1651
  "epoch": 22.5,
1652
- "eval_loss": 0.02635515108704567,
1653
- "eval_runtime": 7.0286,
1654
- "eval_samples_per_second": 14.228,
1655
- "eval_steps_per_second": 0.427,
1656
  "step": 225
1657
  },
1658
  {
1659
- "epoch": 22.6,
1660
- "grad_norm": 0.002436364535242319,
1661
- "learning_rate": 3.360376862463979e-05,
1662
- "loss": 0.0012,
1663
- "step": 226
1664
- },
1665
- {
1666
- "epoch": 22.7,
1667
- "grad_norm": 0.000468397862277925,
1668
- "learning_rate": 3.313350063085851e-05,
1669
- "loss": 0.0012,
1670
- "step": 227
1671
- },
1672
- {
1673
- "epoch": 22.8,
1674
- "grad_norm": 0.0013973162276670337,
1675
- "learning_rate": 3.266491028178964e-05,
1676
- "loss": 0.0013,
1677
- "step": 228
1678
- },
1679
- {
1680
- "epoch": 22.9,
1681
- "grad_norm": 0.000565136200748384,
1682
- "learning_rate": 3.219804418628216e-05,
1683
- "loss": 0.0012,
1684
- "step": 229
1685
- },
1686
- {
1687
- "epoch": 23.0,
1688
- "grad_norm": 0.0004575321509037167,
1689
- "learning_rate": 3.173294878168025e-05,
1690
- "loss": 0.0013,
1691
- "step": 230
1692
- },
1693
- {
1694
- "epoch": 23.1,
1695
- "grad_norm": 0.0003873241657856852,
1696
- "learning_rate": 3.12696703292044e-05,
1697
- "loss": 0.0013,
1698
- "step": 231
1699
- },
1700
- {
1701
- "epoch": 23.2,
1702
- "grad_norm": 0.00041245773900300264,
1703
- "learning_rate": 3.080825490934999e-05,
1704
- "loss": 0.0013,
1705
- "step": 232
1706
- },
1707
- {
1708
- "epoch": 23.3,
1709
- "grad_norm": 0.0005566985928453505,
1710
- "learning_rate": 3.0348748417303823e-05,
1711
- "loss": 0.0012,
1712
- "step": 233
1713
- },
1714
- {
1715
- "epoch": 23.4,
1716
- "grad_norm": 0.002370474860072136,
1717
- "learning_rate": 2.989119655837913e-05,
1718
- "loss": 0.0013,
1719
- "step": 234
1720
- },
1721
- {
1722
- "epoch": 23.5,
1723
- "grad_norm": 0.0008109980844892561,
1724
- "learning_rate": 2.9435644843469436e-05,
1725
- "loss": 0.0013,
1726
- "step": 235
1727
- },
1728
- {
1729
- "epoch": 23.6,
1730
- "grad_norm": 0.0003989999822806567,
1731
- "learning_rate": 2.8982138584521735e-05,
1732
- "loss": 0.0013,
1733
- "step": 236
1734
- },
1735
- {
1736
- "epoch": 23.7,
1737
- "grad_norm": 0.0007184173446148634,
1738
- "learning_rate": 2.8530722890029537e-05,
1739
- "loss": 0.0013,
1740
- "step": 237
1741
- },
1742
- {
1743
- "epoch": 23.8,
1744
- "grad_norm": 0.0005140411667525768,
1745
- "learning_rate": 2.8081442660546125e-05,
1746
- "loss": 0.0013,
1747
- "step": 238
1748
- },
1749
- {
1750
- "epoch": 23.9,
1751
- "grad_norm": 0.000472583866212517,
1752
- "learning_rate": 2.7634342584218365e-05,
1753
- "loss": 0.0013,
1754
- "step": 239
1755
- },
1756
- {
1757
- "epoch": 24.0,
1758
- "grad_norm": 0.0009467861382290721,
1759
- "learning_rate": 2.718946713234185e-05,
1760
- "loss": 0.0013,
1761
- "step": 240
1762
- },
1763
- {
1764
- "epoch": 24.1,
1765
- "grad_norm": 0.0005134555394761264,
1766
- "learning_rate": 2.674686055493748e-05,
1767
- "loss": 0.0012,
1768
- "step": 241
1769
- },
1770
- {
1771
- "epoch": 24.2,
1772
- "grad_norm": 0.0004058448248542845,
1773
- "learning_rate": 2.630656687635007e-05,
1774
- "loss": 0.0013,
1775
- "step": 242
1776
- },
1777
- {
1778
- "epoch": 24.3,
1779
- "grad_norm": 0.0005244086496531963,
1780
- "learning_rate": 2.5868629890869468e-05,
1781
- "loss": 0.0012,
1782
- "step": 243
1783
- },
1784
- {
1785
- "epoch": 24.4,
1786
- "grad_norm": 0.0005328291445039213,
1787
- "learning_rate": 2.543309315837444e-05,
1788
- "loss": 0.0013,
1789
- "step": 244
1790
- },
1791
- {
1792
- "epoch": 24.5,
1793
- "grad_norm": 0.0020896121859550476,
1794
- "learning_rate": 2.500000000000001e-05,
1795
- "loss": 0.0012,
1796
- "step": 245
1797
- },
1798
- {
1799
- "epoch": 24.6,
1800
- "grad_norm": 0.000433528795838356,
1801
- "learning_rate": 2.456939349382843e-05,
1802
- "loss": 0.0012,
1803
- "step": 246
1804
- },
1805
- {
1806
- "epoch": 24.7,
1807
- "grad_norm": 0.00044738021097145975,
1808
- "learning_rate": 2.4141316470604362e-05,
1809
- "loss": 0.0013,
1810
- "step": 247
1811
- },
1812
- {
1813
- "epoch": 24.8,
1814
- "grad_norm": 0.0004753637476824224,
1815
- "learning_rate": 2.371581150947476e-05,
1816
- "loss": 0.0012,
1817
- "step": 248
1818
- },
1819
- {
1820
- "epoch": 24.9,
1821
- "grad_norm": 0.0004613220226019621,
1822
- "learning_rate": 2.3292920933753566e-05,
1823
- "loss": 0.0013,
1824
- "step": 249
1825
- },
1826
- {
1827
- "epoch": 25.0,
1828
- "grad_norm": 0.0004400379257276654,
1829
- "learning_rate": 2.2872686806712035e-05,
1830
- "loss": 0.0013,
1831
- "step": 250
1832
- },
1833
- {
1834
- "epoch": 25.0,
1835
- "eval_loss": 0.027748363092541695,
1836
- "eval_runtime": 7.0062,
1837
- "eval_samples_per_second": 14.273,
1838
- "eval_steps_per_second": 0.428,
1839
- "step": 250
1840
- },
1841
- {
1842
- "epoch": 25.0,
1843
- "step": 250,
1844
- "total_flos": 3.832789293855867e+17,
1845
- "train_loss": 0.011958676076494158,
1846
- "train_runtime": 2822.6775,
1847
- "train_samples_per_second": 5.617,
1848
- "train_steps_per_second": 0.124
1849
  }
1850
  ],
1851
  "logging_steps": 1,
@@ -1860,7 +1677,7 @@
1860
  "early_stopping_threshold": 0.0
1861
  },
1862
  "attributes": {
1863
- "early_stopping_patience_counter": 3
1864
  }
1865
  },
1866
  "TrainerControl": {
@@ -1874,7 +1691,7 @@
1874
  "attributes": {}
1875
  }
1876
  },
1877
- "total_flos": 3.832789293855867e+17,
1878
  "train_batch_size": 48,
1879
  "trial_name": null,
1880
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.015364531427621841,
3
  "best_model_checkpoint": "/home/paperspace/Data/models/Klystroglobal/llm3br256-v1.5/checkpoint-100",
4
+ "epoch": 22.5,
5
  "eval_steps": 25,
6
+ "global_step": 225,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.1,
13
+ "grad_norm": 0.8908795714378357,
14
  "learning_rate": 2.8571428571428573e-06,
15
+ "loss": 0.1562,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.2,
20
+ "grad_norm": 0.8964537382125854,
21
  "learning_rate": 5.7142857142857145e-06,
22
+ "loss": 0.1295,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.3,
27
+ "grad_norm": 0.9193893671035767,
28
  "learning_rate": 8.571428571428573e-06,
29
+ "loss": 0.1632,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.4,
34
+ "grad_norm": 0.7271831035614014,
35
  "learning_rate": 1.1428571428571429e-05,
36
+ "loss": 0.1368,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.5,
41
+ "grad_norm": 0.5309418439865112,
42
  "learning_rate": 1.4285714285714285e-05,
43
+ "loss": 0.1199,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.6,
48
+ "grad_norm": 0.33410048484802246,
49
  "learning_rate": 1.7142857142857145e-05,
50
+ "loss": 0.0867,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.7,
55
+ "grad_norm": 0.47341036796569824,
56
  "learning_rate": 2e-05,
57
+ "loss": 0.1135,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.8,
62
+ "grad_norm": 0.6067326664924622,
63
  "learning_rate": 2.2857142857142858e-05,
64
+ "loss": 0.1035,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.9,
69
+ "grad_norm": 0.5303863883018494,
70
  "learning_rate": 2.5714285714285714e-05,
71
+ "loss": 0.1129,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 1.0,
76
+ "grad_norm": 0.34315383434295654,
77
  "learning_rate": 2.857142857142857e-05,
78
+ "loss": 0.0841,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 1.1,
83
+ "grad_norm": 0.1800101101398468,
84
  "learning_rate": 3.142857142857143e-05,
85
+ "loss": 0.0747,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 1.2,
90
+ "grad_norm": 0.16804951429367065,
91
  "learning_rate": 3.428571428571429e-05,
92
+ "loss": 0.0697,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 1.3,
97
+ "grad_norm": 0.1874297857284546,
98
  "learning_rate": 3.7142857142857143e-05,
99
+ "loss": 0.0706,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 1.4,
104
+ "grad_norm": 0.23576809465885162,
105
  "learning_rate": 4e-05,
106
+ "loss": 0.0698,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 1.5,
111
+ "grad_norm": 0.20643283426761627,
112
  "learning_rate": 4.2857142857142856e-05,
113
+ "loss": 0.0627,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 1.6,
118
+ "grad_norm": 0.13785526156425476,
119
  "learning_rate": 4.5714285714285716e-05,
120
+ "loss": 0.0528,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 1.7,
125
+ "grad_norm": 0.1274898201227188,
126
  "learning_rate": 4.8571428571428576e-05,
127
+ "loss": 0.0587,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 1.8,
132
+ "grad_norm": 0.15090015530586243,
133
  "learning_rate": 5.142857142857143e-05,
134
+ "loss": 0.05,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 1.9,
139
+ "grad_norm": 0.17184874415397644,
140
  "learning_rate": 5.428571428571428e-05,
141
+ "loss": 0.0548,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 2.0,
146
+ "grad_norm": 0.16863049566745758,
147
  "learning_rate": 5.714285714285714e-05,
148
+ "loss": 0.0533,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 2.1,
153
+ "grad_norm": 0.12445452064275742,
154
  "learning_rate": 6e-05,
155
+ "loss": 0.0436,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 2.2,
160
+ "grad_norm": 0.0973101332783699,
161
  "learning_rate": 6.285714285714286e-05,
162
+ "loss": 0.0418,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 2.3,
167
+ "grad_norm": 0.11407855898141861,
168
  "learning_rate": 6.571428571428571e-05,
169
+ "loss": 0.0393,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 2.4,
174
+ "grad_norm": 0.10259675979614258,
175
  "learning_rate": 6.857142857142858e-05,
176
+ "loss": 0.0336,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 2.5,
181
+ "grad_norm": 0.11567441374063492,
182
  "learning_rate": 7.142857142857143e-05,
183
+ "loss": 0.0459,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 2.5,
188
+ "eval_loss": 0.0361514613032341,
189
+ "eval_runtime": 9.5371,
190
+ "eval_samples_per_second": 10.485,
191
+ "eval_steps_per_second": 0.315,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 2.6,
196
+ "grad_norm": 0.1091904267668724,
197
  "learning_rate": 7.428571428571429e-05,
198
+ "loss": 0.0427,
199
  "step": 26
200
  },
201
  {
202
  "epoch": 2.7,
203
+ "grad_norm": 0.10863610357046127,
204
  "learning_rate": 7.714285714285715e-05,
205
+ "loss": 0.0286,
206
  "step": 27
207
  },
208
  {
209
  "epoch": 2.8,
210
+ "grad_norm": 0.10053255409002304,
211
  "learning_rate": 8e-05,
212
  "loss": 0.0377,
213
  "step": 28
214
  },
215
  {
216
  "epoch": 2.9,
217
+ "grad_norm": 0.08568549901247025,
218
  "learning_rate": 8.285714285714287e-05,
219
+ "loss": 0.0304,
220
  "step": 29
221
  },
222
  {
223
  "epoch": 3.0,
224
+ "grad_norm": 0.12581464648246765,
225
  "learning_rate": 8.571428571428571e-05,
226
+ "loss": 0.031,
227
  "step": 30
228
  },
229
  {
230
  "epoch": 3.1,
231
+ "grad_norm": 0.07566319406032562,
232
  "learning_rate": 8.857142857142857e-05,
233
+ "loss": 0.0222,
234
  "step": 31
235
  },
236
  {
237
  "epoch": 3.2,
238
+ "grad_norm": 0.08769939094781876,
239
  "learning_rate": 9.142857142857143e-05,
240
+ "loss": 0.0361,
241
  "step": 32
242
  },
243
  {
244
  "epoch": 3.3,
245
+ "grad_norm": 0.07193590700626373,
246
  "learning_rate": 9.428571428571429e-05,
247
+ "loss": 0.0234,
248
  "step": 33
249
  },
250
  {
251
  "epoch": 3.4,
252
+ "grad_norm": 0.08443808555603027,
253
  "learning_rate": 9.714285714285715e-05,
254
+ "loss": 0.0267,
255
  "step": 34
256
  },
257
  {
258
  "epoch": 3.5,
259
+ "grad_norm": 0.08259449154138565,
260
  "learning_rate": 0.0001,
261
+ "loss": 0.0215,
262
  "step": 35
263
  },
264
  {
265
  "epoch": 3.6,
266
+ "grad_norm": 0.06618565320968628,
267
  "learning_rate": 9.999751334779716e-05,
268
+ "loss": 0.0198,
269
  "step": 36
270
  },
271
  {
272
  "epoch": 3.7,
273
+ "grad_norm": 0.07735524326562881,
274
  "learning_rate": 9.999005363852618e-05,
275
+ "loss": 0.021,
276
  "step": 37
277
  },
278
  {
279
  "epoch": 3.8,
280
+ "grad_norm": 0.08196611702442169,
281
  "learning_rate": 9.997762161417517e-05,
282
+ "loss": 0.0215,
283
  "step": 38
284
  },
285
  {
286
  "epoch": 3.9,
287
+ "grad_norm": 0.07763365656137466,
288
  "learning_rate": 9.996021851130897e-05,
289
+ "loss": 0.0189,
290
  "step": 39
291
  },
292
  {
293
  "epoch": 4.0,
294
+ "grad_norm": 0.09625112265348434,
295
  "learning_rate": 9.993784606094612e-05,
296
+ "loss": 0.0156,
297
  "step": 40
298
  },
299
  {
300
  "epoch": 4.1,
301
+ "grad_norm": 0.06971453130245209,
302
  "learning_rate": 9.991050648838675e-05,
303
+ "loss": 0.0125,
304
  "step": 41
305
  },
306
  {
307
  "epoch": 4.2,
308
+ "grad_norm": 0.06588689982891083,
309
  "learning_rate": 9.987820251299122e-05,
310
+ "loss": 0.0115,
311
  "step": 42
312
  },
313
  {
314
  "epoch": 4.3,
315
+ "grad_norm": 0.0661124512553215,
316
  "learning_rate": 9.984093734790956e-05,
317
+ "loss": 0.0164,
318
  "step": 43
319
  },
320
  {
321
  "epoch": 4.4,
322
+ "grad_norm": 0.06647904217243195,
323
  "learning_rate": 9.979871469976196e-05,
324
+ "loss": 0.0137,
325
  "step": 44
326
  },
327
  {
328
  "epoch": 4.5,
329
+ "grad_norm": 0.07278849184513092,
330
  "learning_rate": 9.975153876827008e-05,
331
+ "loss": 0.0153,
332
  "step": 45
333
  },
334
  {
335
  "epoch": 4.6,
336
+ "grad_norm": 0.08231022208929062,
337
  "learning_rate": 9.969941424583926e-05,
338
+ "loss": 0.0128,
339
  "step": 46
340
  },
341
  {
342
  "epoch": 4.7,
343
+ "grad_norm": 0.09107533097267151,
344
  "learning_rate": 9.964234631709187e-05,
345
+ "loss": 0.016,
346
  "step": 47
347
  },
348
  {
349
  "epoch": 4.8,
350
+ "grad_norm": 0.059027716517448425,
351
  "learning_rate": 9.958034065835151e-05,
352
+ "loss": 0.0098,
353
  "step": 48
354
  },
355
  {
356
  "epoch": 4.9,
357
+ "grad_norm": 0.07697892934083939,
358
  "learning_rate": 9.951340343707852e-05,
359
+ "loss": 0.0112,
360
  "step": 49
361
  },
362
  {
363
  "epoch": 5.0,
364
+ "grad_norm": 0.07257463783025742,
365
  "learning_rate": 9.944154131125642e-05,
366
+ "loss": 0.0083,
367
  "step": 50
368
  },
369
  {
370
  "epoch": 5.0,
371
+ "eval_loss": 0.018401963636279106,
372
+ "eval_runtime": 7.3766,
373
+ "eval_samples_per_second": 13.556,
374
+ "eval_steps_per_second": 0.407,
375
  "step": 50
376
  },
377
  {
378
  "epoch": 5.1,
379
+ "grad_norm": 0.06557884067296982,
380
  "learning_rate": 9.936476142872979e-05,
381
+ "loss": 0.0093,
382
  "step": 51
383
  },
384
  {
385
  "epoch": 5.2,
386
+ "grad_norm": 0.0577961802482605,
387
  "learning_rate": 9.928307142649316e-05,
388
+ "loss": 0.0075,
389
  "step": 52
390
  },
391
  {
392
  "epoch": 5.3,
393
+ "grad_norm": 0.045840684324502945,
394
  "learning_rate": 9.919647942993148e-05,
395
+ "loss": 0.0053,
396
  "step": 53
397
  },
398
  {
399
  "epoch": 5.4,
400
+ "grad_norm": 0.0694541484117508,
401
  "learning_rate": 9.910499405201195e-05,
402
+ "loss": 0.0086,
403
  "step": 54
404
  },
405
  {
406
  "epoch": 5.5,
407
+ "grad_norm": 0.10548929870128632,
408
  "learning_rate": 9.900862439242719e-05,
409
+ "loss": 0.0069,
410
  "step": 55
411
  },
412
  {
413
  "epoch": 5.6,
414
+ "grad_norm": 0.05059473589062691,
415
  "learning_rate": 9.890738003669029e-05,
416
+ "loss": 0.0048,
417
  "step": 56
418
  },
419
  {
420
  "epoch": 5.7,
421
+ "grad_norm": 0.06550820171833038,
422
  "learning_rate": 9.880127105518122e-05,
423
+ "loss": 0.007,
424
  "step": 57
425
  },
426
  {
427
  "epoch": 5.8,
428
+ "grad_norm": 0.137360081076622,
429
  "learning_rate": 9.869030800214532e-05,
430
+ "loss": 0.0106,
431
  "step": 58
432
  },
433
  {
434
  "epoch": 5.9,
435
+ "grad_norm": 0.08213279396295547,
436
  "learning_rate": 9.857450191464337e-05,
437
+ "loss": 0.0067,
438
  "step": 59
439
  },
440
  {
441
  "epoch": 6.0,
442
+ "grad_norm": 0.14191403985023499,
443
  "learning_rate": 9.84538643114539e-05,
444
+ "loss": 0.0073,
445
  "step": 60
446
  },
447
  {
448
  "epoch": 6.1,
449
+ "grad_norm": 0.0513039231300354,
450
  "learning_rate": 9.832840719192736e-05,
451
+ "loss": 0.0039,
452
  "step": 61
453
  },
454
  {
455
  "epoch": 6.2,
456
+ "grad_norm": 0.10549885779619217,
457
  "learning_rate": 9.819814303479267e-05,
458
+ "loss": 0.0066,
459
  "step": 62
460
  },
461
  {
462
  "epoch": 6.3,
463
+ "grad_norm": 0.08000359684228897,
464
  "learning_rate": 9.806308479691595e-05,
465
+ "loss": 0.0047,
466
  "step": 63
467
  },
468
  {
469
  "epoch": 6.4,
470
+ "grad_norm": 0.07948028296232224,
471
  "learning_rate": 9.792324591201179e-05,
472
+ "loss": 0.0075,
473
  "step": 64
474
  },
475
  {
476
  "epoch": 6.5,
477
+ "grad_norm": 0.10712066292762756,
478
  "learning_rate": 9.777864028930705e-05,
479
+ "loss": 0.0061,
480
  "step": 65
481
  },
482
  {
483
  "epoch": 6.6,
484
+ "grad_norm": 0.07607916742563248,
485
  "learning_rate": 9.76292823121573e-05,
486
+ "loss": 0.0077,
487
  "step": 66
488
  },
489
  {
490
  "epoch": 6.7,
491
+ "grad_norm": 0.04418592527508736,
492
  "learning_rate": 9.747518683661631e-05,
493
+ "loss": 0.005,
494
  "step": 67
495
  },
496
  {
497
  "epoch": 6.8,
498
+ "grad_norm": 0.06003979593515396,
499
  "learning_rate": 9.731636918995821e-05,
500
+ "loss": 0.0056,
501
  "step": 68
502
  },
503
  {
504
  "epoch": 6.9,
505
+ "grad_norm": 0.1066676676273346,
506
  "learning_rate": 9.715284516915303e-05,
507
+ "loss": 0.006,
508
  "step": 69
509
  },
510
  {
511
  "epoch": 7.0,
512
+ "grad_norm": 0.1791234314441681,
513
  "learning_rate": 9.698463103929542e-05,
514
+ "loss": 0.0069,
515
  "step": 70
516
  },
517
  {
518
  "epoch": 7.1,
519
+ "grad_norm": 0.05221160128712654,
520
  "learning_rate": 9.681174353198687e-05,
521
+ "loss": 0.0048,
522
  "step": 71
523
  },
524
  {
525
  "epoch": 7.2,
526
+ "grad_norm": 0.05503857508301735,
527
  "learning_rate": 9.663419984367139e-05,
528
+ "loss": 0.0036,
529
  "step": 72
530
  },
531
  {
532
  "epoch": 7.3,
533
+ "grad_norm": 0.0550958476960659,
534
  "learning_rate": 9.645201763392513e-05,
535
+ "loss": 0.004,
536
  "step": 73
537
  },
538
  {
539
  "epoch": 7.4,
540
+ "grad_norm": 0.13011421263217926,
541
  "learning_rate": 9.626521502369984e-05,
542
  "loss": 0.0054,
543
  "step": 74
544
  },
545
  {
546
  "epoch": 7.5,
547
+ "grad_norm": 0.07773306965827942,
548
  "learning_rate": 9.607381059352038e-05,
549
+ "loss": 0.0058,
550
  "step": 75
551
  },
552
  {
553
  "epoch": 7.5,
554
+ "eval_loss": 0.017095288261771202,
555
+ "eval_runtime": 7.0724,
556
+ "eval_samples_per_second": 14.139,
557
+ "eval_steps_per_second": 0.424,
558
  "step": 75
559
  },
560
  {
561
  "epoch": 7.6,
562
+ "grad_norm": 0.043721869587898254,
563
  "learning_rate": 9.587782338163669e-05,
564
+ "loss": 0.0037,
565
  "step": 76
566
  },
567
  {
568
  "epoch": 7.7,
569
+ "grad_norm": 0.08070283383131027,
570
  "learning_rate": 9.567727288213005e-05,
571
+ "loss": 0.0063,
572
  "step": 77
573
  },
574
  {
575
  "epoch": 7.8,
576
+ "grad_norm": 0.05962779000401497,
577
  "learning_rate": 9.547217904297411e-05,
578
+ "loss": 0.0038,
579
  "step": 78
580
  },
581
  {
582
  "epoch": 7.9,
583
+ "grad_norm": 0.07050912082195282,
584
  "learning_rate": 9.526256226405075e-05,
585
+ "loss": 0.0053,
586
  "step": 79
587
  },
588
  {
589
  "epoch": 8.0,
590
+ "grad_norm": 0.12537063658237457,
591
  "learning_rate": 9.504844339512095e-05,
592
+ "loss": 0.005,
593
  "step": 80
594
  },
595
  {
596
  "epoch": 8.1,
597
+ "grad_norm": 0.05943499132990837,
598
  "learning_rate": 9.482984373375105e-05,
599
+ "loss": 0.0041,
600
  "step": 81
601
  },
602
  {
603
  "epoch": 8.2,
604
+ "grad_norm": 0.06273437291383743,
605
  "learning_rate": 9.460678502319418e-05,
606
+ "loss": 0.0035,
607
  "step": 82
608
  },
609
  {
610
  "epoch": 8.3,
611
+ "grad_norm": 0.051491495221853256,
612
  "learning_rate": 9.437928945022771e-05,
613
+ "loss": 0.0039,
614
  "step": 83
615
  },
616
  {
617
  "epoch": 8.4,
618
+ "grad_norm": 0.08781920373439789,
619
  "learning_rate": 9.414737964294636e-05,
620
+ "loss": 0.005,
621
  "step": 84
622
  },
623
  {
624
  "epoch": 8.5,
625
+ "grad_norm": 0.0535728819668293,
626
  "learning_rate": 9.391107866851143e-05,
627
+ "loss": 0.0033,
628
  "step": 85
629
  },
630
  {
631
  "epoch": 8.6,
632
+ "grad_norm": 0.048977237194776535,
633
  "learning_rate": 9.367041003085649e-05,
634
+ "loss": 0.0027,
635
  "step": 86
636
  },
637
  {
638
  "epoch": 8.7,
639
+ "grad_norm": 0.07540930062532425,
640
  "learning_rate": 9.342539766834946e-05,
641
+ "loss": 0.0039,
642
  "step": 87
643
  },
644
  {
645
  "epoch": 8.8,
646
+ "grad_norm": 0.05821846053004265,
647
  "learning_rate": 9.317606595141154e-05,
648
+ "loss": 0.0028,
649
  "step": 88
650
  },
651
  {
652
  "epoch": 8.9,
653
+ "grad_norm": 0.11574677377939224,
654
  "learning_rate": 9.292243968009331e-05,
655
+ "loss": 0.0047,
656
  "step": 89
657
  },
658
  {
659
  "epoch": 9.0,
660
+ "grad_norm": 0.07857176661491394,
661
  "learning_rate": 9.266454408160779e-05,
662
+ "loss": 0.0043,
663
  "step": 90
664
  },
665
  {
666
  "epoch": 9.1,
667
+ "grad_norm": 0.03134455904364586,
668
  "learning_rate": 9.24024048078213e-05,
669
+ "loss": 0.0026,
670
  "step": 91
671
  },
672
  {
673
  "epoch": 9.2,
674
+ "grad_norm": 0.0362420380115509,
675
  "learning_rate": 9.213604793270196e-05,
676
  "loss": 0.0024,
677
  "step": 92
678
  },
679
  {
680
  "epoch": 9.3,
681
+ "grad_norm": 0.0467870719730854,
682
  "learning_rate": 9.186549994972618e-05,
683
+ "loss": 0.0037,
684
  "step": 93
685
  },
686
  {
687
  "epoch": 9.4,
688
+ "grad_norm": 0.04397619143128395,
689
  "learning_rate": 9.159078776924346e-05,
690
+ "loss": 0.0027,
691
  "step": 94
692
  },
693
  {
694
  "epoch": 9.5,
695
+ "grad_norm": 0.06318747252225876,
696
  "learning_rate": 9.131193871579975e-05,
697
+ "loss": 0.003,
698
  "step": 95
699
  },
700
  {
701
  "epoch": 9.6,
702
+ "grad_norm": 0.05562931299209595,
703
  "learning_rate": 9.102898052541958e-05,
704
+ "loss": 0.0032,
705
  "step": 96
706
  },
707
  {
708
  "epoch": 9.7,
709
+ "grad_norm": 0.030015502125024796,
710
  "learning_rate": 9.074194134284726e-05,
711
+ "loss": 0.0022,
712
  "step": 97
713
  },
714
  {
715
  "epoch": 9.8,
716
+ "grad_norm": 0.025300586596131325,
717
  "learning_rate": 9.045084971874738e-05,
718
  "loss": 0.002,
719
  "step": 98
720
  },
721
  {
722
  "epoch": 9.9,
723
+ "grad_norm": 0.052620694041252136,
724
  "learning_rate": 9.015573460686509e-05,
725
+ "loss": 0.0027,
726
  "step": 99
727
  },
728
  {
729
  "epoch": 10.0,
730
+ "grad_norm": 0.06170989200472832,
731
  "learning_rate": 8.985662536114613e-05,
732
+ "loss": 0.0024,
733
  "step": 100
734
  },
735
  {
736
  "epoch": 10.0,
737
+ "eval_loss": 0.015364531427621841,
738
+ "eval_runtime": 7.0336,
739
+ "eval_samples_per_second": 14.217,
740
+ "eval_steps_per_second": 0.427,
741
  "step": 100
742
  },
743
  {
744
  "epoch": 10.1,
745
+ "grad_norm": 0.04944446310400963,
746
  "learning_rate": 8.955355173281708e-05,
747
+ "loss": 0.0021,
748
  "step": 101
749
  },
750
  {
751
  "epoch": 10.2,
752
+ "grad_norm": 0.04514516517519951,
753
  "learning_rate": 8.924654386742613e-05,
754
+ "loss": 0.0022,
755
  "step": 102
756
  },
757
  {
758
  "epoch": 10.3,
759
+ "grad_norm": 0.024107567965984344,
760
  "learning_rate": 8.89356323018447e-05,
761
+ "loss": 0.0015,
762
  "step": 103
763
  },
764
  {
765
  "epoch": 10.4,
766
+ "grad_norm": 0.03177887946367264,
767
  "learning_rate": 8.862084796122998e-05,
768
+ "loss": 0.0024,
769
  "step": 104
770
  },
771
  {
772
  "epoch": 10.5,
773
+ "grad_norm": 0.03080836869776249,
774
  "learning_rate": 8.83022221559489e-05,
775
+ "loss": 0.0021,
776
  "step": 105
777
  },
778
  {
779
  "epoch": 10.6,
780
+ "grad_norm": 0.027285495772957802,
781
  "learning_rate": 8.797978657846391e-05,
782
+ "loss": 0.0016,
783
  "step": 106
784
  },
785
  {
786
  "epoch": 10.7,
787
+ "grad_norm": 0.04299917444586754,
788
  "learning_rate": 8.765357330018056e-05,
789
+ "loss": 0.0022,
790
  "step": 107
791
  },
792
  {
793
  "epoch": 10.8,
794
+ "grad_norm": 0.045274410396814346,
795
  "learning_rate": 8.732361476825752e-05,
796
+ "loss": 0.0026,
797
  "step": 108
798
  },
799
  {
800
  "epoch": 10.9,
801
+ "grad_norm": 0.011724736541509628,
802
  "learning_rate": 8.69899438023792e-05,
803
+ "loss": 0.0014,
804
  "step": 109
805
  },
806
  {
807
  "epoch": 11.0,
808
+ "grad_norm": 0.045926447957754135,
809
  "learning_rate": 8.665259359149132e-05,
810
+ "loss": 0.0025,
811
  "step": 110
812
  },
813
  {
814
  "epoch": 11.1,
815
+ "grad_norm": 0.04422765597701073,
816
  "learning_rate": 8.631159769049965e-05,
817
+ "loss": 0.0022,
818
  "step": 111
819
  },
820
  {
821
  "epoch": 11.2,
822
+ "grad_norm": 0.04244349151849747,
823
  "learning_rate": 8.596699001693255e-05,
824
+ "loss": 0.0019,
825
  "step": 112
826
  },
827
  {
828
  "epoch": 11.3,
829
+ "grad_norm": 0.042645107954740524,
830
  "learning_rate": 8.561880484756725e-05,
831
+ "loss": 0.0024,
832
  "step": 113
833
  },
834
  {
835
  "epoch": 11.4,
836
+ "grad_norm": 0.060319993644952774,
837
  "learning_rate": 8.526707681502044e-05,
838
+ "loss": 0.0027,
839
  "step": 114
840
  },
841
  {
842
  "epoch": 11.5,
843
+ "grad_norm": 0.015948163345456123,
844
  "learning_rate": 8.491184090430364e-05,
845
+ "loss": 0.0016,
846
  "step": 115
847
  },
848
  {
849
  "epoch": 11.6,
850
+ "grad_norm": 0.02832942269742489,
851
  "learning_rate": 8.455313244934324e-05,
852
+ "loss": 0.0019,
853
  "step": 116
854
  },
855
  {
856
  "epoch": 11.7,
857
+ "grad_norm": 0.03249937668442726,
858
  "learning_rate": 8.419098712946601e-05,
859
+ "loss": 0.0021,
860
  "step": 117
861
  },
862
  {
863
  "epoch": 11.8,
864
+ "grad_norm": 0.04278368130326271,
865
  "learning_rate": 8.382544096585027e-05,
866
+ "loss": 0.002,
867
  "step": 118
868
  },
869
  {
870
  "epoch": 11.9,
871
+ "grad_norm": 0.05821336805820465,
872
  "learning_rate": 8.345653031794292e-05,
873
+ "loss": 0.0019,
874
  "step": 119
875
  },
876
  {
877
  "epoch": 12.0,
878
+ "grad_norm": 0.07387255877256393,
879
  "learning_rate": 8.308429187984297e-05,
880
+ "loss": 0.0027,
881
  "step": 120
882
  },
883
  {
884
  "epoch": 12.1,
885
+ "grad_norm": 0.012116311118006706,
886
  "learning_rate": 8.270876267665173e-05,
887
+ "loss": 0.0016,
888
  "step": 121
889
  },
890
  {
891
  "epoch": 12.2,
892
+ "grad_norm": 0.009292635135352612,
893
  "learning_rate": 8.232998006078997e-05,
894
+ "loss": 0.0016,
895
  "step": 122
896
  },
897
  {
898
  "epoch": 12.3,
899
+ "grad_norm": 0.01784040406346321,
900
  "learning_rate": 8.19479817082828e-05,
901
+ "loss": 0.0018,
902
  "step": 123
903
  },
904
  {
905
  "epoch": 12.4,
906
+ "grad_norm": 0.005965951830148697,
907
  "learning_rate": 8.156280561501195e-05,
908
+ "loss": 0.0014,
909
  "step": 124
910
  },
911
  {
912
  "epoch": 12.5,
913
+ "grad_norm": 0.009619847871363163,
914
  "learning_rate": 8.117449009293668e-05,
915
+ "loss": 0.0015,
916
  "step": 125
917
  },
918
  {
919
  "epoch": 12.5,
920
+ "eval_loss": 0.016432486474514008,
921
+ "eval_runtime": 7.0756,
922
+ "eval_samples_per_second": 14.133,
923
+ "eval_steps_per_second": 0.424,
924
  "step": 125
925
  },
926
  {
927
  "epoch": 12.6,
928
+ "grad_norm": 0.0500672347843647,
929
  "learning_rate": 8.07830737662829e-05,
930
+ "loss": 0.0017,
931
  "step": 126
932
  },
933
  {
934
  "epoch": 12.7,
935
+ "grad_norm": 0.02237800695002079,
936
  "learning_rate": 8.038859556770151e-05,
937
+ "loss": 0.0017,
938
  "step": 127
939
  },
940
  {
941
  "epoch": 12.8,
942
+ "grad_norm": 0.00639017578214407,
943
  "learning_rate": 7.999109473439569e-05,
944
+ "loss": 0.0015,
945
  "step": 128
946
  },
947
  {
948
  "epoch": 12.9,
949
+ "grad_norm": 0.0382903516292572,
950
  "learning_rate": 7.959061080421839e-05,
951
  "loss": 0.0026,
952
  "step": 129
953
  },
954
  {
955
  "epoch": 13.0,
956
+ "grad_norm": 0.042706046253442764,
957
  "learning_rate": 7.91871836117395e-05,
958
+ "loss": 0.0018,
959
  "step": 130
960
  },
961
  {
962
  "epoch": 13.1,
963
+ "grad_norm": 0.04088183492422104,
964
  "learning_rate": 7.878085328428369e-05,
965
+ "loss": 0.0015,
966
  "step": 131
967
  },
968
  {
969
  "epoch": 13.2,
970
+ "grad_norm": 0.021482018753886223,
971
  "learning_rate": 7.83716602379391e-05,
972
+ "loss": 0.0016,
973
  "step": 132
974
  },
975
  {
976
  "epoch": 13.3,
977
+ "grad_norm": 0.01334257610142231,
978
  "learning_rate": 7.795964517353735e-05,
979
+ "loss": 0.0015,
980
  "step": 133
981
  },
982
  {
983
  "epoch": 13.4,
984
+ "grad_norm": 0.049743931740522385,
985
  "learning_rate": 7.754484907260513e-05,
986
+ "loss": 0.0017,
987
  "step": 134
988
  },
989
  {
990
  "epoch": 13.5,
991
+ "grad_norm": 0.13533641397953033,
992
  "learning_rate": 7.712731319328798e-05,
993
+ "loss": 0.0025,
994
  "step": 135
995
  },
996
  {
997
  "epoch": 13.6,
998
+ "grad_norm": 0.014479673467576504,
999
  "learning_rate": 7.670707906624644e-05,
1000
+ "loss": 0.0014,
1001
  "step": 136
1002
  },
1003
  {
1004
  "epoch": 13.7,
1005
+ "grad_norm": 0.010055056773126125,
1006
  "learning_rate": 7.628418849052523e-05,
1007
+ "loss": 0.0014,
1008
  "step": 137
1009
  },
1010
  {
1011
  "epoch": 13.8,
1012
+ "grad_norm": 0.07842899858951569,
1013
  "learning_rate": 7.585868352939563e-05,
1014
+ "loss": 0.0021,
1015
  "step": 138
1016
  },
1017
  {
1018
  "epoch": 13.9,
1019
+ "grad_norm": 0.030145661905407906,
1020
  "learning_rate": 7.543060650617158e-05,
1021
+ "loss": 0.0031,
1022
  "step": 139
1023
  },
1024
  {
1025
  "epoch": 14.0,
1026
+ "grad_norm": 0.12752747535705566,
1027
  "learning_rate": 7.500000000000001e-05,
1028
+ "loss": 0.0016,
1029
  "step": 140
1030
  },
1031
  {
1032
  "epoch": 14.1,
1033
+ "grad_norm": 0.0319565013051033,
1034
  "learning_rate": 7.456690684162557e-05,
1035
+ "loss": 0.0034,
1036
  "step": 141
1037
  },
1038
  {
1039
  "epoch": 14.2,
1040
+ "grad_norm": 0.08731327950954437,
1041
  "learning_rate": 7.413137010913054e-05,
1042
+ "loss": 0.002,
1043
  "step": 142
1044
  },
1045
  {
1046
  "epoch": 14.3,
1047
+ "grad_norm": 0.02548266015946865,
1048
  "learning_rate": 7.369343312364993e-05,
1049
+ "loss": 0.0017,
1050
  "step": 143
1051
  },
1052
  {
1053
  "epoch": 14.4,
1054
+ "grad_norm": 0.022404348477721214,
1055
  "learning_rate": 7.325313944506254e-05,
1056
+ "loss": 0.002,
1057
  "step": 144
1058
  },
1059
  {
1060
  "epoch": 14.5,
1061
+ "grad_norm": 0.01952187716960907,
1062
  "learning_rate": 7.281053286765815e-05,
1063
+ "loss": 0.0015,
1064
  "step": 145
1065
  },
1066
  {
1067
  "epoch": 14.6,
1068
+ "grad_norm": 0.02216988056898117,
1069
  "learning_rate": 7.236565741578163e-05,
1070
+ "loss": 0.0017,
1071
  "step": 146
1072
  },
1073
  {
1074
  "epoch": 14.7,
1075
+ "grad_norm": 0.09338480979204178,
1076
  "learning_rate": 7.191855733945387e-05,
1077
+ "loss": 0.0068,
1078
  "step": 147
1079
  },
1080
  {
1081
  "epoch": 14.8,
1082
+ "grad_norm": 0.027766218408942223,
1083
  "learning_rate": 7.146927710997047e-05,
1084
+ "loss": 0.0018,
1085
  "step": 148
1086
  },
1087
  {
1088
  "epoch": 14.9,
1089
+ "grad_norm": 0.01007885206490755,
1090
  "learning_rate": 7.101786141547828e-05,
1091
+ "loss": 0.0016,
1092
  "step": 149
1093
  },
1094
  {
1095
  "epoch": 15.0,
1096
+ "grad_norm": 0.04051733762025833,
1097
  "learning_rate": 7.056435515653059e-05,
1098
+ "loss": 0.0038,
1099
  "step": 150
1100
  },
1101
  {
1102
  "epoch": 15.0,
1103
+ "eval_loss": 0.02101273089647293,
1104
+ "eval_runtime": 7.0416,
1105
+ "eval_samples_per_second": 14.201,
1106
  "eval_steps_per_second": 0.426,
1107
  "step": 150
1108
  },
1109
  {
1110
  "epoch": 15.1,
1111
+ "grad_norm": 0.006852968595921993,
1112
  "learning_rate": 7.010880344162088e-05,
1113
+ "loss": 0.0013,
1114
  "step": 151
1115
  },
1116
  {
1117
  "epoch": 15.2,
1118
+ "grad_norm": 0.018146201968193054,
1119
  "learning_rate": 6.965125158269619e-05,
1120
+ "loss": 0.0016,
1121
  "step": 152
1122
  },
1123
  {
1124
  "epoch": 15.3,
1125
+ "grad_norm": 0.021478457376360893,
1126
  "learning_rate": 6.919174509065004e-05,
1127
+ "loss": 0.0022,
1128
  "step": 153
1129
  },
1130
  {
1131
  "epoch": 15.4,
1132
+ "grad_norm": 0.0223145242780447,
1133
  "learning_rate": 6.873032967079561e-05,
1134
+ "loss": 0.0017,
1135
  "step": 154
1136
  },
1137
  {
1138
  "epoch": 15.5,
1139
+ "grad_norm": 0.015200883150100708,
1140
  "learning_rate": 6.826705121831976e-05,
1141
+ "loss": 0.0019,
1142
  "step": 155
1143
  },
1144
  {
1145
  "epoch": 15.6,
1146
+ "grad_norm": 0.004565316252410412,
1147
  "learning_rate": 6.780195581371784e-05,
1148
+ "loss": 0.0014,
1149
  "step": 156
1150
  },
1151
  {
1152
  "epoch": 15.7,
1153
+ "grad_norm": 0.014399299398064613,
1154
  "learning_rate": 6.733508971821036e-05,
1155
+ "loss": 0.0016,
1156
  "step": 157
1157
  },
1158
  {
1159
  "epoch": 15.8,
1160
+ "grad_norm": 0.055854879319667816,
1161
  "learning_rate": 6.686649936914152e-05,
1162
+ "loss": 0.0016,
1163
  "step": 158
1164
  },
1165
  {
1166
  "epoch": 15.9,
1167
+ "grad_norm": 0.02653384581208229,
1168
  "learning_rate": 6.639623137536023e-05,
1169
+ "loss": 0.0016,
1170
  "step": 159
1171
  },
1172
  {
1173
  "epoch": 16.0,
1174
+ "grad_norm": 0.05519720911979675,
1175
  "learning_rate": 6.592433251258423e-05,
1176
+ "loss": 0.0015,
1177
  "step": 160
1178
  },
1179
  {
1180
  "epoch": 16.1,
1181
+ "grad_norm": 0.013170337304472923,
1182
  "learning_rate": 6.545084971874738e-05,
1183
+ "loss": 0.0016,
1184
  "step": 161
1185
  },
1186
  {
1187
  "epoch": 16.2,
1188
+ "grad_norm": 0.0030152976978570223,
1189
  "learning_rate": 6.497583008933097e-05,
1190
  "loss": 0.0014,
1191
  "step": 162
1192
  },
1193
  {
1194
  "epoch": 16.3,
1195
+ "grad_norm": 0.0028813318349421024,
1196
  "learning_rate": 6.449932087267932e-05,
1197
+ "loss": 0.0013,
1198
  "step": 163
1199
  },
1200
  {
1201
  "epoch": 16.4,
1202
+ "grad_norm": 0.004150426480919123,
1203
  "learning_rate": 6.402136946530014e-05,
1204
  "loss": 0.0013,
1205
  "step": 164
1206
  },
1207
  {
1208
  "epoch": 16.5,
1209
+ "grad_norm": 0.08170194923877716,
1210
  "learning_rate": 6.354202340715026e-05,
1211
+ "loss": 0.0017,
1212
  "step": 165
1213
  },
1214
  {
1215
  "epoch": 16.6,
1216
+ "grad_norm": 0.005082490388303995,
1217
  "learning_rate": 6.306133037690693e-05,
1218
+ "loss": 0.0012,
1219
  "step": 166
1220
  },
1221
  {
1222
  "epoch": 16.7,
1223
+ "grad_norm": 0.0027544551994651556,
1224
  "learning_rate": 6.257933818722543e-05,
1225
+ "loss": 0.0013,
1226
  "step": 167
1227
  },
1228
  {
1229
  "epoch": 16.8,
1230
+ "grad_norm": 0.04007953405380249,
1231
  "learning_rate": 6.209609477998338e-05,
1232
+ "loss": 0.0016,
1233
  "step": 168
1234
  },
1235
  {
1236
  "epoch": 16.9,
1237
+ "grad_norm": 0.0036666980013251305,
1238
  "learning_rate": 6.161164822151213e-05,
1239
+ "loss": 0.0013,
1240
  "step": 169
1241
  },
1242
  {
1243
  "epoch": 17.0,
1244
+ "grad_norm": 0.04919775202870369,
1245
  "learning_rate": 6.112604669781572e-05,
1246
+ "loss": 0.002,
1247
  "step": 170
1248
  },
1249
  {
1250
  "epoch": 17.1,
1251
+ "grad_norm": 0.0016860866453498602,
1252
  "learning_rate": 6.063933850977811e-05,
1253
+ "loss": 0.0013,
1254
  "step": 171
1255
  },
1256
  {
1257
  "epoch": 17.2,
1258
+ "grad_norm": 0.007339112926274538,
1259
  "learning_rate": 6.015157206835881e-05,
1260
+ "loss": 0.0013,
1261
  "step": 172
1262
  },
1263
  {
1264
  "epoch": 17.3,
1265
+ "grad_norm": 0.007601634599268436,
1266
  "learning_rate": 5.9662795889777666e-05,
1267
  "loss": 0.0013,
1268
  "step": 173
1269
  },
1270
  {
1271
  "epoch": 17.4,
1272
+ "grad_norm": 0.005811419803649187,
1273
  "learning_rate": 5.917305859068912e-05,
1274
+ "loss": 0.0012,
1275
  "step": 174
1276
  },
1277
  {
1278
  "epoch": 17.5,
1279
+ "grad_norm": 0.006914922967553139,
1280
  "learning_rate": 5.868240888334653e-05,
1281
  "loss": 0.0013,
1282
  "step": 175
1283
  },
1284
  {
1285
  "epoch": 17.5,
1286
+ "eval_loss": 0.02819022908806801,
1287
+ "eval_runtime": 7.0272,
1288
+ "eval_samples_per_second": 14.23,
1289
+ "eval_steps_per_second": 0.427,
1290
  "step": 175
1291
  },
1292
  {
1293
  "epoch": 17.6,
1294
+ "grad_norm": 0.0504387766122818,
1295
  "learning_rate": 5.819089557075689e-05,
1296
+ "loss": 0.0022,
1297
  "step": 176
1298
  },
1299
  {
1300
  "epoch": 17.7,
1301
+ "grad_norm": 0.002415669383481145,
1302
  "learning_rate": 5.7698567541826675e-05,
1303
+ "loss": 0.0012,
1304
  "step": 177
1305
  },
1306
  {
1307
  "epoch": 17.8,
1308
+ "grad_norm": 0.031064577400684357,
1309
  "learning_rate": 5.7205473766499005e-05,
1310
  "loss": 0.0025,
1311
  "step": 178
1312
  },
1313
  {
1314
  "epoch": 17.9,
1315
+ "grad_norm": 0.0033903273288160563,
1316
  "learning_rate": 5.6711663290882776e-05,
1317
+ "loss": 0.0013,
1318
  "step": 179
1319
  },
1320
  {
1321
  "epoch": 18.0,
1322
+ "grad_norm": 0.005813250318169594,
1323
  "learning_rate": 5.621718523237427e-05,
1324
+ "loss": 0.0013,
1325
  "step": 180
1326
  },
1327
  {
1328
  "epoch": 18.1,
1329
+ "grad_norm": 0.03409822657704353,
1330
  "learning_rate": 5.57220887747716e-05,
1331
+ "loss": 0.0019,
1332
  "step": 181
1333
  },
1334
  {
1335
  "epoch": 18.2,
1336
+ "grad_norm": 0.022658348083496094,
1337
  "learning_rate": 5.522642316338268e-05,
1338
+ "loss": 0.0015,
1339
  "step": 182
1340
  },
1341
  {
1342
  "epoch": 18.3,
1343
+ "grad_norm": 0.0084981769323349,
1344
  "learning_rate": 5.473023770012686e-05,
1345
+ "loss": 0.0016,
1346
  "step": 183
1347
  },
1348
  {
1349
  "epoch": 18.4,
1350
+ "grad_norm": 0.002898683538660407,
1351
  "learning_rate": 5.4233581738631165e-05,
1352
  "loss": 0.0013,
1353
  "step": 184
1354
  },
1355
  {
1356
  "epoch": 18.5,
1357
+ "grad_norm": 0.014665622264146805,
1358
  "learning_rate": 5.373650467932122e-05,
1359
+ "loss": 0.0014,
1360
  "step": 185
1361
  },
1362
  {
1363
  "epoch": 18.6,
1364
+ "grad_norm": 0.02535739168524742,
1365
  "learning_rate": 5.323905596450759e-05,
1366
+ "loss": 0.0014,
1367
  "step": 186
1368
  },
1369
  {
1370
  "epoch": 18.7,
1371
+ "grad_norm": 0.004365747328847647,
1372
  "learning_rate": 5.274128507346801e-05,
1373
  "loss": 0.0013,
1374
  "step": 187
1375
  },
1376
  {
1377
  "epoch": 18.8,
1378
+ "grad_norm": 0.00725106755271554,
1379
  "learning_rate": 5.2243241517525754e-05,
1380
  "loss": 0.0013,
1381
  "step": 188
1382
  },
1383
  {
1384
  "epoch": 18.9,
1385
+ "grad_norm": 0.0034797799307852983,
1386
  "learning_rate": 5.174497483512506e-05,
1387
  "loss": 0.0013,
1388
  "step": 189
1389
  },
1390
  {
1391
  "epoch": 19.0,
1392
+ "grad_norm": 0.0022317483089864254,
1393
  "learning_rate": 5.124653458690365e-05,
1394
  "loss": 0.0013,
1395
  "step": 190
1396
  },
1397
  {
1398
  "epoch": 19.1,
1399
+ "grad_norm": 0.0011345412349328399,
1400
  "learning_rate": 5.074797035076319e-05,
1401
  "loss": 0.0013,
1402
  "step": 191
1403
  },
1404
  {
1405
  "epoch": 19.2,
1406
+ "grad_norm": 0.010673880577087402,
1407
  "learning_rate": 5.024933171693791e-05,
1408
+ "loss": 0.0017,
1409
  "step": 192
1410
  },
1411
  {
1412
  "epoch": 19.3,
1413
+ "grad_norm": 0.0016955966129899025,
1414
  "learning_rate": 4.9750668283062104e-05,
1415
+ "loss": 0.0013,
1416
  "step": 193
1417
  },
1418
  {
1419
  "epoch": 19.4,
1420
+ "grad_norm": 0.0017608372727409005,
1421
  "learning_rate": 4.925202964923683e-05,
1422
+ "loss": 0.0012,
1423
  "step": 194
1424
  },
1425
  {
1426
  "epoch": 19.5,
1427
+ "grad_norm": 0.007348727900534868,
1428
  "learning_rate": 4.875346541309637e-05,
1429
+ "loss": 0.0014,
1430
  "step": 195
1431
  },
1432
  {
1433
  "epoch": 19.6,
1434
+ "grad_norm": 0.012571705505251884,
1435
  "learning_rate": 4.825502516487497e-05,
1436
+ "loss": 0.0014,
1437
  "step": 196
1438
  },
1439
  {
1440
  "epoch": 19.7,
1441
+ "grad_norm": 0.0025126473046839237,
1442
  "learning_rate": 4.775675848247427e-05,
1443
  "loss": 0.0013,
1444
  "step": 197
1445
  },
1446
  {
1447
  "epoch": 19.8,
1448
+ "grad_norm": 0.005391189828515053,
1449
  "learning_rate": 4.725871492653199e-05,
1450
+ "loss": 0.0015,
1451
  "step": 198
1452
  },
1453
  {
1454
  "epoch": 19.9,
1455
+ "grad_norm": 0.004390457179397345,
1456
  "learning_rate": 4.6760944035492404e-05,
1457
  "loss": 0.0014,
1458
  "step": 199
1459
  },
1460
  {
1461
  "epoch": 20.0,
1462
+ "grad_norm": 0.009553831070661545,
1463
  "learning_rate": 4.626349532067879e-05,
1464
  "loss": 0.0012,
1465
  "step": 200
1466
  },
1467
  {
1468
  "epoch": 20.0,
1469
+ "eval_loss": 0.027936220169067383,
1470
+ "eval_runtime": 7.0334,
1471
+ "eval_samples_per_second": 14.218,
1472
+ "eval_steps_per_second": 0.427,
1473
  "step": 200
1474
  },
1475
  {
1476
  "epoch": 20.1,
1477
+ "grad_norm": 0.007283430080860853,
1478
  "learning_rate": 4.576641826136884e-05,
1479
+ "loss": 0.0014,
1480
  "step": 201
1481
  },
1482
  {
1483
  "epoch": 20.2,
1484
+ "grad_norm": 0.0043794638477265835,
1485
  "learning_rate": 4.526976229987315e-05,
1486
+ "loss": 0.0014,
1487
  "step": 202
1488
  },
1489
  {
1490
  "epoch": 20.3,
1491
+ "grad_norm": 0.0032108803279697895,
1492
  "learning_rate": 4.477357683661734e-05,
1493
  "loss": 0.0013,
1494
  "step": 203
1495
  },
1496
  {
1497
  "epoch": 20.4,
1498
+ "grad_norm": 0.001851178240031004,
1499
  "learning_rate": 4.4277911225228414e-05,
1500
+ "loss": 0.0013,
1501
  "step": 204
1502
  },
1503
  {
1504
  "epoch": 20.5,
1505
+ "grad_norm": 0.023678354918956757,
1506
  "learning_rate": 4.378281476762576e-05,
1507
+ "loss": 0.0014,
1508
  "step": 205
1509
  },
1510
  {
1511
  "epoch": 20.6,
1512
+ "grad_norm": 0.0037609227001667023,
1513
  "learning_rate": 4.328833670911724e-05,
1514
+ "loss": 0.0013,
1515
  "step": 206
1516
  },
1517
  {
1518
  "epoch": 20.7,
1519
+ "grad_norm": 0.0009916301351040602,
1520
  "learning_rate": 4.2794526233501006e-05,
1521
  "loss": 0.0012,
1522
  "step": 207
1523
  },
1524
  {
1525
  "epoch": 20.8,
1526
+ "grad_norm": 0.0015994661953300238,
1527
  "learning_rate": 4.230143245817332e-05,
1528
+ "loss": 0.0013,
1529
  "step": 208
1530
  },
1531
  {
1532
  "epoch": 20.9,
1533
+ "grad_norm": 0.0015283317770808935,
1534
  "learning_rate": 4.180910442924312e-05,
1535
  "loss": 0.0013,
1536
  "step": 209
1537
  },
1538
  {
1539
  "epoch": 21.0,
1540
+ "grad_norm": 0.0010106766130775213,
1541
  "learning_rate": 4.131759111665349e-05,
1542
+ "loss": 0.0013,
1543
  "step": 210
1544
  },
1545
  {
1546
  "epoch": 21.1,
1547
+ "grad_norm": 0.0013305904576554894,
1548
  "learning_rate": 4.082694140931089e-05,
1549
  "loss": 0.0013,
1550
  "step": 211
1551
  },
1552
  {
1553
  "epoch": 21.2,
1554
+ "grad_norm": 0.0009969357633963227,
1555
  "learning_rate": 4.0337204110222346e-05,
1556
+ "loss": 0.0013,
1557
  "step": 212
1558
  },
1559
  {
1560
  "epoch": 21.3,
1561
+ "grad_norm": 0.00287212198600173,
1562
  "learning_rate": 3.98484279316412e-05,
1563
  "loss": 0.0013,
1564
  "step": 213
1565
  },
1566
  {
1567
  "epoch": 21.4,
1568
+ "grad_norm": 0.0017720722826197743,
1569
  "learning_rate": 3.936066149022191e-05,
1570
  "loss": 0.0013,
1571
  "step": 214
1572
  },
1573
  {
1574
  "epoch": 21.5,
1575
+ "grad_norm": 0.0008634412661194801,
1576
  "learning_rate": 3.887395330218429e-05,
1577
+ "loss": 0.0013,
1578
  "step": 215
1579
  },
1580
  {
1581
  "epoch": 21.6,
1582
+ "grad_norm": 0.0005406832206062973,
1583
  "learning_rate": 3.838835177848788e-05,
1584
+ "loss": 0.0012,
1585
  "step": 216
1586
  },
1587
  {
1588
  "epoch": 21.7,
1589
+ "grad_norm": 0.0013466555392369628,
1590
  "learning_rate": 3.790390522001662e-05,
1591
  "loss": 0.0013,
1592
  "step": 217
1593
  },
1594
  {
1595
  "epoch": 21.8,
1596
+ "grad_norm": 0.0007291603251360357,
1597
  "learning_rate": 3.742066181277458e-05,
1598
  "loss": 0.0013,
1599
  "step": 218
1600
  },
1601
  {
1602
  "epoch": 21.9,
1603
+ "grad_norm": 0.0028708558529615402,
1604
  "learning_rate": 3.6938669623093084e-05,
1605
+ "loss": 0.0013,
1606
  "step": 219
1607
  },
1608
  {
1609
  "epoch": 22.0,
1610
+ "grad_norm": 0.0008626269409433007,
1611
  "learning_rate": 3.6457976592849754e-05,
1612
  "loss": 0.0013,
1613
  "step": 220
1614
  },
1615
  {
1616
  "epoch": 22.1,
1617
+ "grad_norm": 0.0005283806240186095,
1618
  "learning_rate": 3.597863053469987e-05,
1619
+ "loss": 0.0013,
1620
  "step": 221
1621
  },
1622
  {
1623
  "epoch": 22.2,
1624
+ "grad_norm": 0.0006139932083897293,
1625
  "learning_rate": 3.550067912732069e-05,
1626
+ "loss": 0.0013,
1627
  "step": 222
1628
  },
1629
  {
1630
  "epoch": 22.3,
1631
+ "grad_norm": 0.001044943812303245,
1632
  "learning_rate": 3.502416991066904e-05,
1633
+ "loss": 0.0013,
1634
  "step": 223
1635
  },
1636
  {
1637
  "epoch": 22.4,
1638
+ "grad_norm": 0.0007318172720260918,
1639
  "learning_rate": 3.4549150281252636e-05,
1640
  "loss": 0.0013,
1641
  "step": 224
1642
  },
1643
  {
1644
  "epoch": 22.5,
1645
+ "grad_norm": 0.0004887795075774193,
1646
  "learning_rate": 3.4075667487415785e-05,
1647
  "loss": 0.0013,
1648
  "step": 225
1649
  },
1650
  {
1651
  "epoch": 22.5,
1652
+ "eval_loss": 0.030632568523287773,
1653
+ "eval_runtime": 7.0504,
1654
+ "eval_samples_per_second": 14.184,
1655
+ "eval_steps_per_second": 0.426,
1656
  "step": 225
1657
  },
1658
  {
1659
+ "epoch": 22.5,
1660
+ "step": 225,
1661
+ "total_flos": 3.454694940505375e+17,
1662
+ "train_loss": 0.013378481343388558,
1663
+ "train_runtime": 2354.7794,
1664
+ "train_samples_per_second": 6.733,
1665
+ "train_steps_per_second": 0.149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1666
  }
1667
  ],
1668
  "logging_steps": 1,
 
1677
  "early_stopping_threshold": 0.0
1678
  },
1679
  "attributes": {
1680
+ "early_stopping_patience_counter": 4
1681
  }
1682
  },
1683
  "TrainerControl": {
 
1691
  "attributes": {}
1692
  }
1693
  },
1694
+ "total_flos": 3.454694940505375e+17,
1695
  "train_batch_size": 48,
1696
  "trial_name": null,
1697
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eff46ab602c60d8c5d1c8d5d90dd3e078e4d5b0c7f9bfc0ed5d7c21920a4d63a
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4dc902acccebfb538dc58f222aff08d75a70fcbefd922c05e6f499e0e1cc414
3
  size 5432
training_eval_loss.png CHANGED
training_loss.png CHANGED