sizhkhy commited on
Commit
7601da0
·
verified ·
1 Parent(s): 5fb87ed
README.md CHANGED
@@ -19,7 +19,7 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  This model is a fine-tuned version of [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) on the Klystroglobal dataset.
21
  It achieves the following results on the evaluation set:
22
- - Loss: 0.0481
23
 
24
  ## Model description
25
 
@@ -45,10 +45,22 @@ The following hyperparameters were used during training:
45
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
  - lr_scheduler_type: cosine
47
  - lr_scheduler_warmup_ratio: 0.1
48
- - num_epochs: 1
49
 
50
  ### Training results
51
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
 
54
  ### Framework versions
 
19
 
20
  This model is a fine-tuned version of [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) on the Klystroglobal dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.0174
23
 
24
  ## Model description
25
 
 
45
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
  - lr_scheduler_type: cosine
47
  - lr_scheduler_warmup_ratio: 0.1
48
+ - num_epochs: 35
49
 
50
  ### Training results
51
 
52
+ | Training Loss | Epoch | Step | Validation Loss |
53
+ |:-------------:|:-----:|:----:|:---------------:|
54
+ | 0.0437 | 2.5 | 25 | 0.0340 |
55
+ | 0.0098 | 5.0 | 50 | 0.0166 |
56
+ | 0.0039 | 7.5 | 75 | 0.0165 |
57
+ | 0.0021 | 10.0 | 100 | 0.0174 |
58
+ | 0.0031 | 12.5 | 125 | 0.0145 |
59
+ | 0.0022 | 15.0 | 150 | 0.0235 |
60
+ | 0.0013 | 17.5 | 175 | 0.0187 |
61
+ | 0.0012 | 20.0 | 200 | 0.0246 |
62
+ | 0.0013 | 22.5 | 225 | 0.0264 |
63
+ | 0.0013 | 25.0 | 250 | 0.0277 |
64
 
65
 
66
  ### Framework versions
adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "o_proj",
24
- "q_proj",
25
  "k_proj",
26
- "gate_proj",
27
  "up_proj",
 
 
28
  "v_proj",
29
- "down_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
23
  "k_proj",
 
24
  "up_proj",
25
+ "gate_proj",
26
+ "down_proj",
27
  "v_proj",
28
+ "q_proj",
29
+ "o_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c880d7f07bed6bd1deafc70a6cbafec879332cdd0d2b72d6d47889e9d417069
3
  size 1556140392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e80bf440e224538c0d7fdaa085ba987418951442628ee238b39a518393c3690b
3
  size 1556140392
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_loss": 0.04813718795776367,
4
- "eval_runtime": 8.1182,
5
- "eval_samples_per_second": 12.318,
6
- "eval_steps_per_second": 0.37,
7
- "total_flos": 1.5338473968402432e+16,
8
- "train_loss": 0.09183733761310578,
9
- "train_runtime": 197.912,
10
- "train_samples_per_second": 2.289,
11
- "train_steps_per_second": 0.051
12
  }
 
1
  {
2
+ "epoch": 25.0,
3
+ "eval_loss": 0.017384245991706848,
4
+ "eval_runtime": 7.0041,
5
+ "eval_samples_per_second": 14.277,
6
+ "eval_steps_per_second": 0.428,
7
+ "total_flos": 3.832789293855867e+17,
8
+ "train_loss": 0.011958676076494158,
9
+ "train_runtime": 2822.6775,
10
+ "train_samples_per_second": 5.617,
11
+ "train_steps_per_second": 0.124
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_loss": 0.04813718795776367,
4
- "eval_runtime": 8.1182,
5
- "eval_samples_per_second": 12.318,
6
- "eval_steps_per_second": 0.37
7
  }
 
1
  {
2
+ "epoch": 25.0,
3
+ "eval_loss": 0.017384245991706848,
4
+ "eval_runtime": 7.0041,
5
+ "eval_samples_per_second": 14.277,
6
+ "eval_steps_per_second": 0.428
7
  }
experiment.config ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ vision_config {
2
+ vision_api: TEXT_DETECTION
3
+ feature_element: WORD
4
+ word_confidence_threshold: -0.1
5
+ return_raw_response: true
6
+ }
7
+ preprocess_config {
8
+ label_overlap_threshold: 0.4
9
+ num_processes: 4
10
+ rotation_fixer: VISION_BASED_FAST
11
+ }
12
+ train_config {
13
+ num_epochs: 1
14
+ }
15
+ version: "3.0.2"
16
+ field_extraction {
17
+ sub_exp_config {
18
+ model_config {
19
+ architecture {
20
+ nova {
21
+ }
22
+ }
23
+ }
24
+ }
25
+ }
model.bin ADDED
File without changes
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "total_flos": 1.5338473968402432e+16,
4
- "train_loss": 0.09183733761310578,
5
- "train_runtime": 197.912,
6
- "train_samples_per_second": 2.289,
7
- "train_steps_per_second": 0.051
8
  }
 
1
  {
2
+ "epoch": 25.0,
3
+ "total_flos": 3.832789293855867e+17,
4
+ "train_loss": 0.011958676076494158,
5
+ "train_runtime": 2822.6775,
6
+ "train_samples_per_second": 5.617,
7
+ "train_steps_per_second": 0.124
8
  }
trainer_log.jsonl CHANGED
@@ -1,11 +1,261 @@
1
- {"current_steps": 1, "total_steps": 10, "loss": 0.1531, "lr": 0.0001, "epoch": 0.1, "percentage": 10.0, "elapsed_time": "0:00:30", "remaining_time": "0:04:34"}
2
- {"current_steps": 2, "total_steps": 10, "loss": 0.1308, "lr": 9.698463103929542e-05, "epoch": 0.2, "percentage": 20.0, "elapsed_time": "0:00:47", "remaining_time": "0:03:09"}
3
- {"current_steps": 3, "total_steps": 10, "loss": 0.1702, "lr": 8.83022221559489e-05, "epoch": 0.3, "percentage": 30.0, "elapsed_time": "0:01:05", "remaining_time": "0:02:33"}
4
- {"current_steps": 4, "total_steps": 10, "loss": 0.0813, "lr": 7.500000000000001e-05, "epoch": 0.4, "percentage": 40.0, "elapsed_time": "0:01:24", "remaining_time": "0:02:06"}
5
- {"current_steps": 5, "total_steps": 10, "loss": 0.0786, "lr": 5.868240888334653e-05, "epoch": 0.5, "percentage": 50.0, "elapsed_time": "0:01:43", "remaining_time": "0:01:43"}
6
- {"current_steps": 6, "total_steps": 10, "loss": 0.0576, "lr": 4.131759111665349e-05, "epoch": 0.6, "percentage": 60.0, "elapsed_time": "0:01:53", "remaining_time": "0:01:15"}
7
- {"current_steps": 7, "total_steps": 10, "loss": 0.0715, "lr": 2.500000000000001e-05, "epoch": 0.7, "percentage": 70.0, "elapsed_time": "0:02:31", "remaining_time": "0:01:05"}
8
- {"current_steps": 8, "total_steps": 10, "loss": 0.0518, "lr": 1.1697777844051105e-05, "epoch": 0.8, "percentage": 80.0, "elapsed_time": "0:02:42", "remaining_time": "0:00:40"}
9
- {"current_steps": 9, "total_steps": 10, "loss": 0.0697, "lr": 3.0153689607045845e-06, "epoch": 0.9, "percentage": 90.0, "elapsed_time": "0:02:52", "remaining_time": "0:00:19"}
10
- {"current_steps": 10, "total_steps": 10, "loss": 0.0538, "lr": 0.0, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:02:59", "remaining_time": "0:00:00"}
11
- {"current_steps": 10, "total_steps": 10, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:03:16", "remaining_time": "0:00:00"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 1, "total_steps": 350, "loss": 0.1531, "lr": 2.8571428571428573e-06, "epoch": 0.1, "percentage": 0.29, "elapsed_time": "0:00:27", "remaining_time": "2:38:08"}
2
+ {"current_steps": 2, "total_steps": 350, "loss": 0.1308, "lr": 5.7142857142857145e-06, "epoch": 0.2, "percentage": 0.57, "elapsed_time": "0:00:45", "remaining_time": "2:11:52"}
3
+ {"current_steps": 3, "total_steps": 350, "loss": 0.1619, "lr": 8.571428571428573e-06, "epoch": 0.3, "percentage": 0.86, "elapsed_time": "0:01:04", "remaining_time": "2:04:36"}
4
+ {"current_steps": 4, "total_steps": 350, "loss": 0.1325, "lr": 1.1428571428571429e-05, "epoch": 0.4, "percentage": 1.14, "elapsed_time": "0:01:17", "remaining_time": "1:51:35"}
5
+ {"current_steps": 5, "total_steps": 350, "loss": 0.1206, "lr": 1.4285714285714285e-05, "epoch": 0.5, "percentage": 1.43, "elapsed_time": "0:01:27", "remaining_time": "1:40:58"}
6
+ {"current_steps": 6, "total_steps": 350, "loss": 0.0875, "lr": 1.7142857142857145e-05, "epoch": 0.6, "percentage": 1.71, "elapsed_time": "0:01:38", "remaining_time": "1:33:50"}
7
+ {"current_steps": 7, "total_steps": 350, "loss": 0.1067, "lr": 2e-05, "epoch": 0.7, "percentage": 2.0, "elapsed_time": "0:02:04", "remaining_time": "1:41:17"}
8
+ {"current_steps": 8, "total_steps": 350, "loss": 0.0966, "lr": 2.2857142857142858e-05, "epoch": 0.8, "percentage": 2.29, "elapsed_time": "0:02:14", "remaining_time": "1:35:49"}
9
+ {"current_steps": 9, "total_steps": 350, "loss": 0.1088, "lr": 2.5714285714285714e-05, "epoch": 0.9, "percentage": 2.57, "elapsed_time": "0:02:24", "remaining_time": "1:31:24"}
10
+ {"current_steps": 10, "total_steps": 350, "loss": 0.0839, "lr": 2.857142857142857e-05, "epoch": 1.0, "percentage": 2.86, "elapsed_time": "0:02:37", "remaining_time": "1:29:20"}
11
+ {"current_steps": 11, "total_steps": 350, "loss": 0.0713, "lr": 3.142857142857143e-05, "epoch": 1.1, "percentage": 3.14, "elapsed_time": "0:02:50", "remaining_time": "1:27:31"}
12
+ {"current_steps": 12, "total_steps": 350, "loss": 0.0688, "lr": 3.428571428571429e-05, "epoch": 1.2, "percentage": 3.43, "elapsed_time": "0:03:00", "remaining_time": "1:24:44"}
13
+ {"current_steps": 13, "total_steps": 350, "loss": 0.0688, "lr": 3.7142857142857143e-05, "epoch": 1.3, "percentage": 3.71, "elapsed_time": "0:03:12", "remaining_time": "1:23:01"}
14
+ {"current_steps": 14, "total_steps": 350, "loss": 0.0668, "lr": 4e-05, "epoch": 1.4, "percentage": 4.0, "elapsed_time": "0:03:26", "remaining_time": "1:22:39"}
15
+ {"current_steps": 15, "total_steps": 350, "loss": 0.0613, "lr": 4.2857142857142856e-05, "epoch": 1.5, "percentage": 4.29, "elapsed_time": "0:03:40", "remaining_time": "1:22:10"}
16
+ {"current_steps": 16, "total_steps": 350, "loss": 0.052, "lr": 4.5714285714285716e-05, "epoch": 1.6, "percentage": 4.57, "elapsed_time": "0:03:53", "remaining_time": "1:21:24"}
17
+ {"current_steps": 17, "total_steps": 350, "loss": 0.0581, "lr": 4.8571428571428576e-05, "epoch": 1.7, "percentage": 4.86, "elapsed_time": "0:04:07", "remaining_time": "1:20:46"}
18
+ {"current_steps": 18, "total_steps": 350, "loss": 0.0482, "lr": 5.142857142857143e-05, "epoch": 1.8, "percentage": 5.14, "elapsed_time": "0:04:19", "remaining_time": "1:19:48"}
19
+ {"current_steps": 19, "total_steps": 350, "loss": 0.0533, "lr": 5.428571428571428e-05, "epoch": 1.9, "percentage": 5.43, "elapsed_time": "0:04:30", "remaining_time": "1:18:32"}
20
+ {"current_steps": 20, "total_steps": 350, "loss": 0.0513, "lr": 5.714285714285714e-05, "epoch": 2.0, "percentage": 5.71, "elapsed_time": "0:04:35", "remaining_time": "1:15:50"}
21
+ {"current_steps": 21, "total_steps": 350, "loss": 0.042, "lr": 6e-05, "epoch": 2.1, "percentage": 6.0, "elapsed_time": "0:04:51", "remaining_time": "1:16:05"}
22
+ {"current_steps": 22, "total_steps": 350, "loss": 0.0401, "lr": 6.285714285714286e-05, "epoch": 2.2, "percentage": 6.29, "elapsed_time": "0:05:05", "remaining_time": "1:15:51"}
23
+ {"current_steps": 23, "total_steps": 350, "loss": 0.0382, "lr": 6.571428571428571e-05, "epoch": 2.3, "percentage": 6.57, "elapsed_time": "0:05:16", "remaining_time": "1:14:57"}
24
+ {"current_steps": 24, "total_steps": 350, "loss": 0.034, "lr": 6.857142857142858e-05, "epoch": 2.4, "percentage": 6.86, "elapsed_time": "0:05:30", "remaining_time": "1:14:49"}
25
+ {"current_steps": 25, "total_steps": 350, "loss": 0.0437, "lr": 7.142857142857143e-05, "epoch": 2.5, "percentage": 7.14, "elapsed_time": "0:05:44", "remaining_time": "1:14:38"}
26
+ {"current_steps": 25, "total_steps": 350, "eval_loss": 0.03402441740036011, "epoch": 2.5, "percentage": 7.14, "elapsed_time": "0:06:03", "remaining_time": "1:18:51"}
27
+ {"current_steps": 26, "total_steps": 350, "loss": 0.0422, "lr": 7.428571428571429e-05, "epoch": 2.6, "percentage": 7.43, "elapsed_time": "0:06:16", "remaining_time": "1:18:17"}
28
+ {"current_steps": 27, "total_steps": 350, "loss": 0.0264, "lr": 7.714285714285715e-05, "epoch": 2.7, "percentage": 7.71, "elapsed_time": "0:06:30", "remaining_time": "1:17:48"}
29
+ {"current_steps": 28, "total_steps": 350, "loss": 0.0377, "lr": 8e-05, "epoch": 2.8, "percentage": 8.0, "elapsed_time": "0:06:41", "remaining_time": "1:16:57"}
30
+ {"current_steps": 29, "total_steps": 350, "loss": 0.0289, "lr": 8.285714285714287e-05, "epoch": 2.9, "percentage": 8.29, "elapsed_time": "0:06:54", "remaining_time": "1:16:23"}
31
+ {"current_steps": 30, "total_steps": 350, "loss": 0.0299, "lr": 8.571428571428571e-05, "epoch": 3.0, "percentage": 8.57, "elapsed_time": "0:06:59", "remaining_time": "1:14:35"}
32
+ {"current_steps": 31, "total_steps": 350, "loss": 0.0216, "lr": 8.857142857142857e-05, "epoch": 3.1, "percentage": 8.86, "elapsed_time": "0:07:15", "remaining_time": "1:14:40"}
33
+ {"current_steps": 32, "total_steps": 350, "loss": 0.0359, "lr": 9.142857142857143e-05, "epoch": 3.2, "percentage": 9.14, "elapsed_time": "0:07:26", "remaining_time": "1:13:56"}
34
+ {"current_steps": 33, "total_steps": 350, "loss": 0.0233, "lr": 9.428571428571429e-05, "epoch": 3.3, "percentage": 9.43, "elapsed_time": "0:07:39", "remaining_time": "1:13:36"}
35
+ {"current_steps": 34, "total_steps": 350, "loss": 0.0254, "lr": 9.714285714285715e-05, "epoch": 3.4, "percentage": 9.71, "elapsed_time": "0:07:52", "remaining_time": "1:13:14"}
36
+ {"current_steps": 35, "total_steps": 350, "loss": 0.0202, "lr": 0.0001, "epoch": 3.5, "percentage": 10.0, "elapsed_time": "0:08:07", "remaining_time": "1:13:07"}
37
+ {"current_steps": 36, "total_steps": 350, "loss": 0.0197, "lr": 9.999751334779716e-05, "epoch": 3.6, "percentage": 10.29, "elapsed_time": "0:08:23", "remaining_time": "1:13:13"}
38
+ {"current_steps": 37, "total_steps": 350, "loss": 0.0206, "lr": 9.999005363852618e-05, "epoch": 3.7, "percentage": 10.57, "elapsed_time": "0:08:38", "remaining_time": "1:13:05"}
39
+ {"current_steps": 38, "total_steps": 350, "loss": 0.0197, "lr": 9.997762161417517e-05, "epoch": 3.8, "percentage": 10.86, "elapsed_time": "0:08:51", "remaining_time": "1:12:44"}
40
+ {"current_steps": 39, "total_steps": 350, "loss": 0.0178, "lr": 9.996021851130897e-05, "epoch": 3.9, "percentage": 11.14, "elapsed_time": "0:09:03", "remaining_time": "1:12:14"}
41
+ {"current_steps": 40, "total_steps": 350, "loss": 0.0141, "lr": 9.993784606094612e-05, "epoch": 4.0, "percentage": 11.43, "elapsed_time": "0:09:10", "remaining_time": "1:11:03"}
42
+ {"current_steps": 41, "total_steps": 350, "loss": 0.012, "lr": 9.991050648838675e-05, "epoch": 4.1, "percentage": 11.71, "elapsed_time": "0:09:24", "remaining_time": "1:10:51"}
43
+ {"current_steps": 42, "total_steps": 350, "loss": 0.0124, "lr": 9.987820251299122e-05, "epoch": 4.2, "percentage": 12.0, "elapsed_time": "0:09:34", "remaining_time": "1:10:13"}
44
+ {"current_steps": 43, "total_steps": 350, "loss": 0.017, "lr": 9.984093734790956e-05, "epoch": 4.3, "percentage": 12.29, "elapsed_time": "0:09:47", "remaining_time": "1:09:55"}
45
+ {"current_steps": 44, "total_steps": 350, "loss": 0.0132, "lr": 9.979871469976196e-05, "epoch": 4.4, "percentage": 12.57, "elapsed_time": "0:10:01", "remaining_time": "1:09:40"}
46
+ {"current_steps": 45, "total_steps": 350, "loss": 0.0169, "lr": 9.975153876827008e-05, "epoch": 4.5, "percentage": 12.86, "elapsed_time": "0:10:14", "remaining_time": "1:09:23"}
47
+ {"current_steps": 46, "total_steps": 350, "loss": 0.0145, "lr": 9.969941424583926e-05, "epoch": 4.6, "percentage": 13.14, "elapsed_time": "0:10:26", "remaining_time": "1:09:01"}
48
+ {"current_steps": 47, "total_steps": 350, "loss": 0.0151, "lr": 9.964234631709187e-05, "epoch": 4.7, "percentage": 13.43, "elapsed_time": "0:10:40", "remaining_time": "1:08:51"}
49
+ {"current_steps": 48, "total_steps": 350, "loss": 0.011, "lr": 9.958034065835151e-05, "epoch": 4.8, "percentage": 13.71, "elapsed_time": "0:10:54", "remaining_time": "1:08:38"}
50
+ {"current_steps": 49, "total_steps": 350, "loss": 0.012, "lr": 9.951340343707852e-05, "epoch": 4.9, "percentage": 14.0, "elapsed_time": "0:11:08", "remaining_time": "1:08:24"}
51
+ {"current_steps": 50, "total_steps": 350, "loss": 0.0098, "lr": 9.944154131125642e-05, "epoch": 5.0, "percentage": 14.29, "elapsed_time": "0:11:13", "remaining_time": "1:07:21"}
52
+ {"current_steps": 50, "total_steps": 350, "eval_loss": 0.016623547300696373, "epoch": 5.0, "percentage": 14.29, "elapsed_time": "0:11:29", "remaining_time": "1:08:54"}
53
+ {"current_steps": 51, "total_steps": 350, "loss": 0.011, "lr": 9.936476142872979e-05, "epoch": 5.1, "percentage": 14.57, "elapsed_time": "0:11:46", "remaining_time": "1:09:03"}
54
+ {"current_steps": 52, "total_steps": 350, "loss": 0.0082, "lr": 9.928307142649316e-05, "epoch": 5.2, "percentage": 14.86, "elapsed_time": "0:12:00", "remaining_time": "1:08:49"}
55
+ {"current_steps": 53, "total_steps": 350, "loss": 0.0069, "lr": 9.919647942993148e-05, "epoch": 5.3, "percentage": 15.14, "elapsed_time": "0:12:15", "remaining_time": "1:08:40"}
56
+ {"current_steps": 54, "total_steps": 350, "loss": 0.0091, "lr": 9.910499405201195e-05, "epoch": 5.4, "percentage": 15.43, "elapsed_time": "0:12:28", "remaining_time": "1:08:23"}
57
+ {"current_steps": 55, "total_steps": 350, "loss": 0.0062, "lr": 9.900862439242719e-05, "epoch": 5.5, "percentage": 15.71, "elapsed_time": "0:13:06", "remaining_time": "1:10:19"}
58
+ {"current_steps": 56, "total_steps": 350, "loss": 0.0052, "lr": 9.890738003669029e-05, "epoch": 5.6, "percentage": 16.0, "elapsed_time": "0:13:19", "remaining_time": "1:09:59"}
59
+ {"current_steps": 57, "total_steps": 350, "loss": 0.0076, "lr": 9.880127105518122e-05, "epoch": 5.7, "percentage": 16.29, "elapsed_time": "0:13:34", "remaining_time": "1:09:44"}
60
+ {"current_steps": 58, "total_steps": 350, "loss": 0.0107, "lr": 9.869030800214532e-05, "epoch": 5.8, "percentage": 16.57, "elapsed_time": "0:13:44", "remaining_time": "1:09:11"}
61
+ {"current_steps": 59, "total_steps": 350, "loss": 0.0081, "lr": 9.857450191464337e-05, "epoch": 5.9, "percentage": 16.86, "elapsed_time": "0:13:56", "remaining_time": "1:08:46"}
62
+ {"current_steps": 60, "total_steps": 350, "loss": 0.0063, "lr": 9.84538643114539e-05, "epoch": 6.0, "percentage": 17.14, "elapsed_time": "0:14:01", "remaining_time": "1:07:49"}
63
+ {"current_steps": 61, "total_steps": 350, "loss": 0.0037, "lr": 9.832840719192736e-05, "epoch": 6.1, "percentage": 17.43, "elapsed_time": "0:14:17", "remaining_time": "1:07:42"}
64
+ {"current_steps": 62, "total_steps": 350, "loss": 0.0049, "lr": 9.819814303479267e-05, "epoch": 6.2, "percentage": 17.71, "elapsed_time": "0:14:31", "remaining_time": "1:07:28"}
65
+ {"current_steps": 63, "total_steps": 350, "loss": 0.0051, "lr": 9.806308479691595e-05, "epoch": 6.3, "percentage": 18.0, "elapsed_time": "0:14:45", "remaining_time": "1:07:15"}
66
+ {"current_steps": 64, "total_steps": 350, "loss": 0.0052, "lr": 9.792324591201179e-05, "epoch": 6.4, "percentage": 18.29, "elapsed_time": "0:15:02", "remaining_time": "1:07:14"}
67
+ {"current_steps": 65, "total_steps": 350, "loss": 0.0046, "lr": 9.777864028930705e-05, "epoch": 6.5, "percentage": 18.57, "elapsed_time": "0:15:18", "remaining_time": "1:07:06"}
68
+ {"current_steps": 66, "total_steps": 350, "loss": 0.0064, "lr": 9.76292823121573e-05, "epoch": 6.6, "percentage": 18.86, "elapsed_time": "0:15:30", "remaining_time": "1:06:45"}
69
+ {"current_steps": 67, "total_steps": 350, "loss": 0.0044, "lr": 9.747518683661631e-05, "epoch": 6.7, "percentage": 19.14, "elapsed_time": "0:15:41", "remaining_time": "1:06:15"}
70
+ {"current_steps": 68, "total_steps": 350, "loss": 0.0064, "lr": 9.731636918995821e-05, "epoch": 6.8, "percentage": 19.43, "elapsed_time": "0:15:51", "remaining_time": "1:05:46"}
71
+ {"current_steps": 69, "total_steps": 350, "loss": 0.0045, "lr": 9.715284516915303e-05, "epoch": 6.9, "percentage": 19.71, "elapsed_time": "0:16:01", "remaining_time": "1:05:17"}
72
+ {"current_steps": 70, "total_steps": 350, "loss": 0.0067, "lr": 9.698463103929542e-05, "epoch": 7.0, "percentage": 20.0, "elapsed_time": "0:16:06", "remaining_time": "1:04:26"}
73
+ {"current_steps": 71, "total_steps": 350, "loss": 0.0037, "lr": 9.681174353198687e-05, "epoch": 7.1, "percentage": 20.29, "elapsed_time": "0:16:17", "remaining_time": "1:03:59"}
74
+ {"current_steps": 72, "total_steps": 350, "loss": 0.0027, "lr": 9.663419984367139e-05, "epoch": 7.2, "percentage": 20.57, "elapsed_time": "0:16:29", "remaining_time": "1:03:41"}
75
+ {"current_steps": 73, "total_steps": 350, "loss": 0.0046, "lr": 9.645201763392513e-05, "epoch": 7.3, "percentage": 20.86, "elapsed_time": "0:16:40", "remaining_time": "1:03:16"}
76
+ {"current_steps": 74, "total_steps": 350, "loss": 0.0054, "lr": 9.626521502369984e-05, "epoch": 7.4, "percentage": 21.14, "elapsed_time": "0:16:50", "remaining_time": "1:02:49"}
77
+ {"current_steps": 75, "total_steps": 350, "loss": 0.0039, "lr": 9.607381059352038e-05, "epoch": 7.5, "percentage": 21.43, "elapsed_time": "0:17:01", "remaining_time": "1:02:24"}
78
+ {"current_steps": 75, "total_steps": 350, "eval_loss": 0.016471313312649727, "epoch": 7.5, "percentage": 21.43, "elapsed_time": "0:17:08", "remaining_time": "1:02:49"}
79
+ {"current_steps": 76, "total_steps": 350, "loss": 0.0035, "lr": 9.587782338163669e-05, "epoch": 7.6, "percentage": 21.71, "elapsed_time": "0:17:18", "remaining_time": "1:02:23"}
80
+ {"current_steps": 77, "total_steps": 350, "loss": 0.0047, "lr": 9.567727288213005e-05, "epoch": 7.7, "percentage": 22.0, "elapsed_time": "0:17:28", "remaining_time": "1:01:58"}
81
+ {"current_steps": 78, "total_steps": 350, "loss": 0.0028, "lr": 9.547217904297411e-05, "epoch": 7.8, "percentage": 22.29, "elapsed_time": "0:17:39", "remaining_time": "1:01:33"}
82
+ {"current_steps": 79, "total_steps": 350, "loss": 0.0054, "lr": 9.526256226405075e-05, "epoch": 7.9, "percentage": 22.57, "elapsed_time": "0:17:50", "remaining_time": "1:01:10"}
83
+ {"current_steps": 80, "total_steps": 350, "loss": 0.0025, "lr": 9.504844339512095e-05, "epoch": 8.0, "percentage": 22.86, "elapsed_time": "0:17:54", "remaining_time": "1:00:26"}
84
+ {"current_steps": 81, "total_steps": 350, "loss": 0.0037, "lr": 9.482984373375105e-05, "epoch": 8.1, "percentage": 23.14, "elapsed_time": "0:18:05", "remaining_time": "1:00:04"}
85
+ {"current_steps": 82, "total_steps": 350, "loss": 0.0026, "lr": 9.460678502319418e-05, "epoch": 8.2, "percentage": 23.43, "elapsed_time": "0:18:16", "remaining_time": "0:59:42"}
86
+ {"current_steps": 83, "total_steps": 350, "loss": 0.0049, "lr": 9.437928945022771e-05, "epoch": 8.3, "percentage": 23.71, "elapsed_time": "0:18:26", "remaining_time": "0:59:20"}
87
+ {"current_steps": 84, "total_steps": 350, "loss": 0.0037, "lr": 9.414737964294636e-05, "epoch": 8.4, "percentage": 24.0, "elapsed_time": "0:18:37", "remaining_time": "0:58:58"}
88
+ {"current_steps": 85, "total_steps": 350, "loss": 0.0025, "lr": 9.391107866851143e-05, "epoch": 8.5, "percentage": 24.29, "elapsed_time": "0:18:48", "remaining_time": "0:58:36"}
89
+ {"current_steps": 86, "total_steps": 350, "loss": 0.0032, "lr": 9.367041003085649e-05, "epoch": 8.6, "percentage": 24.57, "elapsed_time": "0:18:58", "remaining_time": "0:58:14"}
90
+ {"current_steps": 87, "total_steps": 350, "loss": 0.0028, "lr": 9.342539766834946e-05, "epoch": 8.7, "percentage": 24.86, "elapsed_time": "0:19:09", "remaining_time": "0:57:53"}
91
+ {"current_steps": 88, "total_steps": 350, "loss": 0.0027, "lr": 9.317606595141154e-05, "epoch": 8.8, "percentage": 25.14, "elapsed_time": "0:19:19", "remaining_time": "0:57:32"}
92
+ {"current_steps": 89, "total_steps": 350, "loss": 0.0029, "lr": 9.292243968009331e-05, "epoch": 8.9, "percentage": 25.43, "elapsed_time": "0:19:30", "remaining_time": "0:57:12"}
93
+ {"current_steps": 90, "total_steps": 350, "loss": 0.0034, "lr": 9.266454408160779e-05, "epoch": 9.0, "percentage": 25.71, "elapsed_time": "0:19:35", "remaining_time": "0:56:34"}
94
+ {"current_steps": 91, "total_steps": 350, "loss": 0.0023, "lr": 9.24024048078213e-05, "epoch": 9.1, "percentage": 26.0, "elapsed_time": "0:19:46", "remaining_time": "0:56:15"}
95
+ {"current_steps": 92, "total_steps": 350, "loss": 0.0024, "lr": 9.213604793270196e-05, "epoch": 9.2, "percentage": 26.29, "elapsed_time": "0:19:57", "remaining_time": "0:55:56"}
96
+ {"current_steps": 93, "total_steps": 350, "loss": 0.0031, "lr": 9.186549994972618e-05, "epoch": 9.3, "percentage": 26.57, "elapsed_time": "0:20:07", "remaining_time": "0:55:37"}
97
+ {"current_steps": 94, "total_steps": 350, "loss": 0.0029, "lr": 9.159078776924346e-05, "epoch": 9.4, "percentage": 26.86, "elapsed_time": "0:20:18", "remaining_time": "0:55:17"}
98
+ {"current_steps": 95, "total_steps": 350, "loss": 0.0017, "lr": 9.131193871579975e-05, "epoch": 9.5, "percentage": 27.14, "elapsed_time": "0:20:28", "remaining_time": "0:54:57"}
99
+ {"current_steps": 96, "total_steps": 350, "loss": 0.0022, "lr": 9.102898052541958e-05, "epoch": 9.6, "percentage": 27.43, "elapsed_time": "0:20:39", "remaining_time": "0:54:38"}
100
+ {"current_steps": 97, "total_steps": 350, "loss": 0.0025, "lr": 9.074194134284726e-05, "epoch": 9.7, "percentage": 27.71, "elapsed_time": "0:20:49", "remaining_time": "0:54:19"}
101
+ {"current_steps": 98, "total_steps": 350, "loss": 0.002, "lr": 9.045084971874738e-05, "epoch": 9.8, "percentage": 28.0, "elapsed_time": "0:21:00", "remaining_time": "0:54:00"}
102
+ {"current_steps": 99, "total_steps": 350, "loss": 0.0026, "lr": 9.015573460686509e-05, "epoch": 9.9, "percentage": 28.29, "elapsed_time": "0:21:10", "remaining_time": "0:53:41"}
103
+ {"current_steps": 100, "total_steps": 350, "loss": 0.0021, "lr": 8.985662536114613e-05, "epoch": 10.0, "percentage": 28.57, "elapsed_time": "0:21:15", "remaining_time": "0:53:08"}
104
+ {"current_steps": 100, "total_steps": 350, "eval_loss": 0.017384279519319534, "epoch": 10.0, "percentage": 28.57, "elapsed_time": "0:21:22", "remaining_time": "0:53:26"}
105
+ {"current_steps": 101, "total_steps": 350, "loss": 0.0017, "lr": 8.955355173281708e-05, "epoch": 10.1, "percentage": 28.86, "elapsed_time": "0:21:51", "remaining_time": "0:53:54"}
106
+ {"current_steps": 102, "total_steps": 350, "loss": 0.0018, "lr": 8.924654386742613e-05, "epoch": 10.2, "percentage": 29.14, "elapsed_time": "0:22:02", "remaining_time": "0:53:34"}
107
+ {"current_steps": 103, "total_steps": 350, "loss": 0.0025, "lr": 8.89356323018447e-05, "epoch": 10.3, "percentage": 29.43, "elapsed_time": "0:22:12", "remaining_time": "0:53:15"}
108
+ {"current_steps": 104, "total_steps": 350, "loss": 0.0017, "lr": 8.862084796122998e-05, "epoch": 10.4, "percentage": 29.71, "elapsed_time": "0:22:22", "remaining_time": "0:52:56"}
109
+ {"current_steps": 105, "total_steps": 350, "loss": 0.0029, "lr": 8.83022221559489e-05, "epoch": 10.5, "percentage": 30.0, "elapsed_time": "0:22:33", "remaining_time": "0:52:37"}
110
+ {"current_steps": 106, "total_steps": 350, "loss": 0.0022, "lr": 8.797978657846391e-05, "epoch": 10.6, "percentage": 30.29, "elapsed_time": "0:22:43", "remaining_time": "0:52:18"}
111
+ {"current_steps": 107, "total_steps": 350, "loss": 0.0024, "lr": 8.765357330018056e-05, "epoch": 10.7, "percentage": 30.57, "elapsed_time": "0:22:54", "remaining_time": "0:52:00"}
112
+ {"current_steps": 108, "total_steps": 350, "loss": 0.0028, "lr": 8.732361476825752e-05, "epoch": 10.8, "percentage": 30.86, "elapsed_time": "0:23:04", "remaining_time": "0:51:42"}
113
+ {"current_steps": 109, "total_steps": 350, "loss": 0.0018, "lr": 8.69899438023792e-05, "epoch": 10.9, "percentage": 31.14, "elapsed_time": "0:23:15", "remaining_time": "0:51:24"}
114
+ {"current_steps": 110, "total_steps": 350, "loss": 0.0018, "lr": 8.665259359149132e-05, "epoch": 11.0, "percentage": 31.43, "elapsed_time": "0:23:19", "remaining_time": "0:50:53"}
115
+ {"current_steps": 111, "total_steps": 350, "loss": 0.0028, "lr": 8.631159769049965e-05, "epoch": 11.1, "percentage": 31.71, "elapsed_time": "0:23:30", "remaining_time": "0:50:36"}
116
+ {"current_steps": 112, "total_steps": 350, "loss": 0.0018, "lr": 8.596699001693255e-05, "epoch": 11.2, "percentage": 32.0, "elapsed_time": "0:23:40", "remaining_time": "0:50:18"}
117
+ {"current_steps": 113, "total_steps": 350, "loss": 0.0018, "lr": 8.561880484756725e-05, "epoch": 11.3, "percentage": 32.29, "elapsed_time": "0:23:50", "remaining_time": "0:50:00"}
118
+ {"current_steps": 114, "total_steps": 350, "loss": 0.0036, "lr": 8.526707681502044e-05, "epoch": 11.4, "percentage": 32.57, "elapsed_time": "0:24:01", "remaining_time": "0:49:43"}
119
+ {"current_steps": 115, "total_steps": 350, "loss": 0.0019, "lr": 8.491184090430364e-05, "epoch": 11.5, "percentage": 32.86, "elapsed_time": "0:24:12", "remaining_time": "0:49:27"}
120
+ {"current_steps": 116, "total_steps": 350, "loss": 0.0024, "lr": 8.455313244934324e-05, "epoch": 11.6, "percentage": 33.14, "elapsed_time": "0:24:22", "remaining_time": "0:49:10"}
121
+ {"current_steps": 117, "total_steps": 350, "loss": 0.0033, "lr": 8.419098712946601e-05, "epoch": 11.7, "percentage": 33.43, "elapsed_time": "0:24:32", "remaining_time": "0:48:53"}
122
+ {"current_steps": 118, "total_steps": 350, "loss": 0.0032, "lr": 8.382544096585027e-05, "epoch": 11.8, "percentage": 33.71, "elapsed_time": "0:24:43", "remaining_time": "0:48:36"}
123
+ {"current_steps": 119, "total_steps": 350, "loss": 0.0032, "lr": 8.345653031794292e-05, "epoch": 11.9, "percentage": 34.0, "elapsed_time": "0:24:53", "remaining_time": "0:48:19"}
124
+ {"current_steps": 120, "total_steps": 350, "loss": 0.0044, "lr": 8.308429187984297e-05, "epoch": 12.0, "percentage": 34.29, "elapsed_time": "0:24:58", "remaining_time": "0:47:51"}
125
+ {"current_steps": 121, "total_steps": 350, "loss": 0.0025, "lr": 8.270876267665173e-05, "epoch": 12.1, "percentage": 34.57, "elapsed_time": "0:25:08", "remaining_time": "0:47:35"}
126
+ {"current_steps": 122, "total_steps": 350, "loss": 0.002, "lr": 8.232998006078997e-05, "epoch": 12.2, "percentage": 34.86, "elapsed_time": "0:25:19", "remaining_time": "0:47:19"}
127
+ {"current_steps": 123, "total_steps": 350, "loss": 0.0023, "lr": 8.19479817082828e-05, "epoch": 12.3, "percentage": 35.14, "elapsed_time": "0:25:29", "remaining_time": "0:47:03"}
128
+ {"current_steps": 124, "total_steps": 350, "loss": 0.0025, "lr": 8.156280561501195e-05, "epoch": 12.4, "percentage": 35.43, "elapsed_time": "0:25:40", "remaining_time": "0:46:47"}
129
+ {"current_steps": 125, "total_steps": 350, "loss": 0.0031, "lr": 8.117449009293668e-05, "epoch": 12.5, "percentage": 35.71, "elapsed_time": "0:25:50", "remaining_time": "0:46:30"}
130
+ {"current_steps": 125, "total_steps": 350, "eval_loss": 0.014472348615527153, "epoch": 12.5, "percentage": 35.71, "elapsed_time": "0:25:57", "remaining_time": "0:46:43"}
131
+ {"current_steps": 126, "total_steps": 350, "loss": 0.0023, "lr": 8.07830737662829e-05, "epoch": 12.6, "percentage": 36.0, "elapsed_time": "0:26:07", "remaining_time": "0:46:27"}
132
+ {"current_steps": 127, "total_steps": 350, "loss": 0.0027, "lr": 8.038859556770151e-05, "epoch": 12.7, "percentage": 36.29, "elapsed_time": "0:26:18", "remaining_time": "0:46:11"}
133
+ {"current_steps": 128, "total_steps": 350, "loss": 0.0019, "lr": 7.999109473439569e-05, "epoch": 12.8, "percentage": 36.57, "elapsed_time": "0:26:28", "remaining_time": "0:45:55"}
134
+ {"current_steps": 129, "total_steps": 350, "loss": 0.0026, "lr": 7.959061080421839e-05, "epoch": 12.9, "percentage": 36.86, "elapsed_time": "0:26:39", "remaining_time": "0:45:39"}
135
+ {"current_steps": 130, "total_steps": 350, "loss": 0.0016, "lr": 7.91871836117395e-05, "epoch": 13.0, "percentage": 37.14, "elapsed_time": "0:26:43", "remaining_time": "0:45:14"}
136
+ {"current_steps": 131, "total_steps": 350, "loss": 0.0018, "lr": 7.878085328428369e-05, "epoch": 13.1, "percentage": 37.43, "elapsed_time": "0:26:54", "remaining_time": "0:44:58"}
137
+ {"current_steps": 132, "total_steps": 350, "loss": 0.0037, "lr": 7.83716602379391e-05, "epoch": 13.2, "percentage": 37.71, "elapsed_time": "0:27:04", "remaining_time": "0:44:43"}
138
+ {"current_steps": 133, "total_steps": 350, "loss": 0.0021, "lr": 7.795964517353735e-05, "epoch": 13.3, "percentage": 38.0, "elapsed_time": "0:27:14", "remaining_time": "0:44:27"}
139
+ {"current_steps": 134, "total_steps": 350, "loss": 0.0023, "lr": 7.754484907260513e-05, "epoch": 13.4, "percentage": 38.29, "elapsed_time": "0:27:25", "remaining_time": "0:44:11"}
140
+ {"current_steps": 135, "total_steps": 350, "loss": 0.0022, "lr": 7.712731319328798e-05, "epoch": 13.5, "percentage": 38.57, "elapsed_time": "0:27:35", "remaining_time": "0:43:56"}
141
+ {"current_steps": 136, "total_steps": 350, "loss": 0.0016, "lr": 7.670707906624644e-05, "epoch": 13.6, "percentage": 38.86, "elapsed_time": "0:27:45", "remaining_time": "0:43:41"}
142
+ {"current_steps": 137, "total_steps": 350, "loss": 0.0024, "lr": 7.628418849052523e-05, "epoch": 13.7, "percentage": 39.14, "elapsed_time": "0:27:56", "remaining_time": "0:43:26"}
143
+ {"current_steps": 138, "total_steps": 350, "loss": 0.0016, "lr": 7.585868352939563e-05, "epoch": 13.8, "percentage": 39.43, "elapsed_time": "0:28:06", "remaining_time": "0:43:10"}
144
+ {"current_steps": 139, "total_steps": 350, "loss": 0.0018, "lr": 7.543060650617158e-05, "epoch": 13.9, "percentage": 39.71, "elapsed_time": "0:28:16", "remaining_time": "0:42:55"}
145
+ {"current_steps": 140, "total_steps": 350, "loss": 0.0025, "lr": 7.500000000000001e-05, "epoch": 14.0, "percentage": 40.0, "elapsed_time": "0:28:21", "remaining_time": "0:42:32"}
146
+ {"current_steps": 141, "total_steps": 350, "loss": 0.0026, "lr": 7.456690684162557e-05, "epoch": 14.1, "percentage": 40.29, "elapsed_time": "0:28:32", "remaining_time": "0:42:17"}
147
+ {"current_steps": 142, "total_steps": 350, "loss": 0.0019, "lr": 7.413137010913054e-05, "epoch": 14.2, "percentage": 40.57, "elapsed_time": "0:28:42", "remaining_time": "0:42:02"}
148
+ {"current_steps": 143, "total_steps": 350, "loss": 0.002, "lr": 7.369343312364993e-05, "epoch": 14.3, "percentage": 40.86, "elapsed_time": "0:28:52", "remaining_time": "0:41:48"}
149
+ {"current_steps": 144, "total_steps": 350, "loss": 0.0015, "lr": 7.325313944506254e-05, "epoch": 14.4, "percentage": 41.14, "elapsed_time": "0:29:03", "remaining_time": "0:41:33"}
150
+ {"current_steps": 145, "total_steps": 350, "loss": 0.0018, "lr": 7.281053286765815e-05, "epoch": 14.5, "percentage": 41.43, "elapsed_time": "0:29:13", "remaining_time": "0:41:18"}
151
+ {"current_steps": 146, "total_steps": 350, "loss": 0.0024, "lr": 7.236565741578163e-05, "epoch": 14.6, "percentage": 41.71, "elapsed_time": "0:29:24", "remaining_time": "0:41:04"}
152
+ {"current_steps": 147, "total_steps": 350, "loss": 0.0049, "lr": 7.191855733945387e-05, "epoch": 14.7, "percentage": 42.0, "elapsed_time": "0:29:34", "remaining_time": "0:40:50"}
153
+ {"current_steps": 148, "total_steps": 350, "loss": 0.0024, "lr": 7.146927710997047e-05, "epoch": 14.8, "percentage": 42.29, "elapsed_time": "0:29:44", "remaining_time": "0:40:36"}
154
+ {"current_steps": 149, "total_steps": 350, "loss": 0.0018, "lr": 7.101786141547828e-05, "epoch": 14.9, "percentage": 42.57, "elapsed_time": "0:29:55", "remaining_time": "0:40:21"}
155
+ {"current_steps": 150, "total_steps": 350, "loss": 0.0022, "lr": 7.056435515653059e-05, "epoch": 15.0, "percentage": 42.86, "elapsed_time": "0:29:59", "remaining_time": "0:39:59"}
156
+ {"current_steps": 150, "total_steps": 350, "eval_loss": 0.023497436195611954, "epoch": 15.0, "percentage": 42.86, "elapsed_time": "0:30:06", "remaining_time": "0:40:09"}
157
+ {"current_steps": 151, "total_steps": 350, "loss": 0.0015, "lr": 7.010880344162088e-05, "epoch": 15.1, "percentage": 43.14, "elapsed_time": "0:30:17", "remaining_time": "0:39:54"}
158
+ {"current_steps": 152, "total_steps": 350, "loss": 0.0022, "lr": 6.965125158269619e-05, "epoch": 15.2, "percentage": 43.43, "elapsed_time": "0:30:27", "remaining_time": "0:39:40"}
159
+ {"current_steps": 153, "total_steps": 350, "loss": 0.0029, "lr": 6.919174509065004e-05, "epoch": 15.3, "percentage": 43.71, "elapsed_time": "0:30:37", "remaining_time": "0:39:26"}
160
+ {"current_steps": 154, "total_steps": 350, "loss": 0.0022, "lr": 6.873032967079561e-05, "epoch": 15.4, "percentage": 44.0, "elapsed_time": "0:30:47", "remaining_time": "0:39:11"}
161
+ {"current_steps": 155, "total_steps": 350, "loss": 0.0033, "lr": 6.826705121831976e-05, "epoch": 15.5, "percentage": 44.29, "elapsed_time": "0:30:57", "remaining_time": "0:38:57"}
162
+ {"current_steps": 156, "total_steps": 350, "loss": 0.0022, "lr": 6.780195581371784e-05, "epoch": 15.6, "percentage": 44.57, "elapsed_time": "0:31:08", "remaining_time": "0:38:43"}
163
+ {"current_steps": 157, "total_steps": 350, "loss": 0.0021, "lr": 6.733508971821036e-05, "epoch": 15.7, "percentage": 44.86, "elapsed_time": "0:31:19", "remaining_time": "0:38:30"}
164
+ {"current_steps": 158, "total_steps": 350, "loss": 0.0019, "lr": 6.686649936914152e-05, "epoch": 15.8, "percentage": 45.14, "elapsed_time": "0:31:29", "remaining_time": "0:38:16"}
165
+ {"current_steps": 159, "total_steps": 350, "loss": 0.002, "lr": 6.639623137536023e-05, "epoch": 15.9, "percentage": 45.43, "elapsed_time": "0:31:39", "remaining_time": "0:38:02"}
166
+ {"current_steps": 160, "total_steps": 350, "loss": 0.0014, "lr": 6.592433251258423e-05, "epoch": 16.0, "percentage": 45.71, "elapsed_time": "0:31:44", "remaining_time": "0:37:41"}
167
+ {"current_steps": 161, "total_steps": 350, "loss": 0.0017, "lr": 6.545084971874738e-05, "epoch": 16.1, "percentage": 46.0, "elapsed_time": "0:31:54", "remaining_time": "0:37:27"}
168
+ {"current_steps": 162, "total_steps": 350, "loss": 0.0014, "lr": 6.497583008933097e-05, "epoch": 16.2, "percentage": 46.29, "elapsed_time": "0:32:05", "remaining_time": "0:37:13"}
169
+ {"current_steps": 163, "total_steps": 350, "loss": 0.0016, "lr": 6.449932087267932e-05, "epoch": 16.3, "percentage": 46.57, "elapsed_time": "0:32:15", "remaining_time": "0:37:00"}
170
+ {"current_steps": 164, "total_steps": 350, "loss": 0.0013, "lr": 6.402136946530014e-05, "epoch": 16.4, "percentage": 46.86, "elapsed_time": "0:32:26", "remaining_time": "0:36:47"}
171
+ {"current_steps": 165, "total_steps": 350, "loss": 0.0016, "lr": 6.354202340715026e-05, "epoch": 16.5, "percentage": 47.14, "elapsed_time": "0:32:36", "remaining_time": "0:36:33"}
172
+ {"current_steps": 166, "total_steps": 350, "loss": 0.0019, "lr": 6.306133037690693e-05, "epoch": 16.6, "percentage": 47.43, "elapsed_time": "0:32:46", "remaining_time": "0:36:20"}
173
+ {"current_steps": 167, "total_steps": 350, "loss": 0.0016, "lr": 6.257933818722543e-05, "epoch": 16.7, "percentage": 47.71, "elapsed_time": "0:32:56", "remaining_time": "0:36:06"}
174
+ {"current_steps": 168, "total_steps": 350, "loss": 0.0025, "lr": 6.209609477998338e-05, "epoch": 16.8, "percentage": 48.0, "elapsed_time": "0:33:07", "remaining_time": "0:35:52"}
175
+ {"current_steps": 169, "total_steps": 350, "loss": 0.0018, "lr": 6.161164822151213e-05, "epoch": 16.9, "percentage": 48.29, "elapsed_time": "0:33:17", "remaining_time": "0:35:39"}
176
+ {"current_steps": 170, "total_steps": 350, "loss": 0.0017, "lr": 6.112604669781572e-05, "epoch": 17.0, "percentage": 48.57, "elapsed_time": "0:33:22", "remaining_time": "0:35:19"}
177
+ {"current_steps": 171, "total_steps": 350, "loss": 0.0012, "lr": 6.063933850977811e-05, "epoch": 17.1, "percentage": 48.86, "elapsed_time": "0:33:32", "remaining_time": "0:35:06"}
178
+ {"current_steps": 172, "total_steps": 350, "loss": 0.0014, "lr": 6.015157206835881e-05, "epoch": 17.2, "percentage": 49.14, "elapsed_time": "0:33:43", "remaining_time": "0:34:53"}
179
+ {"current_steps": 173, "total_steps": 350, "loss": 0.0013, "lr": 5.9662795889777666e-05, "epoch": 17.3, "percentage": 49.43, "elapsed_time": "0:33:53", "remaining_time": "0:34:40"}
180
+ {"current_steps": 174, "total_steps": 350, "loss": 0.0013, "lr": 5.917305859068912e-05, "epoch": 17.4, "percentage": 49.71, "elapsed_time": "0:34:03", "remaining_time": "0:34:27"}
181
+ {"current_steps": 175, "total_steps": 350, "loss": 0.0013, "lr": 5.868240888334653e-05, "epoch": 17.5, "percentage": 50.0, "elapsed_time": "0:34:14", "remaining_time": "0:34:14"}
182
+ {"current_steps": 175, "total_steps": 350, "eval_loss": 0.01870564930140972, "epoch": 17.5, "percentage": 50.0, "elapsed_time": "0:34:21", "remaining_time": "0:34:21"}
183
+ {"current_steps": 176, "total_steps": 350, "loss": 0.0018, "lr": 5.819089557075689e-05, "epoch": 17.6, "percentage": 50.29, "elapsed_time": "0:34:31", "remaining_time": "0:34:08"}
184
+ {"current_steps": 177, "total_steps": 350, "loss": 0.0014, "lr": 5.7698567541826675e-05, "epoch": 17.7, "percentage": 50.57, "elapsed_time": "0:34:42", "remaining_time": "0:33:55"}
185
+ {"current_steps": 178, "total_steps": 350, "loss": 0.0025, "lr": 5.7205473766499005e-05, "epoch": 17.8, "percentage": 50.86, "elapsed_time": "0:34:52", "remaining_time": "0:33:42"}
186
+ {"current_steps": 179, "total_steps": 350, "loss": 0.0024, "lr": 5.6711663290882776e-05, "epoch": 17.9, "percentage": 51.14, "elapsed_time": "0:35:02", "remaining_time": "0:33:28"}
187
+ {"current_steps": 180, "total_steps": 350, "loss": 0.0014, "lr": 5.621718523237427e-05, "epoch": 18.0, "percentage": 51.43, "elapsed_time": "0:35:07", "remaining_time": "0:33:10"}
188
+ {"current_steps": 181, "total_steps": 350, "loss": 0.0016, "lr": 5.57220887747716e-05, "epoch": 18.1, "percentage": 51.71, "elapsed_time": "0:35:17", "remaining_time": "0:32:57"}
189
+ {"current_steps": 182, "total_steps": 350, "loss": 0.0013, "lr": 5.522642316338268e-05, "epoch": 18.2, "percentage": 52.0, "elapsed_time": "0:35:28", "remaining_time": "0:32:45"}
190
+ {"current_steps": 183, "total_steps": 350, "loss": 0.0017, "lr": 5.473023770012686e-05, "epoch": 18.3, "percentage": 52.29, "elapsed_time": "0:35:39", "remaining_time": "0:32:32"}
191
+ {"current_steps": 184, "total_steps": 350, "loss": 0.0013, "lr": 5.4233581738631165e-05, "epoch": 18.4, "percentage": 52.57, "elapsed_time": "0:35:49", "remaining_time": "0:32:19"}
192
+ {"current_steps": 185, "total_steps": 350, "loss": 0.0016, "lr": 5.373650467932122e-05, "epoch": 18.5, "percentage": 52.86, "elapsed_time": "0:35:59", "remaining_time": "0:32:06"}
193
+ {"current_steps": 186, "total_steps": 350, "loss": 0.0015, "lr": 5.323905596450759e-05, "epoch": 18.6, "percentage": 53.14, "elapsed_time": "0:36:09", "remaining_time": "0:31:53"}
194
+ {"current_steps": 187, "total_steps": 350, "loss": 0.0013, "lr": 5.274128507346801e-05, "epoch": 18.7, "percentage": 53.43, "elapsed_time": "0:36:20", "remaining_time": "0:31:40"}
195
+ {"current_steps": 188, "total_steps": 350, "loss": 0.0013, "lr": 5.2243241517525754e-05, "epoch": 18.8, "percentage": 53.71, "elapsed_time": "0:36:30", "remaining_time": "0:31:27"}
196
+ {"current_steps": 189, "total_steps": 350, "loss": 0.0013, "lr": 5.174497483512506e-05, "epoch": 18.9, "percentage": 54.0, "elapsed_time": "0:36:41", "remaining_time": "0:31:14"}
197
+ {"current_steps": 190, "total_steps": 350, "loss": 0.0013, "lr": 5.124653458690365e-05, "epoch": 19.0, "percentage": 54.29, "elapsed_time": "0:36:45", "remaining_time": "0:30:57"}
198
+ {"current_steps": 191, "total_steps": 350, "loss": 0.0013, "lr": 5.074797035076319e-05, "epoch": 19.1, "percentage": 54.57, "elapsed_time": "0:36:56", "remaining_time": "0:30:45"}
199
+ {"current_steps": 192, "total_steps": 350, "loss": 0.0013, "lr": 5.024933171693791e-05, "epoch": 19.2, "percentage": 54.86, "elapsed_time": "0:37:07", "remaining_time": "0:30:32"}
200
+ {"current_steps": 193, "total_steps": 350, "loss": 0.0012, "lr": 4.9750668283062104e-05, "epoch": 19.3, "percentage": 55.14, "elapsed_time": "0:37:17", "remaining_time": "0:30:20"}
201
+ {"current_steps": 194, "total_steps": 350, "loss": 0.0013, "lr": 4.925202964923683e-05, "epoch": 19.4, "percentage": 55.43, "elapsed_time": "0:37:27", "remaining_time": "0:30:07"}
202
+ {"current_steps": 195, "total_steps": 350, "loss": 0.0017, "lr": 4.875346541309637e-05, "epoch": 19.5, "percentage": 55.71, "elapsed_time": "0:37:38", "remaining_time": "0:29:54"}
203
+ {"current_steps": 196, "total_steps": 350, "loss": 0.0013, "lr": 4.825502516487497e-05, "epoch": 19.6, "percentage": 56.0, "elapsed_time": "0:37:48", "remaining_time": "0:29:42"}
204
+ {"current_steps": 197, "total_steps": 350, "loss": 0.0013, "lr": 4.775675848247427e-05, "epoch": 19.7, "percentage": 56.29, "elapsed_time": "0:37:58", "remaining_time": "0:29:29"}
205
+ {"current_steps": 198, "total_steps": 350, "loss": 0.0013, "lr": 4.725871492653199e-05, "epoch": 19.8, "percentage": 56.57, "elapsed_time": "0:38:08", "remaining_time": "0:29:17"}
206
+ {"current_steps": 199, "total_steps": 350, "loss": 0.0014, "lr": 4.6760944035492404e-05, "epoch": 19.9, "percentage": 56.86, "elapsed_time": "0:38:19", "remaining_time": "0:29:04"}
207
+ {"current_steps": 200, "total_steps": 350, "loss": 0.0012, "lr": 4.626349532067879e-05, "epoch": 20.0, "percentage": 57.14, "elapsed_time": "0:38:23", "remaining_time": "0:28:47"}
208
+ {"current_steps": 200, "total_steps": 350, "eval_loss": 0.02464105747640133, "epoch": 20.0, "percentage": 57.14, "elapsed_time": "0:38:30", "remaining_time": "0:28:53"}
209
+ {"current_steps": 201, "total_steps": 350, "loss": 0.0013, "lr": 4.576641826136884e-05, "epoch": 20.1, "percentage": 57.43, "elapsed_time": "0:38:47", "remaining_time": "0:28:45"}
210
+ {"current_steps": 202, "total_steps": 350, "loss": 0.0015, "lr": 4.526976229987315e-05, "epoch": 20.2, "percentage": 57.71, "elapsed_time": "0:38:58", "remaining_time": "0:28:33"}
211
+ {"current_steps": 203, "total_steps": 350, "loss": 0.0013, "lr": 4.477357683661734e-05, "epoch": 20.3, "percentage": 58.0, "elapsed_time": "0:39:08", "remaining_time": "0:28:20"}
212
+ {"current_steps": 204, "total_steps": 350, "loss": 0.0014, "lr": 4.4277911225228414e-05, "epoch": 20.4, "percentage": 58.29, "elapsed_time": "0:39:18", "remaining_time": "0:28:08"}
213
+ {"current_steps": 205, "total_steps": 350, "loss": 0.0013, "lr": 4.378281476762576e-05, "epoch": 20.5, "percentage": 58.57, "elapsed_time": "0:39:29", "remaining_time": "0:27:55"}
214
+ {"current_steps": 206, "total_steps": 350, "loss": 0.0012, "lr": 4.328833670911724e-05, "epoch": 20.6, "percentage": 58.86, "elapsed_time": "0:39:39", "remaining_time": "0:27:43"}
215
+ {"current_steps": 207, "total_steps": 350, "loss": 0.0012, "lr": 4.2794526233501006e-05, "epoch": 20.7, "percentage": 59.14, "elapsed_time": "0:39:49", "remaining_time": "0:27:30"}
216
+ {"current_steps": 208, "total_steps": 350, "loss": 0.0012, "lr": 4.230143245817332e-05, "epoch": 20.8, "percentage": 59.43, "elapsed_time": "0:40:00", "remaining_time": "0:27:18"}
217
+ {"current_steps": 209, "total_steps": 350, "loss": 0.0013, "lr": 4.180910442924312e-05, "epoch": 20.9, "percentage": 59.71, "elapsed_time": "0:40:10", "remaining_time": "0:27:06"}
218
+ {"current_steps": 210, "total_steps": 350, "loss": 0.0012, "lr": 4.131759111665349e-05, "epoch": 21.0, "percentage": 60.0, "elapsed_time": "0:40:15", "remaining_time": "0:26:50"}
219
+ {"current_steps": 211, "total_steps": 350, "loss": 0.0013, "lr": 4.082694140931089e-05, "epoch": 21.1, "percentage": 60.29, "elapsed_time": "0:40:25", "remaining_time": "0:26:37"}
220
+ {"current_steps": 212, "total_steps": 350, "loss": 0.0012, "lr": 4.0337204110222346e-05, "epoch": 21.2, "percentage": 60.57, "elapsed_time": "0:40:36", "remaining_time": "0:26:25"}
221
+ {"current_steps": 213, "total_steps": 350, "loss": 0.0013, "lr": 3.98484279316412e-05, "epoch": 21.3, "percentage": 60.86, "elapsed_time": "0:40:46", "remaining_time": "0:26:13"}
222
+ {"current_steps": 214, "total_steps": 350, "loss": 0.0013, "lr": 3.936066149022191e-05, "epoch": 21.4, "percentage": 61.14, "elapsed_time": "0:40:57", "remaining_time": "0:26:01"}
223
+ {"current_steps": 215, "total_steps": 350, "loss": 0.0012, "lr": 3.887395330218429e-05, "epoch": 21.5, "percentage": 61.43, "elapsed_time": "0:41:07", "remaining_time": "0:25:49"}
224
+ {"current_steps": 216, "total_steps": 350, "loss": 0.0013, "lr": 3.838835177848788e-05, "epoch": 21.6, "percentage": 61.71, "elapsed_time": "0:41:17", "remaining_time": "0:25:37"}
225
+ {"current_steps": 217, "total_steps": 350, "loss": 0.0013, "lr": 3.790390522001662e-05, "epoch": 21.7, "percentage": 62.0, "elapsed_time": "0:41:28", "remaining_time": "0:25:25"}
226
+ {"current_steps": 218, "total_steps": 350, "loss": 0.0013, "lr": 3.742066181277458e-05, "epoch": 21.8, "percentage": 62.29, "elapsed_time": "0:41:38", "remaining_time": "0:25:13"}
227
+ {"current_steps": 219, "total_steps": 350, "loss": 0.0018, "lr": 3.6938669623093084e-05, "epoch": 21.9, "percentage": 62.57, "elapsed_time": "0:41:49", "remaining_time": "0:25:00"}
228
+ {"current_steps": 220, "total_steps": 350, "loss": 0.0013, "lr": 3.6457976592849754e-05, "epoch": 22.0, "percentage": 62.86, "elapsed_time": "0:41:53", "remaining_time": "0:24:45"}
229
+ {"current_steps": 221, "total_steps": 350, "loss": 0.0012, "lr": 3.597863053469987e-05, "epoch": 22.1, "percentage": 63.14, "elapsed_time": "0:42:04", "remaining_time": "0:24:33"}
230
+ {"current_steps": 222, "total_steps": 350, "loss": 0.0012, "lr": 3.550067912732069e-05, "epoch": 22.2, "percentage": 63.43, "elapsed_time": "0:42:14", "remaining_time": "0:24:21"}
231
+ {"current_steps": 223, "total_steps": 350, "loss": 0.0012, "lr": 3.502416991066904e-05, "epoch": 22.3, "percentage": 63.71, "elapsed_time": "0:42:25", "remaining_time": "0:24:09"}
232
+ {"current_steps": 224, "total_steps": 350, "loss": 0.0013, "lr": 3.4549150281252636e-05, "epoch": 22.4, "percentage": 64.0, "elapsed_time": "0:42:35", "remaining_time": "0:23:57"}
233
+ {"current_steps": 225, "total_steps": 350, "loss": 0.0013, "lr": 3.4075667487415785e-05, "epoch": 22.5, "percentage": 64.29, "elapsed_time": "0:42:45", "remaining_time": "0:23:45"}
234
+ {"current_steps": 225, "total_steps": 350, "eval_loss": 0.02635515108704567, "epoch": 22.5, "percentage": 64.29, "elapsed_time": "0:42:52", "remaining_time": "0:23:49"}
235
+ {"current_steps": 226, "total_steps": 350, "loss": 0.0012, "lr": 3.360376862463979e-05, "epoch": 22.6, "percentage": 64.57, "elapsed_time": "0:43:02", "remaining_time": "0:23:37"}
236
+ {"current_steps": 227, "total_steps": 350, "loss": 0.0012, "lr": 3.313350063085851e-05, "epoch": 22.7, "percentage": 64.86, "elapsed_time": "0:43:12", "remaining_time": "0:23:24"}
237
+ {"current_steps": 228, "total_steps": 350, "loss": 0.0013, "lr": 3.266491028178964e-05, "epoch": 22.8, "percentage": 65.14, "elapsed_time": "0:43:23", "remaining_time": "0:23:12"}
238
+ {"current_steps": 229, "total_steps": 350, "loss": 0.0012, "lr": 3.219804418628216e-05, "epoch": 22.9, "percentage": 65.43, "elapsed_time": "0:43:33", "remaining_time": "0:23:01"}
239
+ {"current_steps": 230, "total_steps": 350, "loss": 0.0013, "lr": 3.173294878168025e-05, "epoch": 23.0, "percentage": 65.71, "elapsed_time": "0:43:38", "remaining_time": "0:22:46"}
240
+ {"current_steps": 231, "total_steps": 350, "loss": 0.0013, "lr": 3.12696703292044e-05, "epoch": 23.1, "percentage": 66.0, "elapsed_time": "0:43:48", "remaining_time": "0:22:34"}
241
+ {"current_steps": 232, "total_steps": 350, "loss": 0.0013, "lr": 3.080825490934999e-05, "epoch": 23.2, "percentage": 66.29, "elapsed_time": "0:43:59", "remaining_time": "0:22:22"}
242
+ {"current_steps": 233, "total_steps": 350, "loss": 0.0012, "lr": 3.0348748417303823e-05, "epoch": 23.3, "percentage": 66.57, "elapsed_time": "0:44:09", "remaining_time": "0:22:10"}
243
+ {"current_steps": 234, "total_steps": 350, "loss": 0.0013, "lr": 2.989119655837913e-05, "epoch": 23.4, "percentage": 66.86, "elapsed_time": "0:44:20", "remaining_time": "0:21:58"}
244
+ {"current_steps": 235, "total_steps": 350, "loss": 0.0013, "lr": 2.9435644843469436e-05, "epoch": 23.5, "percentage": 67.14, "elapsed_time": "0:44:30", "remaining_time": "0:21:46"}
245
+ {"current_steps": 236, "total_steps": 350, "loss": 0.0013, "lr": 2.8982138584521735e-05, "epoch": 23.6, "percentage": 67.43, "elapsed_time": "0:44:40", "remaining_time": "0:21:34"}
246
+ {"current_steps": 237, "total_steps": 350, "loss": 0.0013, "lr": 2.8530722890029537e-05, "epoch": 23.7, "percentage": 67.71, "elapsed_time": "0:44:50", "remaining_time": "0:21:23"}
247
+ {"current_steps": 238, "total_steps": 350, "loss": 0.0013, "lr": 2.8081442660546125e-05, "epoch": 23.8, "percentage": 68.0, "elapsed_time": "0:45:01", "remaining_time": "0:21:11"}
248
+ {"current_steps": 239, "total_steps": 350, "loss": 0.0013, "lr": 2.7634342584218365e-05, "epoch": 23.9, "percentage": 68.29, "elapsed_time": "0:45:11", "remaining_time": "0:20:59"}
249
+ {"current_steps": 240, "total_steps": 350, "loss": 0.0013, "lr": 2.718946713234185e-05, "epoch": 24.0, "percentage": 68.57, "elapsed_time": "0:45:16", "remaining_time": "0:20:44"}
250
+ {"current_steps": 241, "total_steps": 350, "loss": 0.0012, "lr": 2.674686055493748e-05, "epoch": 24.1, "percentage": 68.86, "elapsed_time": "0:45:26", "remaining_time": "0:20:33"}
251
+ {"current_steps": 242, "total_steps": 350, "loss": 0.0013, "lr": 2.630656687635007e-05, "epoch": 24.2, "percentage": 69.14, "elapsed_time": "0:45:36", "remaining_time": "0:20:21"}
252
+ {"current_steps": 243, "total_steps": 350, "loss": 0.0012, "lr": 2.5868629890869468e-05, "epoch": 24.3, "percentage": 69.43, "elapsed_time": "0:45:47", "remaining_time": "0:20:09"}
253
+ {"current_steps": 244, "total_steps": 350, "loss": 0.0013, "lr": 2.543309315837444e-05, "epoch": 24.4, "percentage": 69.71, "elapsed_time": "0:45:57", "remaining_time": "0:19:58"}
254
+ {"current_steps": 245, "total_steps": 350, "loss": 0.0012, "lr": 2.500000000000001e-05, "epoch": 24.5, "percentage": 70.0, "elapsed_time": "0:46:08", "remaining_time": "0:19:46"}
255
+ {"current_steps": 246, "total_steps": 350, "loss": 0.0012, "lr": 2.456939349382843e-05, "epoch": 24.6, "percentage": 70.29, "elapsed_time": "0:46:18", "remaining_time": "0:19:34"}
256
+ {"current_steps": 247, "total_steps": 350, "loss": 0.0013, "lr": 2.4141316470604362e-05, "epoch": 24.7, "percentage": 70.57, "elapsed_time": "0:46:28", "remaining_time": "0:19:22"}
257
+ {"current_steps": 248, "total_steps": 350, "loss": 0.0012, "lr": 2.371581150947476e-05, "epoch": 24.8, "percentage": 70.86, "elapsed_time": "0:46:39", "remaining_time": "0:19:11"}
258
+ {"current_steps": 249, "total_steps": 350, "loss": 0.0013, "lr": 2.3292920933753566e-05, "epoch": 24.9, "percentage": 71.14, "elapsed_time": "0:46:49", "remaining_time": "0:18:59"}
259
+ {"current_steps": 250, "total_steps": 350, "loss": 0.0013, "lr": 2.2872686806712035e-05, "epoch": 25.0, "percentage": 71.43, "elapsed_time": "0:46:54", "remaining_time": "0:18:45"}
260
+ {"current_steps": 250, "total_steps": 350, "eval_loss": 0.027748363092541695, "epoch": 25.0, "percentage": 71.43, "elapsed_time": "0:47:01", "remaining_time": "0:18:48"}
261
+ {"current_steps": 250, "total_steps": 350, "epoch": 25.0, "percentage": 71.43, "elapsed_time": "0:47:01", "remaining_time": "0:18:48"}
trainer_state.json CHANGED
@@ -1,97 +1,1857 @@
1
  {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 25,
6
- "global_step": 10,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.1,
13
- "grad_norm": 1.222133755683899,
14
- "learning_rate": 0.0001,
15
  "loss": 0.1531,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.2,
20
- "grad_norm": 1.2098262310028076,
21
- "learning_rate": 9.698463103929542e-05,
22
  "loss": 0.1308,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.3,
27
- "grad_norm": 0.9002367258071899,
28
- "learning_rate": 8.83022221559489e-05,
29
- "loss": 0.1702,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.4,
34
- "grad_norm": 0.20591969788074493,
35
- "learning_rate": 7.500000000000001e-05,
36
- "loss": 0.0813,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.5,
41
- "grad_norm": 0.2272542268037796,
42
- "learning_rate": 5.868240888334653e-05,
43
- "loss": 0.0786,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.6,
48
- "grad_norm": 0.20280824601650238,
49
- "learning_rate": 4.131759111665349e-05,
50
- "loss": 0.0576,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.7,
55
- "grad_norm": 0.16879288852214813,
56
- "learning_rate": 2.500000000000001e-05,
57
- "loss": 0.0715,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.8,
62
- "grad_norm": 0.14683185517787933,
63
- "learning_rate": 1.1697777844051105e-05,
64
- "loss": 0.0518,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.9,
69
- "grad_norm": 0.13245084881782532,
70
- "learning_rate": 3.0153689607045845e-06,
71
- "loss": 0.0697,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 1.0,
76
- "grad_norm": 0.12347117066383362,
77
- "learning_rate": 0.0,
78
- "loss": 0.0538,
79
  "step": 10
80
  },
81
  {
82
- "epoch": 1.0,
83
- "step": 10,
84
- "total_flos": 1.5338473968402432e+16,
85
- "train_loss": 0.09183733761310578,
86
- "train_runtime": 197.912,
87
- "train_samples_per_second": 2.289,
88
- "train_steps_per_second": 0.051
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  }
90
  ],
91
  "logging_steps": 1,
92
- "max_steps": 10,
93
  "num_input_tokens_seen": 0,
94
- "num_train_epochs": 1,
95
  "save_steps": 100,
96
  "stateful_callbacks": {
97
  "EarlyStoppingCallback": {
@@ -100,7 +1860,7 @@
100
  "early_stopping_threshold": 0.0
101
  },
102
  "attributes": {
103
- "early_stopping_patience_counter": 0
104
  }
105
  },
106
  "TrainerControl": {
@@ -109,12 +1869,12 @@
109
  "should_evaluate": false,
110
  "should_log": false,
111
  "should_save": true,
112
- "should_training_stop": true
113
  },
114
  "attributes": {}
115
  }
116
  },
117
- "total_flos": 1.5338473968402432e+16,
118
  "train_batch_size": 48,
119
  "trial_name": null,
120
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.017384279519319534,
3
+ "best_model_checkpoint": "/home/paperspace/Data/models/Klystroglobal/llm3br256-v1.5/checkpoint-100",
4
+ "epoch": 25.0,
5
  "eval_steps": 25,
6
+ "global_step": 250,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.1,
13
+ "grad_norm": 1.2220871448516846,
14
+ "learning_rate": 2.8571428571428573e-06,
15
  "loss": 0.1531,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.2,
20
+ "grad_norm": 1.2097222805023193,
21
+ "learning_rate": 5.7142857142857145e-06,
22
  "loss": 0.1308,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.3,
27
+ "grad_norm": 1.2176955938339233,
28
+ "learning_rate": 8.571428571428573e-06,
29
+ "loss": 0.1619,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.4,
34
+ "grad_norm": 0.7081905007362366,
35
+ "learning_rate": 1.1428571428571429e-05,
36
+ "loss": 0.1325,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.5,
41
+ "grad_norm": 0.529013991355896,
42
+ "learning_rate": 1.4285714285714285e-05,
43
+ "loss": 0.1206,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.6,
48
+ "grad_norm": 0.32227373123168945,
49
+ "learning_rate": 1.7142857142857145e-05,
50
+ "loss": 0.0875,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.7,
55
+ "grad_norm": 0.34188932180404663,
56
+ "learning_rate": 2e-05,
57
+ "loss": 0.1067,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.8,
62
+ "grad_norm": 0.513898491859436,
63
+ "learning_rate": 2.2857142857142858e-05,
64
+ "loss": 0.0966,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.9,
69
+ "grad_norm": 0.47207334637641907,
70
+ "learning_rate": 2.5714285714285714e-05,
71
+ "loss": 0.1088,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 1.0,
76
+ "grad_norm": 0.32782670855522156,
77
+ "learning_rate": 2.857142857142857e-05,
78
+ "loss": 0.0839,
79
  "step": 10
80
  },
81
  {
82
+ "epoch": 1.1,
83
+ "grad_norm": 0.17063647508621216,
84
+ "learning_rate": 3.142857142857143e-05,
85
+ "loss": 0.0713,
86
+ "step": 11
87
+ },
88
+ {
89
+ "epoch": 1.2,
90
+ "grad_norm": 0.15981265902519226,
91
+ "learning_rate": 3.428571428571429e-05,
92
+ "loss": 0.0688,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 1.3,
97
+ "grad_norm": 0.16717936098575592,
98
+ "learning_rate": 3.7142857142857143e-05,
99
+ "loss": 0.0688,
100
+ "step": 13
101
+ },
102
+ {
103
+ "epoch": 1.4,
104
+ "grad_norm": 0.22196544706821442,
105
+ "learning_rate": 4e-05,
106
+ "loss": 0.0668,
107
+ "step": 14
108
+ },
109
+ {
110
+ "epoch": 1.5,
111
+ "grad_norm": 0.20881694555282593,
112
+ "learning_rate": 4.2857142857142856e-05,
113
+ "loss": 0.0613,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 1.6,
118
+ "grad_norm": 0.14273549616336823,
119
+ "learning_rate": 4.5714285714285716e-05,
120
+ "loss": 0.052,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 1.7,
125
+ "grad_norm": 0.12841083109378815,
126
+ "learning_rate": 4.8571428571428576e-05,
127
+ "loss": 0.0581,
128
+ "step": 17
129
+ },
130
+ {
131
+ "epoch": 1.8,
132
+ "grad_norm": 0.1572558879852295,
133
+ "learning_rate": 5.142857142857143e-05,
134
+ "loss": 0.0482,
135
+ "step": 18
136
+ },
137
+ {
138
+ "epoch": 1.9,
139
+ "grad_norm": 0.19301706552505493,
140
+ "learning_rate": 5.428571428571428e-05,
141
+ "loss": 0.0533,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 2.0,
146
+ "grad_norm": 0.1539728194475174,
147
+ "learning_rate": 5.714285714285714e-05,
148
+ "loss": 0.0513,
149
+ "step": 20
150
+ },
151
+ {
152
+ "epoch": 2.1,
153
+ "grad_norm": 0.11170095205307007,
154
+ "learning_rate": 6e-05,
155
+ "loss": 0.042,
156
+ "step": 21
157
+ },
158
+ {
159
+ "epoch": 2.2,
160
+ "grad_norm": 0.09418804198503494,
161
+ "learning_rate": 6.285714285714286e-05,
162
+ "loss": 0.0401,
163
+ "step": 22
164
+ },
165
+ {
166
+ "epoch": 2.3,
167
+ "grad_norm": 0.11917826533317566,
168
+ "learning_rate": 6.571428571428571e-05,
169
+ "loss": 0.0382,
170
+ "step": 23
171
+ },
172
+ {
173
+ "epoch": 2.4,
174
+ "grad_norm": 0.10801587998867035,
175
+ "learning_rate": 6.857142857142858e-05,
176
+ "loss": 0.034,
177
+ "step": 24
178
+ },
179
+ {
180
+ "epoch": 2.5,
181
+ "grad_norm": 0.11365531384944916,
182
+ "learning_rate": 7.142857142857143e-05,
183
+ "loss": 0.0437,
184
+ "step": 25
185
+ },
186
+ {
187
+ "epoch": 2.5,
188
+ "eval_loss": 0.03402441740036011,
189
+ "eval_runtime": 19.4202,
190
+ "eval_samples_per_second": 5.149,
191
+ "eval_steps_per_second": 0.154,
192
+ "step": 25
193
+ },
194
+ {
195
+ "epoch": 2.6,
196
+ "grad_norm": 0.1079014241695404,
197
+ "learning_rate": 7.428571428571429e-05,
198
+ "loss": 0.0422,
199
+ "step": 26
200
+ },
201
+ {
202
+ "epoch": 2.7,
203
+ "grad_norm": 0.08936240524053574,
204
+ "learning_rate": 7.714285714285715e-05,
205
+ "loss": 0.0264,
206
+ "step": 27
207
+ },
208
+ {
209
+ "epoch": 2.8,
210
+ "grad_norm": 0.12060200423002243,
211
+ "learning_rate": 8e-05,
212
+ "loss": 0.0377,
213
+ "step": 28
214
+ },
215
+ {
216
+ "epoch": 2.9,
217
+ "grad_norm": 0.08112004399299622,
218
+ "learning_rate": 8.285714285714287e-05,
219
+ "loss": 0.0289,
220
+ "step": 29
221
+ },
222
+ {
223
+ "epoch": 3.0,
224
+ "grad_norm": 0.12806135416030884,
225
+ "learning_rate": 8.571428571428571e-05,
226
+ "loss": 0.0299,
227
+ "step": 30
228
+ },
229
+ {
230
+ "epoch": 3.1,
231
+ "grad_norm": 0.06607820093631744,
232
+ "learning_rate": 8.857142857142857e-05,
233
+ "loss": 0.0216,
234
+ "step": 31
235
+ },
236
+ {
237
+ "epoch": 3.2,
238
+ "grad_norm": 0.08246105909347534,
239
+ "learning_rate": 9.142857142857143e-05,
240
+ "loss": 0.0359,
241
+ "step": 32
242
+ },
243
+ {
244
+ "epoch": 3.3,
245
+ "grad_norm": 0.07171958684921265,
246
+ "learning_rate": 9.428571428571429e-05,
247
+ "loss": 0.0233,
248
+ "step": 33
249
+ },
250
+ {
251
+ "epoch": 3.4,
252
+ "grad_norm": 0.07688147574663162,
253
+ "learning_rate": 9.714285714285715e-05,
254
+ "loss": 0.0254,
255
+ "step": 34
256
+ },
257
+ {
258
+ "epoch": 3.5,
259
+ "grad_norm": 0.07434146851301193,
260
+ "learning_rate": 0.0001,
261
+ "loss": 0.0202,
262
+ "step": 35
263
+ },
264
+ {
265
+ "epoch": 3.6,
266
+ "grad_norm": 0.06925389170646667,
267
+ "learning_rate": 9.999751334779716e-05,
268
+ "loss": 0.0197,
269
+ "step": 36
270
+ },
271
+ {
272
+ "epoch": 3.7,
273
+ "grad_norm": 0.06520260870456696,
274
+ "learning_rate": 9.999005363852618e-05,
275
+ "loss": 0.0206,
276
+ "step": 37
277
+ },
278
+ {
279
+ "epoch": 3.8,
280
+ "grad_norm": 0.07232938706874847,
281
+ "learning_rate": 9.997762161417517e-05,
282
+ "loss": 0.0197,
283
+ "step": 38
284
+ },
285
+ {
286
+ "epoch": 3.9,
287
+ "grad_norm": 0.08089913427829742,
288
+ "learning_rate": 9.996021851130897e-05,
289
+ "loss": 0.0178,
290
+ "step": 39
291
+ },
292
+ {
293
+ "epoch": 4.0,
294
+ "grad_norm": 0.12080717831850052,
295
+ "learning_rate": 9.993784606094612e-05,
296
+ "loss": 0.0141,
297
+ "step": 40
298
+ },
299
+ {
300
+ "epoch": 4.1,
301
+ "grad_norm": 0.05649913102388382,
302
+ "learning_rate": 9.991050648838675e-05,
303
+ "loss": 0.012,
304
+ "step": 41
305
+ },
306
+ {
307
+ "epoch": 4.2,
308
+ "grad_norm": 0.09042762964963913,
309
+ "learning_rate": 9.987820251299122e-05,
310
+ "loss": 0.0124,
311
+ "step": 42
312
+ },
313
+ {
314
+ "epoch": 4.3,
315
+ "grad_norm": 0.07907257974147797,
316
+ "learning_rate": 9.984093734790956e-05,
317
+ "loss": 0.017,
318
+ "step": 43
319
+ },
320
+ {
321
+ "epoch": 4.4,
322
+ "grad_norm": 0.07241521775722504,
323
+ "learning_rate": 9.979871469976196e-05,
324
+ "loss": 0.0132,
325
+ "step": 44
326
+ },
327
+ {
328
+ "epoch": 4.5,
329
+ "grad_norm": 0.10079007595777512,
330
+ "learning_rate": 9.975153876827008e-05,
331
+ "loss": 0.0169,
332
+ "step": 45
333
+ },
334
+ {
335
+ "epoch": 4.6,
336
+ "grad_norm": 0.09246091544628143,
337
+ "learning_rate": 9.969941424583926e-05,
338
+ "loss": 0.0145,
339
+ "step": 46
340
+ },
341
+ {
342
+ "epoch": 4.7,
343
+ "grad_norm": 0.0651487484574318,
344
+ "learning_rate": 9.964234631709187e-05,
345
+ "loss": 0.0151,
346
+ "step": 47
347
+ },
348
+ {
349
+ "epoch": 4.8,
350
+ "grad_norm": 0.06992605328559875,
351
+ "learning_rate": 9.958034065835151e-05,
352
+ "loss": 0.011,
353
+ "step": 48
354
+ },
355
+ {
356
+ "epoch": 4.9,
357
+ "grad_norm": 0.06309088319540024,
358
+ "learning_rate": 9.951340343707852e-05,
359
+ "loss": 0.012,
360
+ "step": 49
361
+ },
362
+ {
363
+ "epoch": 5.0,
364
+ "grad_norm": 0.06862813979387283,
365
+ "learning_rate": 9.944154131125642e-05,
366
+ "loss": 0.0098,
367
+ "step": 50
368
+ },
369
+ {
370
+ "epoch": 5.0,
371
+ "eval_loss": 0.016623547300696373,
372
+ "eval_runtime": 15.4785,
373
+ "eval_samples_per_second": 6.461,
374
+ "eval_steps_per_second": 0.194,
375
+ "step": 50
376
+ },
377
+ {
378
+ "epoch": 5.1,
379
+ "grad_norm": 0.06234560161828995,
380
+ "learning_rate": 9.936476142872979e-05,
381
+ "loss": 0.011,
382
+ "step": 51
383
+ },
384
+ {
385
+ "epoch": 5.2,
386
+ "grad_norm": 0.05178332328796387,
387
+ "learning_rate": 9.928307142649316e-05,
388
+ "loss": 0.0082,
389
+ "step": 52
390
+ },
391
+ {
392
+ "epoch": 5.3,
393
+ "grad_norm": 0.0584288015961647,
394
+ "learning_rate": 9.919647942993148e-05,
395
+ "loss": 0.0069,
396
+ "step": 53
397
+ },
398
+ {
399
+ "epoch": 5.4,
400
+ "grad_norm": 0.05619216337800026,
401
+ "learning_rate": 9.910499405201195e-05,
402
+ "loss": 0.0091,
403
+ "step": 54
404
+ },
405
+ {
406
+ "epoch": 5.5,
407
+ "grad_norm": 0.052176207304000854,
408
+ "learning_rate": 9.900862439242719e-05,
409
+ "loss": 0.0062,
410
+ "step": 55
411
+ },
412
+ {
413
+ "epoch": 5.6,
414
+ "grad_norm": 0.058783914893865585,
415
+ "learning_rate": 9.890738003669029e-05,
416
+ "loss": 0.0052,
417
+ "step": 56
418
+ },
419
+ {
420
+ "epoch": 5.7,
421
+ "grad_norm": 0.08193694055080414,
422
+ "learning_rate": 9.880127105518122e-05,
423
+ "loss": 0.0076,
424
+ "step": 57
425
+ },
426
+ {
427
+ "epoch": 5.8,
428
+ "grad_norm": 0.09745576977729797,
429
+ "learning_rate": 9.869030800214532e-05,
430
+ "loss": 0.0107,
431
+ "step": 58
432
+ },
433
+ {
434
+ "epoch": 5.9,
435
+ "grad_norm": 0.07822689414024353,
436
+ "learning_rate": 9.857450191464337e-05,
437
+ "loss": 0.0081,
438
+ "step": 59
439
+ },
440
+ {
441
+ "epoch": 6.0,
442
+ "grad_norm": 0.06525323539972305,
443
+ "learning_rate": 9.84538643114539e-05,
444
+ "loss": 0.0063,
445
+ "step": 60
446
+ },
447
+ {
448
+ "epoch": 6.1,
449
+ "grad_norm": 0.03879164531826973,
450
+ "learning_rate": 9.832840719192736e-05,
451
+ "loss": 0.0037,
452
+ "step": 61
453
+ },
454
+ {
455
+ "epoch": 6.2,
456
+ "grad_norm": 0.05432894080877304,
457
+ "learning_rate": 9.819814303479267e-05,
458
+ "loss": 0.0049,
459
+ "step": 62
460
+ },
461
+ {
462
+ "epoch": 6.3,
463
+ "grad_norm": 0.04752165079116821,
464
+ "learning_rate": 9.806308479691595e-05,
465
+ "loss": 0.0051,
466
+ "step": 63
467
+ },
468
+ {
469
+ "epoch": 6.4,
470
+ "grad_norm": 0.0588836595416069,
471
+ "learning_rate": 9.792324591201179e-05,
472
+ "loss": 0.0052,
473
+ "step": 64
474
+ },
475
+ {
476
+ "epoch": 6.5,
477
+ "grad_norm": 0.07457052916288376,
478
+ "learning_rate": 9.777864028930705e-05,
479
+ "loss": 0.0046,
480
+ "step": 65
481
+ },
482
+ {
483
+ "epoch": 6.6,
484
+ "grad_norm": 0.06699630618095398,
485
+ "learning_rate": 9.76292823121573e-05,
486
+ "loss": 0.0064,
487
+ "step": 66
488
+ },
489
+ {
490
+ "epoch": 6.7,
491
+ "grad_norm": 0.05367649346590042,
492
+ "learning_rate": 9.747518683661631e-05,
493
+ "loss": 0.0044,
494
+ "step": 67
495
+ },
496
+ {
497
+ "epoch": 6.8,
498
+ "grad_norm": 0.06585957109928131,
499
+ "learning_rate": 9.731636918995821e-05,
500
+ "loss": 0.0064,
501
+ "step": 68
502
+ },
503
+ {
504
+ "epoch": 6.9,
505
+ "grad_norm": 0.05559472367167473,
506
+ "learning_rate": 9.715284516915303e-05,
507
+ "loss": 0.0045,
508
+ "step": 69
509
+ },
510
+ {
511
+ "epoch": 7.0,
512
+ "grad_norm": 0.1440582275390625,
513
+ "learning_rate": 9.698463103929542e-05,
514
+ "loss": 0.0067,
515
+ "step": 70
516
+ },
517
+ {
518
+ "epoch": 7.1,
519
+ "grad_norm": 0.04040021821856499,
520
+ "learning_rate": 9.681174353198687e-05,
521
+ "loss": 0.0037,
522
+ "step": 71
523
+ },
524
+ {
525
+ "epoch": 7.2,
526
+ "grad_norm": 0.06325013935565948,
527
+ "learning_rate": 9.663419984367139e-05,
528
+ "loss": 0.0027,
529
+ "step": 72
530
+ },
531
+ {
532
+ "epoch": 7.3,
533
+ "grad_norm": 0.11049168556928635,
534
+ "learning_rate": 9.645201763392513e-05,
535
+ "loss": 0.0046,
536
+ "step": 73
537
+ },
538
+ {
539
+ "epoch": 7.4,
540
+ "grad_norm": 0.0775715634226799,
541
+ "learning_rate": 9.626521502369984e-05,
542
+ "loss": 0.0054,
543
+ "step": 74
544
+ },
545
+ {
546
+ "epoch": 7.5,
547
+ "grad_norm": 0.08004690706729889,
548
+ "learning_rate": 9.607381059352038e-05,
549
+ "loss": 0.0039,
550
+ "step": 75
551
+ },
552
+ {
553
+ "epoch": 7.5,
554
+ "eval_loss": 0.016471313312649727,
555
+ "eval_runtime": 7.0024,
556
+ "eval_samples_per_second": 14.281,
557
+ "eval_steps_per_second": 0.428,
558
+ "step": 75
559
+ },
560
+ {
561
+ "epoch": 7.6,
562
+ "grad_norm": 0.12311126291751862,
563
+ "learning_rate": 9.587782338163669e-05,
564
+ "loss": 0.0035,
565
+ "step": 76
566
+ },
567
+ {
568
+ "epoch": 7.7,
569
+ "grad_norm": 0.05487671494483948,
570
+ "learning_rate": 9.567727288213005e-05,
571
+ "loss": 0.0047,
572
+ "step": 77
573
+ },
574
+ {
575
+ "epoch": 7.8,
576
+ "grad_norm": 0.03079923987388611,
577
+ "learning_rate": 9.547217904297411e-05,
578
+ "loss": 0.0028,
579
+ "step": 78
580
+ },
581
+ {
582
+ "epoch": 7.9,
583
+ "grad_norm": 0.09893915802240372,
584
+ "learning_rate": 9.526256226405075e-05,
585
+ "loss": 0.0054,
586
+ "step": 79
587
+ },
588
+ {
589
+ "epoch": 8.0,
590
+ "grad_norm": 0.06392358988523483,
591
+ "learning_rate": 9.504844339512095e-05,
592
+ "loss": 0.0025,
593
+ "step": 80
594
+ },
595
+ {
596
+ "epoch": 8.1,
597
+ "grad_norm": 0.04920504242181778,
598
+ "learning_rate": 9.482984373375105e-05,
599
+ "loss": 0.0037,
600
+ "step": 81
601
+ },
602
+ {
603
+ "epoch": 8.2,
604
+ "grad_norm": 0.044106096029281616,
605
+ "learning_rate": 9.460678502319418e-05,
606
+ "loss": 0.0026,
607
+ "step": 82
608
+ },
609
+ {
610
+ "epoch": 8.3,
611
+ "grad_norm": 0.07550745457410812,
612
+ "learning_rate": 9.437928945022771e-05,
613
+ "loss": 0.0049,
614
+ "step": 83
615
+ },
616
+ {
617
+ "epoch": 8.4,
618
+ "grad_norm": 0.06214550510048866,
619
+ "learning_rate": 9.414737964294636e-05,
620
+ "loss": 0.0037,
621
+ "step": 84
622
+ },
623
+ {
624
+ "epoch": 8.5,
625
+ "grad_norm": 0.057385075837373734,
626
+ "learning_rate": 9.391107866851143e-05,
627
+ "loss": 0.0025,
628
+ "step": 85
629
+ },
630
+ {
631
+ "epoch": 8.6,
632
+ "grad_norm": 0.0968804582953453,
633
+ "learning_rate": 9.367041003085649e-05,
634
+ "loss": 0.0032,
635
+ "step": 86
636
+ },
637
+ {
638
+ "epoch": 8.7,
639
+ "grad_norm": 0.03738746419548988,
640
+ "learning_rate": 9.342539766834946e-05,
641
+ "loss": 0.0028,
642
+ "step": 87
643
+ },
644
+ {
645
+ "epoch": 8.8,
646
+ "grad_norm": 0.04243948310613632,
647
+ "learning_rate": 9.317606595141154e-05,
648
+ "loss": 0.0027,
649
+ "step": 88
650
+ },
651
+ {
652
+ "epoch": 8.9,
653
+ "grad_norm": 0.034692391753196716,
654
+ "learning_rate": 9.292243968009331e-05,
655
+ "loss": 0.0029,
656
+ "step": 89
657
+ },
658
+ {
659
+ "epoch": 9.0,
660
+ "grad_norm": 0.06521083414554596,
661
+ "learning_rate": 9.266454408160779e-05,
662
+ "loss": 0.0034,
663
+ "step": 90
664
+ },
665
+ {
666
+ "epoch": 9.1,
667
+ "grad_norm": 0.04499003291130066,
668
+ "learning_rate": 9.24024048078213e-05,
669
+ "loss": 0.0023,
670
+ "step": 91
671
+ },
672
+ {
673
+ "epoch": 9.2,
674
+ "grad_norm": 0.03955000266432762,
675
+ "learning_rate": 9.213604793270196e-05,
676
+ "loss": 0.0024,
677
+ "step": 92
678
+ },
679
+ {
680
+ "epoch": 9.3,
681
+ "grad_norm": 0.03790497034788132,
682
+ "learning_rate": 9.186549994972618e-05,
683
+ "loss": 0.0031,
684
+ "step": 93
685
+ },
686
+ {
687
+ "epoch": 9.4,
688
+ "grad_norm": 0.053670890629291534,
689
+ "learning_rate": 9.159078776924346e-05,
690
+ "loss": 0.0029,
691
+ "step": 94
692
+ },
693
+ {
694
+ "epoch": 9.5,
695
+ "grad_norm": 0.016972996294498444,
696
+ "learning_rate": 9.131193871579975e-05,
697
+ "loss": 0.0017,
698
+ "step": 95
699
+ },
700
+ {
701
+ "epoch": 9.6,
702
+ "grad_norm": 0.12130908668041229,
703
+ "learning_rate": 9.102898052541958e-05,
704
+ "loss": 0.0022,
705
+ "step": 96
706
+ },
707
+ {
708
+ "epoch": 9.7,
709
+ "grad_norm": 0.04438166692852974,
710
+ "learning_rate": 9.074194134284726e-05,
711
+ "loss": 0.0025,
712
+ "step": 97
713
+ },
714
+ {
715
+ "epoch": 9.8,
716
+ "grad_norm": 0.05157145857810974,
717
+ "learning_rate": 9.045084971874738e-05,
718
+ "loss": 0.002,
719
+ "step": 98
720
+ },
721
+ {
722
+ "epoch": 9.9,
723
+ "grad_norm": 0.03810460492968559,
724
+ "learning_rate": 9.015573460686509e-05,
725
+ "loss": 0.0026,
726
+ "step": 99
727
+ },
728
+ {
729
+ "epoch": 10.0,
730
+ "grad_norm": 0.06720886379480362,
731
+ "learning_rate": 8.985662536114613e-05,
732
+ "loss": 0.0021,
733
+ "step": 100
734
+ },
735
+ {
736
+ "epoch": 10.0,
737
+ "eval_loss": 0.017384279519319534,
738
+ "eval_runtime": 7.4008,
739
+ "eval_samples_per_second": 13.512,
740
+ "eval_steps_per_second": 0.405,
741
+ "step": 100
742
+ },
743
+ {
744
+ "epoch": 10.1,
745
+ "grad_norm": 0.017021650448441505,
746
+ "learning_rate": 8.955355173281708e-05,
747
+ "loss": 0.0017,
748
+ "step": 101
749
+ },
750
+ {
751
+ "epoch": 10.2,
752
+ "grad_norm": 0.03386203572154045,
753
+ "learning_rate": 8.924654386742613e-05,
754
+ "loss": 0.0018,
755
+ "step": 102
756
+ },
757
+ {
758
+ "epoch": 10.3,
759
+ "grad_norm": 0.06196419894695282,
760
+ "learning_rate": 8.89356323018447e-05,
761
+ "loss": 0.0025,
762
+ "step": 103
763
+ },
764
+ {
765
+ "epoch": 10.4,
766
+ "grad_norm": 0.02523985505104065,
767
+ "learning_rate": 8.862084796122998e-05,
768
+ "loss": 0.0017,
769
+ "step": 104
770
+ },
771
+ {
772
+ "epoch": 10.5,
773
+ "grad_norm": 0.05176355317234993,
774
+ "learning_rate": 8.83022221559489e-05,
775
+ "loss": 0.0029,
776
+ "step": 105
777
+ },
778
+ {
779
+ "epoch": 10.6,
780
+ "grad_norm": 0.05031086131930351,
781
+ "learning_rate": 8.797978657846391e-05,
782
+ "loss": 0.0022,
783
+ "step": 106
784
+ },
785
+ {
786
+ "epoch": 10.7,
787
+ "grad_norm": 0.06354419887065887,
788
+ "learning_rate": 8.765357330018056e-05,
789
+ "loss": 0.0024,
790
+ "step": 107
791
+ },
792
+ {
793
+ "epoch": 10.8,
794
+ "grad_norm": 0.06342065334320068,
795
+ "learning_rate": 8.732361476825752e-05,
796
+ "loss": 0.0028,
797
+ "step": 108
798
+ },
799
+ {
800
+ "epoch": 10.9,
801
+ "grad_norm": 0.03949422389268875,
802
+ "learning_rate": 8.69899438023792e-05,
803
+ "loss": 0.0018,
804
+ "step": 109
805
+ },
806
+ {
807
+ "epoch": 11.0,
808
+ "grad_norm": 0.02962133288383484,
809
+ "learning_rate": 8.665259359149132e-05,
810
+ "loss": 0.0018,
811
+ "step": 110
812
+ },
813
+ {
814
+ "epoch": 11.1,
815
+ "grad_norm": 0.10264372825622559,
816
+ "learning_rate": 8.631159769049965e-05,
817
+ "loss": 0.0028,
818
+ "step": 111
819
+ },
820
+ {
821
+ "epoch": 11.2,
822
+ "grad_norm": 0.021233167499303818,
823
+ "learning_rate": 8.596699001693255e-05,
824
+ "loss": 0.0018,
825
+ "step": 112
826
+ },
827
+ {
828
+ "epoch": 11.3,
829
+ "grad_norm": 0.06390991806983948,
830
+ "learning_rate": 8.561880484756725e-05,
831
+ "loss": 0.0018,
832
+ "step": 113
833
+ },
834
+ {
835
+ "epoch": 11.4,
836
+ "grad_norm": 0.1139807403087616,
837
+ "learning_rate": 8.526707681502044e-05,
838
+ "loss": 0.0036,
839
+ "step": 114
840
+ },
841
+ {
842
+ "epoch": 11.5,
843
+ "grad_norm": 0.018219145014882088,
844
+ "learning_rate": 8.491184090430364e-05,
845
+ "loss": 0.0019,
846
+ "step": 115
847
+ },
848
+ {
849
+ "epoch": 11.6,
850
+ "grad_norm": 0.03801802918314934,
851
+ "learning_rate": 8.455313244934324e-05,
852
+ "loss": 0.0024,
853
+ "step": 116
854
+ },
855
+ {
856
+ "epoch": 11.7,
857
+ "grad_norm": 0.052779678255319595,
858
+ "learning_rate": 8.419098712946601e-05,
859
+ "loss": 0.0033,
860
+ "step": 117
861
+ },
862
+ {
863
+ "epoch": 11.8,
864
+ "grad_norm": 0.15576517581939697,
865
+ "learning_rate": 8.382544096585027e-05,
866
+ "loss": 0.0032,
867
+ "step": 118
868
+ },
869
+ {
870
+ "epoch": 11.9,
871
+ "grad_norm": 0.050439249724149704,
872
+ "learning_rate": 8.345653031794292e-05,
873
+ "loss": 0.0032,
874
+ "step": 119
875
+ },
876
+ {
877
+ "epoch": 12.0,
878
+ "grad_norm": 0.13610731065273285,
879
+ "learning_rate": 8.308429187984297e-05,
880
+ "loss": 0.0044,
881
+ "step": 120
882
+ },
883
+ {
884
+ "epoch": 12.1,
885
+ "grad_norm": 0.03380730748176575,
886
+ "learning_rate": 8.270876267665173e-05,
887
+ "loss": 0.0025,
888
+ "step": 121
889
+ },
890
+ {
891
+ "epoch": 12.2,
892
+ "grad_norm": 0.032273851335048676,
893
+ "learning_rate": 8.232998006078997e-05,
894
+ "loss": 0.002,
895
+ "step": 122
896
+ },
897
+ {
898
+ "epoch": 12.3,
899
+ "grad_norm": 0.021625736728310585,
900
+ "learning_rate": 8.19479817082828e-05,
901
+ "loss": 0.0023,
902
+ "step": 123
903
+ },
904
+ {
905
+ "epoch": 12.4,
906
+ "grad_norm": 0.050165340304374695,
907
+ "learning_rate": 8.156280561501195e-05,
908
+ "loss": 0.0025,
909
+ "step": 124
910
+ },
911
+ {
912
+ "epoch": 12.5,
913
+ "grad_norm": 0.052705712616443634,
914
+ "learning_rate": 8.117449009293668e-05,
915
+ "loss": 0.0031,
916
+ "step": 125
917
+ },
918
+ {
919
+ "epoch": 12.5,
920
+ "eval_loss": 0.014472348615527153,
921
+ "eval_runtime": 7.0196,
922
+ "eval_samples_per_second": 14.246,
923
+ "eval_steps_per_second": 0.427,
924
+ "step": 125
925
+ },
926
+ {
927
+ "epoch": 12.6,
928
+ "grad_norm": 0.05300145596265793,
929
+ "learning_rate": 8.07830737662829e-05,
930
+ "loss": 0.0023,
931
+ "step": 126
932
+ },
933
+ {
934
+ "epoch": 12.7,
935
+ "grad_norm": 0.06016397848725319,
936
+ "learning_rate": 8.038859556770151e-05,
937
+ "loss": 0.0027,
938
+ "step": 127
939
+ },
940
+ {
941
+ "epoch": 12.8,
942
+ "grad_norm": 0.06083128601312637,
943
+ "learning_rate": 7.999109473439569e-05,
944
+ "loss": 0.0019,
945
+ "step": 128
946
+ },
947
+ {
948
+ "epoch": 12.9,
949
+ "grad_norm": 0.036125779151916504,
950
+ "learning_rate": 7.959061080421839e-05,
951
+ "loss": 0.0026,
952
+ "step": 129
953
+ },
954
+ {
955
+ "epoch": 13.0,
956
+ "grad_norm": 0.03736874461174011,
957
+ "learning_rate": 7.91871836117395e-05,
958
+ "loss": 0.0016,
959
+ "step": 130
960
+ },
961
+ {
962
+ "epoch": 13.1,
963
+ "grad_norm": 0.0378425307571888,
964
+ "learning_rate": 7.878085328428369e-05,
965
+ "loss": 0.0018,
966
+ "step": 131
967
+ },
968
+ {
969
+ "epoch": 13.2,
970
+ "grad_norm": 0.06520125269889832,
971
+ "learning_rate": 7.83716602379391e-05,
972
+ "loss": 0.0037,
973
+ "step": 132
974
+ },
975
+ {
976
+ "epoch": 13.3,
977
+ "grad_norm": 0.06993651390075684,
978
+ "learning_rate": 7.795964517353735e-05,
979
+ "loss": 0.0021,
980
+ "step": 133
981
+ },
982
+ {
983
+ "epoch": 13.4,
984
+ "grad_norm": 0.0514182485640049,
985
+ "learning_rate": 7.754484907260513e-05,
986
+ "loss": 0.0023,
987
+ "step": 134
988
+ },
989
+ {
990
+ "epoch": 13.5,
991
+ "grad_norm": 0.0771847516298294,
992
+ "learning_rate": 7.712731319328798e-05,
993
+ "loss": 0.0022,
994
+ "step": 135
995
+ },
996
+ {
997
+ "epoch": 13.6,
998
+ "grad_norm": 0.02829659916460514,
999
+ "learning_rate": 7.670707906624644e-05,
1000
+ "loss": 0.0016,
1001
+ "step": 136
1002
+ },
1003
+ {
1004
+ "epoch": 13.7,
1005
+ "grad_norm": 0.08551648259162903,
1006
+ "learning_rate": 7.628418849052523e-05,
1007
+ "loss": 0.0024,
1008
+ "step": 137
1009
+ },
1010
+ {
1011
+ "epoch": 13.8,
1012
+ "grad_norm": 0.09427579492330551,
1013
+ "learning_rate": 7.585868352939563e-05,
1014
+ "loss": 0.0016,
1015
+ "step": 138
1016
+ },
1017
+ {
1018
+ "epoch": 13.9,
1019
+ "grad_norm": 0.04036640748381615,
1020
+ "learning_rate": 7.543060650617158e-05,
1021
+ "loss": 0.0018,
1022
+ "step": 139
1023
+ },
1024
+ {
1025
+ "epoch": 14.0,
1026
+ "grad_norm": 0.19952990114688873,
1027
+ "learning_rate": 7.500000000000001e-05,
1028
+ "loss": 0.0025,
1029
+ "step": 140
1030
+ },
1031
+ {
1032
+ "epoch": 14.1,
1033
+ "grad_norm": 0.11951940506696701,
1034
+ "learning_rate": 7.456690684162557e-05,
1035
+ "loss": 0.0026,
1036
+ "step": 141
1037
+ },
1038
+ {
1039
+ "epoch": 14.2,
1040
+ "grad_norm": 0.043521635234355927,
1041
+ "learning_rate": 7.413137010913054e-05,
1042
+ "loss": 0.0019,
1043
+ "step": 142
1044
+ },
1045
+ {
1046
+ "epoch": 14.3,
1047
+ "grad_norm": 0.07670493423938751,
1048
+ "learning_rate": 7.369343312364993e-05,
1049
+ "loss": 0.002,
1050
+ "step": 143
1051
+ },
1052
+ {
1053
+ "epoch": 14.4,
1054
+ "grad_norm": 0.027879884466528893,
1055
+ "learning_rate": 7.325313944506254e-05,
1056
+ "loss": 0.0015,
1057
+ "step": 144
1058
+ },
1059
+ {
1060
+ "epoch": 14.5,
1061
+ "grad_norm": 0.05514749884605408,
1062
+ "learning_rate": 7.281053286765815e-05,
1063
+ "loss": 0.0018,
1064
+ "step": 145
1065
+ },
1066
+ {
1067
+ "epoch": 14.6,
1068
+ "grad_norm": 0.06391794979572296,
1069
+ "learning_rate": 7.236565741578163e-05,
1070
+ "loss": 0.0024,
1071
+ "step": 146
1072
+ },
1073
+ {
1074
+ "epoch": 14.7,
1075
+ "grad_norm": 0.08744440227746964,
1076
+ "learning_rate": 7.191855733945387e-05,
1077
+ "loss": 0.0049,
1078
+ "step": 147
1079
+ },
1080
+ {
1081
+ "epoch": 14.8,
1082
+ "grad_norm": 0.056523509323596954,
1083
+ "learning_rate": 7.146927710997047e-05,
1084
+ "loss": 0.0024,
1085
+ "step": 148
1086
+ },
1087
+ {
1088
+ "epoch": 14.9,
1089
+ "grad_norm": 0.028166329488158226,
1090
+ "learning_rate": 7.101786141547828e-05,
1091
+ "loss": 0.0018,
1092
+ "step": 149
1093
+ },
1094
+ {
1095
+ "epoch": 15.0,
1096
+ "grad_norm": 0.09874721616506577,
1097
+ "learning_rate": 7.056435515653059e-05,
1098
+ "loss": 0.0022,
1099
+ "step": 150
1100
+ },
1101
+ {
1102
+ "epoch": 15.0,
1103
+ "eval_loss": 0.023497436195611954,
1104
+ "eval_runtime": 7.0483,
1105
+ "eval_samples_per_second": 14.188,
1106
+ "eval_steps_per_second": 0.426,
1107
+ "step": 150
1108
+ },
1109
+ {
1110
+ "epoch": 15.1,
1111
+ "grad_norm": 0.020559396594762802,
1112
+ "learning_rate": 7.010880344162088e-05,
1113
+ "loss": 0.0015,
1114
+ "step": 151
1115
+ },
1116
+ {
1117
+ "epoch": 15.2,
1118
+ "grad_norm": 0.06717398762702942,
1119
+ "learning_rate": 6.965125158269619e-05,
1120
+ "loss": 0.0022,
1121
+ "step": 152
1122
+ },
1123
+ {
1124
+ "epoch": 15.3,
1125
+ "grad_norm": 0.052798088639974594,
1126
+ "learning_rate": 6.919174509065004e-05,
1127
+ "loss": 0.0029,
1128
+ "step": 153
1129
+ },
1130
+ {
1131
+ "epoch": 15.4,
1132
+ "grad_norm": 0.04526599869132042,
1133
+ "learning_rate": 6.873032967079561e-05,
1134
+ "loss": 0.0022,
1135
+ "step": 154
1136
+ },
1137
+ {
1138
+ "epoch": 15.5,
1139
+ "grad_norm": 0.045334987342357635,
1140
+ "learning_rate": 6.826705121831976e-05,
1141
+ "loss": 0.0033,
1142
+ "step": 155
1143
+ },
1144
+ {
1145
+ "epoch": 15.6,
1146
+ "grad_norm": 0.02370765618979931,
1147
+ "learning_rate": 6.780195581371784e-05,
1148
+ "loss": 0.0022,
1149
+ "step": 156
1150
+ },
1151
+ {
1152
+ "epoch": 15.7,
1153
+ "grad_norm": 0.034078944474458694,
1154
+ "learning_rate": 6.733508971821036e-05,
1155
+ "loss": 0.0021,
1156
+ "step": 157
1157
+ },
1158
+ {
1159
+ "epoch": 15.8,
1160
+ "grad_norm": 0.04473605379462242,
1161
+ "learning_rate": 6.686649936914152e-05,
1162
+ "loss": 0.0019,
1163
+ "step": 158
1164
+ },
1165
+ {
1166
+ "epoch": 15.9,
1167
+ "grad_norm": 0.03901509568095207,
1168
+ "learning_rate": 6.639623137536023e-05,
1169
+ "loss": 0.002,
1170
+ "step": 159
1171
+ },
1172
+ {
1173
+ "epoch": 16.0,
1174
+ "grad_norm": 0.027788842096924782,
1175
+ "learning_rate": 6.592433251258423e-05,
1176
+ "loss": 0.0014,
1177
+ "step": 160
1178
+ },
1179
+ {
1180
+ "epoch": 16.1,
1181
+ "grad_norm": 0.02930135279893875,
1182
+ "learning_rate": 6.545084971874738e-05,
1183
+ "loss": 0.0017,
1184
+ "step": 161
1185
+ },
1186
+ {
1187
+ "epoch": 16.2,
1188
+ "grad_norm": 0.010466442443430424,
1189
+ "learning_rate": 6.497583008933097e-05,
1190
+ "loss": 0.0014,
1191
+ "step": 162
1192
+ },
1193
+ {
1194
+ "epoch": 16.3,
1195
+ "grad_norm": 0.021891970187425613,
1196
+ "learning_rate": 6.449932087267932e-05,
1197
+ "loss": 0.0016,
1198
+ "step": 163
1199
+ },
1200
+ {
1201
+ "epoch": 16.4,
1202
+ "grad_norm": 0.012705606408417225,
1203
+ "learning_rate": 6.402136946530014e-05,
1204
+ "loss": 0.0013,
1205
+ "step": 164
1206
+ },
1207
+ {
1208
+ "epoch": 16.5,
1209
+ "grad_norm": 0.019639883190393448,
1210
+ "learning_rate": 6.354202340715026e-05,
1211
+ "loss": 0.0016,
1212
+ "step": 165
1213
+ },
1214
+ {
1215
+ "epoch": 16.6,
1216
+ "grad_norm": 0.03136239945888519,
1217
+ "learning_rate": 6.306133037690693e-05,
1218
+ "loss": 0.0019,
1219
+ "step": 166
1220
+ },
1221
+ {
1222
+ "epoch": 16.7,
1223
+ "grad_norm": 0.04432203993201256,
1224
+ "learning_rate": 6.257933818722543e-05,
1225
+ "loss": 0.0016,
1226
+ "step": 167
1227
+ },
1228
+ {
1229
+ "epoch": 16.8,
1230
+ "grad_norm": 0.06362082064151764,
1231
+ "learning_rate": 6.209609477998338e-05,
1232
+ "loss": 0.0025,
1233
+ "step": 168
1234
+ },
1235
+ {
1236
+ "epoch": 16.9,
1237
+ "grad_norm": 0.03577618673443794,
1238
+ "learning_rate": 6.161164822151213e-05,
1239
+ "loss": 0.0018,
1240
+ "step": 169
1241
+ },
1242
+ {
1243
+ "epoch": 17.0,
1244
+ "grad_norm": 0.033404137939214706,
1245
+ "learning_rate": 6.112604669781572e-05,
1246
+ "loss": 0.0017,
1247
+ "step": 170
1248
+ },
1249
+ {
1250
+ "epoch": 17.1,
1251
+ "grad_norm": 0.0031848133075982332,
1252
+ "learning_rate": 6.063933850977811e-05,
1253
+ "loss": 0.0012,
1254
+ "step": 171
1255
+ },
1256
+ {
1257
+ "epoch": 17.2,
1258
+ "grad_norm": 0.02553616650402546,
1259
+ "learning_rate": 6.015157206835881e-05,
1260
+ "loss": 0.0014,
1261
+ "step": 172
1262
+ },
1263
+ {
1264
+ "epoch": 17.3,
1265
+ "grad_norm": 0.019564760848879814,
1266
+ "learning_rate": 5.9662795889777666e-05,
1267
+ "loss": 0.0013,
1268
+ "step": 173
1269
+ },
1270
+ {
1271
+ "epoch": 17.4,
1272
+ "grad_norm": 0.00845835916697979,
1273
+ "learning_rate": 5.917305859068912e-05,
1274
+ "loss": 0.0013,
1275
+ "step": 174
1276
+ },
1277
+ {
1278
+ "epoch": 17.5,
1279
+ "grad_norm": 0.008497758768498898,
1280
+ "learning_rate": 5.868240888334653e-05,
1281
+ "loss": 0.0013,
1282
+ "step": 175
1283
+ },
1284
+ {
1285
+ "epoch": 17.5,
1286
+ "eval_loss": 0.01870564930140972,
1287
+ "eval_runtime": 7.0134,
1288
+ "eval_samples_per_second": 14.258,
1289
+ "eval_steps_per_second": 0.428,
1290
+ "step": 175
1291
+ },
1292
+ {
1293
+ "epoch": 17.6,
1294
+ "grad_norm": 0.04741276800632477,
1295
+ "learning_rate": 5.819089557075689e-05,
1296
+ "loss": 0.0018,
1297
+ "step": 176
1298
+ },
1299
+ {
1300
+ "epoch": 17.7,
1301
+ "grad_norm": 0.014859266579151154,
1302
+ "learning_rate": 5.7698567541826675e-05,
1303
+ "loss": 0.0014,
1304
+ "step": 177
1305
+ },
1306
+ {
1307
+ "epoch": 17.8,
1308
+ "grad_norm": 0.05082236975431442,
1309
+ "learning_rate": 5.7205473766499005e-05,
1310
+ "loss": 0.0025,
1311
+ "step": 178
1312
+ },
1313
+ {
1314
+ "epoch": 17.9,
1315
+ "grad_norm": 0.05401023477315903,
1316
+ "learning_rate": 5.6711663290882776e-05,
1317
+ "loss": 0.0024,
1318
+ "step": 179
1319
+ },
1320
+ {
1321
+ "epoch": 18.0,
1322
+ "grad_norm": 0.010000503621995449,
1323
+ "learning_rate": 5.621718523237427e-05,
1324
+ "loss": 0.0014,
1325
+ "step": 180
1326
+ },
1327
+ {
1328
+ "epoch": 18.1,
1329
+ "grad_norm": 0.020556163042783737,
1330
+ "learning_rate": 5.57220887747716e-05,
1331
+ "loss": 0.0016,
1332
+ "step": 181
1333
+ },
1334
+ {
1335
+ "epoch": 18.2,
1336
+ "grad_norm": 0.004740948788821697,
1337
+ "learning_rate": 5.522642316338268e-05,
1338
+ "loss": 0.0013,
1339
+ "step": 182
1340
+ },
1341
+ {
1342
+ "epoch": 18.3,
1343
+ "grad_norm": 0.014636721462011337,
1344
+ "learning_rate": 5.473023770012686e-05,
1345
+ "loss": 0.0017,
1346
+ "step": 183
1347
+ },
1348
+ {
1349
+ "epoch": 18.4,
1350
+ "grad_norm": 0.004370884504169226,
1351
+ "learning_rate": 5.4233581738631165e-05,
1352
+ "loss": 0.0013,
1353
+ "step": 184
1354
+ },
1355
+ {
1356
+ "epoch": 18.5,
1357
+ "grad_norm": 0.03240854665637016,
1358
+ "learning_rate": 5.373650467932122e-05,
1359
+ "loss": 0.0016,
1360
+ "step": 185
1361
+ },
1362
+ {
1363
+ "epoch": 18.6,
1364
+ "grad_norm": 0.04714665934443474,
1365
+ "learning_rate": 5.323905596450759e-05,
1366
+ "loss": 0.0015,
1367
+ "step": 186
1368
+ },
1369
+ {
1370
+ "epoch": 18.7,
1371
+ "grad_norm": 0.018981872126460075,
1372
+ "learning_rate": 5.274128507346801e-05,
1373
+ "loss": 0.0013,
1374
+ "step": 187
1375
+ },
1376
+ {
1377
+ "epoch": 18.8,
1378
+ "grad_norm": 0.013816704973578453,
1379
+ "learning_rate": 5.2243241517525754e-05,
1380
+ "loss": 0.0013,
1381
+ "step": 188
1382
+ },
1383
+ {
1384
+ "epoch": 18.9,
1385
+ "grad_norm": 0.01641033962368965,
1386
+ "learning_rate": 5.174497483512506e-05,
1387
+ "loss": 0.0013,
1388
+ "step": 189
1389
+ },
1390
+ {
1391
+ "epoch": 19.0,
1392
+ "grad_norm": 0.01083611510694027,
1393
+ "learning_rate": 5.124653458690365e-05,
1394
+ "loss": 0.0013,
1395
+ "step": 190
1396
+ },
1397
+ {
1398
+ "epoch": 19.1,
1399
+ "grad_norm": 0.0032175600063055754,
1400
+ "learning_rate": 5.074797035076319e-05,
1401
+ "loss": 0.0013,
1402
+ "step": 191
1403
+ },
1404
+ {
1405
+ "epoch": 19.2,
1406
+ "grad_norm": 0.0029091965407133102,
1407
+ "learning_rate": 5.024933171693791e-05,
1408
+ "loss": 0.0013,
1409
+ "step": 192
1410
+ },
1411
+ {
1412
+ "epoch": 19.3,
1413
+ "grad_norm": 0.0017372623551636934,
1414
+ "learning_rate": 4.9750668283062104e-05,
1415
+ "loss": 0.0012,
1416
+ "step": 193
1417
+ },
1418
+ {
1419
+ "epoch": 19.4,
1420
+ "grad_norm": 0.018875645473599434,
1421
+ "learning_rate": 4.925202964923683e-05,
1422
+ "loss": 0.0013,
1423
+ "step": 194
1424
+ },
1425
+ {
1426
+ "epoch": 19.5,
1427
+ "grad_norm": 0.08334866166114807,
1428
+ "learning_rate": 4.875346541309637e-05,
1429
+ "loss": 0.0017,
1430
+ "step": 195
1431
+ },
1432
+ {
1433
+ "epoch": 19.6,
1434
+ "grad_norm": 0.02136778086423874,
1435
+ "learning_rate": 4.825502516487497e-05,
1436
+ "loss": 0.0013,
1437
+ "step": 196
1438
+ },
1439
+ {
1440
+ "epoch": 19.7,
1441
+ "grad_norm": 0.015435784123837948,
1442
+ "learning_rate": 4.775675848247427e-05,
1443
+ "loss": 0.0013,
1444
+ "step": 197
1445
+ },
1446
+ {
1447
+ "epoch": 19.8,
1448
+ "grad_norm": 0.0207098126411438,
1449
+ "learning_rate": 4.725871492653199e-05,
1450
+ "loss": 0.0013,
1451
+ "step": 198
1452
+ },
1453
+ {
1454
+ "epoch": 19.9,
1455
+ "grad_norm": 0.02912098728120327,
1456
+ "learning_rate": 4.6760944035492404e-05,
1457
+ "loss": 0.0014,
1458
+ "step": 199
1459
+ },
1460
+ {
1461
+ "epoch": 20.0,
1462
+ "grad_norm": 0.0012635978637263179,
1463
+ "learning_rate": 4.626349532067879e-05,
1464
+ "loss": 0.0012,
1465
+ "step": 200
1466
+ },
1467
+ {
1468
+ "epoch": 20.0,
1469
+ "eval_loss": 0.02464105747640133,
1470
+ "eval_runtime": 7.0042,
1471
+ "eval_samples_per_second": 14.277,
1472
+ "eval_steps_per_second": 0.428,
1473
+ "step": 200
1474
+ },
1475
+ {
1476
+ "epoch": 20.1,
1477
+ "grad_norm": 0.024539776146411896,
1478
+ "learning_rate": 4.576641826136884e-05,
1479
+ "loss": 0.0013,
1480
+ "step": 201
1481
+ },
1482
+ {
1483
+ "epoch": 20.2,
1484
+ "grad_norm": 0.04463370889425278,
1485
+ "learning_rate": 4.526976229987315e-05,
1486
+ "loss": 0.0015,
1487
+ "step": 202
1488
+ },
1489
+ {
1490
+ "epoch": 20.3,
1491
+ "grad_norm": 0.002574489451944828,
1492
+ "learning_rate": 4.477357683661734e-05,
1493
+ "loss": 0.0013,
1494
+ "step": 203
1495
+ },
1496
+ {
1497
+ "epoch": 20.4,
1498
+ "grad_norm": 0.022832421585917473,
1499
+ "learning_rate": 4.4277911225228414e-05,
1500
+ "loss": 0.0014,
1501
+ "step": 204
1502
+ },
1503
+ {
1504
+ "epoch": 20.5,
1505
+ "grad_norm": 0.011037301272153854,
1506
+ "learning_rate": 4.378281476762576e-05,
1507
+ "loss": 0.0013,
1508
+ "step": 205
1509
+ },
1510
+ {
1511
+ "epoch": 20.6,
1512
+ "grad_norm": 0.0005778741906397045,
1513
+ "learning_rate": 4.328833670911724e-05,
1514
+ "loss": 0.0012,
1515
+ "step": 206
1516
+ },
1517
+ {
1518
+ "epoch": 20.7,
1519
+ "grad_norm": 0.0034062073100358248,
1520
+ "learning_rate": 4.2794526233501006e-05,
1521
+ "loss": 0.0012,
1522
+ "step": 207
1523
+ },
1524
+ {
1525
+ "epoch": 20.8,
1526
+ "grad_norm": 0.00416824035346508,
1527
+ "learning_rate": 4.230143245817332e-05,
1528
+ "loss": 0.0012,
1529
+ "step": 208
1530
+ },
1531
+ {
1532
+ "epoch": 20.9,
1533
+ "grad_norm": 0.0014664290938526392,
1534
+ "learning_rate": 4.180910442924312e-05,
1535
+ "loss": 0.0013,
1536
+ "step": 209
1537
+ },
1538
+ {
1539
+ "epoch": 21.0,
1540
+ "grad_norm": 0.0013818548759445548,
1541
+ "learning_rate": 4.131759111665349e-05,
1542
+ "loss": 0.0012,
1543
+ "step": 210
1544
+ },
1545
+ {
1546
+ "epoch": 21.1,
1547
+ "grad_norm": 0.014141053892672062,
1548
+ "learning_rate": 4.082694140931089e-05,
1549
+ "loss": 0.0013,
1550
+ "step": 211
1551
+ },
1552
+ {
1553
+ "epoch": 21.2,
1554
+ "grad_norm": 0.0033045061863958836,
1555
+ "learning_rate": 4.0337204110222346e-05,
1556
+ "loss": 0.0012,
1557
+ "step": 212
1558
+ },
1559
+ {
1560
+ "epoch": 21.3,
1561
+ "grad_norm": 0.01347325835376978,
1562
+ "learning_rate": 3.98484279316412e-05,
1563
+ "loss": 0.0013,
1564
+ "step": 213
1565
+ },
1566
+ {
1567
+ "epoch": 21.4,
1568
+ "grad_norm": 0.0014377759071066976,
1569
+ "learning_rate": 3.936066149022191e-05,
1570
+ "loss": 0.0013,
1571
+ "step": 214
1572
+ },
1573
+ {
1574
+ "epoch": 21.5,
1575
+ "grad_norm": 0.004647238180041313,
1576
+ "learning_rate": 3.887395330218429e-05,
1577
+ "loss": 0.0012,
1578
+ "step": 215
1579
+ },
1580
+ {
1581
+ "epoch": 21.6,
1582
+ "grad_norm": 0.00935914646834135,
1583
+ "learning_rate": 3.838835177848788e-05,
1584
+ "loss": 0.0013,
1585
+ "step": 216
1586
+ },
1587
+ {
1588
+ "epoch": 21.7,
1589
+ "grad_norm": 0.0008201024029403925,
1590
+ "learning_rate": 3.790390522001662e-05,
1591
+ "loss": 0.0013,
1592
+ "step": 217
1593
+ },
1594
+ {
1595
+ "epoch": 21.8,
1596
+ "grad_norm": 0.0038301898166537285,
1597
+ "learning_rate": 3.742066181277458e-05,
1598
+ "loss": 0.0013,
1599
+ "step": 218
1600
+ },
1601
+ {
1602
+ "epoch": 21.9,
1603
+ "grad_norm": 0.023225074633955956,
1604
+ "learning_rate": 3.6938669623093084e-05,
1605
+ "loss": 0.0018,
1606
+ "step": 219
1607
+ },
1608
+ {
1609
+ "epoch": 22.0,
1610
+ "grad_norm": 0.0008843315881676972,
1611
+ "learning_rate": 3.6457976592849754e-05,
1612
+ "loss": 0.0013,
1613
+ "step": 220
1614
+ },
1615
+ {
1616
+ "epoch": 22.1,
1617
+ "grad_norm": 0.0008087409660220146,
1618
+ "learning_rate": 3.597863053469987e-05,
1619
+ "loss": 0.0012,
1620
+ "step": 221
1621
+ },
1622
+ {
1623
+ "epoch": 22.2,
1624
+ "grad_norm": 0.0007809096714481711,
1625
+ "learning_rate": 3.550067912732069e-05,
1626
+ "loss": 0.0012,
1627
+ "step": 222
1628
+ },
1629
+ {
1630
+ "epoch": 22.3,
1631
+ "grad_norm": 0.0003785623121075332,
1632
+ "learning_rate": 3.502416991066904e-05,
1633
+ "loss": 0.0012,
1634
+ "step": 223
1635
+ },
1636
+ {
1637
+ "epoch": 22.4,
1638
+ "grad_norm": 0.0011643291218206286,
1639
+ "learning_rate": 3.4549150281252636e-05,
1640
+ "loss": 0.0013,
1641
+ "step": 224
1642
+ },
1643
+ {
1644
+ "epoch": 22.5,
1645
+ "grad_norm": 0.00037547224201261997,
1646
+ "learning_rate": 3.4075667487415785e-05,
1647
+ "loss": 0.0013,
1648
+ "step": 225
1649
+ },
1650
+ {
1651
+ "epoch": 22.5,
1652
+ "eval_loss": 0.02635515108704567,
1653
+ "eval_runtime": 7.0286,
1654
+ "eval_samples_per_second": 14.228,
1655
+ "eval_steps_per_second": 0.427,
1656
+ "step": 225
1657
+ },
1658
+ {
1659
+ "epoch": 22.6,
1660
+ "grad_norm": 0.002436364535242319,
1661
+ "learning_rate": 3.360376862463979e-05,
1662
+ "loss": 0.0012,
1663
+ "step": 226
1664
+ },
1665
+ {
1666
+ "epoch": 22.7,
1667
+ "grad_norm": 0.000468397862277925,
1668
+ "learning_rate": 3.313350063085851e-05,
1669
+ "loss": 0.0012,
1670
+ "step": 227
1671
+ },
1672
+ {
1673
+ "epoch": 22.8,
1674
+ "grad_norm": 0.0013973162276670337,
1675
+ "learning_rate": 3.266491028178964e-05,
1676
+ "loss": 0.0013,
1677
+ "step": 228
1678
+ },
1679
+ {
1680
+ "epoch": 22.9,
1681
+ "grad_norm": 0.000565136200748384,
1682
+ "learning_rate": 3.219804418628216e-05,
1683
+ "loss": 0.0012,
1684
+ "step": 229
1685
+ },
1686
+ {
1687
+ "epoch": 23.0,
1688
+ "grad_norm": 0.0004575321509037167,
1689
+ "learning_rate": 3.173294878168025e-05,
1690
+ "loss": 0.0013,
1691
+ "step": 230
1692
+ },
1693
+ {
1694
+ "epoch": 23.1,
1695
+ "grad_norm": 0.0003873241657856852,
1696
+ "learning_rate": 3.12696703292044e-05,
1697
+ "loss": 0.0013,
1698
+ "step": 231
1699
+ },
1700
+ {
1701
+ "epoch": 23.2,
1702
+ "grad_norm": 0.00041245773900300264,
1703
+ "learning_rate": 3.080825490934999e-05,
1704
+ "loss": 0.0013,
1705
+ "step": 232
1706
+ },
1707
+ {
1708
+ "epoch": 23.3,
1709
+ "grad_norm": 0.0005566985928453505,
1710
+ "learning_rate": 3.0348748417303823e-05,
1711
+ "loss": 0.0012,
1712
+ "step": 233
1713
+ },
1714
+ {
1715
+ "epoch": 23.4,
1716
+ "grad_norm": 0.002370474860072136,
1717
+ "learning_rate": 2.989119655837913e-05,
1718
+ "loss": 0.0013,
1719
+ "step": 234
1720
+ },
1721
+ {
1722
+ "epoch": 23.5,
1723
+ "grad_norm": 0.0008109980844892561,
1724
+ "learning_rate": 2.9435644843469436e-05,
1725
+ "loss": 0.0013,
1726
+ "step": 235
1727
+ },
1728
+ {
1729
+ "epoch": 23.6,
1730
+ "grad_norm": 0.0003989999822806567,
1731
+ "learning_rate": 2.8982138584521735e-05,
1732
+ "loss": 0.0013,
1733
+ "step": 236
1734
+ },
1735
+ {
1736
+ "epoch": 23.7,
1737
+ "grad_norm": 0.0007184173446148634,
1738
+ "learning_rate": 2.8530722890029537e-05,
1739
+ "loss": 0.0013,
1740
+ "step": 237
1741
+ },
1742
+ {
1743
+ "epoch": 23.8,
1744
+ "grad_norm": 0.0005140411667525768,
1745
+ "learning_rate": 2.8081442660546125e-05,
1746
+ "loss": 0.0013,
1747
+ "step": 238
1748
+ },
1749
+ {
1750
+ "epoch": 23.9,
1751
+ "grad_norm": 0.000472583866212517,
1752
+ "learning_rate": 2.7634342584218365e-05,
1753
+ "loss": 0.0013,
1754
+ "step": 239
1755
+ },
1756
+ {
1757
+ "epoch": 24.0,
1758
+ "grad_norm": 0.0009467861382290721,
1759
+ "learning_rate": 2.718946713234185e-05,
1760
+ "loss": 0.0013,
1761
+ "step": 240
1762
+ },
1763
+ {
1764
+ "epoch": 24.1,
1765
+ "grad_norm": 0.0005134555394761264,
1766
+ "learning_rate": 2.674686055493748e-05,
1767
+ "loss": 0.0012,
1768
+ "step": 241
1769
+ },
1770
+ {
1771
+ "epoch": 24.2,
1772
+ "grad_norm": 0.0004058448248542845,
1773
+ "learning_rate": 2.630656687635007e-05,
1774
+ "loss": 0.0013,
1775
+ "step": 242
1776
+ },
1777
+ {
1778
+ "epoch": 24.3,
1779
+ "grad_norm": 0.0005244086496531963,
1780
+ "learning_rate": 2.5868629890869468e-05,
1781
+ "loss": 0.0012,
1782
+ "step": 243
1783
+ },
1784
+ {
1785
+ "epoch": 24.4,
1786
+ "grad_norm": 0.0005328291445039213,
1787
+ "learning_rate": 2.543309315837444e-05,
1788
+ "loss": 0.0013,
1789
+ "step": 244
1790
+ },
1791
+ {
1792
+ "epoch": 24.5,
1793
+ "grad_norm": 0.0020896121859550476,
1794
+ "learning_rate": 2.500000000000001e-05,
1795
+ "loss": 0.0012,
1796
+ "step": 245
1797
+ },
1798
+ {
1799
+ "epoch": 24.6,
1800
+ "grad_norm": 0.000433528795838356,
1801
+ "learning_rate": 2.456939349382843e-05,
1802
+ "loss": 0.0012,
1803
+ "step": 246
1804
+ },
1805
+ {
1806
+ "epoch": 24.7,
1807
+ "grad_norm": 0.00044738021097145975,
1808
+ "learning_rate": 2.4141316470604362e-05,
1809
+ "loss": 0.0013,
1810
+ "step": 247
1811
+ },
1812
+ {
1813
+ "epoch": 24.8,
1814
+ "grad_norm": 0.0004753637476824224,
1815
+ "learning_rate": 2.371581150947476e-05,
1816
+ "loss": 0.0012,
1817
+ "step": 248
1818
+ },
1819
+ {
1820
+ "epoch": 24.9,
1821
+ "grad_norm": 0.0004613220226019621,
1822
+ "learning_rate": 2.3292920933753566e-05,
1823
+ "loss": 0.0013,
1824
+ "step": 249
1825
+ },
1826
+ {
1827
+ "epoch": 25.0,
1828
+ "grad_norm": 0.0004400379257276654,
1829
+ "learning_rate": 2.2872686806712035e-05,
1830
+ "loss": 0.0013,
1831
+ "step": 250
1832
+ },
1833
+ {
1834
+ "epoch": 25.0,
1835
+ "eval_loss": 0.027748363092541695,
1836
+ "eval_runtime": 7.0062,
1837
+ "eval_samples_per_second": 14.273,
1838
+ "eval_steps_per_second": 0.428,
1839
+ "step": 250
1840
+ },
1841
+ {
1842
+ "epoch": 25.0,
1843
+ "step": 250,
1844
+ "total_flos": 3.832789293855867e+17,
1845
+ "train_loss": 0.011958676076494158,
1846
+ "train_runtime": 2822.6775,
1847
+ "train_samples_per_second": 5.617,
1848
+ "train_steps_per_second": 0.124
1849
  }
1850
  ],
1851
  "logging_steps": 1,
1852
+ "max_steps": 350,
1853
  "num_input_tokens_seen": 0,
1854
+ "num_train_epochs": 35,
1855
  "save_steps": 100,
1856
  "stateful_callbacks": {
1857
  "EarlyStoppingCallback": {
 
1860
  "early_stopping_threshold": 0.0
1861
  },
1862
  "attributes": {
1863
+ "early_stopping_patience_counter": 3
1864
  }
1865
  },
1866
  "TrainerControl": {
 
1869
  "should_evaluate": false,
1870
  "should_log": false,
1871
  "should_save": true,
1872
+ "should_training_stop": false
1873
  },
1874
  "attributes": {}
1875
  }
1876
  },
1877
+ "total_flos": 3.832789293855867e+17,
1878
  "train_batch_size": 48,
1879
  "trial_name": null,
1880
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ece316075c71a6297cb5cb7ca7f0cec745f9fc41e42b47318297a2a973691a8
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eff46ab602c60d8c5d1c8d5d90dd3e078e4d5b0c7f9bfc0ed5d7c21920a4d63a
3
  size 5432
training_eval_loss.png CHANGED
training_loss.png CHANGED