jbenbudd commited on
Commit
c476fea
·
1 Parent(s): ff6c4f1

train_residue_list_higher_lr

Browse files
README.md CHANGED
@@ -7,19 +7,19 @@ tags:
7
  - lora
8
  - generated_from_trainer
9
  model-index:
10
- - name: train_2025-4-9-1-30-53
11
  results: []
12
  ---
13
 
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
  should probably proofread and complete it, then remove this comment. -->
16
 
17
- # train_2025-4-9-1-30-53
18
 
19
  This model is a fine-tuned version of [GreatCaptainNemo/ProLLaMA_Stage_1](https://huggingface.co/GreatCaptainNemo/ProLLaMA_Stage_1) on the adpr_train dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.1022
22
- - Num Input Tokens Seen: 8775072
23
 
24
  ## Model description
25
 
@@ -53,12 +53,12 @@ The following hyperparameters were used during training:
53
 
54
  | Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen |
55
  |:-------------:|:------:|:----:|:---------------:|:-----------------:|
56
- | 0.1559 | 0.4561 | 100 | 0.1594 | 1338368 |
57
- | 0.1601 | 0.9122 | 200 | 0.1585 | 2677376 |
58
- | 0.1588 | 1.3649 | 300 | 0.1527 | 4005008 |
59
- | 0.1387 | 1.8210 | 400 | 0.1312 | 5343632 |
60
- | 0.1181 | 2.2737 | 500 | 0.1117 | 6672288 |
61
- | 0.1024 | 2.7298 | 600 | 0.1033 | 8012064 |
62
 
63
 
64
  ### Framework versions
@@ -66,5 +66,5 @@ The following hyperparameters were used during training:
66
  - PEFT 0.14.0
67
  - Transformers 4.50.3
68
  - Pytorch 2.3.1+cu121
69
- - Datasets 3.4.1
70
  - Tokenizers 0.21.0
 
7
  - lora
8
  - generated_from_trainer
9
  model-index:
10
+ - name: train_residue_list_higher_lr
11
  results: []
12
  ---
13
 
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
  should probably proofread and complete it, then remove this comment. -->
16
 
17
+ # train_residue_list_higher_lr
18
 
19
  This model is a fine-tuned version of [GreatCaptainNemo/ProLLaMA_Stage_1](https://huggingface.co/GreatCaptainNemo/ProLLaMA_Stage_1) on the adpr_train dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.2629
22
+ - Num Input Tokens Seen: 8057088
23
 
24
  ## Model description
25
 
 
53
 
54
  | Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen |
55
  |:-------------:|:------:|:----:|:---------------:|:-----------------:|
56
+ | 0.4645 | 0.4561 | 100 | 0.4702 | 1229824 |
57
+ | 0.4295 | 0.9122 | 200 | 0.4232 | 2457344 |
58
+ | 0.3545 | 1.3649 | 300 | 0.3548 | 3679728 |
59
+ | 0.3391 | 1.8210 | 400 | 0.3154 | 4908144 |
60
+ | 0.2775 | 2.2737 | 500 | 0.2792 | 6131072 |
61
+ | 0.2481 | 2.7298 | 600 | 0.2637 | 7358336 |
62
 
63
 
64
  ### Framework versions
 
66
  - PEFT 0.14.0
67
  - Transformers 4.50.3
68
  - Pytorch 2.3.1+cu121
69
+ - Datasets 3.5.0
70
  - Tokenizers 0.21.0
adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "o_proj",
27
  "gate_proj",
28
- "up_proj",
29
  "q_proj",
30
- "k_proj",
31
  "down_proj",
32
- "v_proj"
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
26
  "gate_proj",
 
27
  "q_proj",
28
+ "v_proj",
29
  "down_proj",
30
+ "up_proj",
31
+ "k_proj",
32
+ "o_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4352be4fbf0006045edc8d34c5417c610a9e20acf970d380b54281a980bf810c
3
  size 639691872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:201d6463620877ae049418dbc5ceaca38a9c32234bf85c75929a429afa11e285
3
  size 639691872
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 2.9897377423033067,
3
- "eval_loss": 0.10221381485462189,
4
- "eval_runtime": 35.9572,
5
- "eval_samples_per_second": 86.714,
6
- "eval_steps_per_second": 5.423,
7
- "num_input_tokens_seen": 8775072,
8
- "total_flos": 3.56298712611029e+17,
9
- "train_loss": 0.35758140245438347,
10
- "train_runtime": 3571.0417,
11
- "train_samples_per_second": 23.571,
12
- "train_steps_per_second": 0.184
13
  }
 
1
  {
2
  "epoch": 2.9897377423033067,
3
+ "eval_loss": 0.26290133595466614,
4
+ "eval_runtime": 33.1367,
5
+ "eval_samples_per_second": 94.095,
6
+ "eval_steps_per_second": 5.885,
7
+ "num_input_tokens_seen": 8057088,
8
+ "total_flos": 3.271460429947208e+17,
9
+ "train_loss": 0.5877154775678295,
10
+ "train_runtime": 3293.4355,
11
+ "train_samples_per_second": 25.558,
12
+ "train_steps_per_second": 0.199
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.9897377423033067,
3
- "eval_loss": 0.10221381485462189,
4
- "eval_runtime": 35.9572,
5
- "eval_samples_per_second": 86.714,
6
- "eval_steps_per_second": 5.423,
7
- "num_input_tokens_seen": 8775072
8
  }
 
1
  {
2
  "epoch": 2.9897377423033067,
3
+ "eval_loss": 0.26290133595466614,
4
+ "eval_runtime": 33.1367,
5
+ "eval_samples_per_second": 94.095,
6
+ "eval_steps_per_second": 5.885,
7
+ "num_input_tokens_seen": 8057088
8
  }
model_eval_results.csv CHANGED
The diff for this file is too large to render. See raw diff
 
trainer_log.jsonl CHANGED
@@ -1,138 +1,138 @@
1
- {"current_steps": 5, "total_steps": 657, "loss": 13.7981, "lr": 1e-05, "epoch": 0.02280501710376283, "percentage": 0.76, "elapsed_time": "0:00:25", "remaining_time": "0:55:51", "throughput": 2599.81, "total_tokens": 66816}
2
- {"current_steps": 10, "total_steps": 657, "loss": 10.0048, "lr": 2e-05, "epoch": 0.04561003420752566, "percentage": 1.52, "elapsed_time": "0:00:51", "remaining_time": "0:55:07", "throughput": 2621.31, "total_tokens": 134016}
3
- {"current_steps": 15, "total_steps": 657, "loss": 3.0982, "lr": 3e-05, "epoch": 0.06841505131128849, "percentage": 2.28, "elapsed_time": "0:01:16", "remaining_time": "0:54:29", "throughput": 2630.83, "total_tokens": 200960}
4
- {"current_steps": 20, "total_steps": 657, "loss": 1.4691, "lr": 4e-05, "epoch": 0.09122006841505131, "percentage": 3.04, "elapsed_time": "0:01:41", "remaining_time": "0:53:54", "throughput": 2636.48, "total_tokens": 267776}
5
- {"current_steps": 25, "total_steps": 657, "loss": 0.3994, "lr": 5e-05, "epoch": 0.11402508551881414, "percentage": 3.81, "elapsed_time": "0:02:06", "remaining_time": "0:53:22", "throughput": 2640.51, "total_tokens": 334464}
6
- {"current_steps": 30, "total_steps": 657, "loss": 0.248, "lr": 6e-05, "epoch": 0.13683010262257697, "percentage": 4.57, "elapsed_time": "0:02:32", "remaining_time": "0:53:01", "throughput": 2639.79, "total_tokens": 401792}
7
- {"current_steps": 35, "total_steps": 657, "loss": 0.2903, "lr": 7e-05, "epoch": 0.15963511972633979, "percentage": 5.33, "elapsed_time": "0:02:57", "remaining_time": "0:52:35", "throughput": 2640.34, "total_tokens": 468864}
8
- {"current_steps": 40, "total_steps": 657, "loss": 0.2275, "lr": 8e-05, "epoch": 0.18244013683010263, "percentage": 6.09, "elapsed_time": "0:03:22", "remaining_time": "0:52:09", "throughput": 2641.33, "total_tokens": 535808}
9
- {"current_steps": 45, "total_steps": 657, "loss": 0.2009, "lr": 9e-05, "epoch": 0.20524515393386544, "percentage": 6.85, "elapsed_time": "0:03:48", "remaining_time": "0:51:44", "throughput": 2641.3, "total_tokens": 603008}
10
- {"current_steps": 50, "total_steps": 657, "loss": 0.2084, "lr": 0.0001, "epoch": 0.22805017103762829, "percentage": 7.61, "elapsed_time": "0:04:13", "remaining_time": "0:51:16", "throughput": 2642.86, "total_tokens": 669696}
11
- {"current_steps": 55, "total_steps": 657, "loss": 0.2382, "lr": 9.998325912536413e-05, "epoch": 0.2508551881413911, "percentage": 8.37, "elapsed_time": "0:04:38", "remaining_time": "0:50:50", "throughput": 2643.43, "total_tokens": 736640}
12
- {"current_steps": 60, "total_steps": 657, "loss": 0.2569, "lr": 9.99330477117318e-05, "epoch": 0.27366020524515394, "percentage": 9.13, "elapsed_time": "0:05:03", "remaining_time": "0:50:23", "throughput": 2644.15, "total_tokens": 803456}
13
- {"current_steps": 65, "total_steps": 657, "loss": 0.238, "lr": 9.98493993824223e-05, "epoch": 0.29646522234891676, "percentage": 9.89, "elapsed_time": "0:05:29", "remaining_time": "0:49:59", "throughput": 2643.64, "total_tokens": 870784}
14
- {"current_steps": 70, "total_steps": 657, "loss": 0.1904, "lr": 9.973237015128338e-05, "epoch": 0.31927023945267957, "percentage": 10.65, "elapsed_time": "0:05:54", "remaining_time": "0:49:33", "throughput": 2644.11, "total_tokens": 937600}
15
- {"current_steps": 75, "total_steps": 657, "loss": 0.1738, "lr": 9.958203838518255e-05, "epoch": 0.34207525655644244, "percentage": 11.42, "elapsed_time": "0:06:19", "remaining_time": "0:49:06", "throughput": 2644.89, "total_tokens": 1004288}
16
- {"current_steps": 80, "total_steps": 657, "loss": 0.1715, "lr": 9.939850475152978e-05, "epoch": 0.36488027366020526, "percentage": 12.18, "elapsed_time": "0:06:44", "remaining_time": "0:48:40", "throughput": 2645.32, "total_tokens": 1071104}
17
- {"current_steps": 85, "total_steps": 657, "loss": 0.1628, "lr": 9.918189215086719e-05, "epoch": 0.38768529076396807, "percentage": 12.94, "elapsed_time": "0:07:09", "remaining_time": "0:48:13", "throughput": 2646.03, "total_tokens": 1137792}
18
- {"current_steps": 90, "total_steps": 657, "loss": 0.1609, "lr": 9.893234563457049e-05, "epoch": 0.4104903078677309, "percentage": 13.7, "elapsed_time": "0:07:35", "remaining_time": "0:47:47", "throughput": 2646.4, "total_tokens": 1204608}
19
- {"current_steps": 95, "total_steps": 657, "loss": 0.1601, "lr": 9.865003230771745e-05, "epoch": 0.43329532497149376, "percentage": 14.46, "elapsed_time": "0:08:00", "remaining_time": "0:47:22", "throughput": 2646.53, "total_tokens": 1271552}
20
- {"current_steps": 100, "total_steps": 657, "loss": 0.1559, "lr": 9.83351412171886e-05, "epoch": 0.45610034207525657, "percentage": 15.22, "elapsed_time": "0:08:25", "remaining_time": "0:46:56", "throughput": 2646.84, "total_tokens": 1338368}
21
- {"current_steps": 100, "total_steps": 657, "eval_loss": 0.15936881303787231, "epoch": 0.45610034207525657, "percentage": 15.22, "elapsed_time": "0:09:01", "remaining_time": "0:50:16", "throughput": 2471.23, "total_tokens": 1338368}
22
- {"current_steps": 105, "total_steps": 657, "loss": 0.1638, "lr": 9.798788322507475e-05, "epoch": 0.4789053591790194, "percentage": 15.98, "elapsed_time": "0:09:33", "remaining_time": "0:50:15", "throughput": 2450.43, "total_tokens": 1405568}
23
- {"current_steps": 110, "total_steps": 657, "loss": 0.2682, "lr": 9.76084908674764e-05, "epoch": 0.5017103762827823, "percentage": 16.74, "elapsed_time": "0:09:58", "remaining_time": "0:49:37", "throughput": 2459.08, "total_tokens": 1472256}
24
- {"current_steps": 115, "total_steps": 657, "loss": 0.1955, "lr": 9.719721819878942e-05, "epoch": 0.5245153933865451, "percentage": 17.5, "elapsed_time": "0:10:23", "remaining_time": "0:49:00", "throughput": 2466.75, "total_tokens": 1539200}
25
- {"current_steps": 120, "total_steps": 657, "loss": 0.1594, "lr": 9.67543406215813e-05, "epoch": 0.5473204104903079, "percentage": 18.26, "elapsed_time": "0:10:49", "remaining_time": "0:48:26", "throughput": 2473.55, "total_tokens": 1606400}
26
- {"current_steps": 125, "total_steps": 657, "loss": 0.1546, "lr": 9.6280154702172e-05, "epoch": 0.5701254275940707, "percentage": 19.03, "elapsed_time": "0:11:15", "remaining_time": "0:47:53", "throughput": 2479.59, "total_tokens": 1673856}
27
- {"current_steps": 130, "total_steps": 657, "loss": 0.163, "lr": 9.577497797204275e-05, "epoch": 0.5929304446978335, "percentage": 19.79, "elapsed_time": "0:11:40", "remaining_time": "0:47:19", "throughput": 2485.66, "total_tokens": 1740800}
28
- {"current_steps": 135, "total_steps": 657, "loss": 0.1565, "lr": 9.523914871520592e-05, "epoch": 0.6157354618015963, "percentage": 20.55, "elapsed_time": "0:12:05", "remaining_time": "0:46:45", "throughput": 2491.3, "total_tokens": 1807744}
29
- {"current_steps": 140, "total_steps": 657, "loss": 0.1545, "lr": 9.467302574167804e-05, "epoch": 0.6385404789053591, "percentage": 21.31, "elapsed_time": "0:12:31", "remaining_time": "0:46:13", "throughput": 2496.26, "total_tokens": 1875072}
30
- {"current_steps": 145, "total_steps": 657, "loss": 0.1561, "lr": 9.407698814720829e-05, "epoch": 0.661345496009122, "percentage": 22.07, "elapsed_time": "0:12:56", "remaining_time": "0:45:41", "throughput": 2501.1, "total_tokens": 1942144}
31
- {"current_steps": 150, "total_steps": 657, "loss": 0.1607, "lr": 9.345143505942254e-05, "epoch": 0.6841505131128849, "percentage": 22.83, "elapsed_time": "0:13:21", "remaining_time": "0:45:09", "throughput": 2505.84, "total_tokens": 2008960}
32
- {"current_steps": 155, "total_steps": 657, "loss": 0.1567, "lr": 9.279678537055364e-05, "epoch": 0.7069555302166477, "percentage": 23.59, "elapsed_time": "0:13:47", "remaining_time": "0:44:38", "throughput": 2510.1, "total_tokens": 2076032}
33
- {"current_steps": 160, "total_steps": 657, "loss": 0.1537, "lr": 9.211347745693642e-05, "epoch": 0.7297605473204105, "percentage": 24.35, "elapsed_time": "0:14:12", "remaining_time": "0:44:07", "throughput": 2514.42, "total_tokens": 2142720}
34
- {"current_steps": 165, "total_steps": 657, "loss": 0.1613, "lr": 9.140196888545571e-05, "epoch": 0.7525655644241733, "percentage": 25.11, "elapsed_time": "0:14:37", "remaining_time": "0:43:36", "throughput": 2518.37, "total_tokens": 2209536}
35
- {"current_steps": 170, "total_steps": 657, "loss": 0.1525, "lr": 9.066273610714337e-05, "epoch": 0.7753705815279361, "percentage": 25.88, "elapsed_time": "0:15:02", "remaining_time": "0:43:05", "throughput": 2522.22, "total_tokens": 2276224}
36
- {"current_steps": 175, "total_steps": 657, "loss": 0.1582, "lr": 8.989627413813019e-05, "epoch": 0.798175598631699, "percentage": 26.64, "elapsed_time": "0:15:27", "remaining_time": "0:42:35", "throughput": 2525.75, "total_tokens": 2343040}
37
- {"current_steps": 180, "total_steps": 657, "loss": 0.1614, "lr": 8.910309622816558e-05, "epoch": 0.8209806157354618, "percentage": 27.4, "elapsed_time": "0:15:52", "remaining_time": "0:42:05", "throughput": 2529.01, "total_tokens": 2409984}
38
- {"current_steps": 185, "total_steps": 657, "loss": 0.1602, "lr": 8.828373351692773e-05, "epoch": 0.8437856328392246, "percentage": 28.16, "elapsed_time": "0:16:18", "remaining_time": "0:41:35", "throughput": 2532.1, "total_tokens": 2476928}
39
- {"current_steps": 190, "total_steps": 657, "loss": 0.1734, "lr": 8.743873467835388e-05, "epoch": 0.8665906499429875, "percentage": 28.92, "elapsed_time": "0:16:43", "remaining_time": "0:41:06", "throughput": 2535.03, "total_tokens": 2543872}
40
- {"current_steps": 195, "total_steps": 657, "loss": 0.1583, "lr": 8.656866555322896e-05, "epoch": 0.8893956670467503, "percentage": 29.68, "elapsed_time": "0:17:08", "remaining_time": "0:40:37", "throughput": 2537.84, "total_tokens": 2610816}
41
- {"current_steps": 200, "total_steps": 657, "loss": 0.1601, "lr": 8.567410877027891e-05, "epoch": 0.9122006841505131, "percentage": 30.44, "elapsed_time": "0:17:33", "remaining_time": "0:40:07", "throughput": 2540.71, "total_tokens": 2677376}
42
- {"current_steps": 200, "total_steps": 657, "eval_loss": 0.1585174947977066, "epoch": 0.9122006841505131, "percentage": 30.44, "elapsed_time": "0:18:09", "remaining_time": "0:41:30", "throughput": 2456.92, "total_tokens": 2677376}
43
- {"current_steps": 205, "total_steps": 657, "loss": 0.1547, "lr": 8.475566335602205e-05, "epoch": 0.935005701254276, "percentage": 31.2, "elapsed_time": "0:18:42", "remaining_time": "0:41:14", "throughput": 2445.59, "total_tokens": 2744320}
44
- {"current_steps": 210, "total_steps": 657, "loss": 0.1596, "lr": 8.381394433364e-05, "epoch": 0.9578107183580388, "percentage": 31.96, "elapsed_time": "0:19:07", "remaining_time": "0:40:42", "throughput": 2450.15, "total_tokens": 2811136}
45
- {"current_steps": 215, "total_steps": 657, "loss": 0.1615, "lr": 8.284958231113656e-05, "epoch": 0.9806157354618016, "percentage": 32.72, "elapsed_time": "0:19:32", "remaining_time": "0:40:10", "throughput": 2454.36, "total_tokens": 2878208}
46
- {"current_steps": 220, "total_steps": 657, "loss": 0.1663, "lr": 8.186322305906066e-05, "epoch": 1.0, "percentage": 33.49, "elapsed_time": "0:19:54", "remaining_time": "0:39:31", "throughput": 2457.57, "total_tokens": 2934672}
47
- {"current_steps": 225, "total_steps": 657, "loss": 0.157, "lr": 8.085552707807567e-05, "epoch": 1.0228050171037628, "percentage": 34.25, "elapsed_time": "0:20:19", "remaining_time": "0:39:01", "throughput": 2461.57, "total_tokens": 3001488}
48
- {"current_steps": 230, "total_steps": 657, "loss": 0.1599, "lr": 7.982716915666515e-05, "epoch": 1.0456100342075256, "percentage": 35.01, "elapsed_time": "0:20:44", "remaining_time": "0:38:30", "throughput": 2465.58, "total_tokens": 3068048}
49
- {"current_steps": 235, "total_steps": 657, "loss": 0.1558, "lr": 7.877883791927082e-05, "epoch": 1.0684150513112884, "percentage": 35.77, "elapsed_time": "0:21:09", "remaining_time": "0:37:59", "throughput": 2469.29, "total_tokens": 3134864}
50
- {"current_steps": 240, "total_steps": 657, "loss": 0.1507, "lr": 7.771123536516558e-05, "epoch": 1.0912200684150513, "percentage": 36.53, "elapsed_time": "0:21:34", "remaining_time": "0:37:30", "throughput": 2472.66, "total_tokens": 3202064}
51
- {"current_steps": 245, "total_steps": 657, "loss": 0.1611, "lr": 7.662507639837018e-05, "epoch": 1.114025085518814, "percentage": 37.29, "elapsed_time": "0:22:00", "remaining_time": "0:36:59", "throughput": 2476.17, "total_tokens": 3268752}
52
- {"current_steps": 250, "total_steps": 657, "loss": 0.1581, "lr": 7.552108834892857e-05, "epoch": 1.1368301026225769, "percentage": 38.05, "elapsed_time": "0:22:25", "remaining_time": "0:36:30", "throughput": 2479.48, "total_tokens": 3335568}
53
- {"current_steps": 255, "total_steps": 657, "loss": 0.1543, "lr": 7.440001048586209e-05, "epoch": 1.1596351197263397, "percentage": 38.81, "elapsed_time": "0:22:50", "remaining_time": "0:36:00", "throughput": 2482.54, "total_tokens": 3402640}
54
- {"current_steps": 260, "total_steps": 657, "loss": 0.1608, "lr": 7.32625935221293e-05, "epoch": 1.1824401368301025, "percentage": 39.57, "elapsed_time": "0:23:15", "remaining_time": "0:35:31", "throughput": 2485.48, "total_tokens": 3469712}
55
- {"current_steps": 265, "total_steps": 657, "loss": 0.1569, "lr": 7.210959911192214e-05, "epoch": 1.2052451539338653, "percentage": 40.33, "elapsed_time": "0:23:41", "remaining_time": "0:35:02", "throughput": 2488.45, "total_tokens": 3536528}
56
- {"current_steps": 270, "total_steps": 657, "loss": 0.1608, "lr": 7.094179934063567e-05, "epoch": 1.2280501710376284, "percentage": 41.1, "elapsed_time": "0:24:06", "remaining_time": "0:34:33", "throughput": 2491.13, "total_tokens": 3603728}
57
- {"current_steps": 275, "total_steps": 657, "loss": 0.1562, "lr": 6.975997620785276e-05, "epoch": 1.2508551881413912, "percentage": 41.86, "elapsed_time": "0:24:31", "remaining_time": "0:34:04", "throughput": 2493.9, "total_tokens": 3670544}
58
- {"current_steps": 280, "total_steps": 657, "loss": 0.159, "lr": 6.856492110368969e-05, "epoch": 1.273660205245154, "percentage": 42.62, "elapsed_time": "0:24:56", "remaining_time": "0:33:35", "throughput": 2496.58, "total_tokens": 3737360}
59
- {"current_steps": 285, "total_steps": 657, "loss": 0.1552, "lr": 6.735743427885375e-05, "epoch": 1.2964652223489168, "percentage": 43.38, "elapsed_time": "0:25:22", "remaining_time": "0:33:07", "throughput": 2498.99, "total_tokens": 3804560}
60
- {"current_steps": 290, "total_steps": 657, "loss": 0.1611, "lr": 6.613832430876727e-05, "epoch": 1.3192702394526796, "percentage": 44.14, "elapsed_time": "0:25:47", "remaining_time": "0:32:38", "throughput": 2501.49, "total_tokens": 3871376}
61
- {"current_steps": 295, "total_steps": 657, "loss": 0.1514, "lr": 6.490840755211736e-05, "epoch": 1.3420752565564424, "percentage": 44.9, "elapsed_time": "0:26:12", "remaining_time": "0:32:09", "throughput": 2503.97, "total_tokens": 3938064}
62
- {"current_steps": 300, "total_steps": 657, "loss": 0.1588, "lr": 6.366850760419341e-05, "epoch": 1.3648802736602053, "percentage": 45.66, "elapsed_time": "0:26:37", "remaining_time": "0:31:41", "throughput": 2506.27, "total_tokens": 4005008}
63
- {"current_steps": 300, "total_steps": 657, "eval_loss": 0.15267899632453918, "epoch": 1.3648802736602053, "percentage": 45.66, "elapsed_time": "0:27:13", "remaining_time": "0:32:24", "throughput": 2451.16, "total_tokens": 4005008}
64
- {"current_steps": 305, "total_steps": 657, "loss": 0.1552, "lr": 6.241945474537901e-05, "epoch": 1.387685290763968, "percentage": 46.42, "elapsed_time": "0:27:43", "remaining_time": "0:31:59", "throughput": 2448.16, "total_tokens": 4071696}
65
- {"current_steps": 310, "total_steps": 657, "loss": 0.1506, "lr": 6.116208538516707e-05, "epoch": 1.4104903078677309, "percentage": 47.18, "elapsed_time": "0:28:08", "remaining_time": "0:31:30", "throughput": 2451.0, "total_tokens": 4139024}
66
- {"current_steps": 315, "total_steps": 657, "loss": 0.1516, "lr": 5.98972415020708e-05, "epoch": 1.4332953249714937, "percentage": 47.95, "elapsed_time": "0:28:33", "remaining_time": "0:31:00", "throughput": 2454.0, "total_tokens": 4205712}
67
- {"current_steps": 320, "total_steps": 657, "loss": 0.1537, "lr": 5.862577007980544e-05, "epoch": 1.4561003420752565, "percentage": 48.71, "elapsed_time": "0:28:59", "remaining_time": "0:30:31", "throughput": 2456.74, "total_tokens": 4272912}
68
- {"current_steps": 325, "total_steps": 657, "loss": 0.1526, "lr": 5.734852254011833e-05, "epoch": 1.4789053591790193, "percentage": 49.47, "elapsed_time": "0:29:24", "remaining_time": "0:30:02", "throughput": 2459.49, "total_tokens": 4339856}
69
- {"current_steps": 330, "total_steps": 657, "loss": 0.1465, "lr": 5.60663541726471e-05, "epoch": 1.5017103762827824, "percentage": 50.23, "elapsed_time": "0:29:49", "remaining_time": "0:29:33", "throughput": 2462.1, "total_tokens": 4406928}
70
- {"current_steps": 335, "total_steps": 657, "loss": 0.1467, "lr": 5.478012356218779e-05, "epoch": 1.5245153933865452, "percentage": 50.99, "elapsed_time": "0:30:15", "remaining_time": "0:29:04", "throughput": 2464.71, "total_tokens": 4473872}
71
- {"current_steps": 340, "total_steps": 657, "loss": 0.1476, "lr": 5.349069201375657e-05, "epoch": 1.547320410490308, "percentage": 51.75, "elapsed_time": "0:30:40", "remaining_time": "0:28:36", "throughput": 2467.14, "total_tokens": 4541072}
72
- {"current_steps": 345, "total_steps": 657, "loss": 0.1486, "lr": 5.2198922975829544e-05, "epoch": 1.5701254275940708, "percentage": 52.51, "elapsed_time": "0:31:05", "remaining_time": "0:28:07", "throughput": 2469.75, "total_tokens": 4607632}
73
- {"current_steps": 350, "total_steps": 657, "loss": 0.1413, "lr": 5.090568146214764e-05, "epoch": 1.5929304446978336, "percentage": 53.27, "elapsed_time": "0:31:30", "remaining_time": "0:27:38", "throughput": 2472.1, "total_tokens": 4674704}
74
- {"current_steps": 355, "total_steps": 657, "loss": 0.1474, "lr": 4.961183347247301e-05, "epoch": 1.6157354618015964, "percentage": 54.03, "elapsed_time": "0:31:56", "remaining_time": "0:27:10", "throughput": 2474.44, "total_tokens": 4741648}
75
- {"current_steps": 360, "total_steps": 657, "loss": 0.1442, "lr": 4.831824541268537e-05, "epoch": 1.6385404789053593, "percentage": 54.79, "elapsed_time": "0:32:21", "remaining_time": "0:26:41", "throughput": 2476.7, "total_tokens": 4808592}
76
- {"current_steps": 365, "total_steps": 657, "loss": 0.14, "lr": 4.702578351460633e-05, "epoch": 1.661345496009122, "percentage": 55.56, "elapsed_time": "0:32:46", "remaining_time": "0:26:13", "throughput": 2478.89, "total_tokens": 4875536}
77
- {"current_steps": 370, "total_steps": 657, "loss": 0.142, "lr": 4.573531325594017e-05, "epoch": 1.6841505131128849, "percentage": 56.32, "elapsed_time": "0:33:12", "remaining_time": "0:25:45", "throughput": 2481.01, "total_tokens": 4942480}
78
- {"current_steps": 375, "total_steps": 657, "loss": 0.138, "lr": 4.444769878071977e-05, "epoch": 1.7069555302166477, "percentage": 57.08, "elapsed_time": "0:33:37", "remaining_time": "0:25:17", "throughput": 2483.1, "total_tokens": 5009424}
79
- {"current_steps": 380, "total_steps": 657, "loss": 0.1388, "lr": 4.316380232064543e-05, "epoch": 1.7297605473204105, "percentage": 57.84, "elapsed_time": "0:34:02", "remaining_time": "0:24:49", "throughput": 2485.16, "total_tokens": 5076368}
80
- {"current_steps": 385, "total_steps": 657, "loss": 0.1372, "lr": 4.188448361770458e-05, "epoch": 1.7525655644241733, "percentage": 58.6, "elapsed_time": "0:34:27", "remaining_time": "0:24:20", "throughput": 2487.24, "total_tokens": 5143056}
81
- {"current_steps": 390, "total_steps": 657, "loss": 0.1351, "lr": 4.061059934845818e-05, "epoch": 1.7753705815279361, "percentage": 59.36, "elapsed_time": "0:34:52", "remaining_time": "0:23:52", "throughput": 2489.28, "total_tokens": 5209744}
82
- {"current_steps": 395, "total_steps": 657, "loss": 0.1327, "lr": 3.93430025503803e-05, "epoch": 1.798175598631699, "percentage": 60.12, "elapsed_time": "0:35:18", "remaining_time": "0:23:24", "throughput": 2491.17, "total_tokens": 5276688}
83
- {"current_steps": 400, "total_steps": 657, "loss": 0.1387, "lr": 3.8082542050634405e-05, "epoch": 1.8209806157354618, "percentage": 60.88, "elapsed_time": "0:35:43", "remaining_time": "0:22:57", "throughput": 2493.02, "total_tokens": 5343632}
84
- {"current_steps": 400, "total_steps": 657, "eval_loss": 0.1311902403831482, "epoch": 1.8209806157354618, "percentage": 60.88, "elapsed_time": "0:36:19", "remaining_time": "0:23:20", "throughput": 2451.89, "total_tokens": 5343632}
85
- {"current_steps": 405, "total_steps": 657, "loss": 0.1328, "lr": 3.6830061897668866e-05, "epoch": 1.8437856328392246, "percentage": 61.64, "elapsed_time": "0:36:51", "remaining_time": "0:22:56", "throughput": 2446.48, "total_tokens": 5410832}
86
- {"current_steps": 410, "total_steps": 657, "loss": 0.1392, "lr": 3.558640079601265e-05, "epoch": 1.8665906499429874, "percentage": 62.4, "elapsed_time": "0:37:17", "remaining_time": "0:22:27", "throughput": 2448.74, "total_tokens": 5477904}
87
- {"current_steps": 415, "total_steps": 657, "loss": 0.1308, "lr": 3.435239154464947e-05, "epoch": 1.8893956670467502, "percentage": 63.17, "elapsed_time": "0:37:42", "remaining_time": "0:21:59", "throughput": 2451.0, "total_tokens": 5544720}
88
- {"current_steps": 420, "total_steps": 657, "loss": 0.1238, "lr": 3.312886047934639e-05, "epoch": 1.912200684150513, "percentage": 63.93, "elapsed_time": "0:38:07", "remaining_time": "0:21:30", "throughput": 2453.22, "total_tokens": 5611536}
89
- {"current_steps": 425, "total_steps": 657, "loss": 0.131, "lr": 3.191662691931051e-05, "epoch": 1.9350057012542758, "percentage": 64.69, "elapsed_time": "0:38:32", "remaining_time": "0:21:02", "throughput": 2455.33, "total_tokens": 5678480}
90
- {"current_steps": 430, "total_steps": 657, "loss": 0.1292, "lr": 3.071650261854414e-05, "epoch": 1.9578107183580387, "percentage": 65.45, "elapsed_time": "0:38:58", "remaining_time": "0:20:34", "throughput": 2457.21, "total_tokens": 5746192}
91
- {"current_steps": 435, "total_steps": 657, "loss": 0.1233, "lr": 2.9529291222265922e-05, "epoch": 1.9806157354618015, "percentage": 66.21, "elapsed_time": "0:39:23", "remaining_time": "0:20:06", "throughput": 2459.23, "total_tokens": 5813264}
92
- {"current_steps": 440, "total_steps": 657, "loss": 0.126, "lr": 2.8355787728761952e-05, "epoch": 2.0, "percentage": 66.97, "elapsed_time": "0:39:45", "remaining_time": "0:19:36", "throughput": 2460.84, "total_tokens": 5869600}
93
- {"current_steps": 445, "total_steps": 657, "loss": 0.1191, "lr": 2.7196777957027013e-05, "epoch": 2.022805017103763, "percentage": 67.73, "elapsed_time": "0:40:10", "remaining_time": "0:19:08", "throughput": 2462.81, "total_tokens": 5936544}
94
- {"current_steps": 450, "total_steps": 657, "loss": 0.1202, "lr": 2.6053038020552685e-05, "epoch": 2.0456100342075256, "percentage": 68.49, "elapsed_time": "0:40:35", "remaining_time": "0:18:40", "throughput": 2464.8, "total_tokens": 6003232}
95
- {"current_steps": 455, "total_steps": 657, "loss": 0.1235, "lr": 2.492533380761466e-05, "epoch": 2.0684150513112884, "percentage": 69.25, "elapsed_time": "0:41:00", "remaining_time": "0:18:12", "throughput": 2466.76, "total_tokens": 6069920}
96
- {"current_steps": 460, "total_steps": 657, "loss": 0.1153, "lr": 2.3814420468407195e-05, "epoch": 2.0912200684150513, "percentage": 70.02, "elapsed_time": "0:41:25", "remaining_time": "0:17:44", "throughput": 2468.65, "total_tokens": 6136736}
97
- {"current_steps": 465, "total_steps": 657, "loss": 0.1138, "lr": 2.2721041909367986e-05, "epoch": 2.114025085518814, "percentage": 70.78, "elapsed_time": "0:41:51", "remaining_time": "0:17:16", "throughput": 2470.46, "total_tokens": 6203680}
98
- {"current_steps": 470, "total_steps": 657, "loss": 0.1198, "lr": 2.164593029503249e-05, "epoch": 2.136830102622577, "percentage": 71.54, "elapsed_time": "0:42:16", "remaining_time": "0:16:49", "throughput": 2472.17, "total_tokens": 6270880}
99
- {"current_steps": 475, "total_steps": 657, "loss": 0.1179, "lr": 2.0589805557750912e-05, "epoch": 2.1596351197263397, "percentage": 72.3, "elapsed_time": "0:42:41", "remaining_time": "0:16:21", "throughput": 2473.98, "total_tokens": 6337568}
100
- {"current_steps": 480, "total_steps": 657, "loss": 0.1198, "lr": 1.9553374915596328e-05, "epoch": 2.1824401368301025, "percentage": 73.06, "elapsed_time": "0:43:06", "remaining_time": "0:15:53", "throughput": 2475.75, "total_tokens": 6404256}
101
- {"current_steps": 485, "total_steps": 657, "loss": 0.1152, "lr": 1.853733239878669e-05, "epoch": 2.2052451539338653, "percentage": 73.82, "elapsed_time": "0:43:32", "remaining_time": "0:15:26", "throughput": 2477.36, "total_tokens": 6471456}
102
- {"current_steps": 490, "total_steps": 657, "loss": 0.115, "lr": 1.754235838493795e-05, "epoch": 2.228050171037628, "percentage": 74.58, "elapsed_time": "0:43:57", "remaining_time": "0:14:58", "throughput": 2479.03, "total_tokens": 6538272}
103
- {"current_steps": 495, "total_steps": 657, "loss": 0.1195, "lr": 1.6569119143459387e-05, "epoch": 2.250855188141391, "percentage": 75.34, "elapsed_time": "0:44:22", "remaining_time": "0:14:31", "throughput": 2480.64, "total_tokens": 6605216}
104
- {"current_steps": 500, "total_steps": 657, "loss": 0.1181, "lr": 1.561826638939628e-05, "epoch": 2.2736602052451538, "percentage": 76.1, "elapsed_time": "0:44:48", "remaining_time": "0:14:04", "throughput": 2482.18, "total_tokens": 6672288}
105
- {"current_steps": 500, "total_steps": 657, "eval_loss": 0.11165652424097061, "epoch": 2.2736602052451538, "percentage": 76.1, "elapsed_time": "0:45:23", "remaining_time": "0:14:15", "throughput": 2449.44, "total_tokens": 6672288}
106
- {"current_steps": 505, "total_steps": 657, "loss": 0.1144, "lr": 1.4690436847018757e-05, "epoch": 2.2964652223489166, "percentage": 76.86, "elapsed_time": "0:45:56", "remaining_time": "0:13:49", "throughput": 2445.0, "total_tokens": 6739488}
107
- {"current_steps": 510, "total_steps": 657, "loss": 0.1133, "lr": 1.3786251823448909e-05, "epoch": 2.3192702394526794, "percentage": 77.63, "elapsed_time": "0:46:21", "remaining_time": "0:13:21", "throughput": 2446.91, "total_tokens": 6806176}
108
- {"current_steps": 515, "total_steps": 657, "loss": 0.1174, "lr": 1.2906316792611828e-05, "epoch": 2.342075256556442, "percentage": 78.39, "elapsed_time": "0:46:46", "remaining_time": "0:12:53", "throughput": 2448.68, "total_tokens": 6873376}
109
- {"current_steps": 520, "total_steps": 657, "loss": 0.103, "lr": 1.2051220989789075e-05, "epoch": 2.364880273660205, "percentage": 79.15, "elapsed_time": "0:47:12", "remaining_time": "0:12:26", "throughput": 2450.49, "total_tokens": 6940192}
110
- {"current_steps": 525, "total_steps": 657, "loss": 0.1038, "lr": 1.1221537017046101e-05, "epoch": 2.387685290763968, "percentage": 79.91, "elapsed_time": "0:47:37", "remaining_time": "0:11:58", "throughput": 2452.24, "total_tokens": 7007136}
111
- {"current_steps": 530, "total_steps": 657, "loss": 0.1111, "lr": 1.0417820459797939e-05, "epoch": 2.4104903078677307, "percentage": 80.67, "elapsed_time": "0:48:02", "remaining_time": "0:11:30", "throughput": 2453.9, "total_tokens": 7074336}
112
- {"current_steps": 535, "total_steps": 657, "loss": 0.1137, "lr": 9.640609514769695e-06, "epoch": 2.433295324971494, "percentage": 81.43, "elapsed_time": "0:48:28", "remaining_time": "0:11:03", "throughput": 2455.62, "total_tokens": 7141152}
113
- {"current_steps": 540, "total_steps": 657, "loss": 0.1059, "lr": 8.890424629601197e-06, "epoch": 2.4561003420752567, "percentage": 82.19, "elapsed_time": "0:48:53", "remaining_time": "0:10:35", "throughput": 2457.29, "total_tokens": 7208096}
114
- {"current_steps": 545, "total_steps": 657, "loss": 0.1065, "lr": 8.167768154337102e-06, "epoch": 2.4789053591790196, "percentage": 82.95, "elapsed_time": "0:49:18", "remaining_time": "0:10:07", "throughput": 2458.98, "total_tokens": 7274784}
115
- {"current_steps": 550, "total_steps": 657, "loss": 0.0998, "lr": 7.47312400503572e-06, "epoch": 2.5017103762827824, "percentage": 83.71, "elapsed_time": "0:49:43", "remaining_time": "0:09:40", "throughput": 2460.56, "total_tokens": 7341856}
116
- {"current_steps": 555, "total_steps": 657, "loss": 0.1117, "lr": 6.806957339721837e-06, "epoch": 2.524515393386545, "percentage": 84.47, "elapsed_time": "0:50:09", "remaining_time": "0:09:13", "throughput": 2462.09, "total_tokens": 7409056}
117
- {"current_steps": 560, "total_steps": 657, "loss": 0.1048, "lr": 6.169714246900693e-06, "epoch": 2.547320410490308, "percentage": 85.24, "elapsed_time": "0:50:34", "remaining_time": "0:08:45", "throughput": 2463.64, "total_tokens": 7476000}
118
- {"current_steps": 565, "total_steps": 657, "loss": 0.1053, "lr": 5.561821446841431e-06, "epoch": 2.570125427594071, "percentage": 86.0, "elapsed_time": "0:50:59", "remaining_time": "0:08:18", "throughput": 2465.22, "total_tokens": 7542688}
119
- {"current_steps": 570, "total_steps": 657, "loss": 0.1046, "lr": 4.983686005830407e-06, "epoch": 2.5929304446978336, "percentage": 86.76, "elapsed_time": "0:51:25", "remaining_time": "0:07:50", "throughput": 2466.66, "total_tokens": 7609888}
120
- {"current_steps": 575, "total_steps": 657, "loss": 0.1067, "lr": 4.435695063585221e-06, "epoch": 2.6157354618015964, "percentage": 87.52, "elapsed_time": "0:51:50", "remaining_time": "0:07:23", "throughput": 2468.09, "total_tokens": 7677088}
121
- {"current_steps": 580, "total_steps": 657, "loss": 0.1064, "lr": 3.918215574012501e-06, "epoch": 2.6385404789053593, "percentage": 88.28, "elapsed_time": "0:52:15", "remaining_time": "0:06:56", "throughput": 2469.53, "total_tokens": 7744160}
122
- {"current_steps": 585, "total_steps": 657, "loss": 0.1052, "lr": 3.4315940594827233e-06, "epoch": 2.661345496009122, "percentage": 89.04, "elapsed_time": "0:52:41", "remaining_time": "0:06:29", "throughput": 2470.96, "total_tokens": 7811104}
123
- {"current_steps": 590, "total_steps": 657, "loss": 0.107, "lr": 2.9761563787866708e-06, "epoch": 2.684150513112885, "percentage": 89.8, "elapsed_time": "0:53:06", "remaining_time": "0:06:01", "throughput": 2472.39, "total_tokens": 7877920}
124
- {"current_steps": 595, "total_steps": 657, "loss": 0.1077, "lr": 2.5522075089290275e-06, "epoch": 2.7069555302166477, "percentage": 90.56, "elapsed_time": "0:53:31", "remaining_time": "0:05:34", "throughput": 2473.68, "total_tokens": 7945376}
125
- {"current_steps": 600, "total_steps": 657, "loss": 0.1024, "lr": 2.1600313409050833e-06, "epoch": 2.7297605473204105, "percentage": 91.32, "elapsed_time": "0:53:57", "remaining_time": "0:05:07", "throughput": 2475.09, "total_tokens": 8012064}
126
- {"current_steps": 600, "total_steps": 657, "eval_loss": 0.1033177301287651, "epoch": 2.7297605473204105, "percentage": 91.32, "elapsed_time": "0:54:33", "remaining_time": "0:05:10", "throughput": 2447.92, "total_tokens": 8012064}
127
- {"current_steps": 605, "total_steps": 657, "loss": 0.0994, "lr": 1.7998904895974056e-06, "epoch": 2.7525655644241733, "percentage": 92.09, "elapsed_time": "0:55:04", "remaining_time": "0:04:44", "throughput": 2444.75, "total_tokens": 8078752}
128
- {"current_steps": 610, "total_steps": 657, "loss": 0.1015, "lr": 1.4720261179197114e-06, "epoch": 2.775370581527936, "percentage": 92.85, "elapsed_time": "0:55:29", "remaining_time": "0:04:16", "throughput": 2446.28, "total_tokens": 8145824}
129
- {"current_steps": 615, "total_steps": 657, "loss": 0.1041, "lr": 1.1766577753257512e-06, "epoch": 2.798175598631699, "percentage": 93.61, "elapsed_time": "0:55:55", "remaining_time": "0:03:49", "throughput": 2447.79, "total_tokens": 8212768}
130
- {"current_steps": 620, "total_steps": 657, "loss": 0.1019, "lr": 9.139832507913171e-07, "epoch": 2.8209806157354618, "percentage": 94.37, "elapsed_time": "0:56:20", "remaining_time": "0:03:21", "throughput": 2449.24, "total_tokens": 8279968}
131
- {"current_steps": 625, "total_steps": 657, "loss": 0.1028, "lr": 6.841784403678275e-07, "epoch": 2.8437856328392246, "percentage": 95.13, "elapsed_time": "0:56:45", "remaining_time": "0:02:54", "throughput": 2450.7, "total_tokens": 8347040}
132
- {"current_steps": 630, "total_steps": 657, "loss": 0.1008, "lr": 4.873972293961581e-07, "epoch": 2.8665906499429874, "percentage": 95.89, "elapsed_time": "0:57:11", "remaining_time": "0:02:27", "throughput": 2452.18, "total_tokens": 8413856}
133
- {"current_steps": 635, "total_steps": 657, "loss": 0.0998, "lr": 3.2377138945964836e-07, "epoch": 2.88939566704675, "percentage": 96.65, "elapsed_time": "0:57:36", "remaining_time": "0:01:59", "throughput": 2453.61, "total_tokens": 8480800}
134
- {"current_steps": 640, "total_steps": 657, "loss": 0.0954, "lr": 1.934104901452405e-07, "epoch": 2.912200684150513, "percentage": 97.41, "elapsed_time": "0:58:01", "remaining_time": "0:01:32", "throughput": 2455.03, "total_tokens": 8547744}
135
- {"current_steps": 645, "total_steps": 657, "loss": 0.0994, "lr": 9.640182567185463e-08, "epoch": 2.935005701254276, "percentage": 98.17, "elapsed_time": "0:58:27", "remaining_time": "0:01:05", "throughput": 2456.39, "total_tokens": 8614816}
136
- {"current_steps": 650, "total_steps": 657, "loss": 0.1057, "lr": 3.281035643511454e-08, "epoch": 2.9578107183580387, "percentage": 98.93, "elapsed_time": "0:58:52", "remaining_time": "0:00:38", "throughput": 2457.82, "total_tokens": 8681504}
137
- {"current_steps": 655, "total_steps": 657, "loss": 0.105, "lr": 2.678665507588329e-09, "epoch": 2.9806157354618015, "percentage": 99.7, "elapsed_time": "0:59:17", "remaining_time": "0:00:10", "throughput": 2459.2, "total_tokens": 8748320}
138
- {"current_steps": 657, "total_steps": 657, "epoch": 2.9897377423033067, "percentage": 100.0, "elapsed_time": "0:59:31", "remaining_time": "0:00:00", "throughput": 2457.29, "total_tokens": 8775072}
 
1
+ {"current_steps": 5, "total_steps": 657, "loss": 13.3488, "lr": 1e-05, "epoch": 0.02280501710376283, "percentage": 0.76, "elapsed_time": "0:00:24", "remaining_time": "0:52:10", "throughput": 2586.26, "total_tokens": 62080}
2
+ {"current_steps": 10, "total_steps": 657, "loss": 10.56, "lr": 2e-05, "epoch": 0.04561003420752566, "percentage": 1.52, "elapsed_time": "0:00:47", "remaining_time": "0:51:26", "throughput": 2613.82, "total_tokens": 124672}
3
+ {"current_steps": 15, "total_steps": 657, "loss": 5.1785, "lr": 3e-05, "epoch": 0.06841505131128849, "percentage": 2.28, "elapsed_time": "0:01:10", "remaining_time": "0:50:32", "throughput": 2615.88, "total_tokens": 185344}
4
+ {"current_steps": 20, "total_steps": 657, "loss": 2.2593, "lr": 4e-05, "epoch": 0.09122006841505131, "percentage": 3.04, "elapsed_time": "0:01:34", "remaining_time": "0:50:01", "throughput": 2620.07, "total_tokens": 246912}
5
+ {"current_steps": 25, "total_steps": 657, "loss": 0.8861, "lr": 5e-05, "epoch": 0.11402508551881414, "percentage": 3.81, "elapsed_time": "0:01:57", "remaining_time": "0:49:26", "throughput": 2622.49, "total_tokens": 307712}
6
+ {"current_steps": 30, "total_steps": 657, "loss": 0.6509, "lr": 6e-05, "epoch": 0.13683010262257697, "percentage": 4.57, "elapsed_time": "0:02:20", "remaining_time": "0:49:03", "throughput": 2623.48, "total_tokens": 369536}
7
+ {"current_steps": 35, "total_steps": 657, "loss": 0.5623, "lr": 7e-05, "epoch": 0.15963511972633979, "percentage": 5.33, "elapsed_time": "0:02:44", "remaining_time": "0:48:38", "throughput": 2624.78, "total_tokens": 431104}
8
+ {"current_steps": 40, "total_steps": 657, "loss": 0.5356, "lr": 8e-05, "epoch": 0.18244013683010263, "percentage": 6.09, "elapsed_time": "0:03:07", "remaining_time": "0:48:13", "throughput": 2626.7, "total_tokens": 492672}
9
+ {"current_steps": 45, "total_steps": 657, "loss": 0.7113, "lr": 9e-05, "epoch": 0.20524515393386544, "percentage": 6.85, "elapsed_time": "0:03:30", "remaining_time": "0:47:48", "throughput": 2628.5, "total_tokens": 554368}
10
+ {"current_steps": 50, "total_steps": 657, "loss": 0.5074, "lr": 0.0001, "epoch": 0.22805017103762829, "percentage": 7.61, "elapsed_time": "0:03:54", "remaining_time": "0:47:24", "throughput": 2627.88, "total_tokens": 615808}
11
+ {"current_steps": 55, "total_steps": 657, "loss": 0.5147, "lr": 9.998325912536413e-05, "epoch": 0.2508551881413911, "percentage": 8.37, "elapsed_time": "0:04:18", "remaining_time": "0:47:04", "throughput": 2627.9, "total_tokens": 678144}
12
+ {"current_steps": 60, "total_steps": 657, "loss": 0.5046, "lr": 9.99330477117318e-05, "epoch": 0.27366020524515394, "percentage": 9.13, "elapsed_time": "0:04:41", "remaining_time": "0:46:42", "throughput": 2627.29, "total_tokens": 739968}
13
+ {"current_steps": 65, "total_steps": 657, "loss": 0.5062, "lr": 9.98493993824223e-05, "epoch": 0.29646522234891676, "percentage": 9.89, "elapsed_time": "0:05:05", "remaining_time": "0:46:18", "throughput": 2627.81, "total_tokens": 801792}
14
+ {"current_steps": 70, "total_steps": 657, "loss": 0.4955, "lr": 9.973237015128338e-05, "epoch": 0.31927023945267957, "percentage": 10.65, "elapsed_time": "0:05:28", "remaining_time": "0:45:56", "throughput": 2628.09, "total_tokens": 863744}
15
+ {"current_steps": 75, "total_steps": 657, "loss": 0.4799, "lr": 9.958203838518255e-05, "epoch": 0.34207525655644244, "percentage": 11.42, "elapsed_time": "0:05:51", "remaining_time": "0:45:29", "throughput": 2627.71, "total_tokens": 924160}
16
+ {"current_steps": 80, "total_steps": 657, "loss": 0.4767, "lr": 9.939850475152978e-05, "epoch": 0.36488027366020526, "percentage": 12.18, "elapsed_time": "0:06:15", "remaining_time": "0:45:05", "throughput": 2628.28, "total_tokens": 985856}
17
+ {"current_steps": 85, "total_steps": 657, "loss": 0.48, "lr": 9.918189215086719e-05, "epoch": 0.38768529076396807, "percentage": 12.94, "elapsed_time": "0:06:38", "remaining_time": "0:44:42", "throughput": 2628.46, "total_tokens": 1047680}
18
+ {"current_steps": 90, "total_steps": 657, "loss": 0.4828, "lr": 9.893234563457049e-05, "epoch": 0.4104903078677309, "percentage": 13.7, "elapsed_time": "0:07:01", "remaining_time": "0:44:16", "throughput": 2628.49, "total_tokens": 1108352}
19
+ {"current_steps": 95, "total_steps": 657, "loss": 0.4636, "lr": 9.865003230771745e-05, "epoch": 0.43329532497149376, "percentage": 14.46, "elapsed_time": "0:07:24", "remaining_time": "0:43:51", "throughput": 2628.95, "total_tokens": 1169536}
20
+ {"current_steps": 100, "total_steps": 657, "loss": 0.4645, "lr": 9.83351412171886e-05, "epoch": 0.45610034207525657, "percentage": 15.22, "elapsed_time": "0:07:47", "remaining_time": "0:43:26", "throughput": 2628.3, "total_tokens": 1229824}
21
+ {"current_steps": 100, "total_steps": 657, "eval_loss": 0.4702179729938507, "epoch": 0.45610034207525657, "percentage": 15.22, "elapsed_time": "0:08:21", "remaining_time": "0:46:30", "throughput": 2454.68, "total_tokens": 1229824}
22
+ {"current_steps": 105, "total_steps": 657, "loss": 0.4831, "lr": 9.798788322507475e-05, "epoch": 0.4789053591790194, "percentage": 15.98, "elapsed_time": "0:08:51", "remaining_time": "0:46:34", "throughput": 2429.71, "total_tokens": 1291648}
23
+ {"current_steps": 110, "total_steps": 657, "loss": 0.4868, "lr": 9.76084908674764e-05, "epoch": 0.5017103762827823, "percentage": 16.74, "elapsed_time": "0:09:14", "remaining_time": "0:45:59", "throughput": 2438.31, "total_tokens": 1353216}
24
+ {"current_steps": 115, "total_steps": 657, "loss": 0.4706, "lr": 9.719721819878942e-05, "epoch": 0.5245153933865451, "percentage": 17.5, "elapsed_time": "0:09:38", "remaining_time": "0:45:25", "throughput": 2445.88, "total_tokens": 1414272}
25
+ {"current_steps": 120, "total_steps": 657, "loss": 0.4744, "lr": 9.67543406215813e-05, "epoch": 0.5473204104903079, "percentage": 18.26, "elapsed_time": "0:10:01", "remaining_time": "0:44:52", "throughput": 2452.92, "total_tokens": 1476096}
26
+ {"current_steps": 125, "total_steps": 657, "loss": 0.4655, "lr": 9.6280154702172e-05, "epoch": 0.5701254275940707, "percentage": 19.03, "elapsed_time": "0:10:24", "remaining_time": "0:44:19", "throughput": 2458.99, "total_tokens": 1536384}
27
+ {"current_steps": 130, "total_steps": 657, "loss": 0.4762, "lr": 9.577497797204275e-05, "epoch": 0.5929304446978335, "percentage": 19.79, "elapsed_time": "0:10:48", "remaining_time": "0:43:47", "throughput": 2465.17, "total_tokens": 1597952}
28
+ {"current_steps": 135, "total_steps": 657, "loss": 0.4607, "lr": 9.523914871520592e-05, "epoch": 0.6157354618015963, "percentage": 20.55, "elapsed_time": "0:11:11", "remaining_time": "0:43:15", "throughput": 2471.24, "total_tokens": 1659008}
29
+ {"current_steps": 140, "total_steps": 657, "loss": 0.4584, "lr": 9.467302574167804e-05, "epoch": 0.6385404789053591, "percentage": 21.31, "elapsed_time": "0:11:34", "remaining_time": "0:42:44", "throughput": 2476.2, "total_tokens": 1719680}
30
+ {"current_steps": 145, "total_steps": 657, "loss": 0.452, "lr": 9.407698814720829e-05, "epoch": 0.661345496009122, "percentage": 22.07, "elapsed_time": "0:11:57", "remaining_time": "0:42:14", "throughput": 2481.78, "total_tokens": 1781376}
31
+ {"current_steps": 150, "total_steps": 657, "loss": 0.4732, "lr": 9.345143505942254e-05, "epoch": 0.6841505131128849, "percentage": 22.83, "elapsed_time": "0:12:21", "remaining_time": "0:41:45", "throughput": 2486.44, "total_tokens": 1843456}
32
+ {"current_steps": 155, "total_steps": 657, "loss": 0.4627, "lr": 9.279678537055364e-05, "epoch": 0.7069555302166477, "percentage": 23.59, "elapsed_time": "0:12:44", "remaining_time": "0:41:16", "throughput": 2490.82, "total_tokens": 1904640}
33
+ {"current_steps": 160, "total_steps": 657, "loss": 0.4568, "lr": 9.211347745693642e-05, "epoch": 0.7297605473204105, "percentage": 24.35, "elapsed_time": "0:13:07", "remaining_time": "0:40:47", "throughput": 2494.6, "total_tokens": 1965312}
34
+ {"current_steps": 165, "total_steps": 657, "loss": 0.4602, "lr": 9.140196888545571e-05, "epoch": 0.7525655644241733, "percentage": 25.11, "elapsed_time": "0:13:31", "remaining_time": "0:40:18", "throughput": 2498.63, "total_tokens": 2026624}
35
+ {"current_steps": 170, "total_steps": 657, "loss": 0.4524, "lr": 9.066273610714337e-05, "epoch": 0.7753705815279361, "percentage": 25.88, "elapsed_time": "0:13:54", "remaining_time": "0:39:49", "throughput": 2502.2, "total_tokens": 2087552}
36
+ {"current_steps": 175, "total_steps": 657, "loss": 0.4593, "lr": 8.989627413813019e-05, "epoch": 0.798175598631699, "percentage": 26.64, "elapsed_time": "0:14:17", "remaining_time": "0:39:22", "throughput": 2505.61, "total_tokens": 2148864}
37
+ {"current_steps": 180, "total_steps": 657, "loss": 0.451, "lr": 8.910309622816558e-05, "epoch": 0.8209806157354618, "percentage": 27.4, "elapsed_time": "0:14:40", "remaining_time": "0:38:54", "throughput": 2509.09, "total_tokens": 2210048}
38
+ {"current_steps": 185, "total_steps": 657, "loss": 0.4539, "lr": 8.828373351692773e-05, "epoch": 0.8437856328392246, "percentage": 28.16, "elapsed_time": "0:15:04", "remaining_time": "0:38:27", "throughput": 2512.36, "total_tokens": 2272000}
39
+ {"current_steps": 190, "total_steps": 657, "loss": 0.4519, "lr": 8.743873467835388e-05, "epoch": 0.8665906499429875, "percentage": 28.92, "elapsed_time": "0:15:27", "remaining_time": "0:38:00", "throughput": 2515.35, "total_tokens": 2333568}
40
+ {"current_steps": 195, "total_steps": 657, "loss": 0.4446, "lr": 8.656866555322896e-05, "epoch": 0.8893956670467503, "percentage": 29.68, "elapsed_time": "0:15:51", "remaining_time": "0:37:33", "throughput": 2518.39, "total_tokens": 2395776}
41
+ {"current_steps": 200, "total_steps": 657, "loss": 0.4295, "lr": 8.567410877027891e-05, "epoch": 0.9122006841505131, "percentage": 30.44, "elapsed_time": "0:16:14", "remaining_time": "0:37:07", "throughput": 2521.09, "total_tokens": 2457344}
42
+ {"current_steps": 200, "total_steps": 657, "eval_loss": 0.4231548607349396, "epoch": 0.9122006841505131, "percentage": 30.44, "elapsed_time": "0:16:47", "remaining_time": "0:38:22", "throughput": 2438.25, "total_tokens": 2457344}
43
+ {"current_steps": 205, "total_steps": 657, "loss": 0.4208, "lr": 8.475566335602205e-05, "epoch": 0.935005701254276, "percentage": 31.2, "elapsed_time": "0:17:14", "remaining_time": "0:38:01", "throughput": 2433.93, "total_tokens": 2518528}
44
+ {"current_steps": 210, "total_steps": 657, "loss": 0.4246, "lr": 8.381394433364e-05, "epoch": 0.9578107183580388, "percentage": 31.96, "elapsed_time": "0:17:38", "remaining_time": "0:37:32", "throughput": 2438.28, "total_tokens": 2580096}
45
+ {"current_steps": 215, "total_steps": 657, "loss": 0.4156, "lr": 8.284958231113656e-05, "epoch": 0.9806157354618016, "percentage": 32.72, "elapsed_time": "0:18:01", "remaining_time": "0:37:03", "throughput": 2442.72, "total_tokens": 2642176}
46
+ {"current_steps": 220, "total_steps": 657, "loss": 0.4234, "lr": 8.186322305906066e-05, "epoch": 1.0, "percentage": 33.49, "elapsed_time": "0:18:21", "remaining_time": "0:36:28", "throughput": 2446.08, "total_tokens": 2694512}
47
+ {"current_steps": 225, "total_steps": 657, "loss": 0.3979, "lr": 8.085552707807567e-05, "epoch": 1.0228050171037628, "percentage": 34.25, "elapsed_time": "0:18:45", "remaining_time": "0:36:00", "throughput": 2449.95, "total_tokens": 2756208}
48
+ {"current_steps": 230, "total_steps": 657, "loss": 0.406, "lr": 7.982716915666515e-05, "epoch": 1.0456100342075256, "percentage": 35.01, "elapsed_time": "0:19:08", "remaining_time": "0:35:31", "throughput": 2453.62, "total_tokens": 2817520}
49
+ {"current_steps": 235, "total_steps": 657, "loss": 0.3999, "lr": 7.877883791927082e-05, "epoch": 1.0684150513112884, "percentage": 35.77, "elapsed_time": "0:19:31", "remaining_time": "0:35:04", "throughput": 2457.08, "total_tokens": 2879216}
50
+ {"current_steps": 240, "total_steps": 657, "loss": 0.394, "lr": 7.771123536516558e-05, "epoch": 1.0912200684150513, "percentage": 36.53, "elapsed_time": "0:19:55", "remaining_time": "0:34:36", "throughput": 2460.5, "total_tokens": 2940528}
51
+ {"current_steps": 245, "total_steps": 657, "loss": 0.4077, "lr": 7.662507639837018e-05, "epoch": 1.114025085518814, "percentage": 37.29, "elapsed_time": "0:20:18", "remaining_time": "0:34:09", "throughput": 2463.81, "total_tokens": 3002096}
52
+ {"current_steps": 250, "total_steps": 657, "loss": 0.384, "lr": 7.552108834892857e-05, "epoch": 1.1368301026225769, "percentage": 38.05, "elapsed_time": "0:20:41", "remaining_time": "0:33:41", "throughput": 2466.97, "total_tokens": 3063408}
53
+ {"current_steps": 255, "total_steps": 657, "loss": 0.3823, "lr": 7.440001048586209e-05, "epoch": 1.1596351197263397, "percentage": 38.81, "elapsed_time": "0:21:05", "remaining_time": "0:33:14", "throughput": 2469.96, "total_tokens": 3124848}
54
+ {"current_steps": 260, "total_steps": 657, "loss": 0.3829, "lr": 7.32625935221293e-05, "epoch": 1.1824401368301025, "percentage": 39.57, "elapsed_time": "0:21:28", "remaining_time": "0:32:47", "throughput": 2473.04, "total_tokens": 3186544}
55
+ {"current_steps": 265, "total_steps": 657, "loss": 0.384, "lr": 7.210959911192214e-05, "epoch": 1.2052451539338653, "percentage": 40.33, "elapsed_time": "0:21:51", "remaining_time": "0:32:20", "throughput": 2475.9, "total_tokens": 3247856}
56
+ {"current_steps": 270, "total_steps": 657, "loss": 0.3824, "lr": 7.094179934063567e-05, "epoch": 1.2280501710376284, "percentage": 41.1, "elapsed_time": "0:22:15", "remaining_time": "0:31:54", "throughput": 2478.83, "total_tokens": 3310320}
57
+ {"current_steps": 275, "total_steps": 657, "loss": 0.3755, "lr": 6.975997620785276e-05, "epoch": 1.2508551881413912, "percentage": 41.86, "elapsed_time": "0:22:38", "remaining_time": "0:31:27", "throughput": 2481.35, "total_tokens": 3371504}
58
+ {"current_steps": 280, "total_steps": 657, "loss": 0.3762, "lr": 6.856492110368969e-05, "epoch": 1.273660205245154, "percentage": 42.62, "elapsed_time": "0:23:02", "remaining_time": "0:31:01", "throughput": 2484.09, "total_tokens": 3434096}
59
+ {"current_steps": 285, "total_steps": 657, "loss": 0.3673, "lr": 6.735743427885375e-05, "epoch": 1.2964652223489168, "percentage": 43.38, "elapsed_time": "0:23:25", "remaining_time": "0:30:34", "throughput": 2486.41, "total_tokens": 3495280}
60
+ {"current_steps": 290, "total_steps": 657, "loss": 0.3779, "lr": 6.613832430876727e-05, "epoch": 1.3192702394526796, "percentage": 44.14, "elapsed_time": "0:23:49", "remaining_time": "0:30:08", "throughput": 2488.83, "total_tokens": 3557616}
61
+ {"current_steps": 295, "total_steps": 657, "loss": 0.3661, "lr": 6.490840755211736e-05, "epoch": 1.3420752565564424, "percentage": 44.9, "elapsed_time": "0:24:12", "remaining_time": "0:29:42", "throughput": 2490.76, "total_tokens": 3617904}
62
+ {"current_steps": 300, "total_steps": 657, "loss": 0.3545, "lr": 6.366850760419341e-05, "epoch": 1.3648802736602053, "percentage": 45.66, "elapsed_time": "0:24:35", "remaining_time": "0:29:16", "throughput": 2493.06, "total_tokens": 3679728}
63
+ {"current_steps": 300, "total_steps": 657, "eval_loss": 0.35477936267852783, "epoch": 1.3648802736602053, "percentage": 45.66, "elapsed_time": "0:25:09", "remaining_time": "0:29:55", "throughput": 2438.28, "total_tokens": 3679728}
64
+ {"current_steps": 305, "total_steps": 657, "loss": 0.3728, "lr": 6.241945474537901e-05, "epoch": 1.387685290763968, "percentage": 46.42, "elapsed_time": "0:25:36", "remaining_time": "0:29:33", "throughput": 2434.46, "total_tokens": 3740528}
65
+ {"current_steps": 310, "total_steps": 657, "loss": 0.3553, "lr": 6.116208538516707e-05, "epoch": 1.4104903078677309, "percentage": 47.18, "elapsed_time": "0:26:00", "remaining_time": "0:29:06", "throughput": 2437.37, "total_tokens": 3802992}
66
+ {"current_steps": 315, "total_steps": 657, "loss": 0.3474, "lr": 5.98972415020708e-05, "epoch": 1.4332953249714937, "percentage": 47.95, "elapsed_time": "0:26:23", "remaining_time": "0:28:39", "throughput": 2440.17, "total_tokens": 3863792}
67
+ {"current_steps": 320, "total_steps": 657, "loss": 0.3493, "lr": 5.862577007980544e-05, "epoch": 1.4561003420752565, "percentage": 48.71, "elapsed_time": "0:26:47", "remaining_time": "0:28:12", "throughput": 2442.71, "total_tokens": 3925616}
68
+ {"current_steps": 325, "total_steps": 657, "loss": 0.3488, "lr": 5.734852254011833e-05, "epoch": 1.4789053591790193, "percentage": 49.47, "elapsed_time": "0:27:10", "remaining_time": "0:27:45", "throughput": 2445.3, "total_tokens": 3986416}
69
+ {"current_steps": 330, "total_steps": 657, "loss": 0.3262, "lr": 5.60663541726471e-05, "epoch": 1.5017103762827824, "percentage": 50.23, "elapsed_time": "0:27:33", "remaining_time": "0:27:18", "throughput": 2447.66, "total_tokens": 4046704}
70
+ {"current_steps": 335, "total_steps": 657, "loss": 0.325, "lr": 5.478012356218779e-05, "epoch": 1.5245153933865452, "percentage": 50.99, "elapsed_time": "0:27:56", "remaining_time": "0:26:51", "throughput": 2450.1, "total_tokens": 4107632}
71
+ {"current_steps": 340, "total_steps": 657, "loss": 0.3249, "lr": 5.349069201375657e-05, "epoch": 1.547320410490308, "percentage": 51.75, "elapsed_time": "0:28:19", "remaining_time": "0:26:24", "throughput": 2452.57, "total_tokens": 4168816}
72
+ {"current_steps": 345, "total_steps": 657, "loss": 0.3295, "lr": 5.2198922975829544e-05, "epoch": 1.5701254275940708, "percentage": 52.51, "elapsed_time": "0:28:43", "remaining_time": "0:25:58", "throughput": 2455.14, "total_tokens": 4230512}
73
+ {"current_steps": 350, "total_steps": 657, "loss": 0.3126, "lr": 5.090568146214764e-05, "epoch": 1.5929304446978336, "percentage": 53.27, "elapsed_time": "0:29:06", "remaining_time": "0:25:31", "throughput": 2457.58, "total_tokens": 4291952}
74
+ {"current_steps": 355, "total_steps": 657, "loss": 0.3301, "lr": 4.961183347247301e-05, "epoch": 1.6157354618015964, "percentage": 54.03, "elapsed_time": "0:29:30", "remaining_time": "0:25:05", "throughput": 2459.84, "total_tokens": 4354032}
75
+ {"current_steps": 360, "total_steps": 657, "loss": 0.3261, "lr": 4.831824541268537e-05, "epoch": 1.6385404789053593, "percentage": 54.79, "elapsed_time": "0:29:53", "remaining_time": "0:24:39", "throughput": 2461.98, "total_tokens": 4415344}
76
+ {"current_steps": 365, "total_steps": 657, "loss": 0.3195, "lr": 4.702578351460633e-05, "epoch": 1.661345496009122, "percentage": 55.56, "elapsed_time": "0:30:16", "remaining_time": "0:24:13", "throughput": 2464.06, "total_tokens": 4477040}
77
+ {"current_steps": 370, "total_steps": 657, "loss": 0.3249, "lr": 4.573531325594017e-05, "epoch": 1.6841505131128849, "percentage": 56.32, "elapsed_time": "0:30:40", "remaining_time": "0:23:47", "throughput": 2466.33, "total_tokens": 4539120}
78
+ {"current_steps": 375, "total_steps": 657, "loss": 0.3053, "lr": 4.444769878071977e-05, "epoch": 1.7069555302166477, "percentage": 57.08, "elapsed_time": "0:31:03", "remaining_time": "0:23:21", "throughput": 2468.4, "total_tokens": 4600304}
79
+ {"current_steps": 380, "total_steps": 657, "loss": 0.3168, "lr": 4.316380232064543e-05, "epoch": 1.7297605473204105, "percentage": 57.84, "elapsed_time": "0:31:27", "remaining_time": "0:22:55", "throughput": 2470.42, "total_tokens": 4661872}
80
+ {"current_steps": 385, "total_steps": 657, "loss": 0.312, "lr": 4.188448361770458e-05, "epoch": 1.7525655644241733, "percentage": 58.6, "elapsed_time": "0:31:50", "remaining_time": "0:22:29", "throughput": 2472.49, "total_tokens": 4723440}
81
+ {"current_steps": 390, "total_steps": 657, "loss": 0.3172, "lr": 4.061059934845818e-05, "epoch": 1.7753705815279361, "percentage": 59.36, "elapsed_time": "0:32:13", "remaining_time": "0:22:03", "throughput": 2474.49, "total_tokens": 4784880}
82
+ {"current_steps": 395, "total_steps": 657, "loss": 0.3087, "lr": 3.93430025503803e-05, "epoch": 1.798175598631699, "percentage": 60.12, "elapsed_time": "0:32:36", "remaining_time": "0:21:38", "throughput": 2476.33, "total_tokens": 4846064}
83
+ {"current_steps": 400, "total_steps": 657, "loss": 0.3391, "lr": 3.8082542050634405e-05, "epoch": 1.8209806157354618, "percentage": 60.88, "elapsed_time": "0:33:00", "remaining_time": "0:21:12", "throughput": 2478.26, "total_tokens": 4908144}
84
+ {"current_steps": 400, "total_steps": 657, "eval_loss": 0.31536775827407837, "epoch": 1.8209806157354618, "percentage": 60.88, "elapsed_time": "0:33:33", "remaining_time": "0:21:33", "throughput": 2437.43, "total_tokens": 4908144}
85
+ {"current_steps": 405, "total_steps": 657, "loss": 0.3113, "lr": 3.6830061897668866e-05, "epoch": 1.8437856328392246, "percentage": 61.64, "elapsed_time": "0:34:00", "remaining_time": "0:21:09", "throughput": 2435.34, "total_tokens": 4969072}
86
+ {"current_steps": 410, "total_steps": 657, "loss": 0.3127, "lr": 3.558640079601265e-05, "epoch": 1.8665906499429874, "percentage": 62.4, "elapsed_time": "0:34:23", "remaining_time": "0:20:43", "throughput": 2437.6, "total_tokens": 5031152}
87
+ {"current_steps": 415, "total_steps": 657, "loss": 0.3087, "lr": 3.435239154464947e-05, "epoch": 1.8893956670467502, "percentage": 63.17, "elapsed_time": "0:34:47", "remaining_time": "0:20:17", "throughput": 2439.69, "total_tokens": 5092848}
88
+ {"current_steps": 420, "total_steps": 657, "loss": 0.2942, "lr": 3.312886047934639e-05, "epoch": 1.912200684150513, "percentage": 63.93, "elapsed_time": "0:35:10", "remaining_time": "0:19:51", "throughput": 2441.82, "total_tokens": 5154032}
89
+ {"current_steps": 425, "total_steps": 657, "loss": 0.3128, "lr": 3.191662691931051e-05, "epoch": 1.9350057012542758, "percentage": 64.69, "elapsed_time": "0:35:34", "remaining_time": "0:19:25", "throughput": 2443.94, "total_tokens": 5216112}
90
+ {"current_steps": 430, "total_steps": 657, "loss": 0.312, "lr": 3.071650261854414e-05, "epoch": 1.9578107183580387, "percentage": 65.45, "elapsed_time": "0:35:57", "remaining_time": "0:18:59", "throughput": 2446.05, "total_tokens": 5278320}
91
+ {"current_steps": 435, "total_steps": 657, "loss": 0.288, "lr": 2.9529291222265922e-05, "epoch": 1.9806157354618015, "percentage": 66.21, "elapsed_time": "0:36:21", "remaining_time": "0:18:33", "throughput": 2448.02, "total_tokens": 5339504}
92
+ {"current_steps": 440, "total_steps": 657, "loss": 0.2862, "lr": 2.8355787728761952e-05, "epoch": 2.0, "percentage": 66.97, "elapsed_time": "0:36:41", "remaining_time": "0:18:05", "throughput": 2449.57, "total_tokens": 5392000}
93
+ {"current_steps": 445, "total_steps": 657, "loss": 0.2784, "lr": 2.7196777957027013e-05, "epoch": 2.022805017103763, "percentage": 67.73, "elapsed_time": "0:37:04", "remaining_time": "0:17:39", "throughput": 2451.54, "total_tokens": 5453568}
94
+ {"current_steps": 450, "total_steps": 657, "loss": 0.279, "lr": 2.6053038020552685e-05, "epoch": 2.0456100342075256, "percentage": 68.49, "elapsed_time": "0:37:27", "remaining_time": "0:17:14", "throughput": 2453.23, "total_tokens": 5514496}
95
+ {"current_steps": 455, "total_steps": 657, "loss": 0.2801, "lr": 2.492533380761466e-05, "epoch": 2.0684150513112884, "percentage": 69.25, "elapsed_time": "0:37:51", "remaining_time": "0:16:48", "throughput": 2455.14, "total_tokens": 5576192}
96
+ {"current_steps": 460, "total_steps": 657, "loss": 0.2691, "lr": 2.3814420468407195e-05, "epoch": 2.0912200684150513, "percentage": 70.02, "elapsed_time": "0:38:14", "remaining_time": "0:16:22", "throughput": 2456.78, "total_tokens": 5638016}
97
+ {"current_steps": 465, "total_steps": 657, "loss": 0.2716, "lr": 2.2721041909367986e-05, "epoch": 2.114025085518814, "percentage": 70.78, "elapsed_time": "0:38:38", "remaining_time": "0:15:57", "throughput": 2458.38, "total_tokens": 5699456}
98
+ {"current_steps": 470, "total_steps": 657, "loss": 0.2785, "lr": 2.164593029503249e-05, "epoch": 2.136830102622577, "percentage": 71.54, "elapsed_time": "0:39:01", "remaining_time": "0:15:31", "throughput": 2460.03, "total_tokens": 5761280}
99
+ {"current_steps": 475, "total_steps": 657, "loss": 0.2692, "lr": 2.0589805557750912e-05, "epoch": 2.1596351197263397, "percentage": 72.3, "elapsed_time": "0:39:25", "remaining_time": "0:15:06", "throughput": 2461.65, "total_tokens": 5823232}
100
+ {"current_steps": 480, "total_steps": 657, "loss": 0.2707, "lr": 1.9553374915596328e-05, "epoch": 2.1824401368301025, "percentage": 73.06, "elapsed_time": "0:39:49", "remaining_time": "0:14:41", "throughput": 2463.29, "total_tokens": 5885184}
101
+ {"current_steps": 485, "total_steps": 657, "loss": 0.2618, "lr": 1.853733239878669e-05, "epoch": 2.2052451539338653, "percentage": 73.82, "elapsed_time": "0:40:12", "remaining_time": "0:14:15", "throughput": 2464.74, "total_tokens": 5946624}
102
+ {"current_steps": 490, "total_steps": 657, "loss": 0.2609, "lr": 1.754235838493795e-05, "epoch": 2.228050171037628, "percentage": 74.58, "elapsed_time": "0:40:36", "remaining_time": "0:13:50", "throughput": 2466.27, "total_tokens": 6007936}
103
+ {"current_steps": 495, "total_steps": 657, "loss": 0.2689, "lr": 1.6569119143459387e-05, "epoch": 2.250855188141391, "percentage": 75.34, "elapsed_time": "0:40:59", "remaining_time": "0:13:24", "throughput": 2467.8, "total_tokens": 6069248}
104
+ {"current_steps": 500, "total_steps": 657, "loss": 0.2775, "lr": 1.561826638939628e-05, "epoch": 2.2736602052451538, "percentage": 76.1, "elapsed_time": "0:41:22", "remaining_time": "0:12:59", "throughput": 2469.32, "total_tokens": 6131072}
105
+ {"current_steps": 500, "total_steps": 657, "eval_loss": 0.27918943762779236, "epoch": 2.2736602052451538, "percentage": 76.1, "elapsed_time": "0:41:56", "remaining_time": "0:13:10", "throughput": 2436.7, "total_tokens": 6131072}
106
+ {"current_steps": 505, "total_steps": 657, "loss": 0.2593, "lr": 1.4690436847018757e-05, "epoch": 2.2964652223489166, "percentage": 76.86, "elapsed_time": "0:42:23", "remaining_time": "0:12:45", "throughput": 2434.87, "total_tokens": 6192640}
107
+ {"current_steps": 510, "total_steps": 657, "loss": 0.2593, "lr": 1.3786251823448909e-05, "epoch": 2.3192702394526794, "percentage": 77.63, "elapsed_time": "0:42:46", "remaining_time": "0:12:19", "throughput": 2436.66, "total_tokens": 6254592}
108
+ {"current_steps": 515, "total_steps": 657, "loss": 0.2585, "lr": 1.2906316792611828e-05, "epoch": 2.342075256556442, "percentage": 78.39, "elapsed_time": "0:43:10", "remaining_time": "0:11:54", "throughput": 2438.35, "total_tokens": 6316288}
109
+ {"current_steps": 520, "total_steps": 657, "loss": 0.2429, "lr": 1.2051220989789075e-05, "epoch": 2.364880273660205, "percentage": 79.15, "elapsed_time": "0:43:33", "remaining_time": "0:11:28", "throughput": 2440.0, "total_tokens": 6377600}
110
+ {"current_steps": 525, "total_steps": 657, "loss": 0.2446, "lr": 1.1221537017046101e-05, "epoch": 2.387685290763968, "percentage": 79.91, "elapsed_time": "0:43:57", "remaining_time": "0:11:03", "throughput": 2441.62, "total_tokens": 6439040}
111
+ {"current_steps": 530, "total_steps": 657, "loss": 0.262, "lr": 1.0417820459797939e-05, "epoch": 2.4104903078677307, "percentage": 80.67, "elapsed_time": "0:44:21", "remaining_time": "0:10:37", "throughput": 2443.2, "total_tokens": 6501376}
112
+ {"current_steps": 535, "total_steps": 657, "loss": 0.2604, "lr": 9.640609514769695e-06, "epoch": 2.433295324971494, "percentage": 81.43, "elapsed_time": "0:44:44", "remaining_time": "0:10:12", "throughput": 2444.8, "total_tokens": 6563840}
113
+ {"current_steps": 540, "total_steps": 657, "loss": 0.2524, "lr": 8.890424629601197e-06, "epoch": 2.4561003420752567, "percentage": 82.19, "elapsed_time": "0:45:08", "remaining_time": "0:09:46", "throughput": 2446.32, "total_tokens": 6624896}
114
+ {"current_steps": 545, "total_steps": 657, "loss": 0.2495, "lr": 8.167768154337102e-06, "epoch": 2.4789053591790196, "percentage": 82.95, "elapsed_time": "0:45:31", "remaining_time": "0:09:21", "throughput": 2447.71, "total_tokens": 6685184}
115
+ {"current_steps": 550, "total_steps": 657, "loss": 0.2391, "lr": 7.47312400503572e-06, "epoch": 2.5017103762827824, "percentage": 83.71, "elapsed_time": "0:45:54", "remaining_time": "0:08:55", "throughput": 2449.08, "total_tokens": 6744960}
116
+ {"current_steps": 555, "total_steps": 657, "loss": 0.2606, "lr": 6.806957339721837e-06, "epoch": 2.524515393386545, "percentage": 84.47, "elapsed_time": "0:46:17", "remaining_time": "0:08:30", "throughput": 2450.55, "total_tokens": 6805888}
117
+ {"current_steps": 560, "total_steps": 657, "loss": 0.2505, "lr": 6.169714246900693e-06, "epoch": 2.547320410490308, "percentage": 85.24, "elapsed_time": "0:46:40", "remaining_time": "0:08:05", "throughput": 2451.9, "total_tokens": 6866816}
118
+ {"current_steps": 565, "total_steps": 657, "loss": 0.2461, "lr": 5.561821446841431e-06, "epoch": 2.570125427594071, "percentage": 86.0, "elapsed_time": "0:47:03", "remaining_time": "0:07:39", "throughput": 2453.31, "total_tokens": 6928128}
119
+ {"current_steps": 570, "total_steps": 657, "loss": 0.2494, "lr": 4.983686005830407e-06, "epoch": 2.5929304446978336, "percentage": 86.76, "elapsed_time": "0:47:27", "remaining_time": "0:07:14", "throughput": 2454.74, "total_tokens": 6989056}
120
+ {"current_steps": 575, "total_steps": 657, "loss": 0.2528, "lr": 4.435695063585221e-06, "epoch": 2.6157354618015964, "percentage": 87.52, "elapsed_time": "0:47:50", "remaining_time": "0:06:49", "throughput": 2456.18, "total_tokens": 7050496}
121
+ {"current_steps": 580, "total_steps": 657, "loss": 0.2488, "lr": 3.918215574012501e-06, "epoch": 2.6385404789053593, "percentage": 88.28, "elapsed_time": "0:48:13", "remaining_time": "0:06:24", "throughput": 2457.55, "total_tokens": 7111680}
122
+ {"current_steps": 585, "total_steps": 657, "loss": 0.2556, "lr": 3.4315940594827233e-06, "epoch": 2.661345496009122, "percentage": 89.04, "elapsed_time": "0:48:37", "remaining_time": "0:05:59", "throughput": 2458.97, "total_tokens": 7173632}
123
+ {"current_steps": 590, "total_steps": 657, "loss": 0.2501, "lr": 2.9761563787866708e-06, "epoch": 2.684150513112885, "percentage": 89.8, "elapsed_time": "0:49:00", "remaining_time": "0:05:33", "throughput": 2460.32, "total_tokens": 7234816}
124
+ {"current_steps": 595, "total_steps": 657, "loss": 0.2573, "lr": 2.5522075089290275e-06, "epoch": 2.7069555302166477, "percentage": 90.56, "elapsed_time": "0:49:24", "remaining_time": "0:05:08", "throughput": 2461.74, "total_tokens": 7297792}
125
+ {"current_steps": 600, "total_steps": 657, "loss": 0.2481, "lr": 2.1600313409050833e-06, "epoch": 2.7297605473204105, "percentage": 91.32, "elapsed_time": "0:49:47", "remaining_time": "0:04:43", "throughput": 2462.9, "total_tokens": 7358336}
126
+ {"current_steps": 600, "total_steps": 657, "eval_loss": 0.26373597979545593, "epoch": 2.7297605473204105, "percentage": 91.32, "elapsed_time": "0:50:20", "remaining_time": "0:04:46", "throughput": 2435.84, "total_tokens": 7358336}
127
+ {"current_steps": 605, "total_steps": 657, "loss": 0.2414, "lr": 1.7998904895974056e-06, "epoch": 2.7525655644241733, "percentage": 92.09, "elapsed_time": "0:50:47", "remaining_time": "0:04:21", "throughput": 2434.54, "total_tokens": 7419520}
128
+ {"current_steps": 610, "total_steps": 657, "loss": 0.2561, "lr": 1.4720261179197114e-06, "epoch": 2.775370581527936, "percentage": 92.85, "elapsed_time": "0:51:10", "remaining_time": "0:03:56", "throughput": 2436.06, "total_tokens": 7480832}
129
+ {"current_steps": 615, "total_steps": 657, "loss": 0.2547, "lr": 1.1766577753257512e-06, "epoch": 2.798175598631699, "percentage": 93.61, "elapsed_time": "0:51:34", "remaining_time": "0:03:31", "throughput": 2437.6, "total_tokens": 7542400}
130
+ {"current_steps": 620, "total_steps": 657, "loss": 0.2412, "lr": 9.139832507913171e-07, "epoch": 2.8209806157354618, "percentage": 94.37, "elapsed_time": "0:51:57", "remaining_time": "0:03:06", "throughput": 2438.99, "total_tokens": 7603456}
131
+ {"current_steps": 625, "total_steps": 657, "loss": 0.2457, "lr": 6.841784403678275e-07, "epoch": 2.8437856328392246, "percentage": 95.13, "elapsed_time": "0:52:20", "remaining_time": "0:02:40", "throughput": 2440.45, "total_tokens": 7664896}
132
+ {"current_steps": 630, "total_steps": 657, "loss": 0.2357, "lr": 4.873972293961581e-07, "epoch": 2.8665906499429874, "percentage": 95.89, "elapsed_time": "0:52:43", "remaining_time": "0:02:15", "throughput": 2441.84, "total_tokens": 7725824}
133
+ {"current_steps": 635, "total_steps": 657, "loss": 0.2432, "lr": 3.2377138945964836e-07, "epoch": 2.88939566704675, "percentage": 96.65, "elapsed_time": "0:53:07", "remaining_time": "0:01:50", "throughput": 2443.17, "total_tokens": 7786752}
134
+ {"current_steps": 640, "total_steps": 657, "loss": 0.2368, "lr": 1.934104901452405e-07, "epoch": 2.912200684150513, "percentage": 97.41, "elapsed_time": "0:53:30", "remaining_time": "0:01:25", "throughput": 2444.51, "total_tokens": 7847680}
135
+ {"current_steps": 645, "total_steps": 657, "loss": 0.2381, "lr": 9.640182567185463e-08, "epoch": 2.935005701254276, "percentage": 98.17, "elapsed_time": "0:53:53", "remaining_time": "0:01:00", "throughput": 2445.86, "total_tokens": 7908992}
136
+ {"current_steps": 650, "total_steps": 657, "loss": 0.2469, "lr": 3.281035643511454e-08, "epoch": 2.9578107183580387, "percentage": 98.93, "elapsed_time": "0:54:17", "remaining_time": "0:00:35", "throughput": 2447.24, "total_tokens": 7970816}
137
+ {"current_steps": 655, "total_steps": 657, "loss": 0.2491, "lr": 2.678665507588329e-09, "epoch": 2.9806157354618015, "percentage": 99.7, "elapsed_time": "0:54:40", "remaining_time": "0:00:10", "throughput": 2448.49, "total_tokens": 8032384}
138
+ {"current_steps": 657, "total_steps": 657, "epoch": 2.9897377423033067, "percentage": 100.0, "elapsed_time": "0:54:53", "remaining_time": "0:00:00", "throughput": 2446.41, "total_tokens": 8057088}
trainer_state.json CHANGED
@@ -11,1120 +11,1120 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.02280501710376283,
14
- "grad_norm": 42.79934310913086,
15
  "learning_rate": 1e-05,
16
- "loss": 13.7981,
17
- "num_input_tokens_seen": 66816,
18
  "step": 5
19
  },
20
  {
21
  "epoch": 0.04561003420752566,
22
- "grad_norm": 50.72537612915039,
23
  "learning_rate": 2e-05,
24
- "loss": 10.0048,
25
- "num_input_tokens_seen": 134016,
26
  "step": 10
27
  },
28
  {
29
  "epoch": 0.06841505131128849,
30
- "grad_norm": 10.08156967163086,
31
  "learning_rate": 3e-05,
32
- "loss": 3.0982,
33
- "num_input_tokens_seen": 200960,
34
  "step": 15
35
  },
36
  {
37
  "epoch": 0.09122006841505131,
38
- "grad_norm": 11.622808456420898,
39
  "learning_rate": 4e-05,
40
- "loss": 1.4691,
41
- "num_input_tokens_seen": 267776,
42
  "step": 20
43
  },
44
  {
45
  "epoch": 0.11402508551881414,
46
- "grad_norm": 4.010005474090576,
47
  "learning_rate": 5e-05,
48
- "loss": 0.3994,
49
- "num_input_tokens_seen": 334464,
50
  "step": 25
51
  },
52
  {
53
  "epoch": 0.13683010262257697,
54
- "grad_norm": 11.507843971252441,
55
  "learning_rate": 6e-05,
56
- "loss": 0.248,
57
- "num_input_tokens_seen": 401792,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.15963511972633979,
62
- "grad_norm": 11.010838508605957,
63
  "learning_rate": 7e-05,
64
- "loss": 0.2903,
65
- "num_input_tokens_seen": 468864,
66
  "step": 35
67
  },
68
  {
69
  "epoch": 0.18244013683010263,
70
- "grad_norm": 4.925116539001465,
71
  "learning_rate": 8e-05,
72
- "loss": 0.2275,
73
- "num_input_tokens_seen": 535808,
74
  "step": 40
75
  },
76
  {
77
  "epoch": 0.20524515393386544,
78
- "grad_norm": 3.4587278366088867,
79
  "learning_rate": 9e-05,
80
- "loss": 0.2009,
81
- "num_input_tokens_seen": 603008,
82
  "step": 45
83
  },
84
  {
85
  "epoch": 0.22805017103762829,
86
- "grad_norm": 8.7875394821167,
87
  "learning_rate": 0.0001,
88
- "loss": 0.2084,
89
- "num_input_tokens_seen": 669696,
90
  "step": 50
91
  },
92
  {
93
  "epoch": 0.2508551881413911,
94
- "grad_norm": 7.699063777923584,
95
  "learning_rate": 9.998325912536413e-05,
96
- "loss": 0.2382,
97
- "num_input_tokens_seen": 736640,
98
  "step": 55
99
  },
100
  {
101
  "epoch": 0.27366020524515394,
102
- "grad_norm": 4.874607563018799,
103
  "learning_rate": 9.99330477117318e-05,
104
- "loss": 0.2569,
105
- "num_input_tokens_seen": 803456,
106
  "step": 60
107
  },
108
  {
109
  "epoch": 0.29646522234891676,
110
- "grad_norm": 5.9076457023620605,
111
  "learning_rate": 9.98493993824223e-05,
112
- "loss": 0.238,
113
- "num_input_tokens_seen": 870784,
114
  "step": 65
115
  },
116
  {
117
  "epoch": 0.31927023945267957,
118
- "grad_norm": 4.501344203948975,
119
  "learning_rate": 9.973237015128338e-05,
120
- "loss": 0.1904,
121
- "num_input_tokens_seen": 937600,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 0.34207525655644244,
126
- "grad_norm": 0.14564673602581024,
127
  "learning_rate": 9.958203838518255e-05,
128
- "loss": 0.1738,
129
- "num_input_tokens_seen": 1004288,
130
  "step": 75
131
  },
132
  {
133
  "epoch": 0.36488027366020526,
134
- "grad_norm": 0.9604123830795288,
135
  "learning_rate": 9.939850475152978e-05,
136
- "loss": 0.1715,
137
- "num_input_tokens_seen": 1071104,
138
  "step": 80
139
  },
140
  {
141
  "epoch": 0.38768529076396807,
142
- "grad_norm": 0.1580754667520523,
143
  "learning_rate": 9.918189215086719e-05,
144
- "loss": 0.1628,
145
- "num_input_tokens_seen": 1137792,
146
  "step": 85
147
  },
148
  {
149
  "epoch": 0.4104903078677309,
150
- "grad_norm": 0.11061199009418488,
151
  "learning_rate": 9.893234563457049e-05,
152
- "loss": 0.1609,
153
- "num_input_tokens_seen": 1204608,
154
  "step": 90
155
  },
156
  {
157
  "epoch": 0.43329532497149376,
158
- "grad_norm": 0.13197912275791168,
159
  "learning_rate": 9.865003230771745e-05,
160
- "loss": 0.1601,
161
- "num_input_tokens_seen": 1271552,
162
  "step": 95
163
  },
164
  {
165
  "epoch": 0.45610034207525657,
166
- "grad_norm": 0.07416187971830368,
167
  "learning_rate": 9.83351412171886e-05,
168
- "loss": 0.1559,
169
- "num_input_tokens_seen": 1338368,
170
  "step": 100
171
  },
172
  {
173
  "epoch": 0.45610034207525657,
174
- "eval_loss": 0.15936881303787231,
175
- "eval_runtime": 35.9315,
176
- "eval_samples_per_second": 86.776,
177
- "eval_steps_per_second": 5.427,
178
- "num_input_tokens_seen": 1338368,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.4789053591790194,
183
- "grad_norm": 2.2470414638519287,
184
  "learning_rate": 9.798788322507475e-05,
185
- "loss": 0.1638,
186
- "num_input_tokens_seen": 1405568,
187
  "step": 105
188
  },
189
  {
190
  "epoch": 0.5017103762827823,
191
- "grad_norm": 9.681774139404297,
192
  "learning_rate": 9.76084908674764e-05,
193
- "loss": 0.2682,
194
- "num_input_tokens_seen": 1472256,
195
  "step": 110
196
  },
197
  {
198
  "epoch": 0.5245153933865451,
199
- "grad_norm": 0.09305865317583084,
200
  "learning_rate": 9.719721819878942e-05,
201
- "loss": 0.1955,
202
- "num_input_tokens_seen": 1539200,
203
  "step": 115
204
  },
205
  {
206
  "epoch": 0.5473204104903079,
207
- "grad_norm": 0.10614204406738281,
208
  "learning_rate": 9.67543406215813e-05,
209
- "loss": 0.1594,
210
- "num_input_tokens_seen": 1606400,
211
  "step": 120
212
  },
213
  {
214
  "epoch": 0.5701254275940707,
215
- "grad_norm": 0.10649927705526352,
216
  "learning_rate": 9.6280154702172e-05,
217
- "loss": 0.1546,
218
- "num_input_tokens_seen": 1673856,
219
  "step": 125
220
  },
221
  {
222
  "epoch": 0.5929304446978335,
223
- "grad_norm": 0.1280696541070938,
224
  "learning_rate": 9.577497797204275e-05,
225
- "loss": 0.163,
226
- "num_input_tokens_seen": 1740800,
227
  "step": 130
228
  },
229
  {
230
  "epoch": 0.6157354618015963,
231
- "grad_norm": 0.08251194655895233,
232
  "learning_rate": 9.523914871520592e-05,
233
- "loss": 0.1565,
234
- "num_input_tokens_seen": 1807744,
235
  "step": 135
236
  },
237
  {
238
  "epoch": 0.6385404789053591,
239
- "grad_norm": 0.07118305563926697,
240
  "learning_rate": 9.467302574167804e-05,
241
- "loss": 0.1545,
242
- "num_input_tokens_seen": 1875072,
243
  "step": 140
244
  },
245
  {
246
  "epoch": 0.661345496009122,
247
- "grad_norm": 0.09398273378610611,
248
  "learning_rate": 9.407698814720829e-05,
249
- "loss": 0.1561,
250
- "num_input_tokens_seen": 1942144,
251
  "step": 145
252
  },
253
  {
254
  "epoch": 0.6841505131128849,
255
- "grad_norm": 0.044905178248882294,
256
  "learning_rate": 9.345143505942254e-05,
257
- "loss": 0.1607,
258
- "num_input_tokens_seen": 2008960,
259
  "step": 150
260
  },
261
  {
262
  "epoch": 0.7069555302166477,
263
- "grad_norm": 0.30501601099967957,
264
  "learning_rate": 9.279678537055364e-05,
265
- "loss": 0.1567,
266
- "num_input_tokens_seen": 2076032,
267
  "step": 155
268
  },
269
  {
270
  "epoch": 0.7297605473204105,
271
- "grad_norm": 0.12230126559734344,
272
  "learning_rate": 9.211347745693642e-05,
273
- "loss": 0.1537,
274
- "num_input_tokens_seen": 2142720,
275
  "step": 160
276
  },
277
  {
278
  "epoch": 0.7525655644241733,
279
- "grad_norm": 0.06303831189870834,
280
  "learning_rate": 9.140196888545571e-05,
281
- "loss": 0.1613,
282
- "num_input_tokens_seen": 2209536,
283
  "step": 165
284
  },
285
  {
286
  "epoch": 0.7753705815279361,
287
- "grad_norm": 0.07087565213441849,
288
  "learning_rate": 9.066273610714337e-05,
289
- "loss": 0.1525,
290
- "num_input_tokens_seen": 2276224,
291
  "step": 170
292
  },
293
  {
294
  "epoch": 0.798175598631699,
295
- "grad_norm": 0.0799873098731041,
296
  "learning_rate": 8.989627413813019e-05,
297
- "loss": 0.1582,
298
- "num_input_tokens_seen": 2343040,
299
  "step": 175
300
  },
301
  {
302
  "epoch": 0.8209806157354618,
303
- "grad_norm": 0.07897079735994339,
304
  "learning_rate": 8.910309622816558e-05,
305
- "loss": 0.1614,
306
- "num_input_tokens_seen": 2409984,
307
  "step": 180
308
  },
309
  {
310
  "epoch": 0.8437856328392246,
311
- "grad_norm": 0.2020573765039444,
312
  "learning_rate": 8.828373351692773e-05,
313
- "loss": 0.1602,
314
- "num_input_tokens_seen": 2476928,
315
  "step": 185
316
  },
317
  {
318
  "epoch": 0.8665906499429875,
319
- "grad_norm": 0.0581236369907856,
320
  "learning_rate": 8.743873467835388e-05,
321
- "loss": 0.1734,
322
- "num_input_tokens_seen": 2543872,
323
  "step": 190
324
  },
325
  {
326
  "epoch": 0.8893956670467503,
327
- "grad_norm": 0.17267735302448273,
328
  "learning_rate": 8.656866555322896e-05,
329
- "loss": 0.1583,
330
- "num_input_tokens_seen": 2610816,
331
  "step": 195
332
  },
333
  {
334
  "epoch": 0.9122006841505131,
335
- "grad_norm": 0.05809967219829559,
336
  "learning_rate": 8.567410877027891e-05,
337
- "loss": 0.1601,
338
- "num_input_tokens_seen": 2677376,
339
  "step": 200
340
  },
341
  {
342
  "epoch": 0.9122006841505131,
343
- "eval_loss": 0.1585174947977066,
344
- "eval_runtime": 35.9369,
345
- "eval_samples_per_second": 86.763,
346
- "eval_steps_per_second": 5.426,
347
- "num_input_tokens_seen": 2677376,
348
  "step": 200
349
  },
350
  {
351
  "epoch": 0.935005701254276,
352
- "grad_norm": 0.16051442921161652,
353
  "learning_rate": 8.475566335602205e-05,
354
- "loss": 0.1547,
355
- "num_input_tokens_seen": 2744320,
356
  "step": 205
357
  },
358
  {
359
  "epoch": 0.9578107183580388,
360
- "grad_norm": 0.0988399088382721,
361
  "learning_rate": 8.381394433364e-05,
362
- "loss": 0.1596,
363
- "num_input_tokens_seen": 2811136,
364
  "step": 210
365
  },
366
  {
367
  "epoch": 0.9806157354618016,
368
- "grad_norm": 0.07488340139389038,
369
  "learning_rate": 8.284958231113656e-05,
370
- "loss": 0.1615,
371
- "num_input_tokens_seen": 2878208,
372
  "step": 215
373
  },
374
  {
375
  "epoch": 1.0,
376
- "grad_norm": 0.1385851949453354,
377
  "learning_rate": 8.186322305906066e-05,
378
- "loss": 0.1663,
379
- "num_input_tokens_seen": 2934672,
380
  "step": 220
381
  },
382
  {
383
  "epoch": 1.0228050171037628,
384
- "grad_norm": 0.04155217856168747,
385
  "learning_rate": 8.085552707807567e-05,
386
- "loss": 0.157,
387
- "num_input_tokens_seen": 3001488,
388
  "step": 225
389
  },
390
  {
391
  "epoch": 1.0456100342075256,
392
- "grad_norm": 0.05447247251868248,
393
  "learning_rate": 7.982716915666515e-05,
394
- "loss": 0.1599,
395
- "num_input_tokens_seen": 3068048,
396
  "step": 230
397
  },
398
  {
399
  "epoch": 1.0684150513112884,
400
- "grad_norm": 0.05512971803545952,
401
  "learning_rate": 7.877883791927082e-05,
402
- "loss": 0.1558,
403
- "num_input_tokens_seen": 3134864,
404
  "step": 235
405
  },
406
  {
407
  "epoch": 1.0912200684150513,
408
- "grad_norm": 0.030602725222706795,
409
  "learning_rate": 7.771123536516558e-05,
410
- "loss": 0.1507,
411
- "num_input_tokens_seen": 3202064,
412
  "step": 240
413
  },
414
  {
415
  "epoch": 1.114025085518814,
416
- "grad_norm": 0.08493039011955261,
417
  "learning_rate": 7.662507639837018e-05,
418
- "loss": 0.1611,
419
- "num_input_tokens_seen": 3268752,
420
  "step": 245
421
  },
422
  {
423
  "epoch": 1.1368301026225769,
424
- "grad_norm": 0.019975917413830757,
425
  "learning_rate": 7.552108834892857e-05,
426
- "loss": 0.1581,
427
- "num_input_tokens_seen": 3335568,
428
  "step": 250
429
  },
430
  {
431
  "epoch": 1.1596351197263397,
432
- "grad_norm": 0.079766646027565,
433
  "learning_rate": 7.440001048586209e-05,
434
- "loss": 0.1543,
435
- "num_input_tokens_seen": 3402640,
436
  "step": 255
437
  },
438
  {
439
  "epoch": 1.1824401368301025,
440
- "grad_norm": 0.05010490119457245,
441
  "learning_rate": 7.32625935221293e-05,
442
- "loss": 0.1608,
443
- "num_input_tokens_seen": 3469712,
444
  "step": 260
445
  },
446
  {
447
  "epoch": 1.2052451539338653,
448
- "grad_norm": 0.030159804970026016,
449
  "learning_rate": 7.210959911192214e-05,
450
- "loss": 0.1569,
451
- "num_input_tokens_seen": 3536528,
452
  "step": 265
453
  },
454
  {
455
  "epoch": 1.2280501710376284,
456
- "grad_norm": 0.04282454401254654,
457
  "learning_rate": 7.094179934063567e-05,
458
- "loss": 0.1608,
459
- "num_input_tokens_seen": 3603728,
460
  "step": 270
461
  },
462
  {
463
  "epoch": 1.2508551881413912,
464
- "grad_norm": 0.028035402297973633,
465
  "learning_rate": 6.975997620785276e-05,
466
- "loss": 0.1562,
467
- "num_input_tokens_seen": 3670544,
468
  "step": 275
469
  },
470
  {
471
  "epoch": 1.273660205245154,
472
- "grad_norm": 0.027284786105155945,
473
  "learning_rate": 6.856492110368969e-05,
474
- "loss": 0.159,
475
- "num_input_tokens_seen": 3737360,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 1.2964652223489168,
480
- "grad_norm": 0.039123497903347015,
481
  "learning_rate": 6.735743427885375e-05,
482
- "loss": 0.1552,
483
- "num_input_tokens_seen": 3804560,
484
  "step": 285
485
  },
486
  {
487
  "epoch": 1.3192702394526796,
488
- "grad_norm": 0.04413946345448494,
489
  "learning_rate": 6.613832430876727e-05,
490
- "loss": 0.1611,
491
- "num_input_tokens_seen": 3871376,
492
  "step": 290
493
  },
494
  {
495
  "epoch": 1.3420752565564424,
496
- "grad_norm": 0.0899345800280571,
497
  "learning_rate": 6.490840755211736e-05,
498
- "loss": 0.1514,
499
- "num_input_tokens_seen": 3938064,
500
  "step": 295
501
  },
502
  {
503
  "epoch": 1.3648802736602053,
504
- "grad_norm": 0.051399193704128265,
505
  "learning_rate": 6.366850760419341e-05,
506
- "loss": 0.1588,
507
- "num_input_tokens_seen": 4005008,
508
  "step": 300
509
  },
510
  {
511
  "epoch": 1.3648802736602053,
512
- "eval_loss": 0.15267899632453918,
513
- "eval_runtime": 35.9268,
514
- "eval_samples_per_second": 86.788,
515
- "eval_steps_per_second": 5.428,
516
- "num_input_tokens_seen": 4005008,
517
  "step": 300
518
  },
519
  {
520
  "epoch": 1.387685290763968,
521
- "grad_norm": 0.07039803266525269,
522
  "learning_rate": 6.241945474537901e-05,
523
- "loss": 0.1552,
524
- "num_input_tokens_seen": 4071696,
525
  "step": 305
526
  },
527
  {
528
  "epoch": 1.4104903078677309,
529
- "grad_norm": 0.09462770074605942,
530
  "learning_rate": 6.116208538516707e-05,
531
- "loss": 0.1506,
532
- "num_input_tokens_seen": 4139024,
533
  "step": 310
534
  },
535
  {
536
  "epoch": 1.4332953249714937,
537
- "grad_norm": 0.05580534040927887,
538
  "learning_rate": 5.98972415020708e-05,
539
- "loss": 0.1516,
540
- "num_input_tokens_seen": 4205712,
541
  "step": 315
542
  },
543
  {
544
  "epoch": 1.4561003420752565,
545
- "grad_norm": 0.04588129371404648,
546
  "learning_rate": 5.862577007980544e-05,
547
- "loss": 0.1537,
548
- "num_input_tokens_seen": 4272912,
549
  "step": 320
550
  },
551
  {
552
  "epoch": 1.4789053591790193,
553
- "grad_norm": 0.1075662299990654,
554
  "learning_rate": 5.734852254011833e-05,
555
- "loss": 0.1526,
556
- "num_input_tokens_seen": 4339856,
557
  "step": 325
558
  },
559
  {
560
  "epoch": 1.5017103762827824,
561
- "grad_norm": 0.07934882491827011,
562
  "learning_rate": 5.60663541726471e-05,
563
- "loss": 0.1465,
564
- "num_input_tokens_seen": 4406928,
565
  "step": 330
566
  },
567
  {
568
  "epoch": 1.5245153933865452,
569
- "grad_norm": 0.06356900930404663,
570
  "learning_rate": 5.478012356218779e-05,
571
- "loss": 0.1467,
572
- "num_input_tokens_seen": 4473872,
573
  "step": 335
574
  },
575
  {
576
  "epoch": 1.547320410490308,
577
- "grad_norm": 0.08979075402021408,
578
  "learning_rate": 5.349069201375657e-05,
579
- "loss": 0.1476,
580
- "num_input_tokens_seen": 4541072,
581
  "step": 340
582
  },
583
  {
584
  "epoch": 1.5701254275940708,
585
- "grad_norm": 0.11577138304710388,
586
  "learning_rate": 5.2198922975829544e-05,
587
- "loss": 0.1486,
588
- "num_input_tokens_seen": 4607632,
589
  "step": 345
590
  },
591
  {
592
  "epoch": 1.5929304446978336,
593
- "grad_norm": 0.15246377885341644,
594
  "learning_rate": 5.090568146214764e-05,
595
- "loss": 0.1413,
596
- "num_input_tokens_seen": 4674704,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 1.6157354618015964,
601
- "grad_norm": 0.18236877024173737,
602
  "learning_rate": 4.961183347247301e-05,
603
- "loss": 0.1474,
604
- "num_input_tokens_seen": 4741648,
605
  "step": 355
606
  },
607
  {
608
  "epoch": 1.6385404789053593,
609
- "grad_norm": 0.163015216588974,
610
  "learning_rate": 4.831824541268537e-05,
611
- "loss": 0.1442,
612
- "num_input_tokens_seen": 4808592,
613
  "step": 360
614
  },
615
  {
616
  "epoch": 1.661345496009122,
617
- "grad_norm": 0.2425057590007782,
618
  "learning_rate": 4.702578351460633e-05,
619
- "loss": 0.14,
620
- "num_input_tokens_seen": 4875536,
621
  "step": 365
622
  },
623
  {
624
  "epoch": 1.6841505131128849,
625
- "grad_norm": 0.10775990039110184,
626
  "learning_rate": 4.573531325594017e-05,
627
- "loss": 0.142,
628
- "num_input_tokens_seen": 4942480,
629
  "step": 370
630
  },
631
  {
632
  "epoch": 1.7069555302166477,
633
- "grad_norm": 0.23868361115455627,
634
  "learning_rate": 4.444769878071977e-05,
635
- "loss": 0.138,
636
- "num_input_tokens_seen": 5009424,
637
  "step": 375
638
  },
639
  {
640
  "epoch": 1.7297605473204105,
641
- "grad_norm": 0.2915964424610138,
642
  "learning_rate": 4.316380232064543e-05,
643
- "loss": 0.1388,
644
- "num_input_tokens_seen": 5076368,
645
  "step": 380
646
  },
647
  {
648
  "epoch": 1.7525655644241733,
649
- "grad_norm": 0.19889657199382782,
650
  "learning_rate": 4.188448361770458e-05,
651
- "loss": 0.1372,
652
- "num_input_tokens_seen": 5143056,
653
  "step": 385
654
  },
655
  {
656
  "epoch": 1.7753705815279361,
657
- "grad_norm": 0.15646949410438538,
658
  "learning_rate": 4.061059934845818e-05,
659
- "loss": 0.1351,
660
- "num_input_tokens_seen": 5209744,
661
  "step": 390
662
  },
663
  {
664
  "epoch": 1.798175598631699,
665
- "grad_norm": 0.18898698687553406,
666
  "learning_rate": 3.93430025503803e-05,
667
- "loss": 0.1327,
668
- "num_input_tokens_seen": 5276688,
669
  "step": 395
670
  },
671
  {
672
  "epoch": 1.8209806157354618,
673
- "grad_norm": 0.28470566868782043,
674
  "learning_rate": 3.8082542050634405e-05,
675
- "loss": 0.1387,
676
- "num_input_tokens_seen": 5343632,
677
  "step": 400
678
  },
679
  {
680
  "epoch": 1.8209806157354618,
681
- "eval_loss": 0.1311902403831482,
682
- "eval_runtime": 35.9475,
683
- "eval_samples_per_second": 86.738,
684
- "eval_steps_per_second": 5.425,
685
- "num_input_tokens_seen": 5343632,
686
  "step": 400
687
  },
688
  {
689
  "epoch": 1.8437856328392246,
690
- "grad_norm": 0.22231805324554443,
691
  "learning_rate": 3.6830061897668866e-05,
692
- "loss": 0.1328,
693
- "num_input_tokens_seen": 5410832,
694
  "step": 405
695
  },
696
  {
697
  "epoch": 1.8665906499429874,
698
- "grad_norm": 0.1955064982175827,
699
  "learning_rate": 3.558640079601265e-05,
700
- "loss": 0.1392,
701
- "num_input_tokens_seen": 5477904,
702
  "step": 410
703
  },
704
  {
705
  "epoch": 1.8893956670467502,
706
- "grad_norm": 0.21471981704235077,
707
  "learning_rate": 3.435239154464947e-05,
708
- "loss": 0.1308,
709
- "num_input_tokens_seen": 5544720,
710
  "step": 415
711
  },
712
  {
713
  "epoch": 1.912200684150513,
714
- "grad_norm": 0.2159951776266098,
715
  "learning_rate": 3.312886047934639e-05,
716
- "loss": 0.1238,
717
- "num_input_tokens_seen": 5611536,
718
  "step": 420
719
  },
720
  {
721
  "epoch": 1.9350057012542758,
722
- "grad_norm": 0.1482582986354828,
723
  "learning_rate": 3.191662691931051e-05,
724
- "loss": 0.131,
725
- "num_input_tokens_seen": 5678480,
726
  "step": 425
727
  },
728
  {
729
  "epoch": 1.9578107183580387,
730
- "grad_norm": 0.2534109950065613,
731
  "learning_rate": 3.071650261854414e-05,
732
- "loss": 0.1292,
733
- "num_input_tokens_seen": 5746192,
734
  "step": 430
735
  },
736
  {
737
  "epoch": 1.9806157354618015,
738
- "grad_norm": 0.2140827178955078,
739
  "learning_rate": 2.9529291222265922e-05,
740
- "loss": 0.1233,
741
- "num_input_tokens_seen": 5813264,
742
  "step": 435
743
  },
744
  {
745
  "epoch": 2.0,
746
- "grad_norm": 0.32933691143989563,
747
  "learning_rate": 2.8355787728761952e-05,
748
- "loss": 0.126,
749
- "num_input_tokens_seen": 5869600,
750
  "step": 440
751
  },
752
  {
753
  "epoch": 2.022805017103763,
754
- "grad_norm": 0.32979491353034973,
755
  "learning_rate": 2.7196777957027013e-05,
756
- "loss": 0.1191,
757
- "num_input_tokens_seen": 5936544,
758
  "step": 445
759
  },
760
  {
761
  "epoch": 2.0456100342075256,
762
- "grad_norm": 0.23468773066997528,
763
  "learning_rate": 2.6053038020552685e-05,
764
- "loss": 0.1202,
765
- "num_input_tokens_seen": 6003232,
766
  "step": 450
767
  },
768
  {
769
  "epoch": 2.0684150513112884,
770
- "grad_norm": 0.2606813609600067,
771
  "learning_rate": 2.492533380761466e-05,
772
- "loss": 0.1235,
773
- "num_input_tokens_seen": 6069920,
774
  "step": 455
775
  },
776
  {
777
  "epoch": 2.0912200684150513,
778
- "grad_norm": 0.3431966006755829,
779
  "learning_rate": 2.3814420468407195e-05,
780
- "loss": 0.1153,
781
- "num_input_tokens_seen": 6136736,
782
  "step": 460
783
  },
784
  {
785
  "epoch": 2.114025085518814,
786
- "grad_norm": 0.2507797181606293,
787
  "learning_rate": 2.2721041909367986e-05,
788
- "loss": 0.1138,
789
- "num_input_tokens_seen": 6203680,
790
  "step": 465
791
  },
792
  {
793
  "epoch": 2.136830102622577,
794
- "grad_norm": 0.20167945325374603,
795
  "learning_rate": 2.164593029503249e-05,
796
- "loss": 0.1198,
797
- "num_input_tokens_seen": 6270880,
798
  "step": 470
799
  },
800
  {
801
  "epoch": 2.1596351197263397,
802
- "grad_norm": 0.2383241206407547,
803
  "learning_rate": 2.0589805557750912e-05,
804
- "loss": 0.1179,
805
- "num_input_tokens_seen": 6337568,
806
  "step": 475
807
  },
808
  {
809
  "epoch": 2.1824401368301025,
810
- "grad_norm": 0.4635178744792938,
811
  "learning_rate": 1.9553374915596328e-05,
812
- "loss": 0.1198,
813
- "num_input_tokens_seen": 6404256,
814
  "step": 480
815
  },
816
  {
817
  "epoch": 2.2052451539338653,
818
- "grad_norm": 0.48052558302879333,
819
  "learning_rate": 1.853733239878669e-05,
820
- "loss": 0.1152,
821
- "num_input_tokens_seen": 6471456,
822
  "step": 485
823
  },
824
  {
825
  "epoch": 2.228050171037628,
826
- "grad_norm": 0.27201253175735474,
827
  "learning_rate": 1.754235838493795e-05,
828
- "loss": 0.115,
829
- "num_input_tokens_seen": 6538272,
830
  "step": 490
831
  },
832
  {
833
  "epoch": 2.250855188141391,
834
- "grad_norm": 0.21040843427181244,
835
  "learning_rate": 1.6569119143459387e-05,
836
- "loss": 0.1195,
837
- "num_input_tokens_seen": 6605216,
838
  "step": 495
839
  },
840
  {
841
  "epoch": 2.2736602052451538,
842
- "grad_norm": 0.20532406866550446,
843
  "learning_rate": 1.561826638939628e-05,
844
- "loss": 0.1181,
845
- "num_input_tokens_seen": 6672288,
846
  "step": 500
847
  },
848
  {
849
  "epoch": 2.2736602052451538,
850
- "eval_loss": 0.11165652424097061,
851
- "eval_runtime": 35.9227,
852
- "eval_samples_per_second": 86.797,
853
- "eval_steps_per_second": 5.428,
854
- "num_input_tokens_seen": 6672288,
855
  "step": 500
856
  },
857
  {
858
  "epoch": 2.2964652223489166,
859
- "grad_norm": 0.24837104976177216,
860
  "learning_rate": 1.4690436847018757e-05,
861
- "loss": 0.1144,
862
- "num_input_tokens_seen": 6739488,
863
  "step": 505
864
  },
865
  {
866
  "epoch": 2.3192702394526794,
867
- "grad_norm": 0.2058333456516266,
868
  "learning_rate": 1.3786251823448909e-05,
869
- "loss": 0.1133,
870
- "num_input_tokens_seen": 6806176,
871
  "step": 510
872
  },
873
  {
874
  "epoch": 2.342075256556442,
875
- "grad_norm": 0.1937752366065979,
876
  "learning_rate": 1.2906316792611828e-05,
877
- "loss": 0.1174,
878
- "num_input_tokens_seen": 6873376,
879
  "step": 515
880
  },
881
  {
882
  "epoch": 2.364880273660205,
883
- "grad_norm": 0.48919156193733215,
884
  "learning_rate": 1.2051220989789075e-05,
885
- "loss": 0.103,
886
- "num_input_tokens_seen": 6940192,
887
  "step": 520
888
  },
889
  {
890
  "epoch": 2.387685290763968,
891
- "grad_norm": 0.21910136938095093,
892
  "learning_rate": 1.1221537017046101e-05,
893
- "loss": 0.1038,
894
- "num_input_tokens_seen": 7007136,
895
  "step": 525
896
  },
897
  {
898
  "epoch": 2.4104903078677307,
899
- "grad_norm": 0.29447972774505615,
900
  "learning_rate": 1.0417820459797939e-05,
901
- "loss": 0.1111,
902
- "num_input_tokens_seen": 7074336,
903
  "step": 530
904
  },
905
  {
906
  "epoch": 2.433295324971494,
907
- "grad_norm": 0.4034893214702606,
908
  "learning_rate": 9.640609514769695e-06,
909
- "loss": 0.1137,
910
- "num_input_tokens_seen": 7141152,
911
  "step": 535
912
  },
913
  {
914
  "epoch": 2.4561003420752567,
915
- "grad_norm": 0.4087333083152771,
916
  "learning_rate": 8.890424629601197e-06,
917
- "loss": 0.1059,
918
- "num_input_tokens_seen": 7208096,
919
  "step": 540
920
  },
921
  {
922
  "epoch": 2.4789053591790196,
923
- "grad_norm": 0.41021662950515747,
924
  "learning_rate": 8.167768154337102e-06,
925
- "loss": 0.1065,
926
- "num_input_tokens_seen": 7274784,
927
  "step": 545
928
  },
929
  {
930
  "epoch": 2.5017103762827824,
931
- "grad_norm": 0.19636331498622894,
932
  "learning_rate": 7.47312400503572e-06,
933
- "loss": 0.0998,
934
- "num_input_tokens_seen": 7341856,
935
  "step": 550
936
  },
937
  {
938
  "epoch": 2.524515393386545,
939
- "grad_norm": 0.22242596745491028,
940
  "learning_rate": 6.806957339721837e-06,
941
- "loss": 0.1117,
942
- "num_input_tokens_seen": 7409056,
943
  "step": 555
944
  },
945
  {
946
  "epoch": 2.547320410490308,
947
- "grad_norm": 0.28343266248703003,
948
  "learning_rate": 6.169714246900693e-06,
949
- "loss": 0.1048,
950
- "num_input_tokens_seen": 7476000,
951
  "step": 560
952
  },
953
  {
954
  "epoch": 2.570125427594071,
955
- "grad_norm": 0.2496432662010193,
956
  "learning_rate": 5.561821446841431e-06,
957
- "loss": 0.1053,
958
- "num_input_tokens_seen": 7542688,
959
  "step": 565
960
  },
961
  {
962
  "epoch": 2.5929304446978336,
963
- "grad_norm": 0.4992646872997284,
964
  "learning_rate": 4.983686005830407e-06,
965
- "loss": 0.1046,
966
- "num_input_tokens_seen": 7609888,
967
  "step": 570
968
  },
969
  {
970
  "epoch": 2.6157354618015964,
971
- "grad_norm": 0.3248825669288635,
972
  "learning_rate": 4.435695063585221e-06,
973
- "loss": 0.1067,
974
- "num_input_tokens_seen": 7677088,
975
  "step": 575
976
  },
977
  {
978
  "epoch": 2.6385404789053593,
979
- "grad_norm": 0.30638033151626587,
980
  "learning_rate": 3.918215574012501e-06,
981
- "loss": 0.1064,
982
- "num_input_tokens_seen": 7744160,
983
  "step": 580
984
  },
985
  {
986
  "epoch": 2.661345496009122,
987
- "grad_norm": 0.2688983380794525,
988
  "learning_rate": 3.4315940594827233e-06,
989
- "loss": 0.1052,
990
- "num_input_tokens_seen": 7811104,
991
  "step": 585
992
  },
993
  {
994
  "epoch": 2.684150513112885,
995
- "grad_norm": 0.2184939980506897,
996
  "learning_rate": 2.9761563787866708e-06,
997
- "loss": 0.107,
998
- "num_input_tokens_seen": 7877920,
999
  "step": 590
1000
  },
1001
  {
1002
  "epoch": 2.7069555302166477,
1003
- "grad_norm": 0.30081209540367126,
1004
  "learning_rate": 2.5522075089290275e-06,
1005
- "loss": 0.1077,
1006
- "num_input_tokens_seen": 7945376,
1007
  "step": 595
1008
  },
1009
  {
1010
  "epoch": 2.7297605473204105,
1011
- "grad_norm": 0.5384100675582886,
1012
  "learning_rate": 2.1600313409050833e-06,
1013
- "loss": 0.1024,
1014
- "num_input_tokens_seen": 8012064,
1015
  "step": 600
1016
  },
1017
  {
1018
  "epoch": 2.7297605473204105,
1019
- "eval_loss": 0.1033177301287651,
1020
- "eval_runtime": 35.9314,
1021
- "eval_samples_per_second": 86.776,
1022
- "eval_steps_per_second": 5.427,
1023
- "num_input_tokens_seen": 8012064,
1024
  "step": 600
1025
  },
1026
  {
1027
  "epoch": 2.7525655644241733,
1028
- "grad_norm": 0.30382928252220154,
1029
  "learning_rate": 1.7998904895974056e-06,
1030
- "loss": 0.0994,
1031
- "num_input_tokens_seen": 8078752,
1032
  "step": 605
1033
  },
1034
  {
1035
  "epoch": 2.775370581527936,
1036
- "grad_norm": 0.24799004197120667,
1037
  "learning_rate": 1.4720261179197114e-06,
1038
- "loss": 0.1015,
1039
- "num_input_tokens_seen": 8145824,
1040
  "step": 610
1041
  },
1042
  {
1043
  "epoch": 2.798175598631699,
1044
- "grad_norm": 0.28183919191360474,
1045
  "learning_rate": 1.1766577753257512e-06,
1046
- "loss": 0.1041,
1047
- "num_input_tokens_seen": 8212768,
1048
  "step": 615
1049
  },
1050
  {
1051
  "epoch": 2.8209806157354618,
1052
- "grad_norm": 0.2579708397388458,
1053
  "learning_rate": 9.139832507913171e-07,
1054
- "loss": 0.1019,
1055
- "num_input_tokens_seen": 8279968,
1056
  "step": 620
1057
  },
1058
  {
1059
  "epoch": 2.8437856328392246,
1060
- "grad_norm": 0.33580321073532104,
1061
  "learning_rate": 6.841784403678275e-07,
1062
- "loss": 0.1028,
1063
- "num_input_tokens_seen": 8347040,
1064
  "step": 625
1065
  },
1066
  {
1067
  "epoch": 2.8665906499429874,
1068
- "grad_norm": 0.3009226322174072,
1069
  "learning_rate": 4.873972293961581e-07,
1070
- "loss": 0.1008,
1071
- "num_input_tokens_seen": 8413856,
1072
  "step": 630
1073
  },
1074
  {
1075
  "epoch": 2.88939566704675,
1076
- "grad_norm": 0.3536825180053711,
1077
  "learning_rate": 3.2377138945964836e-07,
1078
- "loss": 0.0998,
1079
- "num_input_tokens_seen": 8480800,
1080
  "step": 635
1081
  },
1082
  {
1083
  "epoch": 2.912200684150513,
1084
- "grad_norm": 0.22358882427215576,
1085
  "learning_rate": 1.934104901452405e-07,
1086
- "loss": 0.0954,
1087
- "num_input_tokens_seen": 8547744,
1088
  "step": 640
1089
  },
1090
  {
1091
  "epoch": 2.935005701254276,
1092
- "grad_norm": 0.3308273255825043,
1093
  "learning_rate": 9.640182567185463e-08,
1094
- "loss": 0.0994,
1095
- "num_input_tokens_seen": 8614816,
1096
  "step": 645
1097
  },
1098
  {
1099
  "epoch": 2.9578107183580387,
1100
- "grad_norm": 0.36525049805641174,
1101
  "learning_rate": 3.281035643511454e-08,
1102
- "loss": 0.1057,
1103
- "num_input_tokens_seen": 8681504,
1104
  "step": 650
1105
  },
1106
  {
1107
  "epoch": 2.9806157354618015,
1108
- "grad_norm": 0.20705801248550415,
1109
  "learning_rate": 2.678665507588329e-09,
1110
- "loss": 0.105,
1111
- "num_input_tokens_seen": 8748320,
1112
  "step": 655
1113
  },
1114
  {
1115
  "epoch": 2.9897377423033067,
1116
- "num_input_tokens_seen": 8775072,
1117
  "step": 657,
1118
- "total_flos": 3.56298712611029e+17,
1119
- "train_loss": 0.35758140245438347,
1120
- "train_runtime": 3571.0417,
1121
- "train_samples_per_second": 23.571,
1122
- "train_steps_per_second": 0.184
1123
  }
1124
  ],
1125
  "logging_steps": 5,
1126
  "max_steps": 657,
1127
- "num_input_tokens_seen": 8775072,
1128
  "num_train_epochs": 3,
1129
  "save_steps": 100,
1130
  "stateful_callbacks": {
@@ -1139,7 +1139,7 @@
1139
  "attributes": {}
1140
  }
1141
  },
1142
- "total_flos": 3.56298712611029e+17,
1143
  "train_batch_size": 16,
1144
  "trial_name": null,
1145
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 0.02280501710376283,
14
+ "grad_norm": 41.50532150268555,
15
  "learning_rate": 1e-05,
16
+ "loss": 13.3488,
17
+ "num_input_tokens_seen": 62080,
18
  "step": 5
19
  },
20
  {
21
  "epoch": 0.04561003420752566,
22
+ "grad_norm": 36.72806930541992,
23
  "learning_rate": 2e-05,
24
+ "loss": 10.56,
25
+ "num_input_tokens_seen": 124672,
26
  "step": 10
27
  },
28
  {
29
  "epoch": 0.06841505131128849,
30
+ "grad_norm": 19.752948760986328,
31
  "learning_rate": 3e-05,
32
+ "loss": 5.1785,
33
+ "num_input_tokens_seen": 185344,
34
  "step": 15
35
  },
36
  {
37
  "epoch": 0.09122006841505131,
38
+ "grad_norm": 7.878300189971924,
39
  "learning_rate": 4e-05,
40
+ "loss": 2.2593,
41
+ "num_input_tokens_seen": 246912,
42
  "step": 20
43
  },
44
  {
45
  "epoch": 0.11402508551881414,
46
+ "grad_norm": 2.8654000759124756,
47
  "learning_rate": 5e-05,
48
+ "loss": 0.8861,
49
+ "num_input_tokens_seen": 307712,
50
  "step": 25
51
  },
52
  {
53
  "epoch": 0.13683010262257697,
54
+ "grad_norm": 3.1212615966796875,
55
  "learning_rate": 6e-05,
56
+ "loss": 0.6509,
57
+ "num_input_tokens_seen": 369536,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.15963511972633979,
62
+ "grad_norm": 1.2268868684768677,
63
  "learning_rate": 7e-05,
64
+ "loss": 0.5623,
65
+ "num_input_tokens_seen": 431104,
66
  "step": 35
67
  },
68
  {
69
  "epoch": 0.18244013683010263,
70
+ "grad_norm": 0.8868739604949951,
71
  "learning_rate": 8e-05,
72
+ "loss": 0.5356,
73
+ "num_input_tokens_seen": 492672,
74
  "step": 40
75
  },
76
  {
77
  "epoch": 0.20524515393386544,
78
+ "grad_norm": 0.39010241627693176,
79
  "learning_rate": 9e-05,
80
+ "loss": 0.7113,
81
+ "num_input_tokens_seen": 554368,
82
  "step": 45
83
  },
84
  {
85
  "epoch": 0.22805017103762829,
86
+ "grad_norm": 0.6562630534172058,
87
  "learning_rate": 0.0001,
88
+ "loss": 0.5074,
89
+ "num_input_tokens_seen": 615808,
90
  "step": 50
91
  },
92
  {
93
  "epoch": 0.2508551881413911,
94
+ "grad_norm": 0.3772495687007904,
95
  "learning_rate": 9.998325912536413e-05,
96
+ "loss": 0.5147,
97
+ "num_input_tokens_seen": 678144,
98
  "step": 55
99
  },
100
  {
101
  "epoch": 0.27366020524515394,
102
+ "grad_norm": 0.6829465627670288,
103
  "learning_rate": 9.99330477117318e-05,
104
+ "loss": 0.5046,
105
+ "num_input_tokens_seen": 739968,
106
  "step": 60
107
  },
108
  {
109
  "epoch": 0.29646522234891676,
110
+ "grad_norm": 0.8408652544021606,
111
  "learning_rate": 9.98493993824223e-05,
112
+ "loss": 0.5062,
113
+ "num_input_tokens_seen": 801792,
114
  "step": 65
115
  },
116
  {
117
  "epoch": 0.31927023945267957,
118
+ "grad_norm": 0.6092342138290405,
119
  "learning_rate": 9.973237015128338e-05,
120
+ "loss": 0.4955,
121
+ "num_input_tokens_seen": 863744,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 0.34207525655644244,
126
+ "grad_norm": 0.427351713180542,
127
  "learning_rate": 9.958203838518255e-05,
128
+ "loss": 0.4799,
129
+ "num_input_tokens_seen": 924160,
130
  "step": 75
131
  },
132
  {
133
  "epoch": 0.36488027366020526,
134
+ "grad_norm": 0.7395061254501343,
135
  "learning_rate": 9.939850475152978e-05,
136
+ "loss": 0.4767,
137
+ "num_input_tokens_seen": 985856,
138
  "step": 80
139
  },
140
  {
141
  "epoch": 0.38768529076396807,
142
+ "grad_norm": 1.1108757257461548,
143
  "learning_rate": 9.918189215086719e-05,
144
+ "loss": 0.48,
145
+ "num_input_tokens_seen": 1047680,
146
  "step": 85
147
  },
148
  {
149
  "epoch": 0.4104903078677309,
150
+ "grad_norm": 0.4783574640750885,
151
  "learning_rate": 9.893234563457049e-05,
152
+ "loss": 0.4828,
153
+ "num_input_tokens_seen": 1108352,
154
  "step": 90
155
  },
156
  {
157
  "epoch": 0.43329532497149376,
158
+ "grad_norm": 2.314793825149536,
159
  "learning_rate": 9.865003230771745e-05,
160
+ "loss": 0.4636,
161
+ "num_input_tokens_seen": 1169536,
162
  "step": 95
163
  },
164
  {
165
  "epoch": 0.45610034207525657,
166
+ "grad_norm": 0.5718916654586792,
167
  "learning_rate": 9.83351412171886e-05,
168
+ "loss": 0.4645,
169
+ "num_input_tokens_seen": 1229824,
170
  "step": 100
171
  },
172
  {
173
  "epoch": 0.45610034207525657,
174
+ "eval_loss": 0.4702179729938507,
175
+ "eval_runtime": 33.0943,
176
+ "eval_samples_per_second": 94.216,
177
+ "eval_steps_per_second": 5.892,
178
+ "num_input_tokens_seen": 1229824,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.4789053591790194,
183
+ "grad_norm": 0.2158021181821823,
184
  "learning_rate": 9.798788322507475e-05,
185
+ "loss": 0.4831,
186
+ "num_input_tokens_seen": 1291648,
187
  "step": 105
188
  },
189
  {
190
  "epoch": 0.5017103762827823,
191
+ "grad_norm": 0.36765217781066895,
192
  "learning_rate": 9.76084908674764e-05,
193
+ "loss": 0.4868,
194
+ "num_input_tokens_seen": 1353216,
195
  "step": 110
196
  },
197
  {
198
  "epoch": 0.5245153933865451,
199
+ "grad_norm": 0.4673890173435211,
200
  "learning_rate": 9.719721819878942e-05,
201
+ "loss": 0.4706,
202
+ "num_input_tokens_seen": 1414272,
203
  "step": 115
204
  },
205
  {
206
  "epoch": 0.5473204104903079,
207
+ "grad_norm": 0.26251327991485596,
208
  "learning_rate": 9.67543406215813e-05,
209
+ "loss": 0.4744,
210
+ "num_input_tokens_seen": 1476096,
211
  "step": 120
212
  },
213
  {
214
  "epoch": 0.5701254275940707,
215
+ "grad_norm": 0.2340364009141922,
216
  "learning_rate": 9.6280154702172e-05,
217
+ "loss": 0.4655,
218
+ "num_input_tokens_seen": 1536384,
219
  "step": 125
220
  },
221
  {
222
  "epoch": 0.5929304446978335,
223
+ "grad_norm": 0.3004833161830902,
224
  "learning_rate": 9.577497797204275e-05,
225
+ "loss": 0.4762,
226
+ "num_input_tokens_seen": 1597952,
227
  "step": 130
228
  },
229
  {
230
  "epoch": 0.6157354618015963,
231
+ "grad_norm": 0.22171618044376373,
232
  "learning_rate": 9.523914871520592e-05,
233
+ "loss": 0.4607,
234
+ "num_input_tokens_seen": 1659008,
235
  "step": 135
236
  },
237
  {
238
  "epoch": 0.6385404789053591,
239
+ "grad_norm": 0.34244486689567566,
240
  "learning_rate": 9.467302574167804e-05,
241
+ "loss": 0.4584,
242
+ "num_input_tokens_seen": 1719680,
243
  "step": 140
244
  },
245
  {
246
  "epoch": 0.661345496009122,
247
+ "grad_norm": 0.2041647583246231,
248
  "learning_rate": 9.407698814720829e-05,
249
+ "loss": 0.452,
250
+ "num_input_tokens_seen": 1781376,
251
  "step": 145
252
  },
253
  {
254
  "epoch": 0.6841505131128849,
255
+ "grad_norm": 0.39491522312164307,
256
  "learning_rate": 9.345143505942254e-05,
257
+ "loss": 0.4732,
258
+ "num_input_tokens_seen": 1843456,
259
  "step": 150
260
  },
261
  {
262
  "epoch": 0.7069555302166477,
263
+ "grad_norm": 0.16998454928398132,
264
  "learning_rate": 9.279678537055364e-05,
265
+ "loss": 0.4627,
266
+ "num_input_tokens_seen": 1904640,
267
  "step": 155
268
  },
269
  {
270
  "epoch": 0.7297605473204105,
271
+ "grad_norm": 0.3184824287891388,
272
  "learning_rate": 9.211347745693642e-05,
273
+ "loss": 0.4568,
274
+ "num_input_tokens_seen": 1965312,
275
  "step": 160
276
  },
277
  {
278
  "epoch": 0.7525655644241733,
279
+ "grad_norm": 0.2547973096370697,
280
  "learning_rate": 9.140196888545571e-05,
281
+ "loss": 0.4602,
282
+ "num_input_tokens_seen": 2026624,
283
  "step": 165
284
  },
285
  {
286
  "epoch": 0.7753705815279361,
287
+ "grad_norm": 0.2677474319934845,
288
  "learning_rate": 9.066273610714337e-05,
289
+ "loss": 0.4524,
290
+ "num_input_tokens_seen": 2087552,
291
  "step": 170
292
  },
293
  {
294
  "epoch": 0.798175598631699,
295
+ "grad_norm": 0.26873910427093506,
296
  "learning_rate": 8.989627413813019e-05,
297
+ "loss": 0.4593,
298
+ "num_input_tokens_seen": 2148864,
299
  "step": 175
300
  },
301
  {
302
  "epoch": 0.8209806157354618,
303
+ "grad_norm": 0.2679295539855957,
304
  "learning_rate": 8.910309622816558e-05,
305
+ "loss": 0.451,
306
+ "num_input_tokens_seen": 2210048,
307
  "step": 180
308
  },
309
  {
310
  "epoch": 0.8437856328392246,
311
+ "grad_norm": 0.2806735634803772,
312
  "learning_rate": 8.828373351692773e-05,
313
+ "loss": 0.4539,
314
+ "num_input_tokens_seen": 2272000,
315
  "step": 185
316
  },
317
  {
318
  "epoch": 0.8665906499429875,
319
+ "grad_norm": 0.324919193983078,
320
  "learning_rate": 8.743873467835388e-05,
321
+ "loss": 0.4519,
322
+ "num_input_tokens_seen": 2333568,
323
  "step": 190
324
  },
325
  {
326
  "epoch": 0.8893956670467503,
327
+ "grad_norm": 0.3897643983364105,
328
  "learning_rate": 8.656866555322896e-05,
329
+ "loss": 0.4446,
330
+ "num_input_tokens_seen": 2395776,
331
  "step": 195
332
  },
333
  {
334
  "epoch": 0.9122006841505131,
335
+ "grad_norm": 0.5318993330001831,
336
  "learning_rate": 8.567410877027891e-05,
337
+ "loss": 0.4295,
338
+ "num_input_tokens_seen": 2457344,
339
  "step": 200
340
  },
341
  {
342
  "epoch": 0.9122006841505131,
343
+ "eval_loss": 0.4231548607349396,
344
+ "eval_runtime": 33.1145,
345
+ "eval_samples_per_second": 94.158,
346
+ "eval_steps_per_second": 5.889,
347
+ "num_input_tokens_seen": 2457344,
348
  "step": 200
349
  },
350
  {
351
  "epoch": 0.935005701254276,
352
+ "grad_norm": 0.35744914412498474,
353
  "learning_rate": 8.475566335602205e-05,
354
+ "loss": 0.4208,
355
+ "num_input_tokens_seen": 2518528,
356
  "step": 205
357
  },
358
  {
359
  "epoch": 0.9578107183580388,
360
+ "grad_norm": 0.6458001732826233,
361
  "learning_rate": 8.381394433364e-05,
362
+ "loss": 0.4246,
363
+ "num_input_tokens_seen": 2580096,
364
  "step": 210
365
  },
366
  {
367
  "epoch": 0.9806157354618016,
368
+ "grad_norm": 0.40250498056411743,
369
  "learning_rate": 8.284958231113656e-05,
370
+ "loss": 0.4156,
371
+ "num_input_tokens_seen": 2642176,
372
  "step": 215
373
  },
374
  {
375
  "epoch": 1.0,
376
+ "grad_norm": 0.5407205820083618,
377
  "learning_rate": 8.186322305906066e-05,
378
+ "loss": 0.4234,
379
+ "num_input_tokens_seen": 2694512,
380
  "step": 220
381
  },
382
  {
383
  "epoch": 1.0228050171037628,
384
+ "grad_norm": 0.4826781451702118,
385
  "learning_rate": 8.085552707807567e-05,
386
+ "loss": 0.3979,
387
+ "num_input_tokens_seen": 2756208,
388
  "step": 225
389
  },
390
  {
391
  "epoch": 1.0456100342075256,
392
+ "grad_norm": 0.8148782253265381,
393
  "learning_rate": 7.982716915666515e-05,
394
+ "loss": 0.406,
395
+ "num_input_tokens_seen": 2817520,
396
  "step": 230
397
  },
398
  {
399
  "epoch": 1.0684150513112884,
400
+ "grad_norm": 0.44806551933288574,
401
  "learning_rate": 7.877883791927082e-05,
402
+ "loss": 0.3999,
403
+ "num_input_tokens_seen": 2879216,
404
  "step": 235
405
  },
406
  {
407
  "epoch": 1.0912200684150513,
408
+ "grad_norm": 0.3314334750175476,
409
  "learning_rate": 7.771123536516558e-05,
410
+ "loss": 0.394,
411
+ "num_input_tokens_seen": 2940528,
412
  "step": 240
413
  },
414
  {
415
  "epoch": 1.114025085518814,
416
+ "grad_norm": 0.4039021134376526,
417
  "learning_rate": 7.662507639837018e-05,
418
+ "loss": 0.4077,
419
+ "num_input_tokens_seen": 3002096,
420
  "step": 245
421
  },
422
  {
423
  "epoch": 1.1368301026225769,
424
+ "grad_norm": 0.40102317929267883,
425
  "learning_rate": 7.552108834892857e-05,
426
+ "loss": 0.384,
427
+ "num_input_tokens_seen": 3063408,
428
  "step": 250
429
  },
430
  {
431
  "epoch": 1.1596351197263397,
432
+ "grad_norm": 0.6299117207527161,
433
  "learning_rate": 7.440001048586209e-05,
434
+ "loss": 0.3823,
435
+ "num_input_tokens_seen": 3124848,
436
  "step": 255
437
  },
438
  {
439
  "epoch": 1.1824401368301025,
440
+ "grad_norm": 0.3817499279975891,
441
  "learning_rate": 7.32625935221293e-05,
442
+ "loss": 0.3829,
443
+ "num_input_tokens_seen": 3186544,
444
  "step": 260
445
  },
446
  {
447
  "epoch": 1.2052451539338653,
448
+ "grad_norm": 0.6043562293052673,
449
  "learning_rate": 7.210959911192214e-05,
450
+ "loss": 0.384,
451
+ "num_input_tokens_seen": 3247856,
452
  "step": 265
453
  },
454
  {
455
  "epoch": 1.2280501710376284,
456
+ "grad_norm": 0.4529384970664978,
457
  "learning_rate": 7.094179934063567e-05,
458
+ "loss": 0.3824,
459
+ "num_input_tokens_seen": 3310320,
460
  "step": 270
461
  },
462
  {
463
  "epoch": 1.2508551881413912,
464
+ "grad_norm": 0.43355974555015564,
465
  "learning_rate": 6.975997620785276e-05,
466
+ "loss": 0.3755,
467
+ "num_input_tokens_seen": 3371504,
468
  "step": 275
469
  },
470
  {
471
  "epoch": 1.273660205245154,
472
+ "grad_norm": 0.4386458098888397,
473
  "learning_rate": 6.856492110368969e-05,
474
+ "loss": 0.3762,
475
+ "num_input_tokens_seen": 3434096,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 1.2964652223489168,
480
+ "grad_norm": 0.7101587653160095,
481
  "learning_rate": 6.735743427885375e-05,
482
+ "loss": 0.3673,
483
+ "num_input_tokens_seen": 3495280,
484
  "step": 285
485
  },
486
  {
487
  "epoch": 1.3192702394526796,
488
+ "grad_norm": 0.41383257508277893,
489
  "learning_rate": 6.613832430876727e-05,
490
+ "loss": 0.3779,
491
+ "num_input_tokens_seen": 3557616,
492
  "step": 290
493
  },
494
  {
495
  "epoch": 1.3420752565564424,
496
+ "grad_norm": 0.37136590480804443,
497
  "learning_rate": 6.490840755211736e-05,
498
+ "loss": 0.3661,
499
+ "num_input_tokens_seen": 3617904,
500
  "step": 295
501
  },
502
  {
503
  "epoch": 1.3648802736602053,
504
+ "grad_norm": 0.45618635416030884,
505
  "learning_rate": 6.366850760419341e-05,
506
+ "loss": 0.3545,
507
+ "num_input_tokens_seen": 3679728,
508
  "step": 300
509
  },
510
  {
511
  "epoch": 1.3648802736602053,
512
+ "eval_loss": 0.35477936267852783,
513
+ "eval_runtime": 33.1621,
514
+ "eval_samples_per_second": 94.023,
515
+ "eval_steps_per_second": 5.88,
516
+ "num_input_tokens_seen": 3679728,
517
  "step": 300
518
  },
519
  {
520
  "epoch": 1.387685290763968,
521
+ "grad_norm": 0.5724700689315796,
522
  "learning_rate": 6.241945474537901e-05,
523
+ "loss": 0.3728,
524
+ "num_input_tokens_seen": 3740528,
525
  "step": 305
526
  },
527
  {
528
  "epoch": 1.4104903078677309,
529
+ "grad_norm": 0.4069710075855255,
530
  "learning_rate": 6.116208538516707e-05,
531
+ "loss": 0.3553,
532
+ "num_input_tokens_seen": 3802992,
533
  "step": 310
534
  },
535
  {
536
  "epoch": 1.4332953249714937,
537
+ "grad_norm": 0.42018574476242065,
538
  "learning_rate": 5.98972415020708e-05,
539
+ "loss": 0.3474,
540
+ "num_input_tokens_seen": 3863792,
541
  "step": 315
542
  },
543
  {
544
  "epoch": 1.4561003420752565,
545
+ "grad_norm": 0.321939080953598,
546
  "learning_rate": 5.862577007980544e-05,
547
+ "loss": 0.3493,
548
+ "num_input_tokens_seen": 3925616,
549
  "step": 320
550
  },
551
  {
552
  "epoch": 1.4789053591790193,
553
+ "grad_norm": 0.40397486090660095,
554
  "learning_rate": 5.734852254011833e-05,
555
+ "loss": 0.3488,
556
+ "num_input_tokens_seen": 3986416,
557
  "step": 325
558
  },
559
  {
560
  "epoch": 1.5017103762827824,
561
+ "grad_norm": 0.4368876814842224,
562
  "learning_rate": 5.60663541726471e-05,
563
+ "loss": 0.3262,
564
+ "num_input_tokens_seen": 4046704,
565
  "step": 330
566
  },
567
  {
568
  "epoch": 1.5245153933865452,
569
+ "grad_norm": 0.4872387945652008,
570
  "learning_rate": 5.478012356218779e-05,
571
+ "loss": 0.325,
572
+ "num_input_tokens_seen": 4107632,
573
  "step": 335
574
  },
575
  {
576
  "epoch": 1.547320410490308,
577
+ "grad_norm": 0.6040840744972229,
578
  "learning_rate": 5.349069201375657e-05,
579
+ "loss": 0.3249,
580
+ "num_input_tokens_seen": 4168816,
581
  "step": 340
582
  },
583
  {
584
  "epoch": 1.5701254275940708,
585
+ "grad_norm": 0.4146290421485901,
586
  "learning_rate": 5.2198922975829544e-05,
587
+ "loss": 0.3295,
588
+ "num_input_tokens_seen": 4230512,
589
  "step": 345
590
  },
591
  {
592
  "epoch": 1.5929304446978336,
593
+ "grad_norm": 0.5217750072479248,
594
  "learning_rate": 5.090568146214764e-05,
595
+ "loss": 0.3126,
596
+ "num_input_tokens_seen": 4291952,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 1.6157354618015964,
601
+ "grad_norm": 0.4398638904094696,
602
  "learning_rate": 4.961183347247301e-05,
603
+ "loss": 0.3301,
604
+ "num_input_tokens_seen": 4354032,
605
  "step": 355
606
  },
607
  {
608
  "epoch": 1.6385404789053593,
609
+ "grad_norm": 0.5228092074394226,
610
  "learning_rate": 4.831824541268537e-05,
611
+ "loss": 0.3261,
612
+ "num_input_tokens_seen": 4415344,
613
  "step": 360
614
  },
615
  {
616
  "epoch": 1.661345496009122,
617
+ "grad_norm": 0.5148546099662781,
618
  "learning_rate": 4.702578351460633e-05,
619
+ "loss": 0.3195,
620
+ "num_input_tokens_seen": 4477040,
621
  "step": 365
622
  },
623
  {
624
  "epoch": 1.6841505131128849,
625
+ "grad_norm": 0.5496350526809692,
626
  "learning_rate": 4.573531325594017e-05,
627
+ "loss": 0.3249,
628
+ "num_input_tokens_seen": 4539120,
629
  "step": 370
630
  },
631
  {
632
  "epoch": 1.7069555302166477,
633
+ "grad_norm": 0.6102330088615417,
634
  "learning_rate": 4.444769878071977e-05,
635
+ "loss": 0.3053,
636
+ "num_input_tokens_seen": 4600304,
637
  "step": 375
638
  },
639
  {
640
  "epoch": 1.7297605473204105,
641
+ "grad_norm": 0.5015097260475159,
642
  "learning_rate": 4.316380232064543e-05,
643
+ "loss": 0.3168,
644
+ "num_input_tokens_seen": 4661872,
645
  "step": 380
646
  },
647
  {
648
  "epoch": 1.7525655644241733,
649
+ "grad_norm": 0.5772258043289185,
650
  "learning_rate": 4.188448361770458e-05,
651
+ "loss": 0.312,
652
+ "num_input_tokens_seen": 4723440,
653
  "step": 385
654
  },
655
  {
656
  "epoch": 1.7753705815279361,
657
+ "grad_norm": 0.6678168773651123,
658
  "learning_rate": 4.061059934845818e-05,
659
+ "loss": 0.3172,
660
+ "num_input_tokens_seen": 4784880,
661
  "step": 390
662
  },
663
  {
664
  "epoch": 1.798175598631699,
665
+ "grad_norm": 0.8324864506721497,
666
  "learning_rate": 3.93430025503803e-05,
667
+ "loss": 0.3087,
668
+ "num_input_tokens_seen": 4846064,
669
  "step": 395
670
  },
671
  {
672
  "epoch": 1.8209806157354618,
673
+ "grad_norm": 0.5448362231254578,
674
  "learning_rate": 3.8082542050634405e-05,
675
+ "loss": 0.3391,
676
+ "num_input_tokens_seen": 4908144,
677
  "step": 400
678
  },
679
  {
680
  "epoch": 1.8209806157354618,
681
+ "eval_loss": 0.31536775827407837,
682
+ "eval_runtime": 33.171,
683
+ "eval_samples_per_second": 93.998,
684
+ "eval_steps_per_second": 5.879,
685
+ "num_input_tokens_seen": 4908144,
686
  "step": 400
687
  },
688
  {
689
  "epoch": 1.8437856328392246,
690
+ "grad_norm": 0.6546306014060974,
691
  "learning_rate": 3.6830061897668866e-05,
692
+ "loss": 0.3113,
693
+ "num_input_tokens_seen": 4969072,
694
  "step": 405
695
  },
696
  {
697
  "epoch": 1.8665906499429874,
698
+ "grad_norm": 0.593716561794281,
699
  "learning_rate": 3.558640079601265e-05,
700
+ "loss": 0.3127,
701
+ "num_input_tokens_seen": 5031152,
702
  "step": 410
703
  },
704
  {
705
  "epoch": 1.8893956670467502,
706
+ "grad_norm": 0.5102043747901917,
707
  "learning_rate": 3.435239154464947e-05,
708
+ "loss": 0.3087,
709
+ "num_input_tokens_seen": 5092848,
710
  "step": 415
711
  },
712
  {
713
  "epoch": 1.912200684150513,
714
+ "grad_norm": 0.4818083345890045,
715
  "learning_rate": 3.312886047934639e-05,
716
+ "loss": 0.2942,
717
+ "num_input_tokens_seen": 5154032,
718
  "step": 420
719
  },
720
  {
721
  "epoch": 1.9350057012542758,
722
+ "grad_norm": 0.6003326177597046,
723
  "learning_rate": 3.191662691931051e-05,
724
+ "loss": 0.3128,
725
+ "num_input_tokens_seen": 5216112,
726
  "step": 425
727
  },
728
  {
729
  "epoch": 1.9578107183580387,
730
+ "grad_norm": 1.0734755992889404,
731
  "learning_rate": 3.071650261854414e-05,
732
+ "loss": 0.312,
733
+ "num_input_tokens_seen": 5278320,
734
  "step": 430
735
  },
736
  {
737
  "epoch": 1.9806157354618015,
738
+ "grad_norm": 0.6759859919548035,
739
  "learning_rate": 2.9529291222265922e-05,
740
+ "loss": 0.288,
741
+ "num_input_tokens_seen": 5339504,
742
  "step": 435
743
  },
744
  {
745
  "epoch": 2.0,
746
+ "grad_norm": 1.2327771186828613,
747
  "learning_rate": 2.8355787728761952e-05,
748
+ "loss": 0.2862,
749
+ "num_input_tokens_seen": 5392000,
750
  "step": 440
751
  },
752
  {
753
  "epoch": 2.022805017103763,
754
+ "grad_norm": 0.8787758946418762,
755
  "learning_rate": 2.7196777957027013e-05,
756
+ "loss": 0.2784,
757
+ "num_input_tokens_seen": 5453568,
758
  "step": 445
759
  },
760
  {
761
  "epoch": 2.0456100342075256,
762
+ "grad_norm": 0.5148344039916992,
763
  "learning_rate": 2.6053038020552685e-05,
764
+ "loss": 0.279,
765
+ "num_input_tokens_seen": 5514496,
766
  "step": 450
767
  },
768
  {
769
  "epoch": 2.0684150513112884,
770
+ "grad_norm": 0.4274550676345825,
771
  "learning_rate": 2.492533380761466e-05,
772
+ "loss": 0.2801,
773
+ "num_input_tokens_seen": 5576192,
774
  "step": 455
775
  },
776
  {
777
  "epoch": 2.0912200684150513,
778
+ "grad_norm": 0.7621641159057617,
779
  "learning_rate": 2.3814420468407195e-05,
780
+ "loss": 0.2691,
781
+ "num_input_tokens_seen": 5638016,
782
  "step": 460
783
  },
784
  {
785
  "epoch": 2.114025085518814,
786
+ "grad_norm": 0.9495856165885925,
787
  "learning_rate": 2.2721041909367986e-05,
788
+ "loss": 0.2716,
789
+ "num_input_tokens_seen": 5699456,
790
  "step": 465
791
  },
792
  {
793
  "epoch": 2.136830102622577,
794
+ "grad_norm": 0.6201320886611938,
795
  "learning_rate": 2.164593029503249e-05,
796
+ "loss": 0.2785,
797
+ "num_input_tokens_seen": 5761280,
798
  "step": 470
799
  },
800
  {
801
  "epoch": 2.1596351197263397,
802
+ "grad_norm": 0.7156559824943542,
803
  "learning_rate": 2.0589805557750912e-05,
804
+ "loss": 0.2692,
805
+ "num_input_tokens_seen": 5823232,
806
  "step": 475
807
  },
808
  {
809
  "epoch": 2.1824401368301025,
810
+ "grad_norm": 0.5316115021705627,
811
  "learning_rate": 1.9553374915596328e-05,
812
+ "loss": 0.2707,
813
+ "num_input_tokens_seen": 5885184,
814
  "step": 480
815
  },
816
  {
817
  "epoch": 2.2052451539338653,
818
+ "grad_norm": 0.8920192122459412,
819
  "learning_rate": 1.853733239878669e-05,
820
+ "loss": 0.2618,
821
+ "num_input_tokens_seen": 5946624,
822
  "step": 485
823
  },
824
  {
825
  "epoch": 2.228050171037628,
826
+ "grad_norm": 0.4170345664024353,
827
  "learning_rate": 1.754235838493795e-05,
828
+ "loss": 0.2609,
829
+ "num_input_tokens_seen": 6007936,
830
  "step": 490
831
  },
832
  {
833
  "epoch": 2.250855188141391,
834
+ "grad_norm": 0.6393370628356934,
835
  "learning_rate": 1.6569119143459387e-05,
836
+ "loss": 0.2689,
837
+ "num_input_tokens_seen": 6069248,
838
  "step": 495
839
  },
840
  {
841
  "epoch": 2.2736602052451538,
842
+ "grad_norm": 0.5972685813903809,
843
  "learning_rate": 1.561826638939628e-05,
844
+ "loss": 0.2775,
845
+ "num_input_tokens_seen": 6131072,
846
  "step": 500
847
  },
848
  {
849
  "epoch": 2.2736602052451538,
850
+ "eval_loss": 0.27918943762779236,
851
+ "eval_runtime": 33.2439,
852
+ "eval_samples_per_second": 93.792,
853
+ "eval_steps_per_second": 5.866,
854
+ "num_input_tokens_seen": 6131072,
855
  "step": 500
856
  },
857
  {
858
  "epoch": 2.2964652223489166,
859
+ "grad_norm": 0.8352625370025635,
860
  "learning_rate": 1.4690436847018757e-05,
861
+ "loss": 0.2593,
862
+ "num_input_tokens_seen": 6192640,
863
  "step": 505
864
  },
865
  {
866
  "epoch": 2.3192702394526794,
867
+ "grad_norm": 0.5714385509490967,
868
  "learning_rate": 1.3786251823448909e-05,
869
+ "loss": 0.2593,
870
+ "num_input_tokens_seen": 6254592,
871
  "step": 510
872
  },
873
  {
874
  "epoch": 2.342075256556442,
875
+ "grad_norm": 0.7097648978233337,
876
  "learning_rate": 1.2906316792611828e-05,
877
+ "loss": 0.2585,
878
+ "num_input_tokens_seen": 6316288,
879
  "step": 515
880
  },
881
  {
882
  "epoch": 2.364880273660205,
883
+ "grad_norm": 0.5773028135299683,
884
  "learning_rate": 1.2051220989789075e-05,
885
+ "loss": 0.2429,
886
+ "num_input_tokens_seen": 6377600,
887
  "step": 520
888
  },
889
  {
890
  "epoch": 2.387685290763968,
891
+ "grad_norm": 0.5538610219955444,
892
  "learning_rate": 1.1221537017046101e-05,
893
+ "loss": 0.2446,
894
+ "num_input_tokens_seen": 6439040,
895
  "step": 525
896
  },
897
  {
898
  "epoch": 2.4104903078677307,
899
+ "grad_norm": 0.626125693321228,
900
  "learning_rate": 1.0417820459797939e-05,
901
+ "loss": 0.262,
902
+ "num_input_tokens_seen": 6501376,
903
  "step": 530
904
  },
905
  {
906
  "epoch": 2.433295324971494,
907
+ "grad_norm": 0.5278010964393616,
908
  "learning_rate": 9.640609514769695e-06,
909
+ "loss": 0.2604,
910
+ "num_input_tokens_seen": 6563840,
911
  "step": 535
912
  },
913
  {
914
  "epoch": 2.4561003420752567,
915
+ "grad_norm": 0.5445008873939514,
916
  "learning_rate": 8.890424629601197e-06,
917
+ "loss": 0.2524,
918
+ "num_input_tokens_seen": 6624896,
919
  "step": 540
920
  },
921
  {
922
  "epoch": 2.4789053591790196,
923
+ "grad_norm": 0.5429355502128601,
924
  "learning_rate": 8.167768154337102e-06,
925
+ "loss": 0.2495,
926
+ "num_input_tokens_seen": 6685184,
927
  "step": 545
928
  },
929
  {
930
  "epoch": 2.5017103762827824,
931
+ "grad_norm": 1.100938320159912,
932
  "learning_rate": 7.47312400503572e-06,
933
+ "loss": 0.2391,
934
+ "num_input_tokens_seen": 6744960,
935
  "step": 550
936
  },
937
  {
938
  "epoch": 2.524515393386545,
939
+ "grad_norm": 0.5086662769317627,
940
  "learning_rate": 6.806957339721837e-06,
941
+ "loss": 0.2606,
942
+ "num_input_tokens_seen": 6805888,
943
  "step": 555
944
  },
945
  {
946
  "epoch": 2.547320410490308,
947
+ "grad_norm": 0.4862842857837677,
948
  "learning_rate": 6.169714246900693e-06,
949
+ "loss": 0.2505,
950
+ "num_input_tokens_seen": 6866816,
951
  "step": 560
952
  },
953
  {
954
  "epoch": 2.570125427594071,
955
+ "grad_norm": 0.5588433146476746,
956
  "learning_rate": 5.561821446841431e-06,
957
+ "loss": 0.2461,
958
+ "num_input_tokens_seen": 6928128,
959
  "step": 565
960
  },
961
  {
962
  "epoch": 2.5929304446978336,
963
+ "grad_norm": 0.5050959587097168,
964
  "learning_rate": 4.983686005830407e-06,
965
+ "loss": 0.2494,
966
+ "num_input_tokens_seen": 6989056,
967
  "step": 570
968
  },
969
  {
970
  "epoch": 2.6157354618015964,
971
+ "grad_norm": 0.6401058435440063,
972
  "learning_rate": 4.435695063585221e-06,
973
+ "loss": 0.2528,
974
+ "num_input_tokens_seen": 7050496,
975
  "step": 575
976
  },
977
  {
978
  "epoch": 2.6385404789053593,
979
+ "grad_norm": 0.5814698934555054,
980
  "learning_rate": 3.918215574012501e-06,
981
+ "loss": 0.2488,
982
+ "num_input_tokens_seen": 7111680,
983
  "step": 580
984
  },
985
  {
986
  "epoch": 2.661345496009122,
987
+ "grad_norm": 0.580531895160675,
988
  "learning_rate": 3.4315940594827233e-06,
989
+ "loss": 0.2556,
990
+ "num_input_tokens_seen": 7173632,
991
  "step": 585
992
  },
993
  {
994
  "epoch": 2.684150513112885,
995
+ "grad_norm": 0.6502493619918823,
996
  "learning_rate": 2.9761563787866708e-06,
997
+ "loss": 0.2501,
998
+ "num_input_tokens_seen": 7234816,
999
  "step": 590
1000
  },
1001
  {
1002
  "epoch": 2.7069555302166477,
1003
+ "grad_norm": 0.5822915434837341,
1004
  "learning_rate": 2.5522075089290275e-06,
1005
+ "loss": 0.2573,
1006
+ "num_input_tokens_seen": 7297792,
1007
  "step": 595
1008
  },
1009
  {
1010
  "epoch": 2.7297605473204105,
1011
+ "grad_norm": 0.6461504697799683,
1012
  "learning_rate": 2.1600313409050833e-06,
1013
+ "loss": 0.2481,
1014
+ "num_input_tokens_seen": 7358336,
1015
  "step": 600
1016
  },
1017
  {
1018
  "epoch": 2.7297605473204105,
1019
+ "eval_loss": 0.26373597979545593,
1020
+ "eval_runtime": 33.1861,
1021
+ "eval_samples_per_second": 93.955,
1022
+ "eval_steps_per_second": 5.876,
1023
+ "num_input_tokens_seen": 7358336,
1024
  "step": 600
1025
  },
1026
  {
1027
  "epoch": 2.7525655644241733,
1028
+ "grad_norm": 0.5656365156173706,
1029
  "learning_rate": 1.7998904895974056e-06,
1030
+ "loss": 0.2414,
1031
+ "num_input_tokens_seen": 7419520,
1032
  "step": 605
1033
  },
1034
  {
1035
  "epoch": 2.775370581527936,
1036
+ "grad_norm": 0.5760141611099243,
1037
  "learning_rate": 1.4720261179197114e-06,
1038
+ "loss": 0.2561,
1039
+ "num_input_tokens_seen": 7480832,
1040
  "step": 610
1041
  },
1042
  {
1043
  "epoch": 2.798175598631699,
1044
+ "grad_norm": 0.540861964225769,
1045
  "learning_rate": 1.1766577753257512e-06,
1046
+ "loss": 0.2547,
1047
+ "num_input_tokens_seen": 7542400,
1048
  "step": 615
1049
  },
1050
  {
1051
  "epoch": 2.8209806157354618,
1052
+ "grad_norm": 0.598671019077301,
1053
  "learning_rate": 9.139832507913171e-07,
1054
+ "loss": 0.2412,
1055
+ "num_input_tokens_seen": 7603456,
1056
  "step": 620
1057
  },
1058
  {
1059
  "epoch": 2.8437856328392246,
1060
+ "grad_norm": 0.5341574549674988,
1061
  "learning_rate": 6.841784403678275e-07,
1062
+ "loss": 0.2457,
1063
+ "num_input_tokens_seen": 7664896,
1064
  "step": 625
1065
  },
1066
  {
1067
  "epoch": 2.8665906499429874,
1068
+ "grad_norm": 0.5464954376220703,
1069
  "learning_rate": 4.873972293961581e-07,
1070
+ "loss": 0.2357,
1071
+ "num_input_tokens_seen": 7725824,
1072
  "step": 630
1073
  },
1074
  {
1075
  "epoch": 2.88939566704675,
1076
+ "grad_norm": 0.4676554203033447,
1077
  "learning_rate": 3.2377138945964836e-07,
1078
+ "loss": 0.2432,
1079
+ "num_input_tokens_seen": 7786752,
1080
  "step": 635
1081
  },
1082
  {
1083
  "epoch": 2.912200684150513,
1084
+ "grad_norm": 0.530468225479126,
1085
  "learning_rate": 1.934104901452405e-07,
1086
+ "loss": 0.2368,
1087
+ "num_input_tokens_seen": 7847680,
1088
  "step": 640
1089
  },
1090
  {
1091
  "epoch": 2.935005701254276,
1092
+ "grad_norm": 0.5965820550918579,
1093
  "learning_rate": 9.640182567185463e-08,
1094
+ "loss": 0.2381,
1095
+ "num_input_tokens_seen": 7908992,
1096
  "step": 645
1097
  },
1098
  {
1099
  "epoch": 2.9578107183580387,
1100
+ "grad_norm": 0.5796935558319092,
1101
  "learning_rate": 3.281035643511454e-08,
1102
+ "loss": 0.2469,
1103
+ "num_input_tokens_seen": 7970816,
1104
  "step": 650
1105
  },
1106
  {
1107
  "epoch": 2.9806157354618015,
1108
+ "grad_norm": 0.48861977458000183,
1109
  "learning_rate": 2.678665507588329e-09,
1110
+ "loss": 0.2491,
1111
+ "num_input_tokens_seen": 8032384,
1112
  "step": 655
1113
  },
1114
  {
1115
  "epoch": 2.9897377423033067,
1116
+ "num_input_tokens_seen": 8057088,
1117
  "step": 657,
1118
+ "total_flos": 3.271460429947208e+17,
1119
+ "train_loss": 0.5877154775678295,
1120
+ "train_runtime": 3293.4355,
1121
+ "train_samples_per_second": 25.558,
1122
+ "train_steps_per_second": 0.199
1123
  }
1124
  ],
1125
  "logging_steps": 5,
1126
  "max_steps": 657,
1127
+ "num_input_tokens_seen": 8057088,
1128
  "num_train_epochs": 3,
1129
  "save_steps": 100,
1130
  "stateful_callbacks": {
 
1139
  "attributes": {}
1140
  }
1141
  },
1142
+ "total_flos": 3.271460429947208e+17,
1143
  "train_batch_size": 16,
1144
  "trial_name": null,
1145
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c77df9dc589cdfc4e3d3556f76ad596f2cc18984ef62e734c845f050fa93d443
3
  size 5688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb54f888989d15335b96140be2e070cee9af939169c8a8f2c47d913d69cc2343
3
  size 5688
training_eval_loss.png CHANGED

Git LFS Details

  • SHA256: bbd795ddcd22915e937734cfd7240c4e6352aa599c007c2f23bb76d6ec0f9fc0
  • Pointer size: 130 Bytes
  • Size of remote file: 37.3 kB

Git LFS Details

  • SHA256: 4a47d327bab2c2ea4d93cead124b43353fbf817f045332cb29c28e5bad063fdd
  • Pointer size: 130 Bytes
  • Size of remote file: 36.8 kB
training_loss.png CHANGED

Git LFS Details

  • SHA256: 1ce6e60fe81e6fcdf88326d65c7fbd27d3612b4c5fc1fa7cb2832c8edc9e761a
  • Pointer size: 130 Bytes
  • Size of remote file: 29.9 kB

Git LFS Details

  • SHA256: bcb5697abf7a9342a0944b34ed84b3b359a8ad73a14785c755a810ed3d8e42e2
  • Pointer size: 130 Bytes
  • Size of remote file: 29.7 kB