unakar commited on
Commit
c7223d3
·
verified ·
1 Parent(s): 8b3b732

Upload 10 files

Browse files
adapter_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "baichuan-inc/Baichuan2-7B-Chat",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 512,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 256,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "W_pack"
24
+ ],
25
+ "task_type": "CAUSAL_LM",
26
+ "use_dora": false,
27
+ "use_rslora": false
28
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f5c1e1ee625ef1ed02b92e803cb8b3e8870ca71fbaed94ce5aaf1d5786d3ca0
3
+ size 536879768
all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_loss": 0.24519601464271545,
4
+ "eval_runtime": 16.144,
5
+ "eval_samples_per_second": 0.681,
6
+ "eval_steps_per_second": 0.681,
7
+ "train_loss": 0.32009275511924523,
8
+ "train_runtime": 49821.5539,
9
+ "train_samples_per_second": 6.399,
10
+ "train_steps_per_second": 0.04
11
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_loss": 0.24519601464271545,
4
+ "eval_runtime": 16.144,
5
+ "eval_samples_per_second": 0.681,
6
+ "eval_steps_per_second": 0.681
7
+ }
train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.32009275511924523,
4
+ "train_runtime": 49821.5539,
5
+ "train_samples_per_second": 6.399,
6
+ "train_steps_per_second": 0.04
7
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 1992, "loss": 0.6539, "learning_rate": 9.999384369486675e-05, "epoch": 0.02, "percentage": 0.5, "elapsed_time": "0:04:10", "remaining_time": "13:45:52"}
2
+ {"current_steps": 20, "total_steps": 1992, "loss": 0.4242, "learning_rate": 9.997525241303441e-05, "epoch": 0.03, "percentage": 1.0, "elapsed_time": "0:08:23", "remaining_time": "13:47:34"}
3
+ {"current_steps": 30, "total_steps": 1992, "loss": 0.4085, "learning_rate": 9.994423062331178e-05, "epoch": 0.05, "percentage": 1.51, "elapsed_time": "0:12:34", "remaining_time": "13:41:53"}
4
+ {"current_steps": 40, "total_steps": 1992, "loss": 0.3843, "learning_rate": 9.990078604185e-05, "epoch": 0.06, "percentage": 2.01, "elapsed_time": "0:16:47", "remaining_time": "13:39:45"}
5
+ {"current_steps": 50, "total_steps": 1992, "loss": 0.3814, "learning_rate": 9.984492947476183e-05, "epoch": 0.08, "percentage": 2.51, "elapsed_time": "0:20:59", "remaining_time": "13:35:01"}
6
+ {"current_steps": 60, "total_steps": 1992, "loss": 0.3806, "learning_rate": 9.977667481543383e-05, "epoch": 0.09, "percentage": 3.01, "elapsed_time": "0:25:12", "remaining_time": "13:31:46"}
7
+ {"current_steps": 70, "total_steps": 1992, "loss": 0.3823, "learning_rate": 9.969603904107045e-05, "epoch": 0.11, "percentage": 3.51, "elapsed_time": "0:29:23", "remaining_time": "13:26:56"}
8
+ {"current_steps": 80, "total_steps": 1992, "loss": 0.3717, "learning_rate": 9.960304220847147e-05, "epoch": 0.12, "percentage": 4.02, "elapsed_time": "0:33:44", "remaining_time": "13:26:31"}
9
+ {"current_steps": 90, "total_steps": 1992, "loss": 0.3761, "learning_rate": 9.949770744904306e-05, "epoch": 0.14, "percentage": 4.52, "elapsed_time": "0:37:47", "remaining_time": "13:18:44"}
10
+ {"current_steps": 100, "total_steps": 1992, "loss": 0.3766, "learning_rate": 9.938006096304422e-05, "epoch": 0.15, "percentage": 5.02, "elapsed_time": "0:42:02", "remaining_time": "13:15:27"}
11
+ {"current_steps": 110, "total_steps": 1992, "loss": 0.3815, "learning_rate": 9.925013201306999e-05, "epoch": 0.17, "percentage": 5.52, "elapsed_time": "0:46:16", "remaining_time": "13:11:50"}
12
+ {"current_steps": 120, "total_steps": 1992, "loss": 0.3793, "learning_rate": 9.910795291677279e-05, "epoch": 0.18, "percentage": 6.02, "elapsed_time": "0:50:31", "remaining_time": "13:08:16"}
13
+ {"current_steps": 130, "total_steps": 1992, "loss": 0.3746, "learning_rate": 9.8953559038824e-05, "epoch": 0.2, "percentage": 6.53, "elapsed_time": "0:54:25", "remaining_time": "12:59:27"}
14
+ {"current_steps": 140, "total_steps": 1992, "loss": 0.3627, "learning_rate": 9.878698878211756e-05, "epoch": 0.21, "percentage": 7.03, "elapsed_time": "0:58:35", "remaining_time": "12:55:02"}
15
+ {"current_steps": 150, "total_steps": 1992, "loss": 0.3585, "learning_rate": 9.86082835782179e-05, "epoch": 0.23, "percentage": 7.53, "elapsed_time": "1:02:30", "remaining_time": "12:47:40"}
16
+ {"current_steps": 160, "total_steps": 1992, "loss": 0.3648, "learning_rate": 9.841748787705453e-05, "epoch": 0.24, "percentage": 8.03, "elapsed_time": "1:06:36", "remaining_time": "12:42:39"}
17
+ {"current_steps": 170, "total_steps": 1992, "loss": 0.3714, "learning_rate": 9.821464913586586e-05, "epoch": 0.26, "percentage": 8.53, "elapsed_time": "1:10:43", "remaining_time": "12:38:04"}
18
+ {"current_steps": 180, "total_steps": 1992, "loss": 0.3691, "learning_rate": 9.799981780739504e-05, "epoch": 0.27, "percentage": 9.04, "elapsed_time": "1:14:50", "remaining_time": "12:33:22"}
19
+ {"current_steps": 190, "total_steps": 1992, "loss": 0.3621, "learning_rate": 9.777304732734063e-05, "epoch": 0.29, "percentage": 9.54, "elapsed_time": "1:19:07", "remaining_time": "12:30:27"}
20
+ {"current_steps": 200, "total_steps": 1992, "loss": 0.3627, "learning_rate": 9.753439410106537e-05, "epoch": 0.3, "percentage": 10.04, "elapsed_time": "1:23:15", "remaining_time": "12:25:57"}
21
+ {"current_steps": 210, "total_steps": 1992, "loss": 0.358, "learning_rate": 9.728391748956637e-05, "epoch": 0.32, "percentage": 10.54, "elapsed_time": "1:27:26", "remaining_time": "12:22:00"}
22
+ {"current_steps": 220, "total_steps": 1992, "loss": 0.3587, "learning_rate": 9.702167979470994e-05, "epoch": 0.33, "percentage": 11.04, "elapsed_time": "1:31:18", "remaining_time": "12:15:28"}
23
+ {"current_steps": 230, "total_steps": 1992, "loss": 0.3593, "learning_rate": 9.67477462437351e-05, "epoch": 0.35, "percentage": 11.55, "elapsed_time": "1:35:42", "remaining_time": "12:13:11"}
24
+ {"current_steps": 240, "total_steps": 1992, "loss": 0.3619, "learning_rate": 9.646218497302945e-05, "epoch": 0.36, "percentage": 12.05, "elapsed_time": "1:39:45", "remaining_time": "12:08:12"}
25
+ {"current_steps": 250, "total_steps": 1992, "loss": 0.3592, "learning_rate": 9.616506701118124e-05, "epoch": 0.38, "percentage": 12.55, "elapsed_time": "1:43:50", "remaining_time": "12:03:31"}
26
+ {"current_steps": 260, "total_steps": 1992, "loss": 0.3572, "learning_rate": 9.585646626131237e-05, "epoch": 0.39, "percentage": 13.05, "elapsed_time": "1:48:01", "remaining_time": "11:59:35"}
27
+ {"current_steps": 270, "total_steps": 1992, "loss": 0.3584, "learning_rate": 9.553645948269607e-05, "epoch": 0.41, "percentage": 13.55, "elapsed_time": "1:52:11", "remaining_time": "11:55:31"}
28
+ {"current_steps": 280, "total_steps": 1992, "loss": 0.3569, "learning_rate": 9.520512627166445e-05, "epoch": 0.42, "percentage": 14.06, "elapsed_time": "1:56:19", "remaining_time": "11:51:13"}
29
+ {"current_steps": 290, "total_steps": 1992, "loss": 0.353, "learning_rate": 9.48625490418101e-05, "epoch": 0.44, "percentage": 14.56, "elapsed_time": "2:00:34", "remaining_time": "11:47:38"}
30
+ {"current_steps": 300, "total_steps": 1992, "loss": 0.3492, "learning_rate": 9.450881300348724e-05, "epoch": 0.45, "percentage": 15.06, "elapsed_time": "2:04:44", "remaining_time": "11:43:32"}
31
+ {"current_steps": 310, "total_steps": 1992, "loss": 0.3618, "learning_rate": 9.414400614261693e-05, "epoch": 0.47, "percentage": 15.56, "elapsed_time": "2:08:54", "remaining_time": "11:39:23"}
32
+ {"current_steps": 320, "total_steps": 1992, "loss": 0.3668, "learning_rate": 9.376821919880219e-05, "epoch": 0.48, "percentage": 16.06, "elapsed_time": "2:12:56", "remaining_time": "11:34:36"}
33
+ {"current_steps": 330, "total_steps": 1992, "loss": 0.3445, "learning_rate": 9.338154564275788e-05, "epoch": 0.5, "percentage": 16.57, "elapsed_time": "2:17:08", "remaining_time": "11:30:42"}
34
+ {"current_steps": 340, "total_steps": 1992, "loss": 0.3459, "learning_rate": 9.298408165306157e-05, "epoch": 0.51, "percentage": 17.07, "elapsed_time": "2:21:16", "remaining_time": "11:26:24"}
35
+ {"current_steps": 350, "total_steps": 1992, "loss": 0.3532, "learning_rate": 9.257592609223059e-05, "epoch": 0.53, "percentage": 17.57, "elapsed_time": "2:25:26", "remaining_time": "11:22:18"}
36
+ {"current_steps": 360, "total_steps": 1992, "loss": 0.3542, "learning_rate": 9.21571804821318e-05, "epoch": 0.54, "percentage": 18.07, "elapsed_time": "2:29:41", "remaining_time": "11:18:33"}
37
+ {"current_steps": 370, "total_steps": 1992, "loss": 0.3424, "learning_rate": 9.172794897872957e-05, "epoch": 0.56, "percentage": 18.57, "elapsed_time": "2:33:52", "remaining_time": "11:14:33"}
38
+ {"current_steps": 380, "total_steps": 1992, "loss": 0.35, "learning_rate": 9.128833834617876e-05, "epoch": 0.57, "percentage": 19.08, "elapsed_time": "2:38:04", "remaining_time": "11:10:35"}
39
+ {"current_steps": 390, "total_steps": 1992, "loss": 0.3461, "learning_rate": 9.083845793026905e-05, "epoch": 0.59, "percentage": 19.58, "elapsed_time": "2:42:14", "remaining_time": "11:06:25"}
40
+ {"current_steps": 400, "total_steps": 1992, "loss": 0.3367, "learning_rate": 9.037841963122682e-05, "epoch": 0.6, "percentage": 20.08, "elapsed_time": "2:46:19", "remaining_time": "11:01:57"}
41
+ {"current_steps": 410, "total_steps": 1992, "loss": 0.3504, "learning_rate": 8.990833787588194e-05, "epoch": 0.62, "percentage": 20.58, "elapsed_time": "2:50:42", "remaining_time": "10:58:42"}
42
+ {"current_steps": 420, "total_steps": 1992, "loss": 0.3496, "learning_rate": 8.942832958920602e-05, "epoch": 0.63, "percentage": 21.08, "elapsed_time": "2:55:02", "remaining_time": "10:55:11"}
43
+ {"current_steps": 430, "total_steps": 1992, "loss": 0.3513, "learning_rate": 8.893851416522925e-05, "epoch": 0.65, "percentage": 21.59, "elapsed_time": "2:59:27", "remaining_time": "10:51:54"}
44
+ {"current_steps": 440, "total_steps": 1992, "loss": 0.3409, "learning_rate": 8.843901343734309e-05, "epoch": 0.66, "percentage": 22.09, "elapsed_time": "3:03:43", "remaining_time": "10:48:01"}
45
+ {"current_steps": 450, "total_steps": 1992, "loss": 0.3446, "learning_rate": 8.792995164799637e-05, "epoch": 0.68, "percentage": 22.59, "elapsed_time": "3:08:05", "remaining_time": "10:44:30"}
46
+ {"current_steps": 460, "total_steps": 1992, "loss": 0.3442, "learning_rate": 8.741145541779199e-05, "epoch": 0.69, "percentage": 23.09, "elapsed_time": "3:12:05", "remaining_time": "10:39:44"}
47
+ {"current_steps": 470, "total_steps": 1992, "loss": 0.3444, "learning_rate": 8.688365371399208e-05, "epoch": 0.71, "percentage": 23.59, "elapsed_time": "3:16:16", "remaining_time": "10:35:36"}
48
+ {"current_steps": 480, "total_steps": 1992, "loss": 0.3402, "learning_rate": 8.63466778184397e-05, "epoch": 0.72, "percentage": 24.1, "elapsed_time": "3:20:26", "remaining_time": "10:31:23"}
49
+ {"current_steps": 490, "total_steps": 1992, "loss": 0.3424, "learning_rate": 8.580066129490462e-05, "epoch": 0.74, "percentage": 24.6, "elapsed_time": "3:24:38", "remaining_time": "10:27:18"}
50
+ {"current_steps": 500, "total_steps": 1992, "loss": 0.358, "learning_rate": 8.524573995586153e-05, "epoch": 0.75, "percentage": 25.1, "elapsed_time": "3:29:03", "remaining_time": "10:23:48"}
51
+ {"current_steps": 500, "total_steps": 1992, "eval_loss": 0.26840201020240784, "epoch": 0.75, "percentage": 25.1, "elapsed_time": "3:29:24", "remaining_time": "10:24:51"}
trainer_state.json ADDED
@@ -0,0 +1,1447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.998870907038013,
5
+ "eval_steps": 500,
6
+ "global_step": 1992,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "grad_norm": 0.2066432386636734,
14
+ "learning_rate": 9.999384369486675e-05,
15
+ "loss": 0.6539,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.03,
20
+ "grad_norm": 0.1350802630186081,
21
+ "learning_rate": 9.997525241303441e-05,
22
+ "loss": 0.4242,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.05,
27
+ "grad_norm": 0.12290512770414352,
28
+ "learning_rate": 9.994423062331178e-05,
29
+ "loss": 0.4085,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.06,
34
+ "grad_norm": 0.10319065302610397,
35
+ "learning_rate": 9.990078604185e-05,
36
+ "loss": 0.3843,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.08,
41
+ "grad_norm": 0.1067107692360878,
42
+ "learning_rate": 9.984492947476183e-05,
43
+ "loss": 0.3814,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.09,
48
+ "grad_norm": 0.10464764386415482,
49
+ "learning_rate": 9.977667481543383e-05,
50
+ "loss": 0.3806,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.11,
55
+ "grad_norm": 0.10096515715122223,
56
+ "learning_rate": 9.969603904107045e-05,
57
+ "loss": 0.3823,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.12,
62
+ "grad_norm": 0.10181506723165512,
63
+ "learning_rate": 9.960304220847147e-05,
64
+ "loss": 0.3717,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.14,
69
+ "grad_norm": 0.10328900068998337,
70
+ "learning_rate": 9.949770744904306e-05,
71
+ "loss": 0.3761,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.15,
76
+ "grad_norm": 0.10094640403985977,
77
+ "learning_rate": 9.938006096304422e-05,
78
+ "loss": 0.3766,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.17,
83
+ "grad_norm": 0.10436520725488663,
84
+ "learning_rate": 9.925013201306999e-05,
85
+ "loss": 0.3815,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.18,
90
+ "grad_norm": 0.09791135042905807,
91
+ "learning_rate": 9.910795291677279e-05,
92
+ "loss": 0.3793,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.2,
97
+ "grad_norm": 0.10530658811330795,
98
+ "learning_rate": 9.8953559038824e-05,
99
+ "loss": 0.3746,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.21,
104
+ "grad_norm": 0.10106656700372696,
105
+ "learning_rate": 9.878698878211756e-05,
106
+ "loss": 0.3627,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.23,
111
+ "grad_norm": 0.10589198768138885,
112
+ "learning_rate": 9.86082835782179e-05,
113
+ "loss": 0.3585,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.24,
118
+ "grad_norm": 0.09950920939445496,
119
+ "learning_rate": 9.841748787705453e-05,
120
+ "loss": 0.3648,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.26,
125
+ "grad_norm": 0.09534649550914764,
126
+ "learning_rate": 9.821464913586586e-05,
127
+ "loss": 0.3714,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.27,
132
+ "grad_norm": 0.1069357767701149,
133
+ "learning_rate": 9.799981780739504e-05,
134
+ "loss": 0.3691,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.29,
139
+ "grad_norm": 0.0956304594874382,
140
+ "learning_rate": 9.777304732734063e-05,
141
+ "loss": 0.3621,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.3,
146
+ "grad_norm": 0.1080741360783577,
147
+ "learning_rate": 9.753439410106537e-05,
148
+ "loss": 0.3627,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.32,
153
+ "grad_norm": 0.11919151246547699,
154
+ "learning_rate": 9.728391748956637e-05,
155
+ "loss": 0.358,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.33,
160
+ "grad_norm": 0.09667570888996124,
161
+ "learning_rate": 9.702167979470994e-05,
162
+ "loss": 0.3587,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.35,
167
+ "grad_norm": 0.09914813190698624,
168
+ "learning_rate": 9.67477462437351e-05,
169
+ "loss": 0.3593,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.36,
174
+ "grad_norm": 0.09963402152061462,
175
+ "learning_rate": 9.646218497302945e-05,
176
+ "loss": 0.3619,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.38,
181
+ "grad_norm": 0.0956585481762886,
182
+ "learning_rate": 9.616506701118124e-05,
183
+ "loss": 0.3592,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.39,
188
+ "grad_norm": 0.08918160200119019,
189
+ "learning_rate": 9.585646626131237e-05,
190
+ "loss": 0.3572,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.41,
195
+ "grad_norm": 0.10680545121431351,
196
+ "learning_rate": 9.553645948269607e-05,
197
+ "loss": 0.3584,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.42,
202
+ "grad_norm": 0.09338073432445526,
203
+ "learning_rate": 9.520512627166445e-05,
204
+ "loss": 0.3569,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.44,
209
+ "grad_norm": 0.09849222749471664,
210
+ "learning_rate": 9.48625490418101e-05,
211
+ "loss": 0.353,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.45,
216
+ "grad_norm": 0.09319322556257248,
217
+ "learning_rate": 9.450881300348724e-05,
218
+ "loss": 0.3492,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.47,
223
+ "grad_norm": 0.09313003718852997,
224
+ "learning_rate": 9.414400614261693e-05,
225
+ "loss": 0.3618,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.48,
230
+ "grad_norm": 0.09641731530427933,
231
+ "learning_rate": 9.376821919880219e-05,
232
+ "loss": 0.3668,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.5,
237
+ "grad_norm": 0.10052476078271866,
238
+ "learning_rate": 9.338154564275788e-05,
239
+ "loss": 0.3445,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.51,
244
+ "grad_norm": 0.09882521629333496,
245
+ "learning_rate": 9.298408165306157e-05,
246
+ "loss": 0.3459,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.53,
251
+ "grad_norm": 0.0998958945274353,
252
+ "learning_rate": 9.257592609223059e-05,
253
+ "loss": 0.3532,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.54,
258
+ "grad_norm": 0.0996759906411171,
259
+ "learning_rate": 9.21571804821318e-05,
260
+ "loss": 0.3542,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.56,
265
+ "grad_norm": 0.09207039326429367,
266
+ "learning_rate": 9.172794897872957e-05,
267
+ "loss": 0.3424,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.57,
272
+ "grad_norm": 0.09934650361537933,
273
+ "learning_rate": 9.128833834617876e-05,
274
+ "loss": 0.35,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.59,
279
+ "grad_norm": 0.09678385406732559,
280
+ "learning_rate": 9.083845793026905e-05,
281
+ "loss": 0.3461,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 0.6,
286
+ "grad_norm": 0.09645042568445206,
287
+ "learning_rate": 9.037841963122682e-05,
288
+ "loss": 0.3367,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 0.62,
293
+ "grad_norm": 0.09238140285015106,
294
+ "learning_rate": 8.990833787588194e-05,
295
+ "loss": 0.3504,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 0.63,
300
+ "grad_norm": 0.09661918878555298,
301
+ "learning_rate": 8.942832958920602e-05,
302
+ "loss": 0.3496,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 0.65,
307
+ "grad_norm": 0.09576547890901566,
308
+ "learning_rate": 8.893851416522925e-05,
309
+ "loss": 0.3513,
310
+ "step": 430
311
+ },
312
+ {
313
+ "epoch": 0.66,
314
+ "grad_norm": 0.091176837682724,
315
+ "learning_rate": 8.843901343734309e-05,
316
+ "loss": 0.3409,
317
+ "step": 440
318
+ },
319
+ {
320
+ "epoch": 0.68,
321
+ "grad_norm": 0.09654372185468674,
322
+ "learning_rate": 8.792995164799637e-05,
323
+ "loss": 0.3446,
324
+ "step": 450
325
+ },
326
+ {
327
+ "epoch": 0.69,
328
+ "grad_norm": 0.08725057542324066,
329
+ "learning_rate": 8.741145541779199e-05,
330
+ "loss": 0.3442,
331
+ "step": 460
332
+ },
333
+ {
334
+ "epoch": 0.71,
335
+ "grad_norm": 0.09062401205301285,
336
+ "learning_rate": 8.688365371399208e-05,
337
+ "loss": 0.3444,
338
+ "step": 470
339
+ },
340
+ {
341
+ "epoch": 0.72,
342
+ "grad_norm": 0.09887181222438812,
343
+ "learning_rate": 8.63466778184397e-05,
344
+ "loss": 0.3402,
345
+ "step": 480
346
+ },
347
+ {
348
+ "epoch": 0.74,
349
+ "grad_norm": 0.09595289826393127,
350
+ "learning_rate": 8.580066129490462e-05,
351
+ "loss": 0.3424,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 0.75,
356
+ "grad_norm": 0.0977388396859169,
357
+ "learning_rate": 8.524573995586153e-05,
358
+ "loss": 0.358,
359
+ "step": 500
360
+ },
361
+ {
362
+ "epoch": 0.75,
363
+ "eval_loss": 0.26840201020240784,
364
+ "eval_runtime": 20.7882,
365
+ "eval_samples_per_second": 0.529,
366
+ "eval_steps_per_second": 0.529,
367
+ "step": 500
368
+ },
369
+ {
370
+ "epoch": 0.77,
371
+ "grad_norm": 0.08854895085096359,
372
+ "learning_rate": 8.468205182870901e-05,
373
+ "loss": 0.3328,
374
+ "step": 510
375
+ },
376
+ {
377
+ "epoch": 0.78,
378
+ "grad_norm": 0.09385243058204651,
379
+ "learning_rate": 8.410973712143747e-05,
380
+ "loss": 0.3441,
381
+ "step": 520
382
+ },
383
+ {
384
+ "epoch": 0.8,
385
+ "grad_norm": 0.09778619557619095,
386
+ "learning_rate": 8.352893818775484e-05,
387
+ "loss": 0.3451,
388
+ "step": 530
389
+ },
390
+ {
391
+ "epoch": 0.81,
392
+ "grad_norm": 0.09615397453308105,
393
+ "learning_rate": 8.293979949167839e-05,
394
+ "loss": 0.3441,
395
+ "step": 540
396
+ },
397
+ {
398
+ "epoch": 0.83,
399
+ "grad_norm": 0.10019069164991379,
400
+ "learning_rate": 8.234246757160174e-05,
401
+ "loss": 0.3309,
402
+ "step": 550
403
+ },
404
+ {
405
+ "epoch": 0.84,
406
+ "grad_norm": 0.09099920839071274,
407
+ "learning_rate": 8.17370910038459e-05,
408
+ "loss": 0.3319,
409
+ "step": 560
410
+ },
411
+ {
412
+ "epoch": 0.86,
413
+ "grad_norm": 0.10040932148694992,
414
+ "learning_rate": 8.112382036570344e-05,
415
+ "loss": 0.342,
416
+ "step": 570
417
+ },
418
+ {
419
+ "epoch": 0.87,
420
+ "grad_norm": 0.09268151968717575,
421
+ "learning_rate": 8.050280819798481e-05,
422
+ "loss": 0.334,
423
+ "step": 580
424
+ },
425
+ {
426
+ "epoch": 0.89,
427
+ "grad_norm": 0.09127725660800934,
428
+ "learning_rate": 7.987420896707645e-05,
429
+ "loss": 0.3476,
430
+ "step": 590
431
+ },
432
+ {
433
+ "epoch": 0.9,
434
+ "grad_norm": 0.09117837995290756,
435
+ "learning_rate": 7.923817902651978e-05,
436
+ "loss": 0.3351,
437
+ "step": 600
438
+ },
439
+ {
440
+ "epoch": 0.92,
441
+ "grad_norm": 0.09493087977170944,
442
+ "learning_rate": 7.859487657812095e-05,
443
+ "loss": 0.3408,
444
+ "step": 610
445
+ },
446
+ {
447
+ "epoch": 0.93,
448
+ "grad_norm": 0.09857796877622604,
449
+ "learning_rate": 7.794446163260077e-05,
450
+ "loss": 0.3416,
451
+ "step": 620
452
+ },
453
+ {
454
+ "epoch": 0.95,
455
+ "grad_norm": 0.09530830383300781,
456
+ "learning_rate": 7.728709596979471e-05,
457
+ "loss": 0.3403,
458
+ "step": 630
459
+ },
460
+ {
461
+ "epoch": 0.96,
462
+ "grad_norm": 0.10255115479230881,
463
+ "learning_rate": 7.662294309841283e-05,
464
+ "loss": 0.3349,
465
+ "step": 640
466
+ },
467
+ {
468
+ "epoch": 0.98,
469
+ "grad_norm": 0.09442761540412903,
470
+ "learning_rate": 7.595216821536981e-05,
471
+ "loss": 0.3469,
472
+ "step": 650
473
+ },
474
+ {
475
+ "epoch": 0.99,
476
+ "grad_norm": 0.09131593257188797,
477
+ "learning_rate": 7.527493816469492e-05,
478
+ "loss": 0.3232,
479
+ "step": 660
480
+ },
481
+ {
482
+ "epoch": 1.01,
483
+ "grad_norm": 0.10782450437545776,
484
+ "learning_rate": 7.459142139603236e-05,
485
+ "loss": 0.3275,
486
+ "step": 670
487
+ },
488
+ {
489
+ "epoch": 1.02,
490
+ "grad_norm": 0.10051246732473373,
491
+ "learning_rate": 7.390178792274227e-05,
492
+ "loss": 0.3168,
493
+ "step": 680
494
+ },
495
+ {
496
+ "epoch": 1.04,
497
+ "grad_norm": 0.09489897638559341,
498
+ "learning_rate": 7.32062092796127e-05,
499
+ "loss": 0.3205,
500
+ "step": 690
501
+ },
502
+ {
503
+ "epoch": 1.05,
504
+ "grad_norm": 0.10021229833364487,
505
+ "learning_rate": 7.250485848019326e-05,
506
+ "loss": 0.314,
507
+ "step": 700
508
+ },
509
+ {
510
+ "epoch": 1.07,
511
+ "grad_norm": 0.09837724268436432,
512
+ "learning_rate": 7.179790997376083e-05,
513
+ "loss": 0.3131,
514
+ "step": 710
515
+ },
516
+ {
517
+ "epoch": 1.08,
518
+ "grad_norm": 0.11071084439754486,
519
+ "learning_rate": 7.108553960192827e-05,
520
+ "loss": 0.3141,
521
+ "step": 720
522
+ },
523
+ {
524
+ "epoch": 1.1,
525
+ "grad_norm": 0.09489303082227707,
526
+ "learning_rate": 7.036792455490675e-05,
527
+ "loss": 0.3124,
528
+ "step": 730
529
+ },
530
+ {
531
+ "epoch": 1.11,
532
+ "grad_norm": 0.10039713978767395,
533
+ "learning_rate": 6.964524332743263e-05,
534
+ "loss": 0.3258,
535
+ "step": 740
536
+ },
537
+ {
538
+ "epoch": 1.13,
539
+ "grad_norm": 0.09435191005468369,
540
+ "learning_rate": 6.891767567436988e-05,
541
+ "loss": 0.318,
542
+ "step": 750
543
+ },
544
+ {
545
+ "epoch": 1.14,
546
+ "grad_norm": 0.10240574926137924,
547
+ "learning_rate": 6.818540256599913e-05,
548
+ "loss": 0.3286,
549
+ "step": 760
550
+ },
551
+ {
552
+ "epoch": 1.16,
553
+ "grad_norm": 0.10050353407859802,
554
+ "learning_rate": 6.744860614300426e-05,
555
+ "loss": 0.3096,
556
+ "step": 770
557
+ },
558
+ {
559
+ "epoch": 1.17,
560
+ "grad_norm": 0.09765986353158951,
561
+ "learning_rate": 6.670746967116793e-05,
562
+ "loss": 0.318,
563
+ "step": 780
564
+ },
565
+ {
566
+ "epoch": 1.19,
567
+ "grad_norm": 0.10131178051233292,
568
+ "learning_rate": 6.596217749578743e-05,
569
+ "loss": 0.3199,
570
+ "step": 790
571
+ },
572
+ {
573
+ "epoch": 1.2,
574
+ "grad_norm": 0.0985412448644638,
575
+ "learning_rate": 6.521291499582172e-05,
576
+ "loss": 0.3173,
577
+ "step": 800
578
+ },
579
+ {
580
+ "epoch": 1.22,
581
+ "grad_norm": 0.10045495629310608,
582
+ "learning_rate": 6.445986853778156e-05,
583
+ "loss": 0.304,
584
+ "step": 810
585
+ },
586
+ {
587
+ "epoch": 1.23,
588
+ "grad_norm": 0.10255653411149979,
589
+ "learning_rate": 6.370322542937403e-05,
590
+ "loss": 0.3215,
591
+ "step": 820
592
+ },
593
+ {
594
+ "epoch": 1.25,
595
+ "grad_norm": 0.10014262050390244,
596
+ "learning_rate": 6.294317387291276e-05,
597
+ "loss": 0.3185,
598
+ "step": 830
599
+ },
600
+ {
601
+ "epoch": 1.26,
602
+ "grad_norm": 0.10973095148801804,
603
+ "learning_rate": 6.217990291850581e-05,
604
+ "loss": 0.3128,
605
+ "step": 840
606
+ },
607
+ {
608
+ "epoch": 1.28,
609
+ "grad_norm": 0.1075139194726944,
610
+ "learning_rate": 6.141360241703264e-05,
611
+ "loss": 0.3117,
612
+ "step": 850
613
+ },
614
+ {
615
+ "epoch": 1.29,
616
+ "grad_norm": 0.10679470747709274,
617
+ "learning_rate": 6.0644462972921845e-05,
618
+ "loss": 0.314,
619
+ "step": 860
620
+ },
621
+ {
622
+ "epoch": 1.31,
623
+ "grad_norm": 0.10646017640829086,
624
+ "learning_rate": 5.98726758967415e-05,
625
+ "loss": 0.3166,
626
+ "step": 870
627
+ },
628
+ {
629
+ "epoch": 1.32,
630
+ "grad_norm": 0.10854795575141907,
631
+ "learning_rate": 5.909843315761385e-05,
632
+ "loss": 0.3104,
633
+ "step": 880
634
+ },
635
+ {
636
+ "epoch": 1.34,
637
+ "grad_norm": 0.09923075139522552,
638
+ "learning_rate": 5.832192733546621e-05,
639
+ "loss": 0.3085,
640
+ "step": 890
641
+ },
642
+ {
643
+ "epoch": 1.35,
644
+ "grad_norm": 0.10540423542261124,
645
+ "learning_rate": 5.7543351573129964e-05,
646
+ "loss": 0.3035,
647
+ "step": 900
648
+ },
649
+ {
650
+ "epoch": 1.37,
651
+ "grad_norm": 0.10513672232627869,
652
+ "learning_rate": 5.676289952829945e-05,
653
+ "loss": 0.3069,
654
+ "step": 910
655
+ },
656
+ {
657
+ "epoch": 1.39,
658
+ "grad_norm": 0.10602447390556335,
659
+ "learning_rate": 5.598076532536291e-05,
660
+ "loss": 0.3126,
661
+ "step": 920
662
+ },
663
+ {
664
+ "epoch": 1.4,
665
+ "grad_norm": 0.10258585214614868,
666
+ "learning_rate": 5.5197143507117234e-05,
667
+ "loss": 0.3148,
668
+ "step": 930
669
+ },
670
+ {
671
+ "epoch": 1.42,
672
+ "grad_norm": 0.10304014384746552,
673
+ "learning_rate": 5.441222898637877e-05,
674
+ "loss": 0.3138,
675
+ "step": 940
676
+ },
677
+ {
678
+ "epoch": 1.43,
679
+ "grad_norm": 0.10201530903577805,
680
+ "learning_rate": 5.362621699750196e-05,
681
+ "loss": 0.3104,
682
+ "step": 950
683
+ },
684
+ {
685
+ "epoch": 1.45,
686
+ "grad_norm": 0.10214639455080032,
687
+ "learning_rate": 5.28393030478181e-05,
688
+ "loss": 0.3081,
689
+ "step": 960
690
+ },
691
+ {
692
+ "epoch": 1.46,
693
+ "grad_norm": 0.09911152720451355,
694
+ "learning_rate": 5.2051682869006126e-05,
695
+ "loss": 0.3081,
696
+ "step": 970
697
+ },
698
+ {
699
+ "epoch": 1.48,
700
+ "grad_norm": 0.10031607747077942,
701
+ "learning_rate": 5.126355236840764e-05,
702
+ "loss": 0.3134,
703
+ "step": 980
704
+ },
705
+ {
706
+ "epoch": 1.49,
707
+ "grad_norm": 0.1077575534582138,
708
+ "learning_rate": 5.047510758029832e-05,
709
+ "loss": 0.3272,
710
+ "step": 990
711
+ },
712
+ {
713
+ "epoch": 1.51,
714
+ "grad_norm": 0.11312615871429443,
715
+ "learning_rate": 4.968654461712753e-05,
716
+ "loss": 0.3167,
717
+ "step": 1000
718
+ },
719
+ {
720
+ "epoch": 1.51,
721
+ "eval_loss": 0.2502507269382477,
722
+ "eval_runtime": 14.5906,
723
+ "eval_samples_per_second": 0.754,
724
+ "eval_steps_per_second": 0.754,
725
+ "step": 1000
726
+ },
727
+ {
728
+ "epoch": 1.52,
729
+ "grad_norm": 0.10545619577169418,
730
+ "learning_rate": 4.889805962073874e-05,
731
+ "loss": 0.3142,
732
+ "step": 1010
733
+ },
734
+ {
735
+ "epoch": 1.54,
736
+ "grad_norm": 0.11502473056316376,
737
+ "learning_rate": 4.8109848713582475e-05,
738
+ "loss": 0.3164,
739
+ "step": 1020
740
+ },
741
+ {
742
+ "epoch": 1.55,
743
+ "grad_norm": 0.11116404086351395,
744
+ "learning_rate": 4.7322107949934146e-05,
745
+ "loss": 0.3191,
746
+ "step": 1030
747
+ },
748
+ {
749
+ "epoch": 1.57,
750
+ "grad_norm": 0.11101904511451721,
751
+ "learning_rate": 4.653503326712886e-05,
752
+ "loss": 0.3223,
753
+ "step": 1040
754
+ },
755
+ {
756
+ "epoch": 1.58,
757
+ "grad_norm": 0.10275658220052719,
758
+ "learning_rate": 4.5748820436825204e-05,
759
+ "loss": 0.3127,
760
+ "step": 1050
761
+ },
762
+ {
763
+ "epoch": 1.6,
764
+ "grad_norm": 0.10536840558052063,
765
+ "learning_rate": 4.496366501631043e-05,
766
+ "loss": 0.3104,
767
+ "step": 1060
768
+ },
769
+ {
770
+ "epoch": 1.61,
771
+ "grad_norm": 0.11228681355714798,
772
+ "learning_rate": 4.417976229985876e-05,
773
+ "loss": 0.3181,
774
+ "step": 1070
775
+ },
776
+ {
777
+ "epoch": 1.63,
778
+ "grad_norm": 0.10409337282180786,
779
+ "learning_rate": 4.339730727015527e-05,
780
+ "loss": 0.3085,
781
+ "step": 1080
782
+ },
783
+ {
784
+ "epoch": 1.64,
785
+ "grad_norm": 0.11756953597068787,
786
+ "learning_rate": 4.261649454979714e-05,
787
+ "loss": 0.3105,
788
+ "step": 1090
789
+ },
790
+ {
791
+ "epoch": 1.66,
792
+ "grad_norm": 0.10340123623609543,
793
+ "learning_rate": 4.183751835288463e-05,
794
+ "loss": 0.3168,
795
+ "step": 1100
796
+ },
797
+ {
798
+ "epoch": 1.67,
799
+ "grad_norm": 0.11595764011144638,
800
+ "learning_rate": 4.10605724367135e-05,
801
+ "loss": 0.3172,
802
+ "step": 1110
803
+ },
804
+ {
805
+ "epoch": 1.69,
806
+ "grad_norm": 0.1063317060470581,
807
+ "learning_rate": 4.0285850053581105e-05,
808
+ "loss": 0.319,
809
+ "step": 1120
810
+ },
811
+ {
812
+ "epoch": 1.7,
813
+ "grad_norm": 0.10760544240474701,
814
+ "learning_rate": 3.9513543902718206e-05,
815
+ "loss": 0.3096,
816
+ "step": 1130
817
+ },
818
+ {
819
+ "epoch": 1.72,
820
+ "grad_norm": 0.11430344730615616,
821
+ "learning_rate": 3.87438460823582e-05,
822
+ "loss": 0.3119,
823
+ "step": 1140
824
+ },
825
+ {
826
+ "epoch": 1.73,
827
+ "grad_norm": 0.10582321882247925,
828
+ "learning_rate": 3.7976948041955904e-05,
829
+ "loss": 0.3179,
830
+ "step": 1150
831
+ },
832
+ {
833
+ "epoch": 1.75,
834
+ "grad_norm": 0.11124568432569504,
835
+ "learning_rate": 3.7213040534567725e-05,
836
+ "loss": 0.3099,
837
+ "step": 1160
838
+ },
839
+ {
840
+ "epoch": 1.76,
841
+ "grad_norm": 0.10317942500114441,
842
+ "learning_rate": 3.645231356940501e-05,
843
+ "loss": 0.3081,
844
+ "step": 1170
845
+ },
846
+ {
847
+ "epoch": 1.78,
848
+ "grad_norm": 0.10943736135959625,
849
+ "learning_rate": 3.569495636457244e-05,
850
+ "loss": 0.3103,
851
+ "step": 1180
852
+ },
853
+ {
854
+ "epoch": 1.79,
855
+ "grad_norm": 0.10795300453901291,
856
+ "learning_rate": 3.494115730000321e-05,
857
+ "loss": 0.3123,
858
+ "step": 1190
859
+ },
860
+ {
861
+ "epoch": 1.81,
862
+ "grad_norm": 0.1120479553937912,
863
+ "learning_rate": 3.4191103870602656e-05,
864
+ "loss": 0.3072,
865
+ "step": 1200
866
+ },
867
+ {
868
+ "epoch": 1.82,
869
+ "grad_norm": 0.10475881397724152,
870
+ "learning_rate": 3.344498263961201e-05,
871
+ "loss": 0.3107,
872
+ "step": 1210
873
+ },
874
+ {
875
+ "epoch": 1.84,
876
+ "grad_norm": 0.10638121515512466,
877
+ "learning_rate": 3.270297919220395e-05,
878
+ "loss": 0.3101,
879
+ "step": 1220
880
+ },
881
+ {
882
+ "epoch": 1.85,
883
+ "grad_norm": 0.10646604746580124,
884
+ "learning_rate": 3.1965278089321396e-05,
885
+ "loss": 0.3201,
886
+ "step": 1230
887
+ },
888
+ {
889
+ "epoch": 1.87,
890
+ "grad_norm": 0.10810079425573349,
891
+ "learning_rate": 3.123206282177105e-05,
892
+ "loss": 0.3129,
893
+ "step": 1240
894
+ },
895
+ {
896
+ "epoch": 1.88,
897
+ "grad_norm": 0.10837584733963013,
898
+ "learning_rate": 3.05035157645831e-05,
899
+ "loss": 0.3138,
900
+ "step": 1250
901
+ },
902
+ {
903
+ "epoch": 1.9,
904
+ "grad_norm": 0.11282283812761307,
905
+ "learning_rate": 2.9779818131648563e-05,
906
+ "loss": 0.3048,
907
+ "step": 1260
908
+ },
909
+ {
910
+ "epoch": 1.91,
911
+ "grad_norm": 0.10869032144546509,
912
+ "learning_rate": 2.9061149930645243e-05,
913
+ "loss": 0.3163,
914
+ "step": 1270
915
+ },
916
+ {
917
+ "epoch": 1.93,
918
+ "grad_norm": 0.11098149418830872,
919
+ "learning_rate": 2.8347689918263976e-05,
920
+ "loss": 0.3083,
921
+ "step": 1280
922
+ },
923
+ {
924
+ "epoch": 1.94,
925
+ "grad_norm": 0.10902676731348038,
926
+ "learning_rate": 2.763961555574575e-05,
927
+ "loss": 0.3008,
928
+ "step": 1290
929
+ },
930
+ {
931
+ "epoch": 1.96,
932
+ "grad_norm": 0.10680384933948517,
933
+ "learning_rate": 2.69371029647413e-05,
934
+ "loss": 0.3022,
935
+ "step": 1300
936
+ },
937
+ {
938
+ "epoch": 1.97,
939
+ "grad_norm": 0.10289633274078369,
940
+ "learning_rate": 2.624032688350374e-05,
941
+ "loss": 0.3045,
942
+ "step": 1310
943
+ },
944
+ {
945
+ "epoch": 1.99,
946
+ "grad_norm": 0.11078934371471405,
947
+ "learning_rate": 2.5549460623425354e-05,
948
+ "loss": 0.3065,
949
+ "step": 1320
950
+ },
951
+ {
952
+ "epoch": 2.0,
953
+ "grad_norm": 0.10170484334230423,
954
+ "learning_rate": 2.486467602592929e-05,
955
+ "loss": 0.2956,
956
+ "step": 1330
957
+ },
958
+ {
959
+ "epoch": 2.02,
960
+ "grad_norm": 0.1094578206539154,
961
+ "learning_rate": 2.4186143419726885e-05,
962
+ "loss": 0.2938,
963
+ "step": 1340
964
+ },
965
+ {
966
+ "epoch": 2.03,
967
+ "grad_norm": 0.10822325944900513,
968
+ "learning_rate": 2.351403157845125e-05,
969
+ "loss": 0.2863,
970
+ "step": 1350
971
+ },
972
+ {
973
+ "epoch": 2.05,
974
+ "grad_norm": 0.11142423003911972,
975
+ "learning_rate": 2.2848507678677633e-05,
976
+ "loss": 0.2846,
977
+ "step": 1360
978
+ },
979
+ {
980
+ "epoch": 2.06,
981
+ "grad_norm": 0.11497773230075836,
982
+ "learning_rate": 2.218973725834109e-05,
983
+ "loss": 0.2936,
984
+ "step": 1370
985
+ },
986
+ {
987
+ "epoch": 2.08,
988
+ "grad_norm": 0.10550795495510101,
989
+ "learning_rate": 2.153788417556164e-05,
990
+ "loss": 0.2888,
991
+ "step": 1380
992
+ },
993
+ {
994
+ "epoch": 2.09,
995
+ "grad_norm": 0.10813359171152115,
996
+ "learning_rate": 2.089311056788731e-05,
997
+ "loss": 0.2889,
998
+ "step": 1390
999
+ },
1000
+ {
1001
+ "epoch": 2.11,
1002
+ "grad_norm": 0.10650567710399628,
1003
+ "learning_rate": 2.0255576811965154e-05,
1004
+ "loss": 0.2925,
1005
+ "step": 1400
1006
+ },
1007
+ {
1008
+ "epoch": 2.12,
1009
+ "grad_norm": 0.11273263394832611,
1010
+ "learning_rate": 1.9625441483650235e-05,
1011
+ "loss": 0.2856,
1012
+ "step": 1410
1013
+ },
1014
+ {
1015
+ "epoch": 2.14,
1016
+ "grad_norm": 0.11437318474054337,
1017
+ "learning_rate": 1.9002861318562536e-05,
1018
+ "loss": 0.2845,
1019
+ "step": 1420
1020
+ },
1021
+ {
1022
+ "epoch": 2.15,
1023
+ "grad_norm": 0.11178340762853622,
1024
+ "learning_rate": 1.8387991173101587e-05,
1025
+ "loss": 0.2904,
1026
+ "step": 1430
1027
+ },
1028
+ {
1029
+ "epoch": 2.17,
1030
+ "grad_norm": 0.11630496382713318,
1031
+ "learning_rate": 1.7780983985928534e-05,
1032
+ "loss": 0.2851,
1033
+ "step": 1440
1034
+ },
1035
+ {
1036
+ "epoch": 2.18,
1037
+ "grad_norm": 0.11532077938318253,
1038
+ "learning_rate": 1.7181990739925213e-05,
1039
+ "loss": 0.2797,
1040
+ "step": 1450
1041
+ },
1042
+ {
1043
+ "epoch": 2.2,
1044
+ "grad_norm": 0.11311406642198563,
1045
+ "learning_rate": 1.6591160424639675e-05,
1046
+ "loss": 0.288,
1047
+ "step": 1460
1048
+ },
1049
+ {
1050
+ "epoch": 2.21,
1051
+ "grad_norm": 0.11721379309892654,
1052
+ "learning_rate": 1.6008639999227527e-05,
1053
+ "loss": 0.2926,
1054
+ "step": 1470
1055
+ },
1056
+ {
1057
+ "epoch": 2.23,
1058
+ "grad_norm": 0.11629457771778107,
1059
+ "learning_rate": 1.5434574355898306e-05,
1060
+ "loss": 0.2883,
1061
+ "step": 1480
1062
+ },
1063
+ {
1064
+ "epoch": 2.24,
1065
+ "grad_norm": 0.11253120750188828,
1066
+ "learning_rate": 1.4869106283875972e-05,
1067
+ "loss": 0.2878,
1068
+ "step": 1490
1069
+ },
1070
+ {
1071
+ "epoch": 2.26,
1072
+ "grad_norm": 0.1093481034040451,
1073
+ "learning_rate": 1.4312376433882457e-05,
1074
+ "loss": 0.2893,
1075
+ "step": 1500
1076
+ },
1077
+ {
1078
+ "epoch": 2.26,
1079
+ "eval_loss": 0.24623431265354156,
1080
+ "eval_runtime": 15.3708,
1081
+ "eval_samples_per_second": 0.716,
1082
+ "eval_steps_per_second": 0.716,
1083
+ "step": 1500
1084
+ },
1085
+ {
1086
+ "epoch": 2.27,
1087
+ "grad_norm": 0.114622101187706,
1088
+ "learning_rate": 1.376452328315318e-05,
1089
+ "loss": 0.2905,
1090
+ "step": 1510
1091
+ },
1092
+ {
1093
+ "epoch": 2.29,
1094
+ "grad_norm": 0.11465822905302048,
1095
+ "learning_rate": 1.3225683100993113e-05,
1096
+ "loss": 0.2886,
1097
+ "step": 1520
1098
+ },
1099
+ {
1100
+ "epoch": 2.3,
1101
+ "grad_norm": 0.11255551874637604,
1102
+ "learning_rate": 1.2695989914882128e-05,
1103
+ "loss": 0.2873,
1104
+ "step": 1530
1105
+ },
1106
+ {
1107
+ "epoch": 2.32,
1108
+ "grad_norm": 0.11598910391330719,
1109
+ "learning_rate": 1.2175575477137824e-05,
1110
+ "loss": 0.2853,
1111
+ "step": 1540
1112
+ },
1113
+ {
1114
+ "epoch": 2.33,
1115
+ "grad_norm": 0.11426587402820587,
1116
+ "learning_rate": 1.1664569232144445e-05,
1117
+ "loss": 0.2934,
1118
+ "step": 1550
1119
+ },
1120
+ {
1121
+ "epoch": 2.35,
1122
+ "grad_norm": 0.11229728907346725,
1123
+ "learning_rate": 1.1163098284155665e-05,
1124
+ "loss": 0.2878,
1125
+ "step": 1560
1126
+ },
1127
+ {
1128
+ "epoch": 2.36,
1129
+ "grad_norm": 0.10811661183834076,
1130
+ "learning_rate": 1.0671287365679567e-05,
1131
+ "loss": 0.2818,
1132
+ "step": 1570
1133
+ },
1134
+ {
1135
+ "epoch": 2.38,
1136
+ "grad_norm": 0.109583280980587,
1137
+ "learning_rate": 1.018925880645351e-05,
1138
+ "loss": 0.2915,
1139
+ "step": 1580
1140
+ },
1141
+ {
1142
+ "epoch": 2.39,
1143
+ "grad_norm": 0.1194503903388977,
1144
+ "learning_rate": 9.717132503016685e-06,
1145
+ "loss": 0.2922,
1146
+ "step": 1590
1147
+ },
1148
+ {
1149
+ "epoch": 2.41,
1150
+ "grad_norm": 0.11107660830020905,
1151
+ "learning_rate": 9.255025888887814e-06,
1152
+ "loss": 0.2843,
1153
+ "step": 1600
1154
+ },
1155
+ {
1156
+ "epoch": 2.42,
1157
+ "grad_norm": 0.12213042378425598,
1158
+ "learning_rate": 8.80305390535554e-06,
1159
+ "loss": 0.2867,
1160
+ "step": 1610
1161
+ },
1162
+ {
1163
+ "epoch": 2.44,
1164
+ "grad_norm": 0.11774149537086487,
1165
+ "learning_rate": 8.361328972888732e-06,
1166
+ "loss": 0.2838,
1167
+ "step": 1620
1168
+ },
1169
+ {
1170
+ "epoch": 2.45,
1171
+ "grad_norm": 0.11726924777030945,
1172
+ "learning_rate": 7.929960963173727e-06,
1173
+ "loss": 0.288,
1174
+ "step": 1630
1175
+ },
1176
+ {
1177
+ "epoch": 2.47,
1178
+ "grad_norm": 0.12220928817987442,
1179
+ "learning_rate": 7.509057171785639e-06,
1180
+ "loss": 0.2844,
1181
+ "step": 1640
1182
+ },
1183
+ {
1184
+ "epoch": 2.48,
1185
+ "grad_norm": 0.11534737050533295,
1186
+ "learning_rate": 7.098722291500331e-06,
1187
+ "loss": 0.2842,
1188
+ "step": 1650
1189
+ },
1190
+ {
1191
+ "epoch": 2.5,
1192
+ "grad_norm": 0.11947780847549438,
1193
+ "learning_rate": 6.699058386253865e-06,
1194
+ "loss": 0.2827,
1195
+ "step": 1660
1196
+ },
1197
+ {
1198
+ "epoch": 2.51,
1199
+ "grad_norm": 0.1192295104265213,
1200
+ "learning_rate": 6.310164865755808e-06,
1201
+ "loss": 0.2907,
1202
+ "step": 1670
1203
+ },
1204
+ {
1205
+ "epoch": 2.53,
1206
+ "grad_norm": 0.11454175412654877,
1207
+ "learning_rate": 5.93213846076271e-06,
1208
+ "loss": 0.2833,
1209
+ "step": 1680
1210
+ },
1211
+ {
1212
+ "epoch": 2.54,
1213
+ "grad_norm": 0.11072956025600433,
1214
+ "learning_rate": 5.5650731990179674e-06,
1215
+ "loss": 0.2869,
1216
+ "step": 1690
1217
+ },
1218
+ {
1219
+ "epoch": 2.56,
1220
+ "grad_norm": 0.10620978474617004,
1221
+ "learning_rate": 5.20906038186399e-06,
1222
+ "loss": 0.2775,
1223
+ "step": 1700
1224
+ },
1225
+ {
1226
+ "epoch": 2.57,
1227
+ "grad_norm": 0.1189485564827919,
1228
+ "learning_rate": 4.864188561532507e-06,
1229
+ "loss": 0.2842,
1230
+ "step": 1710
1231
+ },
1232
+ {
1233
+ "epoch": 2.59,
1234
+ "grad_norm": 0.1264795958995819,
1235
+ "learning_rate": 4.530543519118702e-06,
1236
+ "loss": 0.2944,
1237
+ "step": 1720
1238
+ },
1239
+ {
1240
+ "epoch": 2.6,
1241
+ "grad_norm": 0.10882660746574402,
1242
+ "learning_rate": 4.208208243244577e-06,
1243
+ "loss": 0.2903,
1244
+ "step": 1730
1245
+ },
1246
+ {
1247
+ "epoch": 2.62,
1248
+ "grad_norm": 0.1184026375412941,
1249
+ "learning_rate": 3.8972629094169485e-06,
1250
+ "loss": 0.2934,
1251
+ "step": 1740
1252
+ },
1253
+ {
1254
+ "epoch": 2.63,
1255
+ "grad_norm": 0.115041583776474,
1256
+ "learning_rate": 3.5977848600851016e-06,
1257
+ "loss": 0.2831,
1258
+ "step": 1750
1259
+ },
1260
+ {
1261
+ "epoch": 2.65,
1262
+ "grad_norm": 0.1151689738035202,
1263
+ "learning_rate": 3.309848585403169e-06,
1264
+ "loss": 0.2828,
1265
+ "step": 1760
1266
+ },
1267
+ {
1268
+ "epoch": 2.66,
1269
+ "grad_norm": 0.11265784502029419,
1270
+ "learning_rate": 3.033525704701956e-06,
1271
+ "loss": 0.2868,
1272
+ "step": 1770
1273
+ },
1274
+ {
1275
+ "epoch": 2.68,
1276
+ "grad_norm": 0.10989955067634583,
1277
+ "learning_rate": 2.768884948674816e-06,
1278
+ "loss": 0.2798,
1279
+ "step": 1780
1280
+ },
1281
+ {
1282
+ "epoch": 2.69,
1283
+ "grad_norm": 0.12034480273723602,
1284
+ "learning_rate": 2.515992142282042e-06,
1285
+ "loss": 0.2932,
1286
+ "step": 1790
1287
+ },
1288
+ {
1289
+ "epoch": 2.71,
1290
+ "grad_norm": 0.11389750242233276,
1291
+ "learning_rate": 2.2749101883780157e-06,
1292
+ "loss": 0.2877,
1293
+ "step": 1800
1294
+ },
1295
+ {
1296
+ "epoch": 2.72,
1297
+ "grad_norm": 0.12224046885967255,
1298
+ "learning_rate": 2.0456990520651696e-06,
1299
+ "loss": 0.29,
1300
+ "step": 1810
1301
+ },
1302
+ {
1303
+ "epoch": 2.74,
1304
+ "grad_norm": 0.11018862575292587,
1305
+ "learning_rate": 1.8284157457786833e-06,
1306
+ "loss": 0.2903,
1307
+ "step": 1820
1308
+ },
1309
+ {
1310
+ "epoch": 2.75,
1311
+ "grad_norm": 0.11373750865459442,
1312
+ "learning_rate": 1.6231143151055838e-06,
1313
+ "loss": 0.2823,
1314
+ "step": 1830
1315
+ },
1316
+ {
1317
+ "epoch": 2.77,
1318
+ "grad_norm": 0.11346758902072906,
1319
+ "learning_rate": 1.4298458253417968e-06,
1320
+ "loss": 0.2918,
1321
+ "step": 1840
1322
+ },
1323
+ {
1324
+ "epoch": 2.79,
1325
+ "grad_norm": 0.12030266970396042,
1326
+ "learning_rate": 1.2486583487905324e-06,
1327
+ "loss": 0.2793,
1328
+ "step": 1850
1329
+ },
1330
+ {
1331
+ "epoch": 2.8,
1332
+ "grad_norm": 0.11971867829561234,
1333
+ "learning_rate": 1.079596952805101e-06,
1334
+ "loss": 0.2835,
1335
+ "step": 1860
1336
+ },
1337
+ {
1338
+ "epoch": 2.82,
1339
+ "grad_norm": 0.11345378309488297,
1340
+ "learning_rate": 9.227036885791352e-07,
1341
+ "loss": 0.287,
1342
+ "step": 1870
1343
+ },
1344
+ {
1345
+ "epoch": 2.83,
1346
+ "grad_norm": 0.12087972462177277,
1347
+ "learning_rate": 7.78017580687107e-07,
1348
+ "loss": 0.2834,
1349
+ "step": 1880
1350
+ },
1351
+ {
1352
+ "epoch": 2.85,
1353
+ "grad_norm": 0.11702313274145126,
1354
+ "learning_rate": 6.455746173775701e-07,
1355
+ "loss": 0.2828,
1356
+ "step": 1890
1357
+ },
1358
+ {
1359
+ "epoch": 2.86,
1360
+ "grad_norm": 0.11680703610181808,
1361
+ "learning_rate": 5.25407741621714e-07,
1362
+ "loss": 0.2821,
1363
+ "step": 1900
1364
+ },
1365
+ {
1366
+ "epoch": 2.88,
1367
+ "grad_norm": 0.11461573839187622,
1368
+ "learning_rate": 4.1754684291934744e-07,
1369
+ "loss": 0.2784,
1370
+ "step": 1910
1371
+ },
1372
+ {
1373
+ "epoch": 2.89,
1374
+ "grad_norm": 0.12143554538488388,
1375
+ "learning_rate": 3.2201874986437784e-07,
1376
+ "loss": 0.278,
1377
+ "step": 1920
1378
+ },
1379
+ {
1380
+ "epoch": 2.91,
1381
+ "grad_norm": 0.11503802239894867,
1382
+ "learning_rate": 2.3884722347164434e-07,
1383
+ "loss": 0.2881,
1384
+ "step": 1930
1385
+ },
1386
+ {
1387
+ "epoch": 2.92,
1388
+ "grad_norm": 0.11443481594324112,
1389
+ "learning_rate": 1.6805295126677833e-07,
1390
+ "loss": 0.2952,
1391
+ "step": 1940
1392
+ },
1393
+ {
1394
+ "epoch": 2.94,
1395
+ "grad_norm": 0.1119546890258789,
1396
+ "learning_rate": 1.0965354214051982e-07,
1397
+ "loss": 0.2833,
1398
+ "step": 1950
1399
+ },
1400
+ {
1401
+ "epoch": 2.95,
1402
+ "grad_norm": 0.11658598482608795,
1403
+ "learning_rate": 6.366352196878756e-08,
1404
+ "loss": 0.2954,
1405
+ "step": 1960
1406
+ },
1407
+ {
1408
+ "epoch": 2.97,
1409
+ "grad_norm": 0.11841295659542084,
1410
+ "learning_rate": 3.0094329999635906e-08,
1411
+ "loss": 0.2792,
1412
+ "step": 1970
1413
+ },
1414
+ {
1415
+ "epoch": 2.98,
1416
+ "grad_norm": 0.11211774498224258,
1417
+ "learning_rate": 8.954316007908636e-09,
1418
+ "loss": 0.2914,
1419
+ "step": 1980
1420
+ },
1421
+ {
1422
+ "epoch": 3.0,
1423
+ "grad_norm": 0.12004334479570389,
1424
+ "learning_rate": 2.4873821838911073e-10,
1425
+ "loss": 0.2895,
1426
+ "step": 1990
1427
+ },
1428
+ {
1429
+ "epoch": 3.0,
1430
+ "step": 1992,
1431
+ "total_flos": 7.377073297607885e+18,
1432
+ "train_loss": 0.32009275511924523,
1433
+ "train_runtime": 49821.5539,
1434
+ "train_samples_per_second": 6.399,
1435
+ "train_steps_per_second": 0.04
1436
+ }
1437
+ ],
1438
+ "logging_steps": 10,
1439
+ "max_steps": 1992,
1440
+ "num_input_tokens_seen": 0,
1441
+ "num_train_epochs": 3,
1442
+ "save_steps": 250,
1443
+ "total_flos": 7.377073297607885e+18,
1444
+ "train_batch_size": 20,
1445
+ "trial_name": null,
1446
+ "trial_params": null
1447
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85f10b5751a98b3548e4b442f055cd1873b10a4262d8c0879383d6791fb9e828
3
+ size 5112
training_eval_loss.png ADDED
training_loss.png ADDED