hzy commited on
Commit
ff8f7ee
·
1 Parent(s): 690a471
README.md DELETED
@@ -1,3 +0,0 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
all_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 2.0,
3
- "train_loss": 0.19669926248992042,
4
- "train_runtime": 618.138,
5
- "train_samples_per_second": 14.531,
6
- "train_steps_per_second": 1.456
7
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "train_loss": 0.05660845875740051,
4
+ "train_runtime": 609.6094,
5
+ "train_samples_per_second": 19.685,
6
+ "train_steps_per_second": 0.984
7
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/home/huangziyang/ckpt-after-ft/pretrain2-1epoch",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
 
1
  {
2
+ "_name_or_path": "/home/huangziyang/ckpt-after-ft/pretrain3-2epoch",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
pytorch_model-00001-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d638e58828f469036c052849059619068edb57ea522920c41b5d6ad2fd7b2137
3
  size 9976620122
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ed9454157368d7b01a4f3aa00404d7727133aa35ae090d750f5fbb813446d20
3
  size 9976620122
pytorch_model-00002-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78f504d9b3c6f31483cf457d40324bf0f8227b6c61c2d0f4cc0bdc5580c9663a
3
  size 3500310787
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7255670b2c1c9b0cd67c4e1a69f74818655475c4a006383c89a13c7aab494daf
3
  size 3500310787
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 2.0,
3
- "train_loss": 0.19669926248992042,
4
- "train_runtime": 618.138,
5
- "train_samples_per_second": 14.531,
6
- "train_steps_per_second": 1.456
7
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "train_loss": 0.05660845875740051,
4
+ "train_runtime": 609.6094,
5
+ "train_samples_per_second": 19.685,
6
+ "train_steps_per_second": 0.984
7
  }
trainer_log.jsonl CHANGED
@@ -1,91 +1,61 @@
1
- {"current_steps": 10, "total_steps": 900, "loss": 6.3978, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0, "epoch": 0.02, "percentage": 1.11, "elapsed_time": "0:00:03", "remaining_time": "0:05:45"}
2
- {"current_steps": 20, "total_steps": 900, "loss": 6.4106, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0, "epoch": 0.04, "percentage": 2.22, "elapsed_time": "0:00:07", "remaining_time": "0:05:48"}
3
- {"current_steps": 30, "total_steps": 900, "loss": 4.0676, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.555555555555556e-06, "epoch": 0.07, "percentage": 3.33, "elapsed_time": "0:00:14", "remaining_time": "0:06:52"}
4
- {"current_steps": 40, "total_steps": 900, "loss": 0.0992, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1111111111111112e-05, "epoch": 0.09, "percentage": 4.44, "elapsed_time": "0:00:20", "remaining_time": "0:07:22"}
5
- {"current_steps": 50, "total_steps": 900, "loss": 0.0295, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6666666666666667e-05, "epoch": 0.11, "percentage": 5.56, "elapsed_time": "0:00:26", "remaining_time": "0:07:34"}
6
- {"current_steps": 60, "total_steps": 900, "loss": 0.0452, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2222222222222223e-05, "epoch": 0.13, "percentage": 6.67, "elapsed_time": "0:00:33", "remaining_time": "0:07:43"}
7
- {"current_steps": 70, "total_steps": 900, "loss": 0.026, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.777777777777778e-05, "epoch": 0.16, "percentage": 7.78, "elapsed_time": "0:00:39", "remaining_time": "0:07:46"}
8
- {"current_steps": 80, "total_steps": 900, "loss": 0.024, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3333333333333335e-05, "epoch": 0.18, "percentage": 8.89, "elapsed_time": "0:00:45", "remaining_time": "0:07:48"}
9
- {"current_steps": 90, "total_steps": 900, "loss": 0.0235, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.888888888888889e-05, "epoch": 0.2, "percentage": 10.0, "elapsed_time": "0:00:51", "remaining_time": "0:07:47"}
10
- {"current_steps": 100, "total_steps": 900, "loss": 0.0151, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4444444444444447e-05, "epoch": 0.22, "percentage": 11.11, "elapsed_time": "0:00:58", "remaining_time": "0:07:46"}
11
- {"current_steps": 110, "total_steps": 900, "loss": 0.0188, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5e-05, "epoch": 0.24, "percentage": 12.22, "elapsed_time": "0:01:04", "remaining_time": "0:07:41"}
12
- {"current_steps": 120, "total_steps": 900, "loss": 0.0109, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.938271604938271e-05, "epoch": 0.27, "percentage": 13.33, "elapsed_time": "0:01:10", "remaining_time": "0:07:40"}
13
- {"current_steps": 130, "total_steps": 900, "loss": 0.0223, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.876543209876544e-05, "epoch": 0.29, "percentage": 14.44, "elapsed_time": "0:01:17", "remaining_time": "0:07:36"}
14
- {"current_steps": 140, "total_steps": 900, "loss": 0.0157, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.814814814814815e-05, "epoch": 0.31, "percentage": 15.56, "elapsed_time": "0:01:23", "remaining_time": "0:07:32"}
15
- {"current_steps": 150, "total_steps": 900, "loss": 0.0186, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7530864197530866e-05, "epoch": 0.33, "percentage": 16.67, "elapsed_time": "0:01:29", "remaining_time": "0:07:27"}
16
- {"current_steps": 160, "total_steps": 900, "loss": 0.0182, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.691358024691358e-05, "epoch": 0.36, "percentage": 17.78, "elapsed_time": "0:01:35", "remaining_time": "0:07:21"}
17
- {"current_steps": 170, "total_steps": 900, "loss": 0.0287, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.62962962962963e-05, "epoch": 0.38, "percentage": 18.89, "elapsed_time": "0:01:41", "remaining_time": "0:07:17"}
18
- {"current_steps": 180, "total_steps": 900, "loss": 0.0103, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.567901234567901e-05, "epoch": 0.4, "percentage": 20.0, "elapsed_time": "0:01:48", "remaining_time": "0:07:12"}
19
- {"current_steps": 190, "total_steps": 900, "loss": 0.0114, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.506172839506173e-05, "epoch": 0.42, "percentage": 21.11, "elapsed_time": "0:01:54", "remaining_time": "0:07:06"}
20
- {"current_steps": 200, "total_steps": 900, "loss": 0.0009, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4444444444444447e-05, "epoch": 0.44, "percentage": 22.22, "elapsed_time": "0:02:00", "remaining_time": "0:07:01"}
21
- {"current_steps": 210, "total_steps": 900, "loss": 0.0164, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3827160493827164e-05, "epoch": 0.47, "percentage": 23.33, "elapsed_time": "0:02:06", "remaining_time": "0:06:55"}
22
- {"current_steps": 220, "total_steps": 900, "loss": 0.0227, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3209876543209875e-05, "epoch": 0.49, "percentage": 24.44, "elapsed_time": "0:02:12", "remaining_time": "0:06:50"}
23
- {"current_steps": 230, "total_steps": 900, "loss": 0.0225, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.259259259259259e-05, "epoch": 0.51, "percentage": 25.56, "elapsed_time": "0:02:19", "remaining_time": "0:06:45"}
24
- {"current_steps": 240, "total_steps": 900, "loss": 0.0069, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.197530864197531e-05, "epoch": 0.53, "percentage": 26.67, "elapsed_time": "0:02:25", "remaining_time": "0:06:39"}
25
- {"current_steps": 250, "total_steps": 900, "loss": 0.017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.135802469135803e-05, "epoch": 0.56, "percentage": 27.78, "elapsed_time": "0:02:31", "remaining_time": "0:06:33"}
26
- {"current_steps": 260, "total_steps": 900, "loss": 0.008, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.074074074074074e-05, "epoch": 0.58, "percentage": 28.89, "elapsed_time": "0:02:38", "remaining_time": "0:06:29"}
27
- {"current_steps": 270, "total_steps": 900, "loss": 0.0403, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.012345679012346e-05, "epoch": 0.6, "percentage": 30.0, "elapsed_time": "0:02:44", "remaining_time": "0:06:23"}
28
- {"current_steps": 280, "total_steps": 900, "loss": 0.0105, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.950617283950617e-05, "epoch": 0.62, "percentage": 31.11, "elapsed_time": "0:02:50", "remaining_time": "0:06:18"}
29
- {"current_steps": 290, "total_steps": 900, "loss": 0.0105, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.888888888888889e-05, "epoch": 0.64, "percentage": 32.22, "elapsed_time": "0:02:56", "remaining_time": "0:06:11"}
30
- {"current_steps": 300, "total_steps": 900, "loss": 0.0284, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.82716049382716e-05, "epoch": 0.67, "percentage": 33.33, "elapsed_time": "0:03:03", "remaining_time": "0:06:06"}
31
- {"current_steps": 310, "total_steps": 900, "loss": 0.0129, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7654320987654326e-05, "epoch": 0.69, "percentage": 34.44, "elapsed_time": "0:03:09", "remaining_time": "0:06:00"}
32
- {"current_steps": 320, "total_steps": 900, "loss": 0.0105, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7037037037037037e-05, "epoch": 0.71, "percentage": 35.56, "elapsed_time": "0:03:15", "remaining_time": "0:05:54"}
33
- {"current_steps": 330, "total_steps": 900, "loss": 0.027, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6419753086419754e-05, "epoch": 0.73, "percentage": 36.67, "elapsed_time": "0:03:21", "remaining_time": "0:05:48"}
34
- {"current_steps": 340, "total_steps": 900, "loss": 0.0017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.580246913580247e-05, "epoch": 0.76, "percentage": 37.78, "elapsed_time": "0:03:28", "remaining_time": "0:05:42"}
35
- {"current_steps": 350, "total_steps": 900, "loss": 0.0032, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.518518518518519e-05, "epoch": 0.78, "percentage": 38.89, "elapsed_time": "0:03:34", "remaining_time": "0:05:36"}
36
- {"current_steps": 360, "total_steps": 900, "loss": 0.0167, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.45679012345679e-05, "epoch": 0.8, "percentage": 40.0, "elapsed_time": "0:03:40", "remaining_time": "0:05:30"}
37
- {"current_steps": 370, "total_steps": 900, "loss": 0.006, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.395061728395062e-05, "epoch": 0.82, "percentage": 41.11, "elapsed_time": "0:03:46", "remaining_time": "0:05:24"}
38
- {"current_steps": 380, "total_steps": 900, "loss": 0.0057, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3333333333333335e-05, "epoch": 0.84, "percentage": 42.22, "elapsed_time": "0:03:53", "remaining_time": "0:05:18"}
39
- {"current_steps": 390, "total_steps": 900, "loss": 0.0037, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.271604938271605e-05, "epoch": 0.87, "percentage": 43.33, "elapsed_time": "0:03:59", "remaining_time": "0:05:12"}
40
- {"current_steps": 400, "total_steps": 900, "loss": 0.0044, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.209876543209876e-05, "epoch": 0.89, "percentage": 44.44, "elapsed_time": "0:04:05", "remaining_time": "0:05:06"}
41
- {"current_steps": 410, "total_steps": 900, "loss": 0.0045, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.148148148148148e-05, "epoch": 0.91, "percentage": 45.56, "elapsed_time": "0:04:11", "remaining_time": "0:05:00"}
42
- {"current_steps": 420, "total_steps": 900, "loss": 0.0197, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.08641975308642e-05, "epoch": 0.93, "percentage": 46.67, "elapsed_time": "0:04:18", "remaining_time": "0:04:55"}
43
- {"current_steps": 430, "total_steps": 900, "loss": 0.0057, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0246913580246916e-05, "epoch": 0.96, "percentage": 47.78, "elapsed_time": "0:04:24", "remaining_time": "0:04:49"}
44
- {"current_steps": 440, "total_steps": 900, "loss": 0.0047, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.962962962962963e-05, "epoch": 0.98, "percentage": 48.89, "elapsed_time": "0:04:30", "remaining_time": "0:04:43"}
45
- {"current_steps": 450, "total_steps": 900, "loss": 0.0067, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9012345679012347e-05, "epoch": 1.0, "percentage": 50.0, "elapsed_time": "0:04:37", "remaining_time": "0:04:37"}
46
- {"current_steps": 460, "total_steps": 900, "loss": 0.0048, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.839506172839506e-05, "epoch": 1.02, "percentage": 51.11, "elapsed_time": "0:04:43", "remaining_time": "0:04:30"}
47
- {"current_steps": 470, "total_steps": 900, "loss": 0.0042, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.777777777777778e-05, "epoch": 1.04, "percentage": 52.22, "elapsed_time": "0:04:49", "remaining_time": "0:04:25"}
48
- {"current_steps": 480, "total_steps": 900, "loss": 0.0054, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7160493827160493e-05, "epoch": 1.07, "percentage": 53.33, "elapsed_time": "0:04:56", "remaining_time": "0:04:19"}
49
- {"current_steps": 490, "total_steps": 900, "loss": 0.0003, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.654320987654321e-05, "epoch": 1.09, "percentage": 54.44, "elapsed_time": "0:05:02", "remaining_time": "0:04:13"}
50
- {"current_steps": 500, "total_steps": 900, "loss": 0.0003, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.5925925925925925e-05, "epoch": 1.11, "percentage": 55.56, "elapsed_time": "0:05:08", "remaining_time": "0:04:07"}
51
- {"current_steps": 510, "total_steps": 900, "loss": 0.0024, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.5308641975308646e-05, "epoch": 1.13, "percentage": 56.67, "elapsed_time": "0:06:15", "remaining_time": "0:04:46"}
52
- {"current_steps": 520, "total_steps": 900, "loss": 0.0055, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4691358024691357e-05, "epoch": 1.16, "percentage": 57.78, "elapsed_time": "0:06:21", "remaining_time": "0:04:38"}
53
- {"current_steps": 530, "total_steps": 900, "loss": 0.0009, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4074074074074074e-05, "epoch": 1.18, "percentage": 58.89, "elapsed_time": "0:06:27", "remaining_time": "0:04:30"}
54
- {"current_steps": 540, "total_steps": 900, "loss": 0.0006, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.345679012345679e-05, "epoch": 1.2, "percentage": 60.0, "elapsed_time": "0:06:34", "remaining_time": "0:04:22"}
55
- {"current_steps": 550, "total_steps": 900, "loss": 0.0008, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2839506172839506e-05, "epoch": 1.22, "percentage": 61.11, "elapsed_time": "0:06:40", "remaining_time": "0:04:14"}
56
- {"current_steps": 560, "total_steps": 900, "loss": 0.0051, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2222222222222223e-05, "epoch": 1.24, "percentage": 62.22, "elapsed_time": "0:06:46", "remaining_time": "0:04:06"}
57
- {"current_steps": 570, "total_steps": 900, "loss": 0.0129, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1604938271604937e-05, "epoch": 1.27, "percentage": 63.33, "elapsed_time": "0:06:53", "remaining_time": "0:03:59"}
58
- {"current_steps": 580, "total_steps": 900, "loss": 0.0008, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0987654320987655e-05, "epoch": 1.29, "percentage": 64.44, "elapsed_time": "0:06:59", "remaining_time": "0:03:51"}
59
- {"current_steps": 590, "total_steps": 900, "loss": 0.0042, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.037037037037037e-05, "epoch": 1.31, "percentage": 65.56, "elapsed_time": "0:07:05", "remaining_time": "0:03:43"}
60
- {"current_steps": 600, "total_steps": 900, "loss": 0.0003, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9753086419753087e-05, "epoch": 1.33, "percentage": 66.67, "elapsed_time": "0:07:11", "remaining_time": "0:03:35"}
61
- {"current_steps": 610, "total_steps": 900, "loss": 0.0049, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.91358024691358e-05, "epoch": 1.36, "percentage": 67.78, "elapsed_time": "0:07:17", "remaining_time": "0:03:27"}
62
- {"current_steps": 620, "total_steps": 900, "loss": 0.0032, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8518518518518518e-05, "epoch": 1.38, "percentage": 68.89, "elapsed_time": "0:07:23", "remaining_time": "0:03:20"}
63
- {"current_steps": 630, "total_steps": 900, "loss": 0.0019, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7901234567901236e-05, "epoch": 1.4, "percentage": 70.0, "elapsed_time": "0:07:30", "remaining_time": "0:03:12"}
64
- {"current_steps": 640, "total_steps": 900, "loss": 0.002, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.728395061728395e-05, "epoch": 1.42, "percentage": 71.11, "elapsed_time": "0:07:36", "remaining_time": "0:03:05"}
65
- {"current_steps": 650, "total_steps": 900, "loss": 0.0015, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6666666666666667e-05, "epoch": 1.44, "percentage": 72.22, "elapsed_time": "0:07:42", "remaining_time": "0:02:57"}
66
- {"current_steps": 660, "total_steps": 900, "loss": 0.0021, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.604938271604938e-05, "epoch": 1.47, "percentage": 73.33, "elapsed_time": "0:07:48", "remaining_time": "0:02:50"}
67
- {"current_steps": 670, "total_steps": 900, "loss": 0.0028, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.54320987654321e-05, "epoch": 1.49, "percentage": 74.44, "elapsed_time": "0:07:54", "remaining_time": "0:02:42"}
68
- {"current_steps": 680, "total_steps": 900, "loss": 0.0004, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4814814814814815e-05, "epoch": 1.51, "percentage": 75.56, "elapsed_time": "0:08:01", "remaining_time": "0:02:35"}
69
- {"current_steps": 690, "total_steps": 900, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.419753086419753e-05, "epoch": 1.53, "percentage": 76.67, "elapsed_time": "0:08:07", "remaining_time": "0:02:28"}
70
- {"current_steps": 700, "total_steps": 900, "loss": 0.0003, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3580246913580247e-05, "epoch": 1.56, "percentage": 77.78, "elapsed_time": "0:08:13", "remaining_time": "0:02:20"}
71
- {"current_steps": 710, "total_steps": 900, "loss": 0.0007, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2962962962962962e-05, "epoch": 1.58, "percentage": 78.89, "elapsed_time": "0:08:19", "remaining_time": "0:02:13"}
72
- {"current_steps": 720, "total_steps": 900, "loss": 0.0017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2345679012345678e-05, "epoch": 1.6, "percentage": 80.0, "elapsed_time": "0:08:25", "remaining_time": "0:02:06"}
73
- {"current_steps": 730, "total_steps": 900, "loss": 0.0002, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1728395061728396e-05, "epoch": 1.62, "percentage": 81.11, "elapsed_time": "0:08:31", "remaining_time": "0:01:59"}
74
- {"current_steps": 740, "total_steps": 900, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1111111111111112e-05, "epoch": 1.64, "percentage": 82.22, "elapsed_time": "0:08:38", "remaining_time": "0:01:52"}
75
- {"current_steps": 750, "total_steps": 900, "loss": 0.0009, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0493827160493827e-05, "epoch": 1.67, "percentage": 83.33, "elapsed_time": "0:08:44", "remaining_time": "0:01:44"}
76
- {"current_steps": 760, "total_steps": 900, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.876543209876543e-06, "epoch": 1.69, "percentage": 84.44, "elapsed_time": "0:08:50", "remaining_time": "0:01:37"}
77
- {"current_steps": 770, "total_steps": 900, "loss": 0.0005, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.259259259259259e-06, "epoch": 1.71, "percentage": 85.56, "elapsed_time": "0:08:56", "remaining_time": "0:01:30"}
78
- {"current_steps": 780, "total_steps": 900, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.641975308641975e-06, "epoch": 1.73, "percentage": 86.67, "elapsed_time": "0:09:03", "remaining_time": "0:01:23"}
79
- {"current_steps": 790, "total_steps": 900, "loss": 0.0004, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.02469135802469e-06, "epoch": 1.76, "percentage": 87.78, "elapsed_time": "0:09:09", "remaining_time": "0:01:16"}
80
- {"current_steps": 800, "total_steps": 900, "loss": 0.0006, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.4074074074074075e-06, "epoch": 1.78, "percentage": 88.89, "elapsed_time": "0:09:15", "remaining_time": "0:01:09"}
81
- {"current_steps": 810, "total_steps": 900, "loss": 0.0018, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.790123456790123e-06, "epoch": 1.8, "percentage": 90.0, "elapsed_time": "0:09:21", "remaining_time": "0:01:02"}
82
- {"current_steps": 820, "total_steps": 900, "loss": 0.0022, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.172839506172839e-06, "epoch": 1.82, "percentage": 91.11, "elapsed_time": "0:09:28", "remaining_time": "0:00:55"}
83
- {"current_steps": 830, "total_steps": 900, "loss": 0.0017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.555555555555556e-06, "epoch": 1.84, "percentage": 92.22, "elapsed_time": "0:09:34", "remaining_time": "0:00:48"}
84
- {"current_steps": 840, "total_steps": 900, "loss": 0.0052, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.938271604938272e-06, "epoch": 1.87, "percentage": 93.33, "elapsed_time": "0:09:40", "remaining_time": "0:00:41"}
85
- {"current_steps": 850, "total_steps": 900, "loss": 0.0011, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3209876543209875e-06, "epoch": 1.89, "percentage": 94.44, "elapsed_time": "0:09:46", "remaining_time": "0:00:34"}
86
- {"current_steps": 860, "total_steps": 900, "loss": 0.0014, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7037037037037037e-06, "epoch": 1.91, "percentage": 95.56, "elapsed_time": "0:09:53", "remaining_time": "0:00:27"}
87
- {"current_steps": 870, "total_steps": 900, "loss": 0.0056, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0864197530864196e-06, "epoch": 1.93, "percentage": 96.67, "elapsed_time": "0:09:59", "remaining_time": "0:00:20"}
88
- {"current_steps": 880, "total_steps": 900, "loss": 0.0, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.469135802469136e-06, "epoch": 1.96, "percentage": 97.78, "elapsed_time": "0:10:05", "remaining_time": "0:00:13"}
89
- {"current_steps": 890, "total_steps": 900, "loss": 0.0001, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8518518518518519e-06, "epoch": 1.98, "percentage": 98.89, "elapsed_time": "0:10:12", "remaining_time": "0:00:06"}
90
- {"current_steps": 900, "total_steps": 900, "loss": 0.0002, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.234567901234568e-06, "epoch": 2.0, "percentage": 100.0, "elapsed_time": "0:10:18", "remaining_time": "0:00:00"}
91
- {"current_steps": 900, "total_steps": 900, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.0, "percentage": 100.0, "elapsed_time": "0:10:18", "remaining_time": "0:00:00"}
 
1
+ {"current_steps": 10, "total_steps": 600, "loss": 0.7227, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0, "epoch": 0.03, "percentage": 1.67, "elapsed_time": "0:00:06", "remaining_time": "0:06:09"}
2
+ {"current_steps": 20, "total_steps": 600, "loss": 0.7667, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6666666666666667e-06, "epoch": 0.07, "percentage": 3.33, "elapsed_time": "0:00:14", "remaining_time": "0:06:58"}
3
+ {"current_steps": 30, "total_steps": 600, "loss": 0.2428, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1e-05, "epoch": 0.1, "percentage": 5.0, "elapsed_time": "0:00:23", "remaining_time": "0:07:17"}
4
+ {"current_steps": 40, "total_steps": 600, "loss": 0.0194, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8333333333333333e-05, "epoch": 0.13, "percentage": 6.67, "elapsed_time": "0:00:32", "remaining_time": "0:07:33"}
5
+ {"current_steps": 50, "total_steps": 600, "loss": 0.0071, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6666666666666667e-05, "epoch": 0.17, "percentage": 8.33, "elapsed_time": "0:00:41", "remaining_time": "0:07:37"}
6
+ {"current_steps": 60, "total_steps": 600, "loss": 0.0131, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.5e-05, "epoch": 0.2, "percentage": 10.0, "elapsed_time": "0:00:50", "remaining_time": "0:07:36"}
7
+ {"current_steps": 70, "total_steps": 600, "loss": 0.0079, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3333333333333334e-05, "epoch": 0.23, "percentage": 11.67, "elapsed_time": "0:01:00", "remaining_time": "0:07:39"}
8
+ {"current_steps": 80, "total_steps": 600, "loss": 0.0094, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.981481481481482e-05, "epoch": 0.27, "percentage": 13.33, "elapsed_time": "0:01:10", "remaining_time": "0:07:40"}
9
+ {"current_steps": 90, "total_steps": 600, "loss": 0.0177, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.888888888888889e-05, "epoch": 0.3, "percentage": 15.0, "elapsed_time": "0:01:20", "remaining_time": "0:07:35"}
10
+ {"current_steps": 100, "total_steps": 600, "loss": 0.0083, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.796296296296296e-05, "epoch": 0.33, "percentage": 16.67, "elapsed_time": "0:01:30", "remaining_time": "0:07:30"}
11
+ {"current_steps": 110, "total_steps": 600, "loss": 0.0226, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.703703703703704e-05, "epoch": 0.37, "percentage": 18.33, "elapsed_time": "0:01:39", "remaining_time": "0:07:21"}
12
+ {"current_steps": 120, "total_steps": 600, "loss": 0.0102, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6111111111111115e-05, "epoch": 0.4, "percentage": 20.0, "elapsed_time": "0:01:48", "remaining_time": "0:07:14"}
13
+ {"current_steps": 130, "total_steps": 600, "loss": 0.0197, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.518518518518519e-05, "epoch": 0.43, "percentage": 21.67, "elapsed_time": "0:01:57", "remaining_time": "0:07:05"}
14
+ {"current_steps": 140, "total_steps": 600, "loss": 0.0312, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.425925925925926e-05, "epoch": 0.47, "percentage": 23.33, "elapsed_time": "0:02:06", "remaining_time": "0:06:56"}
15
+ {"current_steps": 150, "total_steps": 600, "loss": 0.0282, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3333333333333334e-05, "epoch": 0.5, "percentage": 25.0, "elapsed_time": "0:02:16", "remaining_time": "0:06:48"}
16
+ {"current_steps": 160, "total_steps": 600, "loss": 0.0231, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.240740740740741e-05, "epoch": 0.53, "percentage": 26.67, "elapsed_time": "0:02:25", "remaining_time": "0:06:39"}
17
+ {"current_steps": 170, "total_steps": 600, "loss": 0.0376, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.148148148148148e-05, "epoch": 0.57, "percentage": 28.33, "elapsed_time": "0:02:34", "remaining_time": "0:06:31"}
18
+ {"current_steps": 180, "total_steps": 600, "loss": 0.0209, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.055555555555556e-05, "epoch": 0.6, "percentage": 30.0, "elapsed_time": "0:02:44", "remaining_time": "0:06:22"}
19
+ {"current_steps": 190, "total_steps": 600, "loss": 0.058, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.981481481481482e-05, "epoch": 0.63, "percentage": 31.67, "elapsed_time": "0:02:53", "remaining_time": "0:06:13"}
20
+ {"current_steps": 200, "total_steps": 600, "loss": 0.0302, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.888888888888889e-05, "epoch": 0.67, "percentage": 33.33, "elapsed_time": "0:03:02", "remaining_time": "0:06:04"}
21
+ {"current_steps": 210, "total_steps": 600, "loss": 0.0434, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7962962962962964e-05, "epoch": 0.7, "percentage": 35.0, "elapsed_time": "0:03:12", "remaining_time": "0:05:56"}
22
+ {"current_steps": 220, "total_steps": 600, "loss": 0.0432, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7037037037037037e-05, "epoch": 0.73, "percentage": 36.67, "elapsed_time": "0:03:21", "remaining_time": "0:05:47"}
23
+ {"current_steps": 230, "total_steps": 600, "loss": 0.0493, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.611111111111111e-05, "epoch": 0.77, "percentage": 38.33, "elapsed_time": "0:03:30", "remaining_time": "0:05:37"}
24
+ {"current_steps": 240, "total_steps": 600, "loss": 0.0588, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.527777777777778e-05, "epoch": 0.8, "percentage": 40.0, "elapsed_time": "0:03:39", "remaining_time": "0:05:28"}
25
+ {"current_steps": 250, "total_steps": 600, "loss": 0.0725, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.435185185185185e-05, "epoch": 0.83, "percentage": 41.67, "elapsed_time": "0:03:48", "remaining_time": "0:05:20"}
26
+ {"current_steps": 260, "total_steps": 600, "loss": 0.0859, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3425925925925924e-05, "epoch": 0.87, "percentage": 43.33, "elapsed_time": "0:03:57", "remaining_time": "0:05:10"}
27
+ {"current_steps": 270, "total_steps": 600, "loss": 0.0535, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2500000000000004e-05, "epoch": 0.9, "percentage": 45.0, "elapsed_time": "0:04:06", "remaining_time": "0:05:01"}
28
+ {"current_steps": 280, "total_steps": 600, "loss": 0.0475, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.157407407407408e-05, "epoch": 0.93, "percentage": 46.67, "elapsed_time": "0:04:15", "remaining_time": "0:04:51"}
29
+ {"current_steps": 290, "total_steps": 600, "loss": 0.0291, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.064814814814815e-05, "epoch": 0.97, "percentage": 48.33, "elapsed_time": "0:04:25", "remaining_time": "0:04:43"}
30
+ {"current_steps": 300, "total_steps": 600, "loss": 0.0327, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9722222222222223e-05, "epoch": 1.0, "percentage": 50.0, "elapsed_time": "0:04:34", "remaining_time": "0:04:34"}
31
+ {"current_steps": 310, "total_steps": 600, "loss": 0.0254, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.87962962962963e-05, "epoch": 1.03, "percentage": 51.67, "elapsed_time": "0:04:43", "remaining_time": "0:04:24"}
32
+ {"current_steps": 320, "total_steps": 600, "loss": 0.0486, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7870370370370375e-05, "epoch": 1.07, "percentage": 53.33, "elapsed_time": "0:04:52", "remaining_time": "0:04:15"}
33
+ {"current_steps": 330, "total_steps": 600, "loss": 0.0413, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6944444444444445e-05, "epoch": 1.1, "percentage": 55.0, "elapsed_time": "0:05:01", "remaining_time": "0:04:06"}
34
+ {"current_steps": 340, "total_steps": 600, "loss": 0.0229, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.601851851851852e-05, "epoch": 1.13, "percentage": 56.67, "elapsed_time": "0:05:11", "remaining_time": "0:03:57"}
35
+ {"current_steps": 350, "total_steps": 600, "loss": 0.0191, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.5092592592592594e-05, "epoch": 1.17, "percentage": 58.33, "elapsed_time": "0:05:21", "remaining_time": "0:03:49"}
36
+ {"current_steps": 360, "total_steps": 600, "loss": 0.0378, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4166666666666667e-05, "epoch": 1.2, "percentage": 60.0, "elapsed_time": "0:05:29", "remaining_time": "0:03:39"}
37
+ {"current_steps": 370, "total_steps": 600, "loss": 0.0244, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.324074074074074e-05, "epoch": 1.23, "percentage": 61.67, "elapsed_time": "0:05:38", "remaining_time": "0:03:30"}
38
+ {"current_steps": 380, "total_steps": 600, "loss": 0.014, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2314814814814816e-05, "epoch": 1.27, "percentage": 63.33, "elapsed_time": "0:05:50", "remaining_time": "0:03:23"}
39
+ {"current_steps": 390, "total_steps": 600, "loss": 0.0264, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.138888888888889e-05, "epoch": 1.3, "percentage": 65.0, "elapsed_time": "0:05:59", "remaining_time": "0:03:13"}
40
+ {"current_steps": 400, "total_steps": 600, "loss": 0.043, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0462962962962965e-05, "epoch": 1.33, "percentage": 66.67, "elapsed_time": "0:06:08", "remaining_time": "0:03:04"}
41
+ {"current_steps": 410, "total_steps": 600, "loss": 0.0356, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9537037037037038e-05, "epoch": 1.37, "percentage": 68.33, "elapsed_time": "0:06:16", "remaining_time": "0:02:54"}
42
+ {"current_steps": 420, "total_steps": 600, "loss": 0.023, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.861111111111111e-05, "epoch": 1.4, "percentage": 70.0, "elapsed_time": "0:06:26", "remaining_time": "0:02:45"}
43
+ {"current_steps": 430, "total_steps": 600, "loss": 0.0285, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7685185185185184e-05, "epoch": 1.43, "percentage": 71.67, "elapsed_time": "0:06:36", "remaining_time": "0:02:36"}
44
+ {"current_steps": 440, "total_steps": 600, "loss": 0.0189, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.675925925925926e-05, "epoch": 1.47, "percentage": 73.33, "elapsed_time": "0:06:46", "remaining_time": "0:02:27"}
45
+ {"current_steps": 450, "total_steps": 600, "loss": 0.0296, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5833333333333333e-05, "epoch": 1.5, "percentage": 75.0, "elapsed_time": "0:06:55", "remaining_time": "0:02:18"}
46
+ {"current_steps": 460, "total_steps": 600, "loss": 0.0189, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.490740740740741e-05, "epoch": 1.53, "percentage": 76.67, "elapsed_time": "0:07:05", "remaining_time": "0:02:09"}
47
+ {"current_steps": 470, "total_steps": 600, "loss": 0.0357, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3981481481481482e-05, "epoch": 1.57, "percentage": 78.33, "elapsed_time": "0:07:14", "remaining_time": "0:02:00"}
48
+ {"current_steps": 480, "total_steps": 600, "loss": 0.0235, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3055555555555557e-05, "epoch": 1.6, "percentage": 80.0, "elapsed_time": "0:07:23", "remaining_time": "0:01:50"}
49
+ {"current_steps": 490, "total_steps": 600, "loss": 0.0396, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.212962962962963e-05, "epoch": 1.63, "percentage": 81.67, "elapsed_time": "0:07:33", "remaining_time": "0:01:41"}
50
+ {"current_steps": 500, "total_steps": 600, "loss": 0.0236, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1203703703703704e-05, "epoch": 1.67, "percentage": 83.33, "elapsed_time": "0:07:42", "remaining_time": "0:01:32"}
51
+ {"current_steps": 510, "total_steps": 600, "loss": 0.0147, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0277777777777777e-05, "epoch": 1.7, "percentage": 85.0, "elapsed_time": "0:08:47", "remaining_time": "0:01:33"}
52
+ {"current_steps": 520, "total_steps": 600, "loss": 0.0135, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.351851851851852e-06, "epoch": 1.73, "percentage": 86.67, "elapsed_time": "0:08:56", "remaining_time": "0:01:22"}
53
+ {"current_steps": 530, "total_steps": 600, "loss": 0.0138, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.425925925925926e-06, "epoch": 1.77, "percentage": 88.33, "elapsed_time": "0:09:05", "remaining_time": "0:01:12"}
54
+ {"current_steps": 540, "total_steps": 600, "loss": 0.024, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.5e-06, "epoch": 1.8, "percentage": 90.0, "elapsed_time": "0:09:14", "remaining_time": "0:01:01"}
55
+ {"current_steps": 550, "total_steps": 600, "loss": 0.0181, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.574074074074074e-06, "epoch": 1.83, "percentage": 91.67, "elapsed_time": "0:09:23", "remaining_time": "0:00:51"}
56
+ {"current_steps": 560, "total_steps": 600, "loss": 0.0201, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.6481481481481485e-06, "epoch": 1.87, "percentage": 93.33, "elapsed_time": "0:09:32", "remaining_time": "0:00:40"}
57
+ {"current_steps": 570, "total_steps": 600, "loss": 0.0276, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.722222222222222e-06, "epoch": 1.9, "percentage": 95.0, "elapsed_time": "0:09:41", "remaining_time": "0:00:30"}
58
+ {"current_steps": 580, "total_steps": 600, "loss": 0.0287, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7962962962962964e-06, "epoch": 1.93, "percentage": 96.67, "elapsed_time": "0:09:50", "remaining_time": "0:00:20"}
59
+ {"current_steps": 590, "total_steps": 600, "loss": 0.0256, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8703703703703706e-06, "epoch": 1.97, "percentage": 98.33, "elapsed_time": "0:09:59", "remaining_time": "0:00:10"}
60
+ {"current_steps": 600, "total_steps": 600, "loss": 0.018, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9444444444444444e-06, "epoch": 2.0, "percentage": 100.0, "elapsed_time": "0:10:09", "remaining_time": "0:00:00"}
61
+ {"current_steps": 600, "total_steps": 600, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.0, "percentage": 100.0, "elapsed_time": "0:10:09", "remaining_time": "0:00:00"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
trainer_state.json CHANGED
@@ -3,566 +3,386 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 2.0,
5
  "eval_steps": 500,
6
- "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.02,
13
  "learning_rate": 0.0,
14
- "loss": 6.3978,
15
  "step": 10
16
  },
17
  {
18
- "epoch": 0.04,
19
- "learning_rate": 0.0,
20
- "loss": 6.4106,
21
  "step": 20
22
  },
23
  {
24
- "epoch": 0.07,
25
- "learning_rate": 5.555555555555556e-06,
26
- "loss": 4.0676,
27
  "step": 30
28
  },
29
  {
30
- "epoch": 0.09,
31
- "learning_rate": 1.1111111111111112e-05,
32
- "loss": 0.0992,
33
  "step": 40
34
  },
35
  {
36
- "epoch": 0.11,
37
- "learning_rate": 1.6666666666666667e-05,
38
- "loss": 0.0295,
39
  "step": 50
40
  },
41
  {
42
- "epoch": 0.13,
43
- "learning_rate": 2.2222222222222223e-05,
44
- "loss": 0.0452,
45
  "step": 60
46
  },
47
  {
48
- "epoch": 0.16,
49
- "learning_rate": 2.777777777777778e-05,
50
- "loss": 0.026,
51
  "step": 70
52
  },
53
  {
54
- "epoch": 0.18,
55
- "learning_rate": 3.3333333333333335e-05,
56
- "loss": 0.024,
57
  "step": 80
58
  },
59
  {
60
- "epoch": 0.2,
61
- "learning_rate": 3.888888888888889e-05,
62
- "loss": 0.0235,
63
  "step": 90
64
  },
65
  {
66
- "epoch": 0.22,
67
- "learning_rate": 4.4444444444444447e-05,
68
- "loss": 0.0151,
69
  "step": 100
70
  },
71
  {
72
- "epoch": 0.24,
73
- "learning_rate": 5e-05,
74
- "loss": 0.0188,
75
  "step": 110
76
  },
77
  {
78
- "epoch": 0.27,
79
- "learning_rate": 4.938271604938271e-05,
80
- "loss": 0.0109,
81
  "step": 120
82
  },
83
  {
84
- "epoch": 0.29,
85
- "learning_rate": 4.876543209876544e-05,
86
- "loss": 0.0223,
87
  "step": 130
88
  },
89
  {
90
- "epoch": 0.31,
91
- "learning_rate": 4.814814814814815e-05,
92
- "loss": 0.0157,
93
  "step": 140
94
  },
95
  {
96
- "epoch": 0.33,
97
- "learning_rate": 4.7530864197530866e-05,
98
- "loss": 0.0186,
99
  "step": 150
100
  },
101
  {
102
- "epoch": 0.36,
103
- "learning_rate": 4.691358024691358e-05,
104
- "loss": 0.0182,
105
  "step": 160
106
  },
107
  {
108
- "epoch": 0.38,
109
- "learning_rate": 4.62962962962963e-05,
110
- "loss": 0.0287,
111
  "step": 170
112
  },
113
  {
114
- "epoch": 0.4,
115
- "learning_rate": 4.567901234567901e-05,
116
- "loss": 0.0103,
117
  "step": 180
118
  },
119
  {
120
- "epoch": 0.42,
121
- "learning_rate": 4.506172839506173e-05,
122
- "loss": 0.0114,
123
  "step": 190
124
  },
125
  {
126
- "epoch": 0.44,
127
- "learning_rate": 4.4444444444444447e-05,
128
- "loss": 0.0009,
129
  "step": 200
130
  },
131
  {
132
- "epoch": 0.47,
133
- "learning_rate": 4.3827160493827164e-05,
134
- "loss": 0.0164,
135
  "step": 210
136
  },
137
  {
138
- "epoch": 0.49,
139
- "learning_rate": 4.3209876543209875e-05,
140
- "loss": 0.0227,
141
  "step": 220
142
  },
143
  {
144
- "epoch": 0.51,
145
- "learning_rate": 4.259259259259259e-05,
146
- "loss": 0.0225,
147
  "step": 230
148
  },
149
  {
150
- "epoch": 0.53,
151
- "learning_rate": 4.197530864197531e-05,
152
- "loss": 0.0069,
153
  "step": 240
154
  },
155
  {
156
- "epoch": 0.56,
157
- "learning_rate": 4.135802469135803e-05,
158
- "loss": 0.017,
159
  "step": 250
160
  },
161
  {
162
- "epoch": 0.58,
163
- "learning_rate": 4.074074074074074e-05,
164
- "loss": 0.008,
165
  "step": 260
166
  },
167
  {
168
- "epoch": 0.6,
169
- "learning_rate": 4.012345679012346e-05,
170
- "loss": 0.0403,
171
  "step": 270
172
  },
173
  {
174
- "epoch": 0.62,
175
- "learning_rate": 3.950617283950617e-05,
176
- "loss": 0.0105,
177
  "step": 280
178
  },
179
  {
180
- "epoch": 0.64,
181
- "learning_rate": 3.888888888888889e-05,
182
- "loss": 0.0105,
183
  "step": 290
184
  },
185
  {
186
- "epoch": 0.67,
187
- "learning_rate": 3.82716049382716e-05,
188
- "loss": 0.0284,
189
  "step": 300
190
  },
191
  {
192
- "epoch": 0.69,
193
- "learning_rate": 3.7654320987654326e-05,
194
- "loss": 0.0129,
195
  "step": 310
196
  },
197
  {
198
- "epoch": 0.71,
199
- "learning_rate": 3.7037037037037037e-05,
200
- "loss": 0.0105,
201
  "step": 320
202
  },
203
  {
204
- "epoch": 0.73,
205
- "learning_rate": 3.6419753086419754e-05,
206
- "loss": 0.027,
207
  "step": 330
208
  },
209
  {
210
- "epoch": 0.76,
211
- "learning_rate": 3.580246913580247e-05,
212
- "loss": 0.0017,
213
  "step": 340
214
  },
215
  {
216
- "epoch": 0.78,
217
- "learning_rate": 3.518518518518519e-05,
218
- "loss": 0.0032,
219
  "step": 350
220
  },
221
  {
222
- "epoch": 0.8,
223
- "learning_rate": 3.45679012345679e-05,
224
- "loss": 0.0167,
225
  "step": 360
226
  },
227
  {
228
- "epoch": 0.82,
229
- "learning_rate": 3.395061728395062e-05,
230
- "loss": 0.006,
231
  "step": 370
232
  },
233
  {
234
- "epoch": 0.84,
235
- "learning_rate": 3.3333333333333335e-05,
236
- "loss": 0.0057,
237
  "step": 380
238
  },
239
  {
240
- "epoch": 0.87,
241
- "learning_rate": 3.271604938271605e-05,
242
- "loss": 0.0037,
243
  "step": 390
244
  },
245
  {
246
- "epoch": 0.89,
247
- "learning_rate": 3.209876543209876e-05,
248
- "loss": 0.0044,
249
  "step": 400
250
  },
251
  {
252
- "epoch": 0.91,
253
- "learning_rate": 3.148148148148148e-05,
254
- "loss": 0.0045,
255
  "step": 410
256
  },
257
  {
258
- "epoch": 0.93,
259
- "learning_rate": 3.08641975308642e-05,
260
- "loss": 0.0197,
261
  "step": 420
262
  },
263
  {
264
- "epoch": 0.96,
265
- "learning_rate": 3.0246913580246916e-05,
266
- "loss": 0.0057,
267
  "step": 430
268
  },
269
  {
270
- "epoch": 0.98,
271
- "learning_rate": 2.962962962962963e-05,
272
- "loss": 0.0047,
273
  "step": 440
274
  },
275
  {
276
- "epoch": 1.0,
277
- "learning_rate": 2.9012345679012347e-05,
278
- "loss": 0.0067,
279
  "step": 450
280
  },
281
  {
282
- "epoch": 1.02,
283
- "learning_rate": 2.839506172839506e-05,
284
- "loss": 0.0048,
285
  "step": 460
286
  },
287
  {
288
- "epoch": 1.04,
289
- "learning_rate": 2.777777777777778e-05,
290
- "loss": 0.0042,
291
  "step": 470
292
  },
293
  {
294
- "epoch": 1.07,
295
- "learning_rate": 2.7160493827160493e-05,
296
- "loss": 0.0054,
297
  "step": 480
298
  },
299
  {
300
- "epoch": 1.09,
301
- "learning_rate": 2.654320987654321e-05,
302
- "loss": 0.0003,
303
  "step": 490
304
  },
305
  {
306
- "epoch": 1.11,
307
- "learning_rate": 2.5925925925925925e-05,
308
- "loss": 0.0003,
309
  "step": 500
310
  },
311
  {
312
- "epoch": 1.13,
313
- "learning_rate": 2.5308641975308646e-05,
314
- "loss": 0.0024,
315
  "step": 510
316
  },
317
  {
318
- "epoch": 1.16,
319
- "learning_rate": 2.4691358024691357e-05,
320
- "loss": 0.0055,
321
  "step": 520
322
  },
323
  {
324
- "epoch": 1.18,
325
- "learning_rate": 2.4074074074074074e-05,
326
- "loss": 0.0009,
327
  "step": 530
328
  },
329
  {
330
- "epoch": 1.2,
331
- "learning_rate": 2.345679012345679e-05,
332
- "loss": 0.0006,
333
  "step": 540
334
  },
335
  {
336
- "epoch": 1.22,
337
- "learning_rate": 2.2839506172839506e-05,
338
- "loss": 0.0008,
339
  "step": 550
340
  },
341
  {
342
- "epoch": 1.24,
343
- "learning_rate": 2.2222222222222223e-05,
344
- "loss": 0.0051,
345
  "step": 560
346
  },
347
  {
348
- "epoch": 1.27,
349
- "learning_rate": 2.1604938271604937e-05,
350
- "loss": 0.0129,
351
  "step": 570
352
  },
353
  {
354
- "epoch": 1.29,
355
- "learning_rate": 2.0987654320987655e-05,
356
- "loss": 0.0008,
357
  "step": 580
358
  },
359
  {
360
- "epoch": 1.31,
361
- "learning_rate": 2.037037037037037e-05,
362
- "loss": 0.0042,
363
  "step": 590
364
  },
365
- {
366
- "epoch": 1.33,
367
- "learning_rate": 1.9753086419753087e-05,
368
- "loss": 0.0003,
369
- "step": 600
370
- },
371
- {
372
- "epoch": 1.36,
373
- "learning_rate": 1.91358024691358e-05,
374
- "loss": 0.0049,
375
- "step": 610
376
- },
377
- {
378
- "epoch": 1.38,
379
- "learning_rate": 1.8518518518518518e-05,
380
- "loss": 0.0032,
381
- "step": 620
382
- },
383
- {
384
- "epoch": 1.4,
385
- "learning_rate": 1.7901234567901236e-05,
386
- "loss": 0.0019,
387
- "step": 630
388
- },
389
- {
390
- "epoch": 1.42,
391
- "learning_rate": 1.728395061728395e-05,
392
- "loss": 0.002,
393
- "step": 640
394
- },
395
- {
396
- "epoch": 1.44,
397
- "learning_rate": 1.6666666666666667e-05,
398
- "loss": 0.0015,
399
- "step": 650
400
- },
401
- {
402
- "epoch": 1.47,
403
- "learning_rate": 1.604938271604938e-05,
404
- "loss": 0.0021,
405
- "step": 660
406
- },
407
- {
408
- "epoch": 1.49,
409
- "learning_rate": 1.54320987654321e-05,
410
- "loss": 0.0028,
411
- "step": 670
412
- },
413
- {
414
- "epoch": 1.51,
415
- "learning_rate": 1.4814814814814815e-05,
416
- "loss": 0.0004,
417
- "step": 680
418
- },
419
- {
420
- "epoch": 1.53,
421
- "learning_rate": 1.419753086419753e-05,
422
- "loss": 0.0001,
423
- "step": 690
424
- },
425
- {
426
- "epoch": 1.56,
427
- "learning_rate": 1.3580246913580247e-05,
428
- "loss": 0.0003,
429
- "step": 700
430
- },
431
- {
432
- "epoch": 1.58,
433
- "learning_rate": 1.2962962962962962e-05,
434
- "loss": 0.0007,
435
- "step": 710
436
- },
437
- {
438
- "epoch": 1.6,
439
- "learning_rate": 1.2345679012345678e-05,
440
- "loss": 0.0017,
441
- "step": 720
442
- },
443
- {
444
- "epoch": 1.62,
445
- "learning_rate": 1.1728395061728396e-05,
446
- "loss": 0.0002,
447
- "step": 730
448
- },
449
- {
450
- "epoch": 1.64,
451
- "learning_rate": 1.1111111111111112e-05,
452
- "loss": 0.0001,
453
- "step": 740
454
- },
455
- {
456
- "epoch": 1.67,
457
- "learning_rate": 1.0493827160493827e-05,
458
- "loss": 0.0009,
459
- "step": 750
460
- },
461
- {
462
- "epoch": 1.69,
463
- "learning_rate": 9.876543209876543e-06,
464
- "loss": 0.0001,
465
- "step": 760
466
- },
467
- {
468
- "epoch": 1.71,
469
- "learning_rate": 9.259259259259259e-06,
470
- "loss": 0.0005,
471
- "step": 770
472
- },
473
- {
474
- "epoch": 1.73,
475
- "learning_rate": 8.641975308641975e-06,
476
- "loss": 0.0001,
477
- "step": 780
478
- },
479
- {
480
- "epoch": 1.76,
481
- "learning_rate": 8.02469135802469e-06,
482
- "loss": 0.0004,
483
- "step": 790
484
- },
485
- {
486
- "epoch": 1.78,
487
- "learning_rate": 7.4074074074074075e-06,
488
- "loss": 0.0006,
489
- "step": 800
490
- },
491
- {
492
- "epoch": 1.8,
493
- "learning_rate": 6.790123456790123e-06,
494
- "loss": 0.0018,
495
- "step": 810
496
- },
497
- {
498
- "epoch": 1.82,
499
- "learning_rate": 6.172839506172839e-06,
500
- "loss": 0.0022,
501
- "step": 820
502
- },
503
- {
504
- "epoch": 1.84,
505
- "learning_rate": 5.555555555555556e-06,
506
- "loss": 0.0017,
507
- "step": 830
508
- },
509
- {
510
- "epoch": 1.87,
511
- "learning_rate": 4.938271604938272e-06,
512
- "loss": 0.0052,
513
- "step": 840
514
- },
515
- {
516
- "epoch": 1.89,
517
- "learning_rate": 4.3209876543209875e-06,
518
- "loss": 0.0011,
519
- "step": 850
520
- },
521
- {
522
- "epoch": 1.91,
523
- "learning_rate": 3.7037037037037037e-06,
524
- "loss": 0.0014,
525
- "step": 860
526
- },
527
- {
528
- "epoch": 1.93,
529
- "learning_rate": 3.0864197530864196e-06,
530
- "loss": 0.0056,
531
- "step": 870
532
- },
533
- {
534
- "epoch": 1.96,
535
- "learning_rate": 2.469135802469136e-06,
536
- "loss": 0.0,
537
- "step": 880
538
- },
539
- {
540
- "epoch": 1.98,
541
- "learning_rate": 1.8518518518518519e-06,
542
- "loss": 0.0001,
543
- "step": 890
544
- },
545
  {
546
  "epoch": 2.0,
547
- "learning_rate": 1.234567901234568e-06,
548
- "loss": 0.0002,
549
- "step": 900
550
  },
551
  {
552
  "epoch": 2.0,
553
- "step": 900,
554
- "total_flos": 1.0906858404236493e+17,
555
- "train_loss": 0.19669926248992042,
556
- "train_runtime": 618.138,
557
- "train_samples_per_second": 14.531,
558
- "train_steps_per_second": 1.456
559
  }
560
  ],
561
  "logging_steps": 10,
562
- "max_steps": 900,
563
  "num_train_epochs": 2,
564
  "save_steps": 500,
565
- "total_flos": 1.0906858404236493e+17,
566
  "trial_name": null,
567
  "trial_params": null
568
  }
 
3
  "best_model_checkpoint": null,
4
  "epoch": 2.0,
5
  "eval_steps": 500,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03,
13
  "learning_rate": 0.0,
14
+ "loss": 0.7227,
15
  "step": 10
16
  },
17
  {
18
+ "epoch": 0.07,
19
+ "learning_rate": 1.6666666666666667e-06,
20
+ "loss": 0.7667,
21
  "step": 20
22
  },
23
  {
24
+ "epoch": 0.1,
25
+ "learning_rate": 1e-05,
26
+ "loss": 0.2428,
27
  "step": 30
28
  },
29
  {
30
+ "epoch": 0.13,
31
+ "learning_rate": 1.8333333333333333e-05,
32
+ "loss": 0.0194,
33
  "step": 40
34
  },
35
  {
36
+ "epoch": 0.17,
37
+ "learning_rate": 2.6666666666666667e-05,
38
+ "loss": 0.0071,
39
  "step": 50
40
  },
41
  {
42
+ "epoch": 0.2,
43
+ "learning_rate": 3.5e-05,
44
+ "loss": 0.0131,
45
  "step": 60
46
  },
47
  {
48
+ "epoch": 0.23,
49
+ "learning_rate": 4.3333333333333334e-05,
50
+ "loss": 0.0079,
51
  "step": 70
52
  },
53
  {
54
+ "epoch": 0.27,
55
+ "learning_rate": 4.981481481481482e-05,
56
+ "loss": 0.0094,
57
  "step": 80
58
  },
59
  {
60
+ "epoch": 0.3,
61
+ "learning_rate": 4.888888888888889e-05,
62
+ "loss": 0.0177,
63
  "step": 90
64
  },
65
  {
66
+ "epoch": 0.33,
67
+ "learning_rate": 4.796296296296296e-05,
68
+ "loss": 0.0083,
69
  "step": 100
70
  },
71
  {
72
+ "epoch": 0.37,
73
+ "learning_rate": 4.703703703703704e-05,
74
+ "loss": 0.0226,
75
  "step": 110
76
  },
77
  {
78
+ "epoch": 0.4,
79
+ "learning_rate": 4.6111111111111115e-05,
80
+ "loss": 0.0102,
81
  "step": 120
82
  },
83
  {
84
+ "epoch": 0.43,
85
+ "learning_rate": 4.518518518518519e-05,
86
+ "loss": 0.0197,
87
  "step": 130
88
  },
89
  {
90
+ "epoch": 0.47,
91
+ "learning_rate": 4.425925925925926e-05,
92
+ "loss": 0.0312,
93
  "step": 140
94
  },
95
  {
96
+ "epoch": 0.5,
97
+ "learning_rate": 4.3333333333333334e-05,
98
+ "loss": 0.0282,
99
  "step": 150
100
  },
101
  {
102
+ "epoch": 0.53,
103
+ "learning_rate": 4.240740740740741e-05,
104
+ "loss": 0.0231,
105
  "step": 160
106
  },
107
  {
108
+ "epoch": 0.57,
109
+ "learning_rate": 4.148148148148148e-05,
110
+ "loss": 0.0376,
111
  "step": 170
112
  },
113
  {
114
+ "epoch": 0.6,
115
+ "learning_rate": 4.055555555555556e-05,
116
+ "loss": 0.0209,
117
  "step": 180
118
  },
119
  {
120
+ "epoch": 0.63,
121
+ "learning_rate": 3.981481481481482e-05,
122
+ "loss": 0.058,
123
  "step": 190
124
  },
125
  {
126
+ "epoch": 0.67,
127
+ "learning_rate": 3.888888888888889e-05,
128
+ "loss": 0.0302,
129
  "step": 200
130
  },
131
  {
132
+ "epoch": 0.7,
133
+ "learning_rate": 3.7962962962962964e-05,
134
+ "loss": 0.0434,
135
  "step": 210
136
  },
137
  {
138
+ "epoch": 0.73,
139
+ "learning_rate": 3.7037037037037037e-05,
140
+ "loss": 0.0432,
141
  "step": 220
142
  },
143
  {
144
+ "epoch": 0.77,
145
+ "learning_rate": 3.611111111111111e-05,
146
+ "loss": 0.0493,
147
  "step": 230
148
  },
149
  {
150
+ "epoch": 0.8,
151
+ "learning_rate": 3.527777777777778e-05,
152
+ "loss": 0.0588,
153
  "step": 240
154
  },
155
  {
156
+ "epoch": 0.83,
157
+ "learning_rate": 3.435185185185185e-05,
158
+ "loss": 0.0725,
159
  "step": 250
160
  },
161
  {
162
+ "epoch": 0.87,
163
+ "learning_rate": 3.3425925925925924e-05,
164
+ "loss": 0.0859,
165
  "step": 260
166
  },
167
  {
168
+ "epoch": 0.9,
169
+ "learning_rate": 3.2500000000000004e-05,
170
+ "loss": 0.0535,
171
  "step": 270
172
  },
173
  {
174
+ "epoch": 0.93,
175
+ "learning_rate": 3.157407407407408e-05,
176
+ "loss": 0.0475,
177
  "step": 280
178
  },
179
  {
180
+ "epoch": 0.97,
181
+ "learning_rate": 3.064814814814815e-05,
182
+ "loss": 0.0291,
183
  "step": 290
184
  },
185
  {
186
+ "epoch": 1.0,
187
+ "learning_rate": 2.9722222222222223e-05,
188
+ "loss": 0.0327,
189
  "step": 300
190
  },
191
  {
192
+ "epoch": 1.03,
193
+ "learning_rate": 2.87962962962963e-05,
194
+ "loss": 0.0254,
195
  "step": 310
196
  },
197
  {
198
+ "epoch": 1.07,
199
+ "learning_rate": 2.7870370370370375e-05,
200
+ "loss": 0.0486,
201
  "step": 320
202
  },
203
  {
204
+ "epoch": 1.1,
205
+ "learning_rate": 2.6944444444444445e-05,
206
+ "loss": 0.0413,
207
  "step": 330
208
  },
209
  {
210
+ "epoch": 1.13,
211
+ "learning_rate": 2.601851851851852e-05,
212
+ "loss": 0.0229,
213
  "step": 340
214
  },
215
  {
216
+ "epoch": 1.17,
217
+ "learning_rate": 2.5092592592592594e-05,
218
+ "loss": 0.0191,
219
  "step": 350
220
  },
221
  {
222
+ "epoch": 1.2,
223
+ "learning_rate": 2.4166666666666667e-05,
224
+ "loss": 0.0378,
225
  "step": 360
226
  },
227
  {
228
+ "epoch": 1.23,
229
+ "learning_rate": 2.324074074074074e-05,
230
+ "loss": 0.0244,
231
  "step": 370
232
  },
233
  {
234
+ "epoch": 1.27,
235
+ "learning_rate": 2.2314814814814816e-05,
236
+ "loss": 0.014,
237
  "step": 380
238
  },
239
  {
240
+ "epoch": 1.3,
241
+ "learning_rate": 2.138888888888889e-05,
242
+ "loss": 0.0264,
243
  "step": 390
244
  },
245
  {
246
+ "epoch": 1.33,
247
+ "learning_rate": 2.0462962962962965e-05,
248
+ "loss": 0.043,
249
  "step": 400
250
  },
251
  {
252
+ "epoch": 1.37,
253
+ "learning_rate": 1.9537037037037038e-05,
254
+ "loss": 0.0356,
255
  "step": 410
256
  },
257
  {
258
+ "epoch": 1.4,
259
+ "learning_rate": 1.861111111111111e-05,
260
+ "loss": 0.023,
261
  "step": 420
262
  },
263
  {
264
+ "epoch": 1.43,
265
+ "learning_rate": 1.7685185185185184e-05,
266
+ "loss": 0.0285,
267
  "step": 430
268
  },
269
  {
270
+ "epoch": 1.47,
271
+ "learning_rate": 1.675925925925926e-05,
272
+ "loss": 0.0189,
273
  "step": 440
274
  },
275
  {
276
+ "epoch": 1.5,
277
+ "learning_rate": 1.5833333333333333e-05,
278
+ "loss": 0.0296,
279
  "step": 450
280
  },
281
  {
282
+ "epoch": 1.53,
283
+ "learning_rate": 1.490740740740741e-05,
284
+ "loss": 0.0189,
285
  "step": 460
286
  },
287
  {
288
+ "epoch": 1.57,
289
+ "learning_rate": 1.3981481481481482e-05,
290
+ "loss": 0.0357,
291
  "step": 470
292
  },
293
  {
294
+ "epoch": 1.6,
295
+ "learning_rate": 1.3055555555555557e-05,
296
+ "loss": 0.0235,
297
  "step": 480
298
  },
299
  {
300
+ "epoch": 1.63,
301
+ "learning_rate": 1.212962962962963e-05,
302
+ "loss": 0.0396,
303
  "step": 490
304
  },
305
  {
306
+ "epoch": 1.67,
307
+ "learning_rate": 1.1203703703703704e-05,
308
+ "loss": 0.0236,
309
  "step": 500
310
  },
311
  {
312
+ "epoch": 1.7,
313
+ "learning_rate": 1.0277777777777777e-05,
314
+ "loss": 0.0147,
315
  "step": 510
316
  },
317
  {
318
+ "epoch": 1.73,
319
+ "learning_rate": 9.351851851851852e-06,
320
+ "loss": 0.0135,
321
  "step": 520
322
  },
323
  {
324
+ "epoch": 1.77,
325
+ "learning_rate": 8.425925925925926e-06,
326
+ "loss": 0.0138,
327
  "step": 530
328
  },
329
  {
330
+ "epoch": 1.8,
331
+ "learning_rate": 7.5e-06,
332
+ "loss": 0.024,
333
  "step": 540
334
  },
335
  {
336
+ "epoch": 1.83,
337
+ "learning_rate": 6.574074074074074e-06,
338
+ "loss": 0.0181,
339
  "step": 550
340
  },
341
  {
342
+ "epoch": 1.87,
343
+ "learning_rate": 5.6481481481481485e-06,
344
+ "loss": 0.0201,
345
  "step": 560
346
  },
347
  {
348
+ "epoch": 1.9,
349
+ "learning_rate": 4.722222222222222e-06,
350
+ "loss": 0.0276,
351
  "step": 570
352
  },
353
  {
354
+ "epoch": 1.93,
355
+ "learning_rate": 3.7962962962962964e-06,
356
+ "loss": 0.0287,
357
  "step": 580
358
  },
359
  {
360
+ "epoch": 1.97,
361
+ "learning_rate": 2.8703703703703706e-06,
362
+ "loss": 0.0256,
363
  "step": 590
364
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  {
366
  "epoch": 2.0,
367
+ "learning_rate": 1.9444444444444444e-06,
368
+ "loss": 0.018,
369
+ "step": 600
370
  },
371
  {
372
  "epoch": 2.0,
373
+ "step": 600,
374
+ "total_flos": 1.5734632955772928e+17,
375
+ "train_loss": 0.05660845875740051,
376
+ "train_runtime": 609.6094,
377
+ "train_samples_per_second": 19.685,
378
+ "train_steps_per_second": 0.984
379
  }
380
  ],
381
  "logging_steps": 10,
382
+ "max_steps": 600,
383
  "num_train_epochs": 2,
384
  "save_steps": 500,
385
+ "total_flos": 1.5734632955772928e+17,
386
  "trial_name": null,
387
  "trial_params": null
388
  }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2517f3e3366d7fcfe464be71f2274d49b5ae7bc0ee93635cc4846116f60c5a8
3
+ size 5179
training_loss.png CHANGED