File size: 28,439 Bytes
c55fa17
 
 
 
41b9d47
 
52c7389
 
 
21a7084
 
16b8c4b
 
 
6639e41
 
4dbe18f
 
8476238
 
 
07f6263
 
7a161c8
 
 
a0c7aad
 
4176ec8
 
3e79840
 
 
9a73399
 
1ffe941
 
 
4423a3f
 
dde38f7
 
380854f
 
 
5bb8b82
 
f9bc200
 
 
59085b7
 
8eff469
 
4908a07
 
 
1e33c3e
 
847e23c
 
 
9f0eb7a
 
7497987
 
83acd59
 
 
b141490
 
b27058e
 
 
acc17b3
 
4af804a
 
120cdc9
 
 
ee44091
 
6721c21
 
 
61a7065
 
65112e8
 
1982096
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
{"eval_loss": 4.51287508, "eval_runtime": 86.5945, "eval_samples_per_second": 2.113, "eval_steps_per_second": 0.531, "eval_token_acc": 0.74476581, "epoch": 0, "global_step/max_steps": "0/756", "percentage": "0.00%", "elapsed_time": "1m 26s", "memory(GiB)": 21.73, "train_speed(iter/s)": 0.0}
{"loss": 1.0916611, "grad_norm": 0.55941969, "learning_rate": 2.63e-06, "token_acc": 0.74447185, "epoch": 0.00397911, "global_step/max_steps": "1/756", "percentage": "0.13%", "elapsed_time": "1m 50s", "remaining_time": "23h 10m 25s", "memory(GiB)": 28.08, "train_speed(iter/s)": 0.00905}
{"loss": 1.10507329, "grad_norm": 0.50046724, "learning_rate": 2.632e-05, "token_acc": 0.74080157, "epoch": 0.0397911, "global_step/max_steps": "10/756", "percentage": "1.32%", "elapsed_time": "5m 16s", "remaining_time": "6h 34m 5s", "memory(GiB)": 31.13, "train_speed(iter/s)": 0.031549}
{"loss": 1.00932264, "grad_norm": 0.22522864, "learning_rate": 5.263e-05, "token_acc": 0.74884435, "epoch": 0.07958219, "global_step/max_steps": "20/756", "percentage": "2.65%", "elapsed_time": "9m 6s", "remaining_time": "5h 35m 11s", "memory(GiB)": 31.16, "train_speed(iter/s)": 0.036596}
{"loss": 0.93268452, "grad_norm": 0.25404572, "learning_rate": 7.895e-05, "token_acc": 0.75676336, "epoch": 0.11937329, "global_step/max_steps": "30/756", "percentage": "3.97%", "elapsed_time": "13m 12s", "remaining_time": "5h 19m 30s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.03787}
{"loss": 0.87882118, "grad_norm": 0.21832059, "learning_rate": 0.0001, "token_acc": 0.76511042, "epoch": 0.15916439, "global_step/max_steps": "40/756", "percentage": "5.29%", "elapsed_time": "17m 8s", "remaining_time": "5h 6m 52s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.038887}
{"loss": 0.8168808, "grad_norm": 0.20378394, "learning_rate": 9.993e-05, "token_acc": 0.7749847, "epoch": 0.19895548, "global_step/max_steps": "50/756", "percentage": "6.61%", "elapsed_time": "20m 54s", "remaining_time": "4h 55m 11s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039861}
{"eval_loss": 3.2664814, "eval_runtime": 83.0261, "eval_samples_per_second": 2.204, "eval_steps_per_second": 0.554, "eval_token_acc": 0.78074018, "epoch": 0.19895548, "global_step/max_steps": "50/756", "percentage": "6.61%", "elapsed_time": "22m 17s", "remaining_time": "5h 14m 43s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.037386}
{"loss": 0.81045494, "grad_norm": 0.21387973, "learning_rate": 9.977e-05, "token_acc": 0.77483389, "epoch": 0.23874658, "global_step/max_steps": "60/756", "percentage": "7.94%", "elapsed_time": "26m 8s", "remaining_time": "5h 3m 11s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.03826}
{"loss": 0.78286715, "grad_norm": 0.25447285, "learning_rate": 9.951e-05, "token_acc": 0.78042773, "epoch": 0.27853768, "global_step/max_steps": "70/756", "percentage": "9.26%", "elapsed_time": "30m 5s", "remaining_time": "4h 54m 52s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.038774}
{"loss": 0.75977039, "grad_norm": 0.24767879, "learning_rate": 9.916e-05, "token_acc": 0.78458958, "epoch": 0.31832877, "global_step/max_steps": "80/756", "percentage": "10.58%", "elapsed_time": "33m 55s", "remaining_time": "4h 46m 36s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.03931}
{"loss": 0.73083143, "grad_norm": 0.37413245, "learning_rate": 9.871e-05, "token_acc": 0.79073194, "epoch": 0.35811987, "global_step/max_steps": "90/756", "percentage": "11.90%", "elapsed_time": "37m 45s", "remaining_time": "4h 39m 25s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039724}
{"loss": 0.73291955, "grad_norm": 0.30059874, "learning_rate": 9.817e-05, "token_acc": 0.79032498, "epoch": 0.39791097, "global_step/max_steps": "100/756", "percentage": "13.23%", "elapsed_time": "41m 37s", "remaining_time": "4h 33m 2s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040042}
{"eval_loss": 2.91791534, "eval_runtime": 82.5349, "eval_samples_per_second": 2.217, "eval_steps_per_second": 0.557, "eval_token_acc": 0.79485684, "epoch": 0.39791097, "global_step/max_steps": "100/756", "percentage": "13.23%", "elapsed_time": "42m 59s", "remaining_time": "4h 42m 4s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.03876}
{"loss": 0.71492791, "grad_norm": 0.29148239, "learning_rate": 9.754e-05, "token_acc": 0.79306438, "epoch": 0.43770206, "global_step/max_steps": "110/756", "percentage": "14.55%", "elapsed_time": "46m 54s", "remaining_time": "4h 35m 28s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039085}
{"loss": 0.71300731, "grad_norm": 0.27155069, "learning_rate": 9.682e-05, "token_acc": 0.79392519, "epoch": 0.47749316, "global_step/max_steps": "120/756", "percentage": "15.87%", "elapsed_time": "51m 4s", "remaining_time": "4h 30m 42s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039158}
{"loss": 0.71306467, "grad_norm": 0.29438251, "learning_rate": 9.6e-05, "token_acc": 0.79304077, "epoch": 0.51728426, "global_step/max_steps": "130/756", "percentage": "17.20%", "elapsed_time": "54m 55s", "remaining_time": "4h 24m 27s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039453}
{"loss": 0.70194082, "grad_norm": 0.32997555, "learning_rate": 9.51e-05, "token_acc": 0.79521872, "epoch": 0.55707535, "global_step/max_steps": "140/756", "percentage": "18.52%", "elapsed_time": "58m 43s", "remaining_time": "4h 18m 24s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039731}
{"loss": 0.70811157, "grad_norm": 0.29847297, "learning_rate": 9.412e-05, "token_acc": 0.79354036, "epoch": 0.59686645, "global_step/max_steps": "150/756", "percentage": "19.84%", "elapsed_time": "1h 2m 32s", "remaining_time": "4h 12m 39s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039976}
{"eval_loss": 2.7713089, "eval_runtime": 82.4754, "eval_samples_per_second": 2.219, "eval_steps_per_second": 0.558, "eval_token_acc": 0.80116134, "epoch": 0.59686645, "global_step/max_steps": "150/756", "percentage": "19.84%", "elapsed_time": "1h 3m 54s", "remaining_time": "4h 18m 12s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039116}
{"loss": 0.70022655, "grad_norm": 0.34647727, "learning_rate": 9.304e-05, "token_acc": 0.796757, "epoch": 0.63665755, "global_step/max_steps": "160/756", "percentage": "21.16%", "elapsed_time": "1h 7m 44s", "remaining_time": "4h 12m 18s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.03937}
{"loss": 0.68182106, "grad_norm": 0.34159285, "learning_rate": 9.189e-05, "token_acc": 0.80100522, "epoch": 0.67644864, "global_step/max_steps": "170/756", "percentage": "22.49%", "elapsed_time": "1h 11m 33s", "remaining_time": "4h 6m 41s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.03959}
{"loss": 0.69354277, "grad_norm": 0.35957012, "learning_rate": 9.066e-05, "token_acc": 0.7954857, "epoch": 0.71623974, "global_step/max_steps": "180/756", "percentage": "23.81%", "elapsed_time": "1h 15m 30s", "remaining_time": "4h 1m 38s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039728}
{"loss": 0.68614893, "grad_norm": 0.36396566, "learning_rate": 8.934e-05, "token_acc": 0.79892365, "epoch": 0.75603084, "global_step/max_steps": "190/756", "percentage": "25.13%", "elapsed_time": "1h 19m 18s", "remaining_time": "3h 56m 16s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039925}
{"loss": 0.67535744, "grad_norm": 0.3679336, "learning_rate": 8.796e-05, "token_acc": 0.80184291, "epoch": 0.79582193, "global_step/max_steps": "200/756", "percentage": "26.46%", "elapsed_time": "1h 23m 11s", "remaining_time": "3h 51m 15s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040071}
{"eval_loss": 2.68263507, "eval_runtime": 83.6535, "eval_samples_per_second": 2.188, "eval_steps_per_second": 0.55, "eval_token_acc": 0.80542834, "epoch": 0.79582193, "global_step/max_steps": "200/756", "percentage": "26.46%", "elapsed_time": "1h 24m 34s", "remaining_time": "3h 55m 7s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039411}
{"loss": 0.66844773, "grad_norm": 0.38341215, "learning_rate": 8.65e-05, "token_acc": 0.80330412, "epoch": 0.83561303, "global_step/max_steps": "210/756", "percentage": "27.78%", "elapsed_time": "1h 28m 25s", "remaining_time": "3h 49m 54s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.03958}
{"loss": 0.6761816, "grad_norm": 0.3342804, "learning_rate": 8.497e-05, "token_acc": 0.80166258, "epoch": 0.87540413, "global_step/max_steps": "220/756", "percentage": "29.10%", "elapsed_time": "1h 32m 18s", "remaining_time": "3h 44m 53s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039722}
{"loss": 0.67615294, "grad_norm": 0.37265298, "learning_rate": 8.337e-05, "token_acc": 0.80118919, "epoch": 0.91519523, "global_step/max_steps": "230/756", "percentage": "30.42%", "elapsed_time": "1h 36m 8s", "remaining_time": "3h 39m 51s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039873}
{"loss": 0.67424359, "grad_norm": 0.37488362, "learning_rate": 8.171e-05, "token_acc": 0.7999509, "epoch": 0.95498632, "global_step/max_steps": "240/756", "percentage": "31.75%", "elapsed_time": "1h 39m 52s", "remaining_time": "3h 34m 43s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040051}
{"loss": 0.66487207, "grad_norm": 0.33123785, "learning_rate": 7.999e-05, "token_acc": 0.80308648, "epoch": 0.99477742, "global_step/max_steps": "250/756", "percentage": "33.07%", "elapsed_time": "1h 43m 41s", "remaining_time": "3h 29m 52s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040183}
{"eval_loss": 2.62072086, "eval_runtime": 81.3597, "eval_samples_per_second": 2.249, "eval_steps_per_second": 0.565, "eval_token_acc": 0.80856458, "epoch": 0.99477742, "global_step/max_steps": "250/756", "percentage": "33.07%", "elapsed_time": "1h 45m 2s", "remaining_time": "3h 32m 36s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039665}
{"loss": 0.64975939, "grad_norm": 0.47815546, "learning_rate": 7.821e-05, "token_acc": 0.80659412, "epoch": 1.03183288, "global_step/max_steps": "260/756", "percentage": "34.39%", "elapsed_time": "1h 48m 30s", "remaining_time": "3h 27m 0s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039934}
{"loss": 0.65147877, "grad_norm": 0.36382401, "learning_rate": 7.638e-05, "token_acc": 0.80339263, "epoch": 1.07162397, "global_step/max_steps": "270/756", "percentage": "35.71%", "elapsed_time": "1h 52m 14s", "remaining_time": "3h 22m 2s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040092}
{"loss": 0.65088706, "grad_norm": 0.34569094, "learning_rate": 7.449e-05, "token_acc": 0.80499139, "epoch": 1.11141507, "global_step/max_steps": "280/756", "percentage": "37.04%", "elapsed_time": "1h 56m 9s", "remaining_time": "3h 17m 27s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040177}
{"loss": 0.64894333, "grad_norm": 0.38046846, "learning_rate": 7.256e-05, "token_acc": 0.80559137, "epoch": 1.15120617, "global_step/max_steps": "290/756", "percentage": "38.36%", "elapsed_time": "1h 59m 57s", "remaining_time": "3h 12m 45s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040292}
{"loss": 0.63442192, "grad_norm": 0.3717086, "learning_rate": 7.059e-05, "token_acc": 0.81019007, "epoch": 1.19099726, "global_step/max_steps": "300/756", "percentage": "39.68%", "elapsed_time": "2h 3m 52s", "remaining_time": "3h 8m 17s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040364}
{"eval_loss": 2.57562494, "eval_runtime": 83.7636, "eval_samples_per_second": 2.185, "eval_steps_per_second": 0.549, "eval_token_acc": 0.81049895, "epoch": 1.19099726, "global_step/max_steps": "300/756", "percentage": "39.68%", "elapsed_time": "2h 5m 16s", "remaining_time": "3h 10m 24s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.039914}
{"loss": 0.64846001, "grad_norm": 0.41845098, "learning_rate": 6.858e-05, "token_acc": 0.80573442, "epoch": 1.23078836, "global_step/max_steps": "310/756", "percentage": "41.01%", "elapsed_time": "2h 9m 6s", "remaining_time": "3h 5m 44s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.04002}
{"loss": 0.63730922, "grad_norm": 0.41076234, "learning_rate": 6.653e-05, "token_acc": 0.8094828, "epoch": 1.27057946, "global_step/max_steps": "320/756", "percentage": "42.33%", "elapsed_time": "2h 12m 56s", "remaining_time": "3h 1m 7s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040118}
{"loss": 0.64370165, "grad_norm": 0.37233034, "learning_rate": 6.445e-05, "token_acc": 0.80564955, "epoch": 1.31037055, "global_step/max_steps": "330/756", "percentage": "43.65%", "elapsed_time": "2h 16m 41s", "remaining_time": "2h 56m 26s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040238}
{"loss": 0.62285523, "grad_norm": 0.42613682, "learning_rate": 6.234e-05, "token_acc": 0.81161872, "epoch": 1.35016165, "global_step/max_steps": "340/756", "percentage": "44.97%", "elapsed_time": "2h 20m 25s", "remaining_time": "2h 51m 48s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040353}
{"loss": 0.63389444, "grad_norm": 0.37893876, "learning_rate": 6.021e-05, "token_acc": 0.80897209, "epoch": 1.38995275, "global_step/max_steps": "350/756", "percentage": "46.30%", "elapsed_time": "2h 24m 12s", "remaining_time": "2h 47m 17s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040448}
{"eval_loss": 2.5464642, "eval_runtime": 81.8498, "eval_samples_per_second": 2.236, "eval_steps_per_second": 0.562, "eval_token_acc": 0.81189995, "epoch": 1.38995275, "global_step/max_steps": "350/756", "percentage": "46.30%", "elapsed_time": "2h 25m 34s", "remaining_time": "2h 48m 52s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040069}
{"loss": 0.63284993, "grad_norm": 0.47583783, "learning_rate": 5.806e-05, "token_acc": 0.80906272, "epoch": 1.42974384, "global_step/max_steps": "360/756", "percentage": "47.62%", "elapsed_time": "2h 29m 21s", "remaining_time": "2h 44m 17s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040174}
{"loss": 0.63130598, "grad_norm": 0.40477979, "learning_rate": 5.589e-05, "token_acc": 0.80867286, "epoch": 1.46953494, "global_step/max_steps": "370/756", "percentage": "48.94%", "elapsed_time": "2h 33m 18s", "remaining_time": "2h 39m 56s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040222}
{"loss": 0.64159236, "grad_norm": 0.42596149, "learning_rate": 5.372e-05, "token_acc": 0.80728027, "epoch": 1.50932604, "global_step/max_steps": "380/756", "percentage": "50.26%", "elapsed_time": "2h 37m 12s", "remaining_time": "2h 35m 32s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040287}
{"loss": 0.64312091, "grad_norm": 0.42579779, "learning_rate": 5.153e-05, "token_acc": 0.80699175, "epoch": 1.54911714, "global_step/max_steps": "390/756", "percentage": "51.59%", "elapsed_time": "2h 41m 0s", "remaining_time": "2h 31m 6s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040369}
{"loss": 0.63438926, "grad_norm": 0.39897516, "learning_rate": 4.934e-05, "token_acc": 0.80871207, "epoch": 1.58890823, "global_step/max_steps": "400/756", "percentage": "52.91%", "elapsed_time": "2h 44m 42s", "remaining_time": "2h 26m 35s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040477}
{"eval_loss": 2.51271057, "eval_runtime": 82.0433, "eval_samples_per_second": 2.231, "eval_steps_per_second": 0.561, "eval_token_acc": 0.81330806, "epoch": 1.58890823, "global_step/max_steps": "400/756", "percentage": "52.91%", "elapsed_time": "2h 46m 4s", "remaining_time": "2h 27m 48s", "memory(GiB)": 31.41, "train_speed(iter/s)": 0.040143}
{"loss": 0.63062172, "grad_norm": 0.42983574, "learning_rate": 4.716e-05, "token_acc": 0.81136857, "epoch": 1.62869933, "global_step/max_steps": "410/756", "percentage": "54.23%", "elapsed_time": "2h 49m 56s", "remaining_time": "2h 23m 25s", "memory(GiB)": 31.45, "train_speed(iter/s)": 0.040208}
{"loss": 0.63269787, "grad_norm": 0.42424557, "learning_rate": 4.498e-05, "token_acc": 0.80917716, "epoch": 1.66849043, "global_step/max_steps": "420/756", "percentage": "55.56%", "elapsed_time": "2h 53m 47s", "remaining_time": "2h 19m 2s", "memory(GiB)": 31.45, "train_speed(iter/s)": 0.040278}
{"loss": 0.63158178, "grad_norm": 0.3841342, "learning_rate": 4.281e-05, "token_acc": 0.80976307, "epoch": 1.70828152, "global_step/max_steps": "430/756", "percentage": "56.88%", "elapsed_time": "2h 57m 35s", "remaining_time": "2h 14m 38s", "memory(GiB)": 31.47, "train_speed(iter/s)": 0.040355}
{"loss": 0.64060121, "grad_norm": 0.40401438, "learning_rate": 4.065e-05, "token_acc": 0.80762307, "epoch": 1.74807262, "global_step/max_steps": "440/756", "percentage": "58.20%", "elapsed_time": "3h 1m 23s", "remaining_time": "2h 10m 16s", "memory(GiB)": 31.47, "train_speed(iter/s)": 0.040429}
{"loss": 0.62615027, "grad_norm": 0.395248, "learning_rate": 3.851e-05, "token_acc": 0.810926, "epoch": 1.78786372, "global_step/max_steps": "450/756", "percentage": "59.52%", "elapsed_time": "3h 5m 2s", "remaining_time": "2h 5m 50s", "memory(GiB)": 31.47, "train_speed(iter/s)": 0.04053}
{"eval_loss": 2.489187, "eval_runtime": 82.0407, "eval_samples_per_second": 2.231, "eval_steps_per_second": 0.561, "eval_token_acc": 0.81433214, "epoch": 1.78786372, "global_step/max_steps": "450/756", "percentage": "59.52%", "elapsed_time": "3h 6m 25s", "remaining_time": "2h 6m 45s", "memory(GiB)": 31.47, "train_speed(iter/s)": 0.040232}
{"loss": 0.6332561, "grad_norm": 0.42475322, "learning_rate": 3.639e-05, "token_acc": 0.80931666, "epoch": 1.82765481, "global_step/max_steps": "460/756", "percentage": "60.85%", "elapsed_time": "3h 10m 12s", "remaining_time": "2h 2m 23s", "memory(GiB)": 31.47, "train_speed(iter/s)": 0.040307}
{"loss": 0.62653365, "grad_norm": 0.40236804, "learning_rate": 3.43e-05, "token_acc": 0.81024547, "epoch": 1.86744591, "global_step/max_steps": "470/756", "percentage": "62.17%", "elapsed_time": "3h 13m 54s", "remaining_time": "1h 57m 59s", "memory(GiB)": 31.47, "train_speed(iter/s)": 0.040398}
{"loss": 0.63048306, "grad_norm": 0.42044917, "learning_rate": 3.224e-05, "token_acc": 0.80941748, "epoch": 1.90723701, "global_step/max_steps": "480/756", "percentage": "63.49%", "elapsed_time": "3h 17m 38s", "remaining_time": "1h 53m 38s", "memory(GiB)": 31.47, "train_speed(iter/s)": 0.040478}
{"loss": 0.63408885, "grad_norm": 0.46262404, "learning_rate": 3.021e-05, "token_acc": 0.80842041, "epoch": 1.9470281, "global_step/max_steps": "490/756", "percentage": "64.81%", "elapsed_time": "3h 21m 23s", "remaining_time": "1h 49m 19s", "memory(GiB)": 31.66, "train_speed(iter/s)": 0.040551}
{"loss": 0.62491407, "grad_norm": 0.40705633, "learning_rate": 2.822e-05, "token_acc": 0.8104753, "epoch": 1.9868192, "global_step/max_steps": "500/756", "percentage": "66.14%", "elapsed_time": "3h 25m 14s", "remaining_time": "1h 45m 4s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040604}
{"eval_loss": 2.46459222, "eval_runtime": 81.9508, "eval_samples_per_second": 2.233, "eval_steps_per_second": 0.561, "eval_token_acc": 0.81585048, "epoch": 1.9868192, "global_step/max_steps": "500/756", "percentage": "66.14%", "elapsed_time": "3h 26m 36s", "remaining_time": "1h 45m 46s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040335}
{"loss": 0.60653248, "grad_norm": 0.40968722, "learning_rate": 2.627e-05, "token_acc": 0.81415899, "epoch": 2.02387466, "global_step/max_steps": "510/756", "percentage": "67.46%", "elapsed_time": "3h 30m 10s", "remaining_time": "1h 41m 22s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040441}
{"loss": 0.60644536, "grad_norm": 0.38200235, "learning_rate": 2.437e-05, "token_acc": 0.81476939, "epoch": 2.06366575, "global_step/max_steps": "520/756", "percentage": "68.78%", "elapsed_time": "3h 34m 2s", "remaining_time": "1h 37m 8s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.04049}
{"loss": 0.60643401, "grad_norm": 0.44535932, "learning_rate": 2.252e-05, "token_acc": 0.81422568, "epoch": 2.10345685, "global_step/max_steps": "530/756", "percentage": "70.11%", "elapsed_time": "3h 37m 54s", "remaining_time": "1h 32m 55s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040538}
{"loss": 0.61101432, "grad_norm": 0.39274883, "learning_rate": 2.072e-05, "token_acc": 0.81452859, "epoch": 2.14324795, "global_step/max_steps": "540/756", "percentage": "71.43%", "elapsed_time": "3h 41m 46s", "remaining_time": "1h 28m 42s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040581}
{"loss": 0.61000729, "grad_norm": 0.46830264, "learning_rate": 1.897e-05, "token_acc": 0.81341713, "epoch": 2.18303905, "global_step/max_steps": "550/756", "percentage": "72.75%", "elapsed_time": "3h 45m 37s", "remaining_time": "1h 24m 30s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040628}
{"eval_loss": 2.45355225, "eval_runtime": 83.0503, "eval_samples_per_second": 2.203, "eval_steps_per_second": 0.554, "eval_token_acc": 0.81649409, "epoch": 2.18303905, "global_step/max_steps": "550/756", "percentage": "72.75%", "elapsed_time": "3h 47m 0s", "remaining_time": "1h 25m 1s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040379}
{"loss": 0.59827032, "grad_norm": 0.45767462, "learning_rate": 1.729e-05, "token_acc": 0.81744438, "epoch": 2.22283014, "global_step/max_steps": "560/756", "percentage": "74.07%", "elapsed_time": "3h 50m 44s", "remaining_time": "1h 20m 45s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040448}
{"loss": 0.61155815, "grad_norm": 0.41861716, "learning_rate": 1.566e-05, "token_acc": 0.81309193, "epoch": 2.26262124, "global_step/max_steps": "570/756", "percentage": "75.40%", "elapsed_time": "3h 54m 41s", "remaining_time": "1h 16m 35s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040478}
{"loss": 0.59858131, "grad_norm": 0.48302349, "learning_rate": 1.411e-05, "token_acc": 0.81787965, "epoch": 2.30241234, "global_step/max_steps": "580/756", "percentage": "76.72%", "elapsed_time": "3h 58m 28s", "remaining_time": "1h 12m 21s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040535}
{"loss": 0.59993834, "grad_norm": 0.45890245, "learning_rate": 1.262e-05, "token_acc": 0.81582084, "epoch": 2.34220343, "global_step/max_steps": "590/756", "percentage": "78.04%", "elapsed_time": "4h 2m 14s", "remaining_time": "1h 8m 9s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040592}
{"loss": 0.60076218, "grad_norm": 0.41991904, "learning_rate": 1.12e-05, "token_acc": 0.81506615, "epoch": 2.38199453, "global_step/max_steps": "600/756", "percentage": "79.37%", "elapsed_time": "4h 6m 8s", "remaining_time": "1h 3m 59s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040626}
{"eval_loss": 2.44298482, "eval_runtime": 81.8948, "eval_samples_per_second": 2.235, "eval_steps_per_second": 0.562, "eval_token_acc": 0.81703102, "epoch": 2.38199453, "global_step/max_steps": "600/756", "percentage": "79.37%", "elapsed_time": "4h 7m 30s", "remaining_time": "1h 4m 21s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040402}
{"loss": 0.60592523, "grad_norm": 0.46010175, "learning_rate": 9.86e-06, "token_acc": 0.81451129, "epoch": 2.42178563, "global_step/max_steps": "610/756", "percentage": "80.69%", "elapsed_time": "4h 11m 20s", "remaining_time": "1h 0m 9s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.04045}
{"loss": 0.60055327, "grad_norm": 0.44221529, "learning_rate": 8.59e-06, "token_acc": 0.81496379, "epoch": 2.46157672, "global_step/max_steps": "620/756", "percentage": "82.01%", "elapsed_time": "4h 15m 3s", "remaining_time": "55m 56s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040513}
{"loss": 0.60854549, "grad_norm": 2.7957778, "learning_rate": 7.41e-06, "token_acc": 0.81355507, "epoch": 2.50136782, "global_step/max_steps": "630/756", "percentage": "83.33%", "elapsed_time": "4h 18m 49s", "remaining_time": "51m 45s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040569}
{"loss": 0.6125968, "grad_norm": 0.5052405, "learning_rate": 6.3e-06, "token_acc": 0.81358937, "epoch": 2.54115892, "global_step/max_steps": "640/756", "percentage": "84.66%", "elapsed_time": "4h 22m 32s", "remaining_time": "47m 35s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040629}
{"loss": 0.61151357, "grad_norm": 0.43676135, "learning_rate": 5.28e-06, "token_acc": 0.81315275, "epoch": 2.58095001, "global_step/max_steps": "650/756", "percentage": "85.98%", "elapsed_time": "4h 26m 12s", "remaining_time": "43m 24s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040695}
{"eval_loss": 2.43642974, "eval_runtime": 81.9213, "eval_samples_per_second": 2.234, "eval_steps_per_second": 0.562, "eval_token_acc": 0.8173546, "epoch": 2.58095001, "global_step/max_steps": "650/756", "percentage": "85.98%", "elapsed_time": "4h 27m 34s", "remaining_time": "43m 38s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040487}
{"loss": 0.60613508, "grad_norm": 0.44147998, "learning_rate": 4.35e-06, "token_acc": 0.81491268, "epoch": 2.62074111, "global_step/max_steps": "660/756", "percentage": "87.30%", "elapsed_time": "4h 31m 20s", "remaining_time": "39m 28s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040539}
{"loss": 0.6118052, "grad_norm": 0.47108141, "learning_rate": 3.5e-06, "token_acc": 0.81243315, "epoch": 2.66053221, "global_step/max_steps": "670/756", "percentage": "88.62%", "elapsed_time": "4h 35m 11s", "remaining_time": "35m 19s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040577}
{"loss": 0.59714527, "grad_norm": 0.43485948, "learning_rate": 2.74e-06, "token_acc": 0.81635524, "epoch": 2.7003233, "global_step/max_steps": "680/756", "percentage": "89.95%", "elapsed_time": "4h 39m 5s", "remaining_time": "31m 11s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040607}
{"loss": 0.60821619, "grad_norm": 0.45727718, "learning_rate": 2.07e-06, "token_acc": 0.81383335, "epoch": 2.7401144, "global_step/max_steps": "690/756", "percentage": "91.27%", "elapsed_time": "4h 42m 47s", "remaining_time": "27m 2s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040666}
{"loss": 0.61759448, "grad_norm": 0.46361193, "learning_rate": 1.49e-06, "token_acc": 0.81136959, "epoch": 2.7799055, "global_step/max_steps": "700/756", "percentage": "92.59%", "elapsed_time": "4h 46m 36s", "remaining_time": "22m 55s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040706}
{"eval_loss": 2.43304062, "eval_runtime": 81.9424, "eval_samples_per_second": 2.233, "eval_steps_per_second": 0.561, "eval_token_acc": 0.81754306, "epoch": 2.7799055, "global_step/max_steps": "700/756", "percentage": "92.59%", "elapsed_time": "4h 47m 58s", "remaining_time": "23m 2s", "memory(GiB)": 31.67, "train_speed(iter/s)": 0.040513}
{"loss": 0.61013942, "grad_norm": 0.46191198, "learning_rate": 1.01e-06, "token_acc": 0.81316501, "epoch": 2.81969659, "global_step/max_steps": "710/756", "percentage": "93.92%", "elapsed_time": "4h 51m 50s", "remaining_time": "18m 54s", "memory(GiB)": 31.68, "train_speed(iter/s)": 0.040546}
{"loss": 0.59742956, "grad_norm": 0.46738109, "learning_rate": 6.2e-07, "token_acc": 0.81762888, "epoch": 2.85948769, "global_step/max_steps": "720/756", "percentage": "95.24%", "elapsed_time": "4h 55m 31s", "remaining_time": "14m 46s", "memory(GiB)": 31.68, "train_speed(iter/s)": 0.040605}
{"loss": 0.61027303, "grad_norm": 0.47740796, "learning_rate": 3.2e-07, "token_acc": 0.81291669, "epoch": 2.89927879, "global_step/max_steps": "730/756", "percentage": "96.56%", "elapsed_time": "4h 59m 17s", "remaining_time": "10m 39s", "memory(GiB)": 31.68, "train_speed(iter/s)": 0.040652}
{"loss": 0.60262928, "grad_norm": 0.39351776, "learning_rate": 1.2e-07, "token_acc": 0.81533332, "epoch": 2.93906988, "global_step/max_steps": "740/756", "percentage": "97.88%", "elapsed_time": "5h 2m 56s", "remaining_time": "6m 33s", "memory(GiB)": 31.68, "train_speed(iter/s)": 0.040711}
{"loss": 0.60620799, "grad_norm": 0.43030033, "learning_rate": 2e-08, "token_acc": 0.81408437, "epoch": 2.97886098, "global_step/max_steps": "750/756", "percentage": "99.21%", "elapsed_time": "5h 6m 43s", "remaining_time": "2m 27s", "memory(GiB)": 31.68, "train_speed(iter/s)": 0.040754}
{"eval_loss": 2.43168187, "eval_runtime": 81.8657, "eval_samples_per_second": 2.235, "eval_steps_per_second": 0.562, "eval_token_acc": 0.81765685, "epoch": 2.97886098, "global_step/max_steps": "750/756", "percentage": "99.21%", "elapsed_time": "5h 8m 5s", "remaining_time": "2m 27s", "memory(GiB)": 31.68, "train_speed(iter/s)": 0.040573}
{"eval_loss": 2.43212795, "eval_runtime": 81.3299, "eval_samples_per_second": 2.25, "eval_steps_per_second": 0.566, "eval_token_acc": 0.81747906, "epoch": 3.0, "global_step/max_steps": "756/756", "percentage": "100.00%", "elapsed_time": "5h 11m 27s", "remaining_time": "0s", "memory(GiB)": 31.68, "train_speed(iter/s)": 0.040455}
{"train_runtime": 18692.3405, "train_samples_per_second": 0.645, "train_steps_per_second": 0.04, "total_flos": 3.421626729187456e+19, "train_loss": 0.66581451, "epoch": 3.0, "global_step/max_steps": "756/756", "percentage": "100.00%", "elapsed_time": "5h 11m 29s", "remaining_time": "0s", "memory(GiB)": 31.68, "train_speed(iter/s)": 0.040451}