Sidharthkr commited on
Commit
2f6b212
·
1 Parent(s): bb1f7f3

Upload trainer_log_history.jsonl with huggingface_hub

Browse files
Files changed (1) hide show
  1. trainer_log_history.jsonl +293 -0
trainer_log_history.jsonl ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"loss": 1.5861, "learning_rate": 2.9999999999999997e-05, "epoch": 0.01, "step": 10}
2
+ {"eval_loss": 1.5546112060546875, "eval_runtime": 23.1798, "eval_samples_per_second": 4.314, "eval_steps_per_second": 0.561, "epoch": 0.01, "step": 10}
3
+ {"loss": 1.5346, "learning_rate": 5.9999999999999995e-05, "epoch": 0.03, "step": 20}
4
+ {"eval_loss": 1.4962812662124634, "eval_runtime": 23.5783, "eval_samples_per_second": 4.241, "eval_steps_per_second": 0.551, "epoch": 0.03, "step": 20}
5
+ {"loss": 1.4226, "learning_rate": 8.999999999999999e-05, "epoch": 0.04, "step": 30}
6
+ {"eval_loss": 1.3235948085784912, "eval_runtime": 23.7142, "eval_samples_per_second": 4.217, "eval_steps_per_second": 0.548, "epoch": 0.04, "step": 30}
7
+ {"loss": 1.1968, "learning_rate": 0.00011999999999999999, "epoch": 0.06, "step": 40}
8
+ {"eval_loss": 1.0622987747192383, "eval_runtime": 23.6224, "eval_samples_per_second": 4.233, "eval_steps_per_second": 0.55, "epoch": 0.06, "step": 40}
9
+ {"loss": 1.0149, "learning_rate": 0.00015, "epoch": 0.07, "step": 50}
10
+ {"eval_loss": 0.9303178191184998, "eval_runtime": 23.6914, "eval_samples_per_second": 4.221, "eval_steps_per_second": 0.549, "epoch": 0.07, "step": 50}
11
+ {"loss": 0.8012, "learning_rate": 0.00017999999999999998, "epoch": 0.08, "step": 60}
12
+ {"eval_loss": 0.884680986404419, "eval_runtime": 23.5854, "eval_samples_per_second": 4.24, "eval_steps_per_second": 0.551, "epoch": 0.08, "step": 60}
13
+ {"loss": 0.9176, "learning_rate": 0.00020999999999999998, "epoch": 0.1, "step": 70}
14
+ {"eval_loss": 0.865105152130127, "eval_runtime": 23.5816, "eval_samples_per_second": 4.241, "eval_steps_per_second": 0.551, "epoch": 0.1, "step": 70}
15
+ {"loss": 0.8454, "learning_rate": 0.00023999999999999998, "epoch": 0.11, "step": 80}
16
+ {"eval_loss": 0.8409528136253357, "eval_runtime": 23.6657, "eval_samples_per_second": 4.226, "eval_steps_per_second": 0.549, "epoch": 0.11, "step": 80}
17
+ {"loss": 0.7406, "learning_rate": 0.00027, "epoch": 0.12, "step": 90}
18
+ {"eval_loss": 0.8041796088218689, "eval_runtime": 23.705, "eval_samples_per_second": 4.219, "eval_steps_per_second": 0.548, "epoch": 0.12, "step": 90}
19
+ {"loss": 0.8262, "learning_rate": 0.0003, "epoch": 0.14, "step": 100}
20
+ {"eval_loss": 0.7901164293289185, "eval_runtime": 23.5789, "eval_samples_per_second": 4.241, "eval_steps_per_second": 0.551, "epoch": 0.14, "step": 100}
21
+ {"loss": 0.7947, "learning_rate": 0.00029958041958041954, "epoch": 0.15, "step": 110}
22
+ {"eval_loss": 0.7940295338630676, "eval_runtime": 23.6534, "eval_samples_per_second": 4.228, "eval_steps_per_second": 0.55, "epoch": 0.15, "step": 110}
23
+ {"loss": 0.8331, "learning_rate": 0.00029916083916083915, "epoch": 0.17, "step": 120}
24
+ {"eval_loss": 0.7761766314506531, "eval_runtime": 23.5587, "eval_samples_per_second": 4.245, "eval_steps_per_second": 0.552, "epoch": 0.17, "step": 120}
25
+ {"loss": 0.7844, "learning_rate": 0.0002987412587412587, "epoch": 0.18, "step": 130}
26
+ {"eval_loss": 0.764004647731781, "eval_runtime": 23.6807, "eval_samples_per_second": 4.223, "eval_steps_per_second": 0.549, "epoch": 0.18, "step": 130}
27
+ {"loss": 0.7601, "learning_rate": 0.0002983216783216783, "epoch": 0.19, "step": 140}
28
+ {"eval_loss": 0.7403784990310669, "eval_runtime": 23.5213, "eval_samples_per_second": 4.251, "eval_steps_per_second": 0.553, "epoch": 0.19, "step": 140}
29
+ {"loss": 0.7325, "learning_rate": 0.0002979020979020979, "epoch": 0.21, "step": 150}
30
+ {"eval_loss": 0.7203081250190735, "eval_runtime": 23.5515, "eval_samples_per_second": 4.246, "eval_steps_per_second": 0.552, "epoch": 0.21, "step": 150}
31
+ {"loss": 0.7265, "learning_rate": 0.00029748251748251746, "epoch": 0.22, "step": 160}
32
+ {"eval_loss": 0.7069114446640015, "eval_runtime": 23.5863, "eval_samples_per_second": 4.24, "eval_steps_per_second": 0.551, "epoch": 0.22, "step": 160}
33
+ {"loss": 0.7468, "learning_rate": 0.000297062937062937, "epoch": 0.23, "step": 170}
34
+ {"eval_loss": 0.7000756859779358, "eval_runtime": 23.5962, "eval_samples_per_second": 4.238, "eval_steps_per_second": 0.551, "epoch": 0.23, "step": 170}
35
+ {"loss": 0.7092, "learning_rate": 0.00029664335664335664, "epoch": 0.25, "step": 180}
36
+ {"eval_loss": 0.6928163766860962, "eval_runtime": 23.5955, "eval_samples_per_second": 4.238, "eval_steps_per_second": 0.551, "epoch": 0.25, "step": 180}
37
+ {"loss": 0.7378, "learning_rate": 0.0002962237762237762, "epoch": 0.26, "step": 190}
38
+ {"eval_loss": 0.6906119585037231, "eval_runtime": 23.5746, "eval_samples_per_second": 4.242, "eval_steps_per_second": 0.551, "epoch": 0.26, "step": 190}
39
+ {"loss": 0.7679, "learning_rate": 0.00029580419580419576, "epoch": 0.28, "step": 200}
40
+ {"eval_loss": 0.6861391663551331, "eval_runtime": 23.6069, "eval_samples_per_second": 4.236, "eval_steps_per_second": 0.551, "epoch": 0.28, "step": 200}
41
+ {"loss": 0.7512, "learning_rate": 0.0002953846153846154, "epoch": 0.29, "step": 210}
42
+ {"eval_loss": 0.6841049194335938, "eval_runtime": 23.5512, "eval_samples_per_second": 4.246, "eval_steps_per_second": 0.552, "epoch": 0.29, "step": 210}
43
+ {"loss": 0.6518, "learning_rate": 0.00029496503496503494, "epoch": 0.3, "step": 220}
44
+ {"eval_loss": 0.6800382137298584, "eval_runtime": 23.5815, "eval_samples_per_second": 4.241, "eval_steps_per_second": 0.551, "epoch": 0.3, "step": 220}
45
+ {"loss": 0.7361, "learning_rate": 0.0002945454545454545, "epoch": 0.32, "step": 230}
46
+ {"eval_loss": 0.6794602870941162, "eval_runtime": 23.6023, "eval_samples_per_second": 4.237, "eval_steps_per_second": 0.551, "epoch": 0.32, "step": 230}
47
+ {"loss": 0.7661, "learning_rate": 0.0002941258741258741, "epoch": 0.33, "step": 240}
48
+ {"eval_loss": 0.6810237169265747, "eval_runtime": 23.55, "eval_samples_per_second": 4.246, "eval_steps_per_second": 0.552, "epoch": 0.33, "step": 240}
49
+ {"loss": 0.5877, "learning_rate": 0.0002937062937062937, "epoch": 0.34, "step": 250}
50
+ {"eval_loss": 0.6788613796234131, "eval_runtime": 23.5943, "eval_samples_per_second": 4.238, "eval_steps_per_second": 0.551, "epoch": 0.34, "step": 250}
51
+ {"loss": 0.655, "learning_rate": 0.00029328671328671325, "epoch": 0.36, "step": 260}
52
+ {"eval_loss": 0.671405553817749, "eval_runtime": 23.6778, "eval_samples_per_second": 4.223, "eval_steps_per_second": 0.549, "epoch": 0.36, "step": 260}
53
+ {"loss": 0.6352, "learning_rate": 0.0002928671328671328, "epoch": 0.37, "step": 270}
54
+ {"eval_loss": 0.6693644523620605, "eval_runtime": 23.5528, "eval_samples_per_second": 4.246, "eval_steps_per_second": 0.552, "epoch": 0.37, "step": 270}
55
+ {"loss": 0.6746, "learning_rate": 0.0002924475524475524, "epoch": 0.39, "step": 280}
56
+ {"eval_loss": 0.6700127124786377, "eval_runtime": 23.5688, "eval_samples_per_second": 4.243, "eval_steps_per_second": 0.552, "epoch": 0.39, "step": 280}
57
+ {"loss": 0.7004, "learning_rate": 0.000292027972027972, "epoch": 0.4, "step": 290}
58
+ {"eval_loss": 0.6715095639228821, "eval_runtime": 23.6593, "eval_samples_per_second": 4.227, "eval_steps_per_second": 0.549, "epoch": 0.4, "step": 290}
59
+ {"loss": 0.6779, "learning_rate": 0.00029160839160839155, "epoch": 0.41, "step": 300}
60
+ {"eval_loss": 0.6686553359031677, "eval_runtime": 23.5853, "eval_samples_per_second": 4.24, "eval_steps_per_second": 0.551, "epoch": 0.41, "step": 300}
61
+ {"loss": 0.621, "learning_rate": 0.00029118881118881117, "epoch": 0.43, "step": 310}
62
+ {"eval_loss": 0.6673153042793274, "eval_runtime": 23.5281, "eval_samples_per_second": 4.25, "eval_steps_per_second": 0.553, "epoch": 0.43, "step": 310}
63
+ {"loss": 0.6755, "learning_rate": 0.00029076923076923073, "epoch": 0.44, "step": 320}
64
+ {"eval_loss": 0.6658429503440857, "eval_runtime": 23.5721, "eval_samples_per_second": 4.242, "eval_steps_per_second": 0.552, "epoch": 0.44, "step": 320}
65
+ {"loss": 0.6725, "learning_rate": 0.00029034965034965035, "epoch": 0.46, "step": 330}
66
+ {"eval_loss": 0.6701672077178955, "eval_runtime": 23.6198, "eval_samples_per_second": 4.234, "eval_steps_per_second": 0.55, "epoch": 0.46, "step": 330}
67
+ {"loss": 0.7197, "learning_rate": 0.0002899300699300699, "epoch": 0.47, "step": 340}
68
+ {"eval_loss": 0.6637719869613647, "eval_runtime": 23.5613, "eval_samples_per_second": 4.244, "eval_steps_per_second": 0.552, "epoch": 0.47, "step": 340}
69
+ {"loss": 0.7827, "learning_rate": 0.0002895104895104895, "epoch": 0.48, "step": 350}
70
+ {"eval_loss": 0.665269672870636, "eval_runtime": 23.7009, "eval_samples_per_second": 4.219, "eval_steps_per_second": 0.549, "epoch": 0.48, "step": 350}
71
+ {"loss": 0.5587, "learning_rate": 0.00028909090909090904, "epoch": 0.5, "step": 360}
72
+ {"eval_loss": 0.6614734530448914, "eval_runtime": 23.5885, "eval_samples_per_second": 4.239, "eval_steps_per_second": 0.551, "epoch": 0.5, "step": 360}
73
+ {"loss": 0.6846, "learning_rate": 0.00028867132867132865, "epoch": 0.51, "step": 370}
74
+ {"eval_loss": 0.6604605317115784, "eval_runtime": 23.5867, "eval_samples_per_second": 4.24, "eval_steps_per_second": 0.551, "epoch": 0.51, "step": 370}
75
+ {"loss": 0.5939, "learning_rate": 0.0002882517482517482, "epoch": 0.52, "step": 380}
76
+ {"eval_loss": 0.6580032110214233, "eval_runtime": 23.6794, "eval_samples_per_second": 4.223, "eval_steps_per_second": 0.549, "epoch": 0.52, "step": 380}
77
+ {"loss": 0.804, "learning_rate": 0.0002878321678321678, "epoch": 0.54, "step": 390}
78
+ {"eval_loss": 0.6574468016624451, "eval_runtime": 23.5623, "eval_samples_per_second": 4.244, "eval_steps_per_second": 0.552, "epoch": 0.54, "step": 390}
79
+ {"loss": 0.6946, "learning_rate": 0.0002874125874125874, "epoch": 0.55, "step": 400}
80
+ {"eval_loss": 0.6552902460098267, "eval_runtime": 23.5759, "eval_samples_per_second": 4.242, "eval_steps_per_second": 0.551, "epoch": 0.55, "step": 400}
81
+ {"loss": 0.6129, "learning_rate": 0.00028699300699300696, "epoch": 0.57, "step": 410}
82
+ {"eval_loss": 0.6540884375572205, "eval_runtime": 23.6844, "eval_samples_per_second": 4.222, "eval_steps_per_second": 0.549, "epoch": 0.57, "step": 410}
83
+ {"loss": 0.6499, "learning_rate": 0.0002865734265734266, "epoch": 0.58, "step": 420}
84
+ {"eval_loss": 0.6544709205627441, "eval_runtime": 23.486, "eval_samples_per_second": 4.258, "eval_steps_per_second": 0.554, "epoch": 0.58, "step": 420}
85
+ {"loss": 0.6199, "learning_rate": 0.00028615384615384614, "epoch": 0.59, "step": 430}
86
+ {"eval_loss": 0.6521677374839783, "eval_runtime": 23.5787, "eval_samples_per_second": 4.241, "eval_steps_per_second": 0.551, "epoch": 0.59, "step": 430}
87
+ {"loss": 0.5753, "learning_rate": 0.0002857342657342657, "epoch": 0.61, "step": 440}
88
+ {"eval_loss": 0.6501143574714661, "eval_runtime": 23.5895, "eval_samples_per_second": 4.239, "eval_steps_per_second": 0.551, "epoch": 0.61, "step": 440}
89
+ {"loss": 0.6175, "learning_rate": 0.00028531468531468526, "epoch": 0.62, "step": 450}
90
+ {"eval_loss": 0.6489108204841614, "eval_runtime": 23.5249, "eval_samples_per_second": 4.251, "eval_steps_per_second": 0.553, "epoch": 0.62, "step": 450}
91
+ {"loss": 0.7238, "learning_rate": 0.0002848951048951049, "epoch": 0.63, "step": 460}
92
+ {"eval_loss": 0.6477003693580627, "eval_runtime": 23.6246, "eval_samples_per_second": 4.233, "eval_steps_per_second": 0.55, "epoch": 0.63, "step": 460}
93
+ {"loss": 0.7032, "learning_rate": 0.00028447552447552444, "epoch": 0.65, "step": 470}
94
+ {"eval_loss": 0.6449102759361267, "eval_runtime": 23.69, "eval_samples_per_second": 4.221, "eval_steps_per_second": 0.549, "epoch": 0.65, "step": 470}
95
+ {"loss": 0.6022, "learning_rate": 0.000284055944055944, "epoch": 0.66, "step": 480}
96
+ {"eval_loss": 0.6427639126777649, "eval_runtime": 23.568, "eval_samples_per_second": 4.243, "eval_steps_per_second": 0.552, "epoch": 0.66, "step": 480}
97
+ {"loss": 0.6425, "learning_rate": 0.0002836363636363636, "epoch": 0.68, "step": 490}
98
+ {"eval_loss": 0.6416438221931458, "eval_runtime": 23.5869, "eval_samples_per_second": 4.24, "eval_steps_per_second": 0.551, "epoch": 0.68, "step": 490}
99
+ {"loss": 0.6723, "learning_rate": 0.0002832167832167832, "epoch": 0.69, "step": 500}
100
+ {"eval_loss": 0.6422706842422485, "eval_runtime": 23.7408, "eval_samples_per_second": 4.212, "eval_steps_per_second": 0.548, "epoch": 0.69, "step": 500}
101
+ {"loss": 0.65, "learning_rate": 0.0002827972027972028, "epoch": 0.7, "step": 510}
102
+ {"eval_loss": 0.640707790851593, "eval_runtime": 23.6305, "eval_samples_per_second": 4.232, "eval_steps_per_second": 0.55, "epoch": 0.7, "step": 510}
103
+ {"loss": 0.5461, "learning_rate": 0.00028237762237762236, "epoch": 0.72, "step": 520}
104
+ {"eval_loss": 0.6413621306419373, "eval_runtime": 23.5561, "eval_samples_per_second": 4.245, "eval_steps_per_second": 0.552, "epoch": 0.72, "step": 520}
105
+ {"loss": 0.5654, "learning_rate": 0.0002819580419580419, "epoch": 0.73, "step": 530}
106
+ {"eval_loss": 0.6443601250648499, "eval_runtime": 23.6625, "eval_samples_per_second": 4.226, "eval_steps_per_second": 0.549, "epoch": 0.73, "step": 530}
107
+ {"loss": 0.685, "learning_rate": 0.0002815384615384615, "epoch": 0.74, "step": 540}
108
+ {"eval_loss": 0.6444653868675232, "eval_runtime": 23.6821, "eval_samples_per_second": 4.223, "eval_steps_per_second": 0.549, "epoch": 0.74, "step": 540}
109
+ {"loss": 0.6196, "learning_rate": 0.0002811188811188811, "epoch": 0.76, "step": 550}
110
+ {"eval_loss": 0.6420193314552307, "eval_runtime": 23.6122, "eval_samples_per_second": 4.235, "eval_steps_per_second": 0.551, "epoch": 0.76, "step": 550}
111
+ {"loss": 0.787, "learning_rate": 0.00028069930069930067, "epoch": 0.77, "step": 560}
112
+ {"eval_loss": 0.6415860652923584, "eval_runtime": 23.6089, "eval_samples_per_second": 4.236, "eval_steps_per_second": 0.551, "epoch": 0.77, "step": 560}
113
+ {"loss": 0.6576, "learning_rate": 0.00028027972027972023, "epoch": 0.79, "step": 570}
114
+ {"eval_loss": 0.643482506275177, "eval_runtime": 23.6156, "eval_samples_per_second": 4.234, "eval_steps_per_second": 0.55, "epoch": 0.79, "step": 570}
115
+ {"loss": 0.6749, "learning_rate": 0.00027986013986013985, "epoch": 0.8, "step": 580}
116
+ {"eval_loss": 0.6405051350593567, "eval_runtime": 23.628, "eval_samples_per_second": 4.232, "eval_steps_per_second": 0.55, "epoch": 0.8, "step": 580}
117
+ {"loss": 0.63, "learning_rate": 0.0002794405594405594, "epoch": 0.81, "step": 590}
118
+ {"eval_loss": 0.6396690011024475, "eval_runtime": 23.6168, "eval_samples_per_second": 4.234, "eval_steps_per_second": 0.55, "epoch": 0.81, "step": 590}
119
+ {"loss": 0.6797, "learning_rate": 0.00027902097902097903, "epoch": 0.83, "step": 600}
120
+ {"eval_loss": 0.6393585801124573, "eval_runtime": 23.5842, "eval_samples_per_second": 4.24, "eval_steps_per_second": 0.551, "epoch": 0.83, "step": 600}
121
+ {"loss": 0.6553, "learning_rate": 0.00027860139860139854, "epoch": 0.84, "step": 610}
122
+ {"eval_loss": 0.6381799578666687, "eval_runtime": 23.5984, "eval_samples_per_second": 4.238, "eval_steps_per_second": 0.551, "epoch": 0.84, "step": 610}
123
+ {"loss": 0.5907, "learning_rate": 0.00027818181818181815, "epoch": 0.86, "step": 620}
124
+ {"eval_loss": 0.638024091720581, "eval_runtime": 23.6565, "eval_samples_per_second": 4.227, "eval_steps_per_second": 0.55, "epoch": 0.86, "step": 620}
125
+ {"loss": 0.6526, "learning_rate": 0.0002777622377622377, "epoch": 0.87, "step": 630}
126
+ {"eval_loss": 0.6353902816772461, "eval_runtime": 23.6865, "eval_samples_per_second": 4.222, "eval_steps_per_second": 0.549, "epoch": 0.87, "step": 630}
127
+ {"loss": 0.6027, "learning_rate": 0.00027734265734265733, "epoch": 0.88, "step": 640}
128
+ {"eval_loss": 0.6318895816802979, "eval_runtime": 23.544, "eval_samples_per_second": 4.247, "eval_steps_per_second": 0.552, "epoch": 0.88, "step": 640}
129
+ {"loss": 0.5499, "learning_rate": 0.0002769230769230769, "epoch": 0.9, "step": 650}
130
+ {"eval_loss": 0.6283926963806152, "eval_runtime": 23.5591, "eval_samples_per_second": 4.245, "eval_steps_per_second": 0.552, "epoch": 0.9, "step": 650}
131
+ {"loss": 0.5983, "learning_rate": 0.00027650349650349646, "epoch": 0.91, "step": 660}
132
+ {"eval_loss": 0.6258216500282288, "eval_runtime": 23.6975, "eval_samples_per_second": 4.22, "eval_steps_per_second": 0.549, "epoch": 0.91, "step": 660}
133
+ {"loss": 0.6189, "learning_rate": 0.0002760839160839161, "epoch": 0.92, "step": 670}
134
+ {"eval_loss": 0.624355673789978, "eval_runtime": 23.5901, "eval_samples_per_second": 4.239, "eval_steps_per_second": 0.551, "epoch": 0.92, "step": 670}
135
+ {"loss": 0.6977, "learning_rate": 0.00027566433566433564, "epoch": 0.94, "step": 680}
136
+ {"eval_loss": 0.6230638027191162, "eval_runtime": 23.5414, "eval_samples_per_second": 4.248, "eval_steps_per_second": 0.552, "epoch": 0.94, "step": 680}
137
+ {"loss": 0.6097, "learning_rate": 0.00027524475524475525, "epoch": 0.95, "step": 690}
138
+ {"eval_loss": 0.6207563281059265, "eval_runtime": 23.5276, "eval_samples_per_second": 4.25, "eval_steps_per_second": 0.553, "epoch": 0.95, "step": 690}
139
+ {"loss": 0.5457, "learning_rate": 0.00027482517482517476, "epoch": 0.97, "step": 700}
140
+ {"eval_loss": 0.6207029223442078, "eval_runtime": 23.5497, "eval_samples_per_second": 4.246, "eval_steps_per_second": 0.552, "epoch": 0.97, "step": 700}
141
+ {"loss": 0.5926, "learning_rate": 0.0002744055944055944, "epoch": 0.98, "step": 710}
142
+ {"eval_loss": 0.6202435493469238, "eval_runtime": 23.685, "eval_samples_per_second": 4.222, "eval_steps_per_second": 0.549, "epoch": 0.98, "step": 710}
143
+ {"loss": 0.6451, "learning_rate": 0.00027398601398601394, "epoch": 0.99, "step": 720}
144
+ {"eval_loss": 0.6203433871269226, "eval_runtime": 23.5624, "eval_samples_per_second": 4.244, "eval_steps_per_second": 0.552, "epoch": 0.99, "step": 720}
145
+ {"loss": 0.5877, "learning_rate": 0.00027356643356643356, "epoch": 1.01, "step": 730}
146
+ {"eval_loss": 0.6191022396087646, "eval_runtime": 23.5692, "eval_samples_per_second": 4.243, "eval_steps_per_second": 0.552, "epoch": 1.01, "step": 730}
147
+ {"loss": 0.7144, "learning_rate": 0.0002731468531468531, "epoch": 1.02, "step": 740}
148
+ {"eval_loss": 0.6193973422050476, "eval_runtime": 23.712, "eval_samples_per_second": 4.217, "eval_steps_per_second": 0.548, "epoch": 1.02, "step": 740}
149
+ {"loss": 0.5671, "learning_rate": 0.0002727272727272727, "epoch": 1.03, "step": 750}
150
+ {"eval_loss": 0.6172534227371216, "eval_runtime": 23.541, "eval_samples_per_second": 4.248, "eval_steps_per_second": 0.552, "epoch": 1.03, "step": 750}
151
+ {"loss": 0.6423, "learning_rate": 0.0002723076923076923, "epoch": 1.05, "step": 760}
152
+ {"eval_loss": 0.6179550290107727, "eval_runtime": 23.5459, "eval_samples_per_second": 4.247, "eval_steps_per_second": 0.552, "epoch": 1.05, "step": 760}
153
+ {"loss": 0.657, "learning_rate": 0.00027188811188811186, "epoch": 1.06, "step": 770}
154
+ {"eval_loss": 0.6190667152404785, "eval_runtime": 23.6657, "eval_samples_per_second": 4.226, "eval_steps_per_second": 0.549, "epoch": 1.06, "step": 770}
155
+ {"loss": 0.6178, "learning_rate": 0.0002714685314685315, "epoch": 1.08, "step": 780}
156
+ {"eval_loss": 0.6176871657371521, "eval_runtime": 23.6599, "eval_samples_per_second": 4.227, "eval_steps_per_second": 0.549, "epoch": 1.08, "step": 780}
157
+ {"loss": 0.6659, "learning_rate": 0.000271048951048951, "epoch": 1.09, "step": 790}
158
+ {"eval_loss": 0.6174372434616089, "eval_runtime": 23.5345, "eval_samples_per_second": 4.249, "eval_steps_per_second": 0.552, "epoch": 1.09, "step": 790}
159
+ {"loss": 0.6216, "learning_rate": 0.0002706293706293706, "epoch": 1.1, "step": 800}
160
+ {"eval_loss": 0.6179863214492798, "eval_runtime": 23.5446, "eval_samples_per_second": 4.247, "eval_steps_per_second": 0.552, "epoch": 1.1, "step": 800}
161
+ {"loss": 0.5623, "learning_rate": 0.00027020979020979017, "epoch": 1.12, "step": 810}
162
+ {"eval_loss": 0.6160795092582703, "eval_runtime": 23.6019, "eval_samples_per_second": 4.237, "eval_steps_per_second": 0.551, "epoch": 1.12, "step": 810}
163
+ {"loss": 0.544, "learning_rate": 0.0002697902097902098, "epoch": 1.13, "step": 820}
164
+ {"eval_loss": 0.6154199838638306, "eval_runtime": 23.5405, "eval_samples_per_second": 4.248, "eval_steps_per_second": 0.552, "epoch": 1.13, "step": 820}
165
+ {"loss": 0.5405, "learning_rate": 0.00026937062937062935, "epoch": 1.14, "step": 830}
166
+ {"eval_loss": 0.6137506365776062, "eval_runtime": 23.6357, "eval_samples_per_second": 4.231, "eval_steps_per_second": 0.55, "epoch": 1.14, "step": 830}
167
+ {"loss": 0.5871, "learning_rate": 0.0002689510489510489, "epoch": 1.16, "step": 840}
168
+ {"eval_loss": 0.6168184876441956, "eval_runtime": 23.587, "eval_samples_per_second": 4.24, "eval_steps_per_second": 0.551, "epoch": 1.16, "step": 840}
169
+ {"loss": 0.688, "learning_rate": 0.0002685314685314685, "epoch": 1.17, "step": 850}
170
+ {"eval_loss": 0.6159818768501282, "eval_runtime": 23.6104, "eval_samples_per_second": 4.235, "eval_steps_per_second": 0.551, "epoch": 1.17, "step": 850}
171
+ {"loss": 0.6124, "learning_rate": 0.0002681118881118881, "epoch": 1.19, "step": 860}
172
+ {"eval_loss": 0.611853837966919, "eval_runtime": 23.5647, "eval_samples_per_second": 4.244, "eval_steps_per_second": 0.552, "epoch": 1.19, "step": 860}
173
+ {"loss": 0.629, "learning_rate": 0.0002676923076923077, "epoch": 1.2, "step": 870}
174
+ {"eval_loss": 0.6117254495620728, "eval_runtime": 23.5692, "eval_samples_per_second": 4.243, "eval_steps_per_second": 0.552, "epoch": 1.2, "step": 870}
175
+ {"loss": 0.5151, "learning_rate": 0.0002672727272727272, "epoch": 1.21, "step": 880}
176
+ {"eval_loss": 0.6104335784912109, "eval_runtime": 23.6021, "eval_samples_per_second": 4.237, "eval_steps_per_second": 0.551, "epoch": 1.21, "step": 880}
177
+ {"loss": 0.5627, "learning_rate": 0.00026685314685314683, "epoch": 1.23, "step": 890}
178
+ {"eval_loss": 0.6086432933807373, "eval_runtime": 23.6595, "eval_samples_per_second": 4.227, "eval_steps_per_second": 0.549, "epoch": 1.23, "step": 890}
179
+ {"loss": 0.5814, "learning_rate": 0.0002664335664335664, "epoch": 1.24, "step": 900}
180
+ {"eval_loss": 0.6092746257781982, "eval_runtime": 23.5251, "eval_samples_per_second": 4.251, "eval_steps_per_second": 0.553, "epoch": 1.24, "step": 900}
181
+ {"loss": 0.5602, "learning_rate": 0.000266013986013986, "epoch": 1.26, "step": 910}
182
+ {"eval_loss": 0.608718752861023, "eval_runtime": 23.5334, "eval_samples_per_second": 4.249, "eval_steps_per_second": 0.552, "epoch": 1.26, "step": 910}
183
+ {"loss": 0.6004, "learning_rate": 0.0002655944055944056, "epoch": 1.27, "step": 920}
184
+ {"eval_loss": 0.6084469556808472, "eval_runtime": 23.5616, "eval_samples_per_second": 4.244, "eval_steps_per_second": 0.552, "epoch": 1.27, "step": 920}
185
+ {"loss": 0.5979, "learning_rate": 0.00026517482517482514, "epoch": 1.28, "step": 930}
186
+ {"eval_loss": 0.6070427298545837, "eval_runtime": 23.5628, "eval_samples_per_second": 4.244, "eval_steps_per_second": 0.552, "epoch": 1.28, "step": 930}
187
+ {"loss": 0.6662, "learning_rate": 0.00026475524475524475, "epoch": 1.3, "step": 940}
188
+ {"eval_loss": 0.607434868812561, "eval_runtime": 23.6199, "eval_samples_per_second": 4.234, "eval_steps_per_second": 0.55, "epoch": 1.3, "step": 940}
189
+ {"loss": 0.6447, "learning_rate": 0.0002643356643356643, "epoch": 1.31, "step": 950}
190
+ {"eval_loss": 0.6066017746925354, "eval_runtime": 23.5508, "eval_samples_per_second": 4.246, "eval_steps_per_second": 0.552, "epoch": 1.31, "step": 950}
191
+ {"loss": 0.588, "learning_rate": 0.00026391608391608393, "epoch": 1.32, "step": 960}
192
+ {"eval_loss": 0.605922281742096, "eval_runtime": 23.5334, "eval_samples_per_second": 4.249, "eval_steps_per_second": 0.552, "epoch": 1.32, "step": 960}
193
+ {"loss": 0.6808, "learning_rate": 0.00026349650349650344, "epoch": 1.34, "step": 970}
194
+ {"eval_loss": 0.6060763597488403, "eval_runtime": 23.6283, "eval_samples_per_second": 4.232, "eval_steps_per_second": 0.55, "epoch": 1.34, "step": 970}
195
+ {"loss": 0.7089, "learning_rate": 0.00026307692307692306, "epoch": 1.35, "step": 980}
196
+ {"eval_loss": 0.6056197881698608, "eval_runtime": 23.5199, "eval_samples_per_second": 4.252, "eval_steps_per_second": 0.553, "epoch": 1.35, "step": 980}
197
+ {"loss": 0.6435, "learning_rate": 0.0002626573426573426, "epoch": 1.37, "step": 990}
198
+ {"eval_loss": 0.6043457388877869, "eval_runtime": 23.4979, "eval_samples_per_second": 4.256, "eval_steps_per_second": 0.553, "epoch": 1.37, "step": 990}
199
+ {"loss": 0.5691, "learning_rate": 0.00026223776223776224, "epoch": 1.38, "step": 1000}
200
+ {"eval_loss": 0.6017763018608093, "eval_runtime": 23.6214, "eval_samples_per_second": 4.233, "eval_steps_per_second": 0.55, "epoch": 1.38, "step": 1000}
201
+ {"loss": 0.4584, "learning_rate": 0.0002618181818181818, "epoch": 1.39, "step": 1010}
202
+ {"eval_loss": 0.6021450757980347, "eval_runtime": 23.5098, "eval_samples_per_second": 4.254, "eval_steps_per_second": 0.553, "epoch": 1.39, "step": 1010}
203
+ {"loss": 0.6848, "learning_rate": 0.00026139860139860136, "epoch": 1.41, "step": 1020}
204
+ {"eval_loss": 0.6020896434783936, "eval_runtime": 23.5, "eval_samples_per_second": 4.255, "eval_steps_per_second": 0.553, "epoch": 1.41, "step": 1020}
205
+ {"loss": 0.5807, "learning_rate": 0.000260979020979021, "epoch": 1.42, "step": 1030}
206
+ {"eval_loss": 0.6019191145896912, "eval_runtime": 23.6199, "eval_samples_per_second": 4.234, "eval_steps_per_second": 0.55, "epoch": 1.42, "step": 1030}
207
+ {"loss": 0.5409, "learning_rate": 0.00026055944055944054, "epoch": 1.43, "step": 1040}
208
+ {"eval_loss": 0.6014266014099121, "eval_runtime": 23.4902, "eval_samples_per_second": 4.257, "eval_steps_per_second": 0.553, "epoch": 1.43, "step": 1040}
209
+ {"loss": 0.5266, "learning_rate": 0.0002601398601398601, "epoch": 1.45, "step": 1050}
210
+ {"eval_loss": 0.6019847989082336, "eval_runtime": 23.5346, "eval_samples_per_second": 4.249, "eval_steps_per_second": 0.552, "epoch": 1.45, "step": 1050}
211
+ {"loss": 0.6526, "learning_rate": 0.00025972027972027967, "epoch": 1.46, "step": 1060}
212
+ {"eval_loss": 0.6054437756538391, "eval_runtime": 23.5086, "eval_samples_per_second": 4.254, "eval_steps_per_second": 0.553, "epoch": 1.46, "step": 1060}
213
+ {"loss": 0.6598, "learning_rate": 0.0002593006993006993, "epoch": 1.48, "step": 1070}
214
+ {"eval_loss": 0.6089524626731873, "eval_runtime": 23.5442, "eval_samples_per_second": 4.247, "eval_steps_per_second": 0.552, "epoch": 1.48, "step": 1070}
215
+ {"loss": 0.4933, "learning_rate": 0.00025888111888111885, "epoch": 1.49, "step": 1080}
216
+ {"eval_loss": 0.601382315158844, "eval_runtime": 23.5524, "eval_samples_per_second": 4.246, "eval_steps_per_second": 0.552, "epoch": 1.49, "step": 1080}
217
+ {"loss": 0.5707, "learning_rate": 0.00025846153846153846, "epoch": 1.5, "step": 1090}
218
+ {"eval_loss": 0.5991722345352173, "eval_runtime": 23.5666, "eval_samples_per_second": 4.243, "eval_steps_per_second": 0.552, "epoch": 1.5, "step": 1090}
219
+ {"loss": 0.7365, "learning_rate": 0.000258041958041958, "epoch": 1.52, "step": 1100}
220
+ {"eval_loss": 0.6007959246635437, "eval_runtime": 23.659, "eval_samples_per_second": 4.227, "eval_steps_per_second": 0.549, "epoch": 1.52, "step": 1100}
221
+ {"loss": 0.5684, "learning_rate": 0.0002576223776223776, "epoch": 1.53, "step": 1110}
222
+ {"eval_loss": 0.5978883504867554, "eval_runtime": 23.557, "eval_samples_per_second": 4.245, "eval_steps_per_second": 0.552, "epoch": 1.53, "step": 1110}
223
+ {"loss": 0.6895, "learning_rate": 0.0002572027972027972, "epoch": 1.54, "step": 1120}
224
+ {"eval_loss": 0.5960245728492737, "eval_runtime": 23.5344, "eval_samples_per_second": 4.249, "eval_steps_per_second": 0.552, "epoch": 1.54, "step": 1120}
225
+ {"loss": 0.5413, "learning_rate": 0.00025678321678321677, "epoch": 1.56, "step": 1130}
226
+ {"eval_loss": 0.5944367051124573, "eval_runtime": 23.6758, "eval_samples_per_second": 4.224, "eval_steps_per_second": 0.549, "epoch": 1.56, "step": 1130}
227
+ {"loss": 0.6234, "learning_rate": 0.00025636363636363633, "epoch": 1.57, "step": 1140}
228
+ {"eval_loss": 0.5943716764450073, "eval_runtime": 23.5334, "eval_samples_per_second": 4.249, "eval_steps_per_second": 0.552, "epoch": 1.57, "step": 1140}
229
+ {"loss": 0.4974, "learning_rate": 0.0002559440559440559, "epoch": 1.59, "step": 1150}
230
+ {"eval_loss": 0.5943745970726013, "eval_runtime": 23.6205, "eval_samples_per_second": 4.234, "eval_steps_per_second": 0.55, "epoch": 1.59, "step": 1150}
231
+ {"loss": 0.5585, "learning_rate": 0.0002555244755244755, "epoch": 1.6, "step": 1160}
232
+ {"eval_loss": 0.5932533740997314, "eval_runtime": 23.5191, "eval_samples_per_second": 4.252, "eval_steps_per_second": 0.553, "epoch": 1.6, "step": 1160}
233
+ {"loss": 0.6533, "learning_rate": 0.0002551048951048951, "epoch": 1.61, "step": 1170}
234
+ {"eval_loss": 0.5927255749702454, "eval_runtime": 23.4764, "eval_samples_per_second": 4.26, "eval_steps_per_second": 0.554, "epoch": 1.61, "step": 1170}
235
+ {"loss": 0.5602, "learning_rate": 0.0002546853146853147, "epoch": 1.63, "step": 1180}
236
+ {"eval_loss": 0.5937183499336243, "eval_runtime": 23.6869, "eval_samples_per_second": 4.222, "eval_steps_per_second": 0.549, "epoch": 1.63, "step": 1180}
237
+ {"loss": 0.658, "learning_rate": 0.00025426573426573425, "epoch": 1.64, "step": 1190}
238
+ {"eval_loss": 0.5941335558891296, "eval_runtime": 23.6582, "eval_samples_per_second": 4.227, "eval_steps_per_second": 0.549, "epoch": 1.64, "step": 1190}
239
+ {"loss": 0.5749, "learning_rate": 0.0002538461538461538, "epoch": 1.66, "step": 1200}
240
+ {"eval_loss": 0.5928318500518799, "eval_runtime": 23.5647, "eval_samples_per_second": 4.244, "eval_steps_per_second": 0.552, "epoch": 1.66, "step": 1200}
241
+ {"loss": 0.6214, "learning_rate": 0.00025342657342657343, "epoch": 1.67, "step": 1210}
242
+ {"eval_loss": 0.5921968221664429, "eval_runtime": 23.5253, "eval_samples_per_second": 4.251, "eval_steps_per_second": 0.553, "epoch": 1.67, "step": 1210}
243
+ {"loss": 0.5356, "learning_rate": 0.000253006993006993, "epoch": 1.68, "step": 1220}
244
+ {"eval_loss": 0.5913729667663574, "eval_runtime": 23.5345, "eval_samples_per_second": 4.249, "eval_steps_per_second": 0.552, "epoch": 1.68, "step": 1220}
245
+ {"loss": 0.616, "learning_rate": 0.00025258741258741256, "epoch": 1.7, "step": 1230}
246
+ {"eval_loss": 0.5925624370574951, "eval_runtime": 23.56, "eval_samples_per_second": 4.244, "eval_steps_per_second": 0.552, "epoch": 1.7, "step": 1230}
247
+ {"loss": 0.6622, "learning_rate": 0.0002521678321678321, "epoch": 1.71, "step": 1240}
248
+ {"eval_loss": 0.591957688331604, "eval_runtime": 23.6371, "eval_samples_per_second": 4.231, "eval_steps_per_second": 0.55, "epoch": 1.71, "step": 1240}
249
+ {"loss": 0.5844, "learning_rate": 0.00025174825174825174, "epoch": 1.72, "step": 1250}
250
+ {"eval_loss": 0.5911493897438049, "eval_runtime": 23.6677, "eval_samples_per_second": 4.225, "eval_steps_per_second": 0.549, "epoch": 1.72, "step": 1250}
251
+ {"loss": 0.5539, "learning_rate": 0.0002513286713286713, "epoch": 1.74, "step": 1260}
252
+ {"eval_loss": 0.5910014510154724, "eval_runtime": 23.552, "eval_samples_per_second": 4.246, "eval_steps_per_second": 0.552, "epoch": 1.74, "step": 1260}
253
+ {"loss": 0.5968, "learning_rate": 0.00025090909090909086, "epoch": 1.75, "step": 1270}
254
+ {"eval_loss": 0.5909925699234009, "eval_runtime": 23.5627, "eval_samples_per_second": 4.244, "eval_steps_per_second": 0.552, "epoch": 1.75, "step": 1270}
255
+ {"loss": 0.4834, "learning_rate": 0.0002504895104895105, "epoch": 1.77, "step": 1280}
256
+ {"eval_loss": 0.5910032987594604, "eval_runtime": 23.5516, "eval_samples_per_second": 4.246, "eval_steps_per_second": 0.552, "epoch": 1.77, "step": 1280}
257
+ {"loss": 0.6222, "learning_rate": 0.00025006993006993004, "epoch": 1.78, "step": 1290}
258
+ {"eval_loss": 0.5898649096488953, "eval_runtime": 23.6624, "eval_samples_per_second": 4.226, "eval_steps_per_second": 0.549, "epoch": 1.78, "step": 1290}
259
+ {"loss": 0.5424, "learning_rate": 0.00024965034965034966, "epoch": 1.79, "step": 1300}
260
+ {"eval_loss": 0.590370774269104, "eval_runtime": 23.5622, "eval_samples_per_second": 4.244, "eval_steps_per_second": 0.552, "epoch": 1.79, "step": 1300}
261
+ {"loss": 0.6267, "learning_rate": 0.0002492307692307692, "epoch": 1.81, "step": 1310}
262
+ {"eval_loss": 0.5873252749443054, "eval_runtime": 23.5328, "eval_samples_per_second": 4.249, "eval_steps_per_second": 0.552, "epoch": 1.81, "step": 1310}
263
+ {"loss": 0.6605, "learning_rate": 0.0002488111888111888, "epoch": 1.82, "step": 1320}
264
+ {"eval_loss": 0.5867363214492798, "eval_runtime": 23.6272, "eval_samples_per_second": 4.232, "eval_steps_per_second": 0.55, "epoch": 1.82, "step": 1320}
265
+ {"loss": 0.5647, "learning_rate": 0.00024839160839160835, "epoch": 1.83, "step": 1330}
266
+ {"eval_loss": 0.5863688588142395, "eval_runtime": 23.5295, "eval_samples_per_second": 4.25, "eval_steps_per_second": 0.552, "epoch": 1.83, "step": 1330}
267
+ {"loss": 0.5607, "learning_rate": 0.00024797202797202796, "epoch": 1.85, "step": 1340}
268
+ {"eval_loss": 0.5849428176879883, "eval_runtime": 23.5386, "eval_samples_per_second": 4.248, "eval_steps_per_second": 0.552, "epoch": 1.85, "step": 1340}
269
+ {"loss": 0.6948, "learning_rate": 0.0002475524475524475, "epoch": 1.86, "step": 1350}
270
+ {"eval_loss": 0.585557222366333, "eval_runtime": 23.6955, "eval_samples_per_second": 4.22, "eval_steps_per_second": 0.549, "epoch": 1.86, "step": 1350}
271
+ {"loss": 0.6667, "learning_rate": 0.0002471328671328671, "epoch": 1.88, "step": 1360}
272
+ {"eval_loss": 0.5850853323936462, "eval_runtime": 23.5655, "eval_samples_per_second": 4.243, "eval_steps_per_second": 0.552, "epoch": 1.88, "step": 1360}
273
+ {"loss": 0.6335, "learning_rate": 0.0002467132867132867, "epoch": 1.89, "step": 1370}
274
+ {"eval_loss": 0.5850026607513428, "eval_runtime": 23.5045, "eval_samples_per_second": 4.255, "eval_steps_per_second": 0.553, "epoch": 1.89, "step": 1370}
275
+ {"loss": 0.601, "learning_rate": 0.00024629370629370627, "epoch": 1.9, "step": 1380}
276
+ {"eval_loss": 0.5849950909614563, "eval_runtime": 23.6286, "eval_samples_per_second": 4.232, "eval_steps_per_second": 0.55, "epoch": 1.9, "step": 1380}
277
+ {"loss": 0.4668, "learning_rate": 0.0002458741258741259, "epoch": 1.92, "step": 1390}
278
+ {"eval_loss": 0.5844566226005554, "eval_runtime": 23.5099, "eval_samples_per_second": 4.254, "eval_steps_per_second": 0.553, "epoch": 1.92, "step": 1390}
279
+ {"loss": 0.5218, "learning_rate": 0.00024545454545454545, "epoch": 1.93, "step": 1400}
280
+ {"eval_loss": 0.583265483379364, "eval_runtime": 23.5127, "eval_samples_per_second": 4.253, "eval_steps_per_second": 0.553, "epoch": 1.93, "step": 1400}
281
+ {"loss": 0.5104, "learning_rate": 0.000245034965034965, "epoch": 1.94, "step": 1410}
282
+ {"eval_loss": 0.5836408734321594, "eval_runtime": 23.7146, "eval_samples_per_second": 4.217, "eval_steps_per_second": 0.548, "epoch": 1.94, "step": 1410}
283
+ {"loss": 0.7134, "learning_rate": 0.0002446153846153846, "epoch": 1.96, "step": 1420}
284
+ {"eval_loss": 0.5841034650802612, "eval_runtime": 23.6571, "eval_samples_per_second": 4.227, "eval_steps_per_second": 0.55, "epoch": 1.96, "step": 1420}
285
+ {"loss": 0.5728, "learning_rate": 0.0002441958041958042, "epoch": 1.97, "step": 1430}
286
+ {"eval_loss": 0.5834821462631226, "eval_runtime": 23.4792, "eval_samples_per_second": 4.259, "eval_steps_per_second": 0.554, "epoch": 1.97, "step": 1430}
287
+ {"loss": 0.5703, "learning_rate": 0.00024377622377622378, "epoch": 1.99, "step": 1440}
288
+ {"eval_loss": 0.5817570686340332, "eval_runtime": 23.5093, "eval_samples_per_second": 4.254, "eval_steps_per_second": 0.553, "epoch": 1.99, "step": 1440}
289
+ {"loss": 0.5527, "learning_rate": 0.00024335664335664332, "epoch": 2.0, "step": 1450}
290
+ {"eval_loss": 0.5805172920227051, "eval_runtime": 23.5889, "eval_samples_per_second": 4.239, "eval_steps_per_second": 0.551, "epoch": 2.0, "step": 1450}
291
+ {"loss": 0.6111, "learning_rate": 0.0002429370629370629, "epoch": 2.01, "step": 1460}
292
+ {"eval_loss": 0.57992023229599, "eval_runtime": 23.5146, "eval_samples_per_second": 4.253, "eval_steps_per_second": 0.553, "epoch": 2.01, "step": 1460}
293
+ {"train_runtime": 7921.6312, "train_samples_per_second": 3.661, "train_steps_per_second": 0.915, "total_flos": 3.015155661857096e+17, "train_loss": 0.6625166618660705, "epoch": 2.01, "step": 1460}