| { |
| "best_global_step": 23000, |
| "best_metric": 3.3681890964508057, |
| "best_model_checkpoint": "models/plausigpt/checkpoint-20000", |
| "epoch": 80.18872375560274, |
| "eval_steps": 1000, |
| "global_step": 85000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.09436187780136825, |
| "grad_norm": 2.9101743698120117, |
| "learning_rate": 9.9e-07, |
| "loss": 10.3242, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1887237556027365, |
| "grad_norm": 2.162980556488037, |
| "learning_rate": 1.99e-06, |
| "loss": 9.296, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.28308563340410475, |
| "grad_norm": 2.0969018936157227, |
| "learning_rate": 2.99e-06, |
| "loss": 8.7898, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.377447511205473, |
| "grad_norm": 1.897030234336853, |
| "learning_rate": 3.99e-06, |
| "loss": 8.2757, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.4718093890068412, |
| "grad_norm": 1.666489601135254, |
| "learning_rate": 4.9900000000000005e-06, |
| "loss": 7.8218, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5661712668082095, |
| "grad_norm": 1.5309317111968994, |
| "learning_rate": 5.99e-06, |
| "loss": 7.3494, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.6605331446095777, |
| "grad_norm": 1.2604295015335083, |
| "learning_rate": 6.990000000000001e-06, |
| "loss": 6.8932, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.754895022410946, |
| "grad_norm": 1.1508737802505493, |
| "learning_rate": 7.99e-06, |
| "loss": 6.5159, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.8492569002123143, |
| "grad_norm": 1.1227755546569824, |
| "learning_rate": 8.99e-06, |
| "loss": 6.2373, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.9436187780136824, |
| "grad_norm": 1.1233224868774414, |
| "learning_rate": 9.990000000000001e-06, |
| "loss": 6.053, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.9436187780136824, |
| "eval_loss": 5.930422782897949, |
| "eval_runtime": 89.9185, |
| "eval_samples_per_second": 167.607, |
| "eval_steps_per_second": 5.238, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.0377447511205473, |
| "grad_norm": 1.1704257726669312, |
| "learning_rate": 1.099e-05, |
| "loss": 5.9047, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.1321066289219155, |
| "grad_norm": 1.0163617134094238, |
| "learning_rate": 1.199e-05, |
| "loss": 5.7717, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.226468506723284, |
| "grad_norm": 0.9280975461006165, |
| "learning_rate": 1.299e-05, |
| "loss": 5.7047, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.320830384524652, |
| "grad_norm": 1.1254018545150757, |
| "learning_rate": 1.399e-05, |
| "loss": 5.6121, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.4151922623260202, |
| "grad_norm": 1.5115385055541992, |
| "learning_rate": 1.499e-05, |
| "loss": 5.5445, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.5095541401273884, |
| "grad_norm": 1.1451691389083862, |
| "learning_rate": 1.599e-05, |
| "loss": 5.4659, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.6039160179287568, |
| "grad_norm": 1.0520662069320679, |
| "learning_rate": 1.699e-05, |
| "loss": 5.4362, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.698277895730125, |
| "grad_norm": 1.2363018989562988, |
| "learning_rate": 1.7990000000000002e-05, |
| "loss": 5.3637, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.7926397735314934, |
| "grad_norm": 1.2105119228363037, |
| "learning_rate": 1.8990000000000003e-05, |
| "loss": 5.3063, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.8870016513328616, |
| "grad_norm": 1.5523535013198853, |
| "learning_rate": 1.999e-05, |
| "loss": 5.2543, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.8870016513328616, |
| "eval_loss": 5.177983283996582, |
| "eval_runtime": 89.9532, |
| "eval_samples_per_second": 167.543, |
| "eval_steps_per_second": 5.236, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.9813635291342298, |
| "grad_norm": 1.5004462003707886, |
| "learning_rate": 2.099e-05, |
| "loss": 5.1994, |
| "step": 2100 |
| }, |
| { |
| "epoch": 2.0754895022410946, |
| "grad_norm": 1.4016139507293701, |
| "learning_rate": 2.199e-05, |
| "loss": 5.1422, |
| "step": 2200 |
| }, |
| { |
| "epoch": 2.169851380042463, |
| "grad_norm": 1.5630654096603394, |
| "learning_rate": 2.2990000000000002e-05, |
| "loss": 5.0975, |
| "step": 2300 |
| }, |
| { |
| "epoch": 2.264213257843831, |
| "grad_norm": 1.373801827430725, |
| "learning_rate": 2.3990000000000002e-05, |
| "loss": 5.0522, |
| "step": 2400 |
| }, |
| { |
| "epoch": 2.358575135645199, |
| "grad_norm": 1.3656255006790161, |
| "learning_rate": 2.4990000000000003e-05, |
| "loss": 5.0166, |
| "step": 2500 |
| }, |
| { |
| "epoch": 2.452937013446568, |
| "grad_norm": 1.4151180982589722, |
| "learning_rate": 2.5990000000000004e-05, |
| "loss": 4.9653, |
| "step": 2600 |
| }, |
| { |
| "epoch": 2.547298891247936, |
| "grad_norm": 1.5235018730163574, |
| "learning_rate": 2.6989999999999997e-05, |
| "loss": 4.9166, |
| "step": 2700 |
| }, |
| { |
| "epoch": 2.641660769049304, |
| "grad_norm": 1.2987467050552368, |
| "learning_rate": 2.7989999999999998e-05, |
| "loss": 4.877, |
| "step": 2800 |
| }, |
| { |
| "epoch": 2.7360226468506723, |
| "grad_norm": 1.3841923475265503, |
| "learning_rate": 2.8990000000000002e-05, |
| "loss": 4.8477, |
| "step": 2900 |
| }, |
| { |
| "epoch": 2.8303845246520405, |
| "grad_norm": 1.5736286640167236, |
| "learning_rate": 2.9990000000000003e-05, |
| "loss": 4.7941, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.8303845246520405, |
| "eval_loss": 4.716542720794678, |
| "eval_runtime": 89.9497, |
| "eval_samples_per_second": 167.549, |
| "eval_steps_per_second": 5.236, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.9247464024534087, |
| "grad_norm": 1.404842734336853, |
| "learning_rate": 3.099e-05, |
| "loss": 4.7567, |
| "step": 3100 |
| }, |
| { |
| "epoch": 3.0188723755602735, |
| "grad_norm": 1.2254925966262817, |
| "learning_rate": 3.1990000000000004e-05, |
| "loss": 4.7165, |
| "step": 3200 |
| }, |
| { |
| "epoch": 3.1132342533616417, |
| "grad_norm": 1.2496081590652466, |
| "learning_rate": 3.299e-05, |
| "loss": 4.6894, |
| "step": 3300 |
| }, |
| { |
| "epoch": 3.2075961311630103, |
| "grad_norm": 1.3087129592895508, |
| "learning_rate": 3.399e-05, |
| "loss": 4.6382, |
| "step": 3400 |
| }, |
| { |
| "epoch": 3.3019580089643785, |
| "grad_norm": 1.2795251607894897, |
| "learning_rate": 3.499e-05, |
| "loss": 4.6091, |
| "step": 3500 |
| }, |
| { |
| "epoch": 3.3963198867657467, |
| "grad_norm": 1.2816106081008911, |
| "learning_rate": 3.599e-05, |
| "loss": 4.5872, |
| "step": 3600 |
| }, |
| { |
| "epoch": 3.490681764567115, |
| "grad_norm": 1.2217532396316528, |
| "learning_rate": 3.699e-05, |
| "loss": 4.5528, |
| "step": 3700 |
| }, |
| { |
| "epoch": 3.585043642368483, |
| "grad_norm": 1.2160422801971436, |
| "learning_rate": 3.799e-05, |
| "loss": 4.5129, |
| "step": 3800 |
| }, |
| { |
| "epoch": 3.6794055201698512, |
| "grad_norm": 1.2504174709320068, |
| "learning_rate": 3.8990000000000004e-05, |
| "loss": 4.4912, |
| "step": 3900 |
| }, |
| { |
| "epoch": 3.77376739797122, |
| "grad_norm": 1.3709900379180908, |
| "learning_rate": 3.999e-05, |
| "loss": 4.4662, |
| "step": 4000 |
| }, |
| { |
| "epoch": 3.77376739797122, |
| "eval_loss": 4.381489276885986, |
| "eval_runtime": 89.9497, |
| "eval_samples_per_second": 167.549, |
| "eval_steps_per_second": 5.236, |
| "step": 4000 |
| }, |
| { |
| "epoch": 3.868129275772588, |
| "grad_norm": 1.2039287090301514, |
| "learning_rate": 4.099e-05, |
| "loss": 4.4265, |
| "step": 4100 |
| }, |
| { |
| "epoch": 3.962491153573956, |
| "grad_norm": 1.2931344509124756, |
| "learning_rate": 4.199e-05, |
| "loss": 4.4062, |
| "step": 4200 |
| }, |
| { |
| "epoch": 4.056617126680821, |
| "grad_norm": 1.1824957132339478, |
| "learning_rate": 4.299e-05, |
| "loss": 4.3693, |
| "step": 4300 |
| }, |
| { |
| "epoch": 4.150979004482189, |
| "grad_norm": 1.1529172658920288, |
| "learning_rate": 4.3990000000000004e-05, |
| "loss": 4.3337, |
| "step": 4400 |
| }, |
| { |
| "epoch": 4.245340882283557, |
| "grad_norm": 1.076854944229126, |
| "learning_rate": 4.499e-05, |
| "loss": 4.3073, |
| "step": 4500 |
| }, |
| { |
| "epoch": 4.339702760084926, |
| "grad_norm": 1.2523971796035767, |
| "learning_rate": 4.599e-05, |
| "loss": 4.2877, |
| "step": 4600 |
| }, |
| { |
| "epoch": 4.434064637886294, |
| "grad_norm": 1.19026780128479, |
| "learning_rate": 4.699e-05, |
| "loss": 4.2684, |
| "step": 4700 |
| }, |
| { |
| "epoch": 4.528426515687662, |
| "grad_norm": 1.1093727350234985, |
| "learning_rate": 4.799e-05, |
| "loss": 4.2301, |
| "step": 4800 |
| }, |
| { |
| "epoch": 4.622788393489031, |
| "grad_norm": 1.170032262802124, |
| "learning_rate": 4.8990000000000004e-05, |
| "loss": 4.2172, |
| "step": 4900 |
| }, |
| { |
| "epoch": 4.717150271290398, |
| "grad_norm": 1.1602752208709717, |
| "learning_rate": 4.999e-05, |
| "loss": 4.1815, |
| "step": 5000 |
| }, |
| { |
| "epoch": 4.717150271290398, |
| "eval_loss": 4.115845203399658, |
| "eval_runtime": 89.9622, |
| "eval_samples_per_second": 167.526, |
| "eval_steps_per_second": 5.236, |
| "step": 5000 |
| }, |
| { |
| "epoch": 4.811512149091767, |
| "grad_norm": 1.1583232879638672, |
| "learning_rate": 5.0990000000000005e-05, |
| "loss": 4.1705, |
| "step": 5100 |
| }, |
| { |
| "epoch": 4.905874026893136, |
| "grad_norm": 1.1284886598587036, |
| "learning_rate": 5.199000000000001e-05, |
| "loss": 4.1492, |
| "step": 5200 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 1.2270270586013794, |
| "learning_rate": 5.2990000000000006e-05, |
| "loss": 4.1234, |
| "step": 5300 |
| }, |
| { |
| "epoch": 5.094361877801369, |
| "grad_norm": 1.1221809387207031, |
| "learning_rate": 5.399000000000001e-05, |
| "loss": 4.0913, |
| "step": 5400 |
| }, |
| { |
| "epoch": 5.188723755602736, |
| "grad_norm": 1.1447559595108032, |
| "learning_rate": 5.499000000000001e-05, |
| "loss": 4.0568, |
| "step": 5500 |
| }, |
| { |
| "epoch": 5.283085633404105, |
| "grad_norm": 1.109061598777771, |
| "learning_rate": 5.599e-05, |
| "loss": 4.0514, |
| "step": 5600 |
| }, |
| { |
| "epoch": 5.377447511205473, |
| "grad_norm": 1.2101478576660156, |
| "learning_rate": 5.699e-05, |
| "loss": 4.0345, |
| "step": 5700 |
| }, |
| { |
| "epoch": 5.471809389006841, |
| "grad_norm": 1.0513982772827148, |
| "learning_rate": 5.799e-05, |
| "loss": 4.0028, |
| "step": 5800 |
| }, |
| { |
| "epoch": 5.56617126680821, |
| "grad_norm": 1.1381795406341553, |
| "learning_rate": 5.899e-05, |
| "loss": 4.0061, |
| "step": 5900 |
| }, |
| { |
| "epoch": 5.660533144609578, |
| "grad_norm": 1.102358102798462, |
| "learning_rate": 5.999e-05, |
| "loss": 3.981, |
| "step": 6000 |
| }, |
| { |
| "epoch": 5.660533144609578, |
| "eval_loss": 3.9213197231292725, |
| "eval_runtime": 89.9286, |
| "eval_samples_per_second": 167.589, |
| "eval_steps_per_second": 5.237, |
| "step": 6000 |
| }, |
| { |
| "epoch": 5.754895022410946, |
| "grad_norm": 1.0048632621765137, |
| "learning_rate": 6.0990000000000004e-05, |
| "loss": 3.9619, |
| "step": 6100 |
| }, |
| { |
| "epoch": 5.849256900212314, |
| "grad_norm": 1.0199745893478394, |
| "learning_rate": 6.199000000000001e-05, |
| "loss": 3.96, |
| "step": 6200 |
| }, |
| { |
| "epoch": 5.943618778013683, |
| "grad_norm": 1.052060842514038, |
| "learning_rate": 6.299e-05, |
| "loss": 3.9441, |
| "step": 6300 |
| }, |
| { |
| "epoch": 6.037744751120547, |
| "grad_norm": 1.1077136993408203, |
| "learning_rate": 6.399e-05, |
| "loss": 3.9129, |
| "step": 6400 |
| }, |
| { |
| "epoch": 6.132106628921916, |
| "grad_norm": 0.9797239303588867, |
| "learning_rate": 6.499000000000001e-05, |
| "loss": 3.8972, |
| "step": 6500 |
| }, |
| { |
| "epoch": 6.2264685067232834, |
| "grad_norm": 0.9934578537940979, |
| "learning_rate": 6.599000000000001e-05, |
| "loss": 3.8777, |
| "step": 6600 |
| }, |
| { |
| "epoch": 6.320830384524652, |
| "grad_norm": 0.9989880323410034, |
| "learning_rate": 6.699000000000001e-05, |
| "loss": 3.859, |
| "step": 6700 |
| }, |
| { |
| "epoch": 6.415192262326021, |
| "grad_norm": 0.9644502997398376, |
| "learning_rate": 6.799e-05, |
| "loss": 3.8596, |
| "step": 6800 |
| }, |
| { |
| "epoch": 6.509554140127388, |
| "grad_norm": 1.004499912261963, |
| "learning_rate": 6.899e-05, |
| "loss": 3.8494, |
| "step": 6900 |
| }, |
| { |
| "epoch": 6.603916017928757, |
| "grad_norm": 0.9108039736747742, |
| "learning_rate": 6.999e-05, |
| "loss": 3.8294, |
| "step": 7000 |
| }, |
| { |
| "epoch": 6.603916017928757, |
| "eval_loss": 3.7933058738708496, |
| "eval_runtime": 89.9278, |
| "eval_samples_per_second": 167.59, |
| "eval_steps_per_second": 5.238, |
| "step": 7000 |
| }, |
| { |
| "epoch": 6.698277895730125, |
| "grad_norm": 0.9396700859069824, |
| "learning_rate": 7.099e-05, |
| "loss": 3.8365, |
| "step": 7100 |
| }, |
| { |
| "epoch": 6.792639773531493, |
| "grad_norm": 0.9836630821228027, |
| "learning_rate": 7.199000000000001e-05, |
| "loss": 3.8102, |
| "step": 7200 |
| }, |
| { |
| "epoch": 6.887001651332861, |
| "grad_norm": 0.9225268959999084, |
| "learning_rate": 7.299e-05, |
| "loss": 3.8053, |
| "step": 7300 |
| }, |
| { |
| "epoch": 6.98136352913423, |
| "grad_norm": 0.9103354215621948, |
| "learning_rate": 7.399e-05, |
| "loss": 3.8067, |
| "step": 7400 |
| }, |
| { |
| "epoch": 7.075489502241094, |
| "grad_norm": 0.9512243270874023, |
| "learning_rate": 7.499e-05, |
| "loss": 3.7489, |
| "step": 7500 |
| }, |
| { |
| "epoch": 7.169851380042463, |
| "grad_norm": 0.9671022295951843, |
| "learning_rate": 7.599000000000001e-05, |
| "loss": 3.7524, |
| "step": 7600 |
| }, |
| { |
| "epoch": 7.264213257843831, |
| "grad_norm": 0.9643733501434326, |
| "learning_rate": 7.699e-05, |
| "loss": 3.7309, |
| "step": 7700 |
| }, |
| { |
| "epoch": 7.358575135645199, |
| "grad_norm": 0.94767165184021, |
| "learning_rate": 7.799e-05, |
| "loss": 3.7428, |
| "step": 7800 |
| }, |
| { |
| "epoch": 7.452937013446568, |
| "grad_norm": 0.9278003573417664, |
| "learning_rate": 7.899000000000001e-05, |
| "loss": 3.7256, |
| "step": 7900 |
| }, |
| { |
| "epoch": 7.5472988912479355, |
| "grad_norm": 0.8693475127220154, |
| "learning_rate": 7.999000000000001e-05, |
| "loss": 3.7299, |
| "step": 8000 |
| }, |
| { |
| "epoch": 7.5472988912479355, |
| "eval_loss": 3.701340675354004, |
| "eval_runtime": 89.9644, |
| "eval_samples_per_second": 167.522, |
| "eval_steps_per_second": 5.235, |
| "step": 8000 |
| }, |
| { |
| "epoch": 7.641660769049304, |
| "grad_norm": 0.8692898750305176, |
| "learning_rate": 8.099e-05, |
| "loss": 3.7201, |
| "step": 8100 |
| }, |
| { |
| "epoch": 7.736022646850673, |
| "grad_norm": 0.86644047498703, |
| "learning_rate": 8.199e-05, |
| "loss": 3.7185, |
| "step": 8200 |
| }, |
| { |
| "epoch": 7.8303845246520405, |
| "grad_norm": 0.9059743285179138, |
| "learning_rate": 8.299e-05, |
| "loss": 3.712, |
| "step": 8300 |
| }, |
| { |
| "epoch": 7.924746402453409, |
| "grad_norm": 0.8368768692016602, |
| "learning_rate": 8.399e-05, |
| "loss": 3.7064, |
| "step": 8400 |
| }, |
| { |
| "epoch": 8.018872375560274, |
| "grad_norm": 0.8828296065330505, |
| "learning_rate": 8.499e-05, |
| "loss": 3.6786, |
| "step": 8500 |
| }, |
| { |
| "epoch": 8.113234253361641, |
| "grad_norm": 0.8559228181838989, |
| "learning_rate": 8.599000000000001e-05, |
| "loss": 3.643, |
| "step": 8600 |
| }, |
| { |
| "epoch": 8.20759613116301, |
| "grad_norm": 0.8702303171157837, |
| "learning_rate": 8.699e-05, |
| "loss": 3.6411, |
| "step": 8700 |
| }, |
| { |
| "epoch": 8.301958008964379, |
| "grad_norm": 0.8181409239768982, |
| "learning_rate": 8.799e-05, |
| "loss": 3.6498, |
| "step": 8800 |
| }, |
| { |
| "epoch": 8.396319886765747, |
| "grad_norm": 0.839365541934967, |
| "learning_rate": 8.899e-05, |
| "loss": 3.641, |
| "step": 8900 |
| }, |
| { |
| "epoch": 8.490681764567114, |
| "grad_norm": 0.8675922155380249, |
| "learning_rate": 8.999000000000001e-05, |
| "loss": 3.6283, |
| "step": 9000 |
| }, |
| { |
| "epoch": 8.490681764567114, |
| "eval_loss": 3.6308939456939697, |
| "eval_runtime": 89.9234, |
| "eval_samples_per_second": 167.598, |
| "eval_steps_per_second": 5.238, |
| "step": 9000 |
| }, |
| { |
| "epoch": 8.585043642368483, |
| "grad_norm": 0.8916610479354858, |
| "learning_rate": 9.099000000000001e-05, |
| "loss": 3.6343, |
| "step": 9100 |
| }, |
| { |
| "epoch": 8.679405520169851, |
| "grad_norm": 0.81273353099823, |
| "learning_rate": 9.199e-05, |
| "loss": 3.6309, |
| "step": 9200 |
| }, |
| { |
| "epoch": 8.77376739797122, |
| "grad_norm": 0.8205325603485107, |
| "learning_rate": 9.299e-05, |
| "loss": 3.6322, |
| "step": 9300 |
| }, |
| { |
| "epoch": 8.868129275772588, |
| "grad_norm": 0.8169659972190857, |
| "learning_rate": 9.399e-05, |
| "loss": 3.6216, |
| "step": 9400 |
| }, |
| { |
| "epoch": 8.962491153573955, |
| "grad_norm": 0.8198681473731995, |
| "learning_rate": 9.499e-05, |
| "loss": 3.6054, |
| "step": 9500 |
| }, |
| { |
| "epoch": 9.056617126680822, |
| "grad_norm": 0.8319467902183533, |
| "learning_rate": 9.599000000000001e-05, |
| "loss": 3.5813, |
| "step": 9600 |
| }, |
| { |
| "epoch": 9.150979004482188, |
| "grad_norm": 0.814388632774353, |
| "learning_rate": 9.699e-05, |
| "loss": 3.5636, |
| "step": 9700 |
| }, |
| { |
| "epoch": 9.245340882283557, |
| "grad_norm": 0.8515892624855042, |
| "learning_rate": 9.799e-05, |
| "loss": 3.5551, |
| "step": 9800 |
| }, |
| { |
| "epoch": 9.339702760084926, |
| "grad_norm": 0.8092982769012451, |
| "learning_rate": 9.899e-05, |
| "loss": 3.5585, |
| "step": 9900 |
| }, |
| { |
| "epoch": 9.434064637886294, |
| "grad_norm": 0.8135222792625427, |
| "learning_rate": 9.999000000000001e-05, |
| "loss": 3.5615, |
| "step": 10000 |
| }, |
| { |
| "epoch": 9.434064637886294, |
| "eval_loss": 3.57912540435791, |
| "eval_runtime": 89.9276, |
| "eval_samples_per_second": 167.59, |
| "eval_steps_per_second": 5.238, |
| "step": 10000 |
| }, |
| { |
| "epoch": 9.528426515687663, |
| "grad_norm": 0.7966075539588928, |
| "learning_rate": 9.999970144476398e-05, |
| "loss": 3.5535, |
| "step": 10100 |
| }, |
| { |
| "epoch": 9.62278839348903, |
| "grad_norm": 0.7658608555793762, |
| "learning_rate": 9.999879368940656e-05, |
| "loss": 3.5665, |
| "step": 10200 |
| }, |
| { |
| "epoch": 9.717150271290398, |
| "grad_norm": 0.7524927854537964, |
| "learning_rate": 9.999727671452668e-05, |
| "loss": 3.5502, |
| "step": 10300 |
| }, |
| { |
| "epoch": 9.811512149091767, |
| "grad_norm": 0.743500292301178, |
| "learning_rate": 9.999515053860821e-05, |
| "loss": 3.5497, |
| "step": 10400 |
| }, |
| { |
| "epoch": 9.905874026893136, |
| "grad_norm": 0.7590805292129517, |
| "learning_rate": 9.999241518755793e-05, |
| "loss": 3.5467, |
| "step": 10500 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.8875275254249573, |
| "learning_rate": 9.998907069470524e-05, |
| "loss": 3.545, |
| "step": 10600 |
| }, |
| { |
| "epoch": 10.094361877801369, |
| "grad_norm": 0.7724853157997131, |
| "learning_rate": 9.998511710080171e-05, |
| "loss": 3.4858, |
| "step": 10700 |
| }, |
| { |
| "epoch": 10.188723755602737, |
| "grad_norm": 0.7321507334709167, |
| "learning_rate": 9.998055445402067e-05, |
| "loss": 3.4796, |
| "step": 10800 |
| }, |
| { |
| "epoch": 10.283085633404104, |
| "grad_norm": 0.7564536333084106, |
| "learning_rate": 9.997538280995651e-05, |
| "loss": 3.4922, |
| "step": 10900 |
| }, |
| { |
| "epoch": 10.377447511205473, |
| "grad_norm": 0.7601178288459778, |
| "learning_rate": 9.996960223162406e-05, |
| "loss": 3.4881, |
| "step": 11000 |
| }, |
| { |
| "epoch": 10.377447511205473, |
| "eval_loss": 3.530785322189331, |
| "eval_runtime": 89.9198, |
| "eval_samples_per_second": 167.605, |
| "eval_steps_per_second": 5.238, |
| "step": 11000 |
| }, |
| { |
| "epoch": 10.471809389006841, |
| "grad_norm": 0.7771745324134827, |
| "learning_rate": 9.996321278945788e-05, |
| "loss": 3.4822, |
| "step": 11100 |
| }, |
| { |
| "epoch": 10.56617126680821, |
| "grad_norm": 0.7326973676681519, |
| "learning_rate": 9.995621456131128e-05, |
| "loss": 3.4838, |
| "step": 11200 |
| }, |
| { |
| "epoch": 10.660533144609577, |
| "grad_norm": 0.728434145450592, |
| "learning_rate": 9.994860763245549e-05, |
| "loss": 3.4832, |
| "step": 11300 |
| }, |
| { |
| "epoch": 10.754895022410945, |
| "grad_norm": 0.7702102065086365, |
| "learning_rate": 9.99403920955785e-05, |
| "loss": 3.4962, |
| "step": 11400 |
| }, |
| { |
| "epoch": 10.849256900212314, |
| "grad_norm": 0.718971848487854, |
| "learning_rate": 9.993156805078405e-05, |
| "loss": 3.486, |
| "step": 11500 |
| }, |
| { |
| "epoch": 10.943618778013683, |
| "grad_norm": 0.7548109889030457, |
| "learning_rate": 9.992213560559034e-05, |
| "loss": 3.484, |
| "step": 11600 |
| }, |
| { |
| "epoch": 11.037744751120547, |
| "grad_norm": 0.7601837515830994, |
| "learning_rate": 9.991209487492876e-05, |
| "loss": 3.4513, |
| "step": 11700 |
| }, |
| { |
| "epoch": 11.132106628921916, |
| "grad_norm": 0.7187873721122742, |
| "learning_rate": 9.990144598114242e-05, |
| "loss": 3.4157, |
| "step": 11800 |
| }, |
| { |
| "epoch": 11.226468506723284, |
| "grad_norm": 0.7205685377120972, |
| "learning_rate": 9.989018905398473e-05, |
| "loss": 3.4232, |
| "step": 11900 |
| }, |
| { |
| "epoch": 11.320830384524651, |
| "grad_norm": 0.761542558670044, |
| "learning_rate": 9.98783242306178e-05, |
| "loss": 3.4295, |
| "step": 12000 |
| }, |
| { |
| "epoch": 11.320830384524651, |
| "eval_loss": 3.497931957244873, |
| "eval_runtime": 89.9528, |
| "eval_samples_per_second": 167.543, |
| "eval_steps_per_second": 5.236, |
| "step": 12000 |
| }, |
| { |
| "epoch": 11.41519226232602, |
| "grad_norm": 0.7080798149108887, |
| "learning_rate": 9.986585165561076e-05, |
| "loss": 3.4227, |
| "step": 12100 |
| }, |
| { |
| "epoch": 11.509554140127388, |
| "grad_norm": 0.7278120517730713, |
| "learning_rate": 9.9852771480938e-05, |
| "loss": 3.432, |
| "step": 12200 |
| }, |
| { |
| "epoch": 11.603916017928757, |
| "grad_norm": 0.7304459810256958, |
| "learning_rate": 9.983908386597732e-05, |
| "loss": 3.4355, |
| "step": 12300 |
| }, |
| { |
| "epoch": 11.698277895730126, |
| "grad_norm": 0.7287798523902893, |
| "learning_rate": 9.9824788977508e-05, |
| "loss": 3.4281, |
| "step": 12400 |
| }, |
| { |
| "epoch": 11.792639773531493, |
| "grad_norm": 0.6873247027397156, |
| "learning_rate": 9.980988698970872e-05, |
| "loss": 3.4263, |
| "step": 12500 |
| }, |
| { |
| "epoch": 11.887001651332861, |
| "grad_norm": 0.7197590470314026, |
| "learning_rate": 9.979437808415552e-05, |
| "loss": 3.4261, |
| "step": 12600 |
| }, |
| { |
| "epoch": 11.98136352913423, |
| "grad_norm": 0.691047728061676, |
| "learning_rate": 9.977826244981952e-05, |
| "loss": 3.4262, |
| "step": 12700 |
| }, |
| { |
| "epoch": 12.075489502241094, |
| "grad_norm": 0.7147277593612671, |
| "learning_rate": 9.976154028306461e-05, |
| "loss": 3.3695, |
| "step": 12800 |
| }, |
| { |
| "epoch": 12.169851380042463, |
| "grad_norm": 0.7131621837615967, |
| "learning_rate": 9.974421178764515e-05, |
| "loss": 3.3619, |
| "step": 12900 |
| }, |
| { |
| "epoch": 12.264213257843831, |
| "grad_norm": 0.6843485832214355, |
| "learning_rate": 9.972627717470337e-05, |
| "loss": 3.3786, |
| "step": 13000 |
| }, |
| { |
| "epoch": 12.264213257843831, |
| "eval_loss": 3.4681344032287598, |
| "eval_runtime": 89.9337, |
| "eval_samples_per_second": 167.579, |
| "eval_steps_per_second": 5.237, |
| "step": 13000 |
| }, |
| { |
| "epoch": 12.3585751356452, |
| "grad_norm": 0.7265843152999878, |
| "learning_rate": 9.970773666276686e-05, |
| "loss": 3.3759, |
| "step": 13100 |
| }, |
| { |
| "epoch": 12.452937013446567, |
| "grad_norm": 0.7135173082351685, |
| "learning_rate": 9.968859047774595e-05, |
| "loss": 3.3757, |
| "step": 13200 |
| }, |
| { |
| "epoch": 12.547298891247936, |
| "grad_norm": 0.7075929045677185, |
| "learning_rate": 9.966883885293081e-05, |
| "loss": 3.3868, |
| "step": 13300 |
| }, |
| { |
| "epoch": 12.641660769049304, |
| "grad_norm": 0.6600580811500549, |
| "learning_rate": 9.964848202898879e-05, |
| "loss": 3.3768, |
| "step": 13400 |
| }, |
| { |
| "epoch": 12.736022646850673, |
| "grad_norm": 0.6909327507019043, |
| "learning_rate": 9.962752025396133e-05, |
| "loss": 3.3761, |
| "step": 13500 |
| }, |
| { |
| "epoch": 12.830384524652041, |
| "grad_norm": 0.7116390466690063, |
| "learning_rate": 9.96059537832611e-05, |
| "loss": 3.3696, |
| "step": 13600 |
| }, |
| { |
| "epoch": 12.924746402453408, |
| "grad_norm": 0.6888706088066101, |
| "learning_rate": 9.958378287966868e-05, |
| "loss": 3.3835, |
| "step": 13700 |
| }, |
| { |
| "epoch": 13.018872375560274, |
| "grad_norm": 0.6996840834617615, |
| "learning_rate": 9.956100781332958e-05, |
| "loss": 3.3644, |
| "step": 13800 |
| }, |
| { |
| "epoch": 13.113234253361641, |
| "grad_norm": 0.7074296474456787, |
| "learning_rate": 9.953762886175075e-05, |
| "loss": 3.3085, |
| "step": 13900 |
| }, |
| { |
| "epoch": 13.20759613116301, |
| "grad_norm": 0.7509676218032837, |
| "learning_rate": 9.951364630979738e-05, |
| "loss": 3.324, |
| "step": 14000 |
| }, |
| { |
| "epoch": 13.20759613116301, |
| "eval_loss": 3.446702241897583, |
| "eval_runtime": 89.9626, |
| "eval_samples_per_second": 167.525, |
| "eval_steps_per_second": 5.236, |
| "step": 14000 |
| }, |
| { |
| "epoch": 13.301958008964379, |
| "grad_norm": 0.6905140280723572, |
| "learning_rate": 9.948906044968926e-05, |
| "loss": 3.3204, |
| "step": 14100 |
| }, |
| { |
| "epoch": 13.396319886765747, |
| "grad_norm": 0.6943195462226868, |
| "learning_rate": 9.946387158099738e-05, |
| "loss": 3.3314, |
| "step": 14200 |
| }, |
| { |
| "epoch": 13.490681764567114, |
| "grad_norm": 0.748652994632721, |
| "learning_rate": 9.943808001064013e-05, |
| "loss": 3.3365, |
| "step": 14300 |
| }, |
| { |
| "epoch": 13.585043642368483, |
| "grad_norm": 0.6941584944725037, |
| "learning_rate": 9.941168605287965e-05, |
| "loss": 3.3327, |
| "step": 14400 |
| }, |
| { |
| "epoch": 13.679405520169851, |
| "grad_norm": 0.7011757493019104, |
| "learning_rate": 9.938469002931798e-05, |
| "loss": 3.336, |
| "step": 14500 |
| }, |
| { |
| "epoch": 13.77376739797122, |
| "grad_norm": 0.6881093978881836, |
| "learning_rate": 9.935709226889319e-05, |
| "loss": 3.3441, |
| "step": 14600 |
| }, |
| { |
| "epoch": 13.868129275772588, |
| "grad_norm": 0.6721529960632324, |
| "learning_rate": 9.932889310787522e-05, |
| "loss": 3.3355, |
| "step": 14700 |
| }, |
| { |
| "epoch": 13.962491153573955, |
| "grad_norm": 0.6991400718688965, |
| "learning_rate": 9.9300092889862e-05, |
| "loss": 3.332, |
| "step": 14800 |
| }, |
| { |
| "epoch": 14.056617126680822, |
| "grad_norm": 0.6730444431304932, |
| "learning_rate": 9.927069196577507e-05, |
| "loss": 3.2893, |
| "step": 14900 |
| }, |
| { |
| "epoch": 14.150979004482188, |
| "grad_norm": 0.6822571754455566, |
| "learning_rate": 9.924069069385543e-05, |
| "loss": 3.2673, |
| "step": 15000 |
| }, |
| { |
| "epoch": 14.150979004482188, |
| "eval_loss": 3.428182601928711, |
| "eval_runtime": 89.9364, |
| "eval_samples_per_second": 167.574, |
| "eval_steps_per_second": 5.237, |
| "step": 15000 |
| }, |
| { |
| "epoch": 14.245340882283557, |
| "grad_norm": 0.6944796442985535, |
| "learning_rate": 9.921008943965908e-05, |
| "loss": 3.2846, |
| "step": 15100 |
| }, |
| { |
| "epoch": 14.339702760084926, |
| "grad_norm": 0.6927157044410706, |
| "learning_rate": 9.917888857605268e-05, |
| "loss": 3.2856, |
| "step": 15200 |
| }, |
| { |
| "epoch": 14.434064637886294, |
| "grad_norm": 0.6758902072906494, |
| "learning_rate": 9.91470884832089e-05, |
| "loss": 3.2768, |
| "step": 15300 |
| }, |
| { |
| "epoch": 14.528426515687663, |
| "grad_norm": 0.7083920836448669, |
| "learning_rate": 9.911468954860181e-05, |
| "loss": 3.2846, |
| "step": 15400 |
| }, |
| { |
| "epoch": 14.62278839348903, |
| "grad_norm": 0.6658477783203125, |
| "learning_rate": 9.908169216700223e-05, |
| "loss": 3.3001, |
| "step": 15500 |
| }, |
| { |
| "epoch": 14.717150271290398, |
| "grad_norm": 0.6643409729003906, |
| "learning_rate": 9.904809674047284e-05, |
| "loss": 3.3046, |
| "step": 15600 |
| }, |
| { |
| "epoch": 14.811512149091767, |
| "grad_norm": 0.6668530106544495, |
| "learning_rate": 9.90139036783633e-05, |
| "loss": 3.3031, |
| "step": 15700 |
| }, |
| { |
| "epoch": 14.905874026893136, |
| "grad_norm": 0.6760970950126648, |
| "learning_rate": 9.897911339730527e-05, |
| "loss": 3.3031, |
| "step": 15800 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.8160315155982971, |
| "learning_rate": 9.894372632120738e-05, |
| "loss": 3.3028, |
| "step": 15900 |
| }, |
| { |
| "epoch": 15.094361877801369, |
| "grad_norm": 0.6879032850265503, |
| "learning_rate": 9.890774288124996e-05, |
| "loss": 3.2276, |
| "step": 16000 |
| }, |
| { |
| "epoch": 15.094361877801369, |
| "eval_loss": 3.4133388996124268, |
| "eval_runtime": 89.9633, |
| "eval_samples_per_second": 167.524, |
| "eval_steps_per_second": 5.235, |
| "step": 16000 |
| }, |
| { |
| "epoch": 15.188723755602737, |
| "grad_norm": 0.688949704170227, |
| "learning_rate": 9.887116351587985e-05, |
| "loss": 3.2447, |
| "step": 16100 |
| }, |
| { |
| "epoch": 15.283085633404104, |
| "grad_norm": 0.6961474418640137, |
| "learning_rate": 9.883398867080513e-05, |
| "loss": 3.2392, |
| "step": 16200 |
| }, |
| { |
| "epoch": 15.377447511205473, |
| "grad_norm": 0.681828498840332, |
| "learning_rate": 9.87962187989895e-05, |
| "loss": 3.2465, |
| "step": 16300 |
| }, |
| { |
| "epoch": 15.471809389006841, |
| "grad_norm": 0.6817638874053955, |
| "learning_rate": 9.875785436064697e-05, |
| "loss": 3.2503, |
| "step": 16400 |
| }, |
| { |
| "epoch": 15.56617126680821, |
| "grad_norm": 0.6779124736785889, |
| "learning_rate": 9.871889582323609e-05, |
| "loss": 3.2555, |
| "step": 16500 |
| }, |
| { |
| "epoch": 15.660533144609577, |
| "grad_norm": 0.6662207841873169, |
| "learning_rate": 9.867934366145435e-05, |
| "loss": 3.263, |
| "step": 16600 |
| }, |
| { |
| "epoch": 15.754895022410945, |
| "grad_norm": 0.691040575504303, |
| "learning_rate": 9.863919835723236e-05, |
| "loss": 3.2616, |
| "step": 16700 |
| }, |
| { |
| "epoch": 15.849256900212314, |
| "grad_norm": 0.6542192101478577, |
| "learning_rate": 9.859846039972798e-05, |
| "loss": 3.2662, |
| "step": 16800 |
| }, |
| { |
| "epoch": 15.943618778013683, |
| "grad_norm": 0.6532755494117737, |
| "learning_rate": 9.855713028532036e-05, |
| "loss": 3.2684, |
| "step": 16900 |
| }, |
| { |
| "epoch": 16.03774475112055, |
| "grad_norm": 0.6761746406555176, |
| "learning_rate": 9.851520851760394e-05, |
| "loss": 3.2356, |
| "step": 17000 |
| }, |
| { |
| "epoch": 16.03774475112055, |
| "eval_loss": 3.402015447616577, |
| "eval_runtime": 89.929, |
| "eval_samples_per_second": 167.588, |
| "eval_steps_per_second": 5.237, |
| "step": 17000 |
| }, |
| { |
| "epoch": 16.132106628921914, |
| "grad_norm": 0.6820452809333801, |
| "learning_rate": 9.847269560738218e-05, |
| "loss": 3.1936, |
| "step": 17100 |
| }, |
| { |
| "epoch": 16.226468506723283, |
| "grad_norm": 0.6789988875389099, |
| "learning_rate": 9.842959207266149e-05, |
| "loss": 3.2047, |
| "step": 17200 |
| }, |
| { |
| "epoch": 16.32083038452465, |
| "grad_norm": 0.6698039174079895, |
| "learning_rate": 9.838589843864484e-05, |
| "loss": 3.2103, |
| "step": 17300 |
| }, |
| { |
| "epoch": 16.41519226232602, |
| "grad_norm": 0.6566837430000305, |
| "learning_rate": 9.834161523772539e-05, |
| "loss": 3.2203, |
| "step": 17400 |
| }, |
| { |
| "epoch": 16.50955414012739, |
| "grad_norm": 0.677543044090271, |
| "learning_rate": 9.829674300947993e-05, |
| "loss": 3.222, |
| "step": 17500 |
| }, |
| { |
| "epoch": 16.603916017928757, |
| "grad_norm": 0.679976761341095, |
| "learning_rate": 9.825128230066244e-05, |
| "loss": 3.2282, |
| "step": 17600 |
| }, |
| { |
| "epoch": 16.698277895730126, |
| "grad_norm": 0.670319676399231, |
| "learning_rate": 9.82052336651973e-05, |
| "loss": 3.2225, |
| "step": 17700 |
| }, |
| { |
| "epoch": 16.792639773531494, |
| "grad_norm": 0.6647588610649109, |
| "learning_rate": 9.815859766417257e-05, |
| "loss": 3.2326, |
| "step": 17800 |
| }, |
| { |
| "epoch": 16.887001651332863, |
| "grad_norm": 0.6643775701522827, |
| "learning_rate": 9.811137486583324e-05, |
| "loss": 3.2256, |
| "step": 17900 |
| }, |
| { |
| "epoch": 16.981363529134228, |
| "grad_norm": 0.6705678701400757, |
| "learning_rate": 9.806356584557419e-05, |
| "loss": 3.2403, |
| "step": 18000 |
| }, |
| { |
| "epoch": 16.981363529134228, |
| "eval_loss": 3.387256622314453, |
| "eval_runtime": 89.9338, |
| "eval_samples_per_second": 167.579, |
| "eval_steps_per_second": 5.237, |
| "step": 18000 |
| }, |
| { |
| "epoch": 17.075489502241094, |
| "grad_norm": 0.6900054216384888, |
| "learning_rate": 9.801517118593327e-05, |
| "loss": 3.1775, |
| "step": 18100 |
| }, |
| { |
| "epoch": 17.169851380042463, |
| "grad_norm": 0.6650823950767517, |
| "learning_rate": 9.796619147658408e-05, |
| "loss": 3.1641, |
| "step": 18200 |
| }, |
| { |
| "epoch": 17.26421325784383, |
| "grad_norm": 0.6726897358894348, |
| "learning_rate": 9.791662731432898e-05, |
| "loss": 3.175, |
| "step": 18300 |
| }, |
| { |
| "epoch": 17.3585751356452, |
| "grad_norm": 0.6691387295722961, |
| "learning_rate": 9.78664793030916e-05, |
| "loss": 3.1834, |
| "step": 18400 |
| }, |
| { |
| "epoch": 17.45293701344657, |
| "grad_norm": 0.6631948351860046, |
| "learning_rate": 9.781574805390967e-05, |
| "loss": 3.1814, |
| "step": 18500 |
| }, |
| { |
| "epoch": 17.547298891247937, |
| "grad_norm": 0.6776889562606812, |
| "learning_rate": 9.776443418492744e-05, |
| "loss": 3.1934, |
| "step": 18600 |
| }, |
| { |
| "epoch": 17.641660769049302, |
| "grad_norm": 0.6866058111190796, |
| "learning_rate": 9.771253832138819e-05, |
| "loss": 3.1933, |
| "step": 18700 |
| }, |
| { |
| "epoch": 17.73602264685067, |
| "grad_norm": 0.6719706058502197, |
| "learning_rate": 9.766006109562664e-05, |
| "loss": 3.1993, |
| "step": 18800 |
| }, |
| { |
| "epoch": 17.83038452465204, |
| "grad_norm": 0.6513810753822327, |
| "learning_rate": 9.760700314706125e-05, |
| "loss": 3.21, |
| "step": 18900 |
| }, |
| { |
| "epoch": 17.924746402453408, |
| "grad_norm": 0.6892839074134827, |
| "learning_rate": 9.755336512218638e-05, |
| "loss": 3.2045, |
| "step": 19000 |
| }, |
| { |
| "epoch": 17.924746402453408, |
| "eval_loss": 3.3803834915161133, |
| "eval_runtime": 89.9356, |
| "eval_samples_per_second": 167.575, |
| "eval_steps_per_second": 5.237, |
| "step": 19000 |
| }, |
| { |
| "epoch": 18.018872375560274, |
| "grad_norm": 0.671567440032959, |
| "learning_rate": 9.749914767456441e-05, |
| "loss": 3.1867, |
| "step": 19100 |
| }, |
| { |
| "epoch": 18.113234253361643, |
| "grad_norm": 0.6859995126724243, |
| "learning_rate": 9.744435146481785e-05, |
| "loss": 3.1267, |
| "step": 19200 |
| }, |
| { |
| "epoch": 18.20759613116301, |
| "grad_norm": 0.6942476630210876, |
| "learning_rate": 9.738897716062121e-05, |
| "loss": 3.1458, |
| "step": 19300 |
| }, |
| { |
| "epoch": 18.301958008964377, |
| "grad_norm": 0.6862732768058777, |
| "learning_rate": 9.733302543669291e-05, |
| "loss": 3.151, |
| "step": 19400 |
| }, |
| { |
| "epoch": 18.396319886765745, |
| "grad_norm": 0.6695058941841125, |
| "learning_rate": 9.727649697478708e-05, |
| "loss": 3.1599, |
| "step": 19500 |
| }, |
| { |
| "epoch": 18.490681764567114, |
| "grad_norm": 0.6894610524177551, |
| "learning_rate": 9.721939246368515e-05, |
| "loss": 3.1535, |
| "step": 19600 |
| }, |
| { |
| "epoch": 18.585043642368483, |
| "grad_norm": 0.65924471616745, |
| "learning_rate": 9.716171259918758e-05, |
| "loss": 3.1606, |
| "step": 19700 |
| }, |
| { |
| "epoch": 18.67940552016985, |
| "grad_norm": 0.6839491724967957, |
| "learning_rate": 9.710345808410532e-05, |
| "loss": 3.1706, |
| "step": 19800 |
| }, |
| { |
| "epoch": 18.77376739797122, |
| "grad_norm": 0.6813986897468567, |
| "learning_rate": 9.704462962825124e-05, |
| "loss": 3.1755, |
| "step": 19900 |
| }, |
| { |
| "epoch": 18.86812927577259, |
| "grad_norm": 0.677698016166687, |
| "learning_rate": 9.698522794843154e-05, |
| "loss": 3.1827, |
| "step": 20000 |
| }, |
| { |
| "epoch": 18.86812927577259, |
| "eval_loss": 3.3754522800445557, |
| "eval_runtime": 89.9428, |
| "eval_samples_per_second": 167.562, |
| "eval_steps_per_second": 5.237, |
| "step": 20000 |
| }, |
| { |
| "epoch": 18.962491153573957, |
| "grad_norm": 0.6723212003707886, |
| "learning_rate": 9.692525376843691e-05, |
| "loss": 3.1761, |
| "step": 20100 |
| }, |
| { |
| "epoch": 19.05661712668082, |
| "grad_norm": 0.6973418593406677, |
| "learning_rate": 9.686470781903383e-05, |
| "loss": 3.1261, |
| "step": 20200 |
| }, |
| { |
| "epoch": 19.15097900448219, |
| "grad_norm": 0.6863260865211487, |
| "learning_rate": 9.680359083795557e-05, |
| "loss": 3.108, |
| "step": 20300 |
| }, |
| { |
| "epoch": 19.245340882283557, |
| "grad_norm": 0.6857718825340271, |
| "learning_rate": 9.674190356989325e-05, |
| "loss": 3.1168, |
| "step": 20400 |
| }, |
| { |
| "epoch": 19.339702760084926, |
| "grad_norm": 0.6926584839820862, |
| "learning_rate": 9.66796467664868e-05, |
| "loss": 3.1257, |
| "step": 20500 |
| }, |
| { |
| "epoch": 19.434064637886294, |
| "grad_norm": 0.651599645614624, |
| "learning_rate": 9.661682118631568e-05, |
| "loss": 3.1265, |
| "step": 20600 |
| }, |
| { |
| "epoch": 19.528426515687663, |
| "grad_norm": 0.6708794832229614, |
| "learning_rate": 9.655342759488979e-05, |
| "loss": 3.1367, |
| "step": 20700 |
| }, |
| { |
| "epoch": 19.62278839348903, |
| "grad_norm": 0.6695137619972229, |
| "learning_rate": 9.648946676464002e-05, |
| "loss": 3.1375, |
| "step": 20800 |
| }, |
| { |
| "epoch": 19.7171502712904, |
| "grad_norm": 0.6629143357276917, |
| "learning_rate": 9.642493947490889e-05, |
| "loss": 3.1393, |
| "step": 20900 |
| }, |
| { |
| "epoch": 19.81151214909177, |
| "grad_norm": 0.6744666695594788, |
| "learning_rate": 9.635984651194109e-05, |
| "loss": 3.1504, |
| "step": 21000 |
| }, |
| { |
| "epoch": 19.81151214909177, |
| "eval_loss": 3.3715157508850098, |
| "eval_runtime": 89.9309, |
| "eval_samples_per_second": 167.584, |
| "eval_steps_per_second": 5.237, |
| "step": 21000 |
| }, |
| { |
| "epoch": 19.905874026893134, |
| "grad_norm": 0.6979096531867981, |
| "learning_rate": 9.629418866887381e-05, |
| "loss": 3.1563, |
| "step": 21100 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.7904658317565918, |
| "learning_rate": 9.622796674572716e-05, |
| "loss": 3.1528, |
| "step": 21200 |
| }, |
| { |
| "epoch": 20.09436187780137, |
| "grad_norm": 0.6927621960639954, |
| "learning_rate": 9.616118154939436e-05, |
| "loss": 3.0697, |
| "step": 21300 |
| }, |
| { |
| "epoch": 20.188723755602737, |
| "grad_norm": 0.6882548332214355, |
| "learning_rate": 9.609383389363198e-05, |
| "loss": 3.0805, |
| "step": 21400 |
| }, |
| { |
| "epoch": 20.283085633404106, |
| "grad_norm": 0.6912660002708435, |
| "learning_rate": 9.602592459904993e-05, |
| "loss": 3.0944, |
| "step": 21500 |
| }, |
| { |
| "epoch": 20.377447511205474, |
| "grad_norm": 0.6904003620147705, |
| "learning_rate": 9.595745449310152e-05, |
| "loss": 3.1035, |
| "step": 21600 |
| }, |
| { |
| "epoch": 20.47180938900684, |
| "grad_norm": 0.6675978899002075, |
| "learning_rate": 9.588842441007342e-05, |
| "loss": 3.1113, |
| "step": 21700 |
| }, |
| { |
| "epoch": 20.566171266808208, |
| "grad_norm": 0.6616461873054504, |
| "learning_rate": 9.581883519107538e-05, |
| "loss": 3.1086, |
| "step": 21800 |
| }, |
| { |
| "epoch": 20.660533144609577, |
| "grad_norm": 0.6793836951255798, |
| "learning_rate": 9.574868768403007e-05, |
| "loss": 3.1142, |
| "step": 21900 |
| }, |
| { |
| "epoch": 20.754895022410945, |
| "grad_norm": 0.6753296256065369, |
| "learning_rate": 9.567798274366273e-05, |
| "loss": 3.1291, |
| "step": 22000 |
| }, |
| { |
| "epoch": 20.754895022410945, |
| "eval_loss": 3.3698410987854004, |
| "eval_runtime": 89.921, |
| "eval_samples_per_second": 167.603, |
| "eval_steps_per_second": 5.238, |
| "step": 22000 |
| }, |
| { |
| "epoch": 20.849256900212314, |
| "grad_norm": 0.6524435877799988, |
| "learning_rate": 9.560672123149077e-05, |
| "loss": 3.1228, |
| "step": 22100 |
| }, |
| { |
| "epoch": 20.943618778013683, |
| "grad_norm": 0.6633111238479614, |
| "learning_rate": 9.55349040158132e-05, |
| "loss": 3.1326, |
| "step": 22200 |
| }, |
| { |
| "epoch": 21.03774475112055, |
| "grad_norm": 0.6754229068756104, |
| "learning_rate": 9.546253197170015e-05, |
| "loss": 3.0924, |
| "step": 22300 |
| }, |
| { |
| "epoch": 21.132106628921914, |
| "grad_norm": 0.6778271794319153, |
| "learning_rate": 9.538960598098211e-05, |
| "loss": 3.0552, |
| "step": 22400 |
| }, |
| { |
| "epoch": 21.226468506723283, |
| "grad_norm": 0.688077449798584, |
| "learning_rate": 9.531612693223928e-05, |
| "loss": 3.0623, |
| "step": 22500 |
| }, |
| { |
| "epoch": 21.32083038452465, |
| "grad_norm": 0.7048831582069397, |
| "learning_rate": 9.524209572079068e-05, |
| "loss": 3.0733, |
| "step": 22600 |
| }, |
| { |
| "epoch": 21.41519226232602, |
| "grad_norm": 0.6822944283485413, |
| "learning_rate": 9.516751324868326e-05, |
| "loss": 3.089, |
| "step": 22700 |
| }, |
| { |
| "epoch": 21.50955414012739, |
| "grad_norm": 0.6703271269798279, |
| "learning_rate": 9.509238042468091e-05, |
| "loss": 3.0831, |
| "step": 22800 |
| }, |
| { |
| "epoch": 21.603916017928757, |
| "grad_norm": 0.6696993708610535, |
| "learning_rate": 9.501669816425337e-05, |
| "loss": 3.0977, |
| "step": 22900 |
| }, |
| { |
| "epoch": 21.698277895730126, |
| "grad_norm": 0.673234224319458, |
| "learning_rate": 9.494046738956508e-05, |
| "loss": 3.096, |
| "step": 23000 |
| }, |
| { |
| "epoch": 21.698277895730126, |
| "eval_loss": 3.3681890964508057, |
| "eval_runtime": 89.9353, |
| "eval_samples_per_second": 167.576, |
| "eval_steps_per_second": 5.237, |
| "step": 23000 |
| }, |
| { |
| "epoch": 21.792639773531494, |
| "grad_norm": 0.6924330592155457, |
| "learning_rate": 9.486368902946402e-05, |
| "loss": 3.0936, |
| "step": 23100 |
| }, |
| { |
| "epoch": 21.887001651332863, |
| "grad_norm": 0.6492679715156555, |
| "learning_rate": 9.478636401947026e-05, |
| "loss": 3.0962, |
| "step": 23200 |
| }, |
| { |
| "epoch": 21.981363529134228, |
| "grad_norm": 0.6826934814453125, |
| "learning_rate": 9.47084933017646e-05, |
| "loss": 3.1055, |
| "step": 23300 |
| }, |
| { |
| "epoch": 22.075489502241094, |
| "grad_norm": 0.717924177646637, |
| "learning_rate": 9.463007782517723e-05, |
| "loss": 3.0381, |
| "step": 23400 |
| }, |
| { |
| "epoch": 22.169851380042463, |
| "grad_norm": 0.7275199890136719, |
| "learning_rate": 9.455111854517595e-05, |
| "loss": 3.0324, |
| "step": 23500 |
| }, |
| { |
| "epoch": 22.26421325784383, |
| "grad_norm": 0.7005674839019775, |
| "learning_rate": 9.447161642385467e-05, |
| "loss": 3.0485, |
| "step": 23600 |
| }, |
| { |
| "epoch": 22.3585751356452, |
| "grad_norm": 0.6765129566192627, |
| "learning_rate": 9.439157242992164e-05, |
| "loss": 3.0495, |
| "step": 23700 |
| }, |
| { |
| "epoch": 22.45293701344657, |
| "grad_norm": 0.7040418982505798, |
| "learning_rate": 9.43109875386877e-05, |
| "loss": 3.0574, |
| "step": 23800 |
| }, |
| { |
| "epoch": 22.547298891247937, |
| "grad_norm": 0.6893322467803955, |
| "learning_rate": 9.422986273205429e-05, |
| "loss": 3.0706, |
| "step": 23900 |
| }, |
| { |
| "epoch": 22.641660769049302, |
| "grad_norm": 0.6621236205101013, |
| "learning_rate": 9.414819899850158e-05, |
| "loss": 3.0781, |
| "step": 24000 |
| }, |
| { |
| "epoch": 22.641660769049302, |
| "eval_loss": 3.3693912029266357, |
| "eval_runtime": 89.9188, |
| "eval_samples_per_second": 167.607, |
| "eval_steps_per_second": 5.238, |
| "step": 24000 |
| }, |
| { |
| "epoch": 22.73602264685067, |
| "grad_norm": 0.6813406348228455, |
| "learning_rate": 9.40659973330764e-05, |
| "loss": 3.0719, |
| "step": 24100 |
| }, |
| { |
| "epoch": 22.83038452465204, |
| "grad_norm": 0.7135064601898193, |
| "learning_rate": 9.398325873738007e-05, |
| "loss": 3.0747, |
| "step": 24200 |
| }, |
| { |
| "epoch": 22.924746402453408, |
| "grad_norm": 0.6699959635734558, |
| "learning_rate": 9.389998421955632e-05, |
| "loss": 3.0834, |
| "step": 24300 |
| }, |
| { |
| "epoch": 23.018872375560274, |
| "grad_norm": 0.7042635679244995, |
| "learning_rate": 9.381617479427885e-05, |
| "loss": 3.0699, |
| "step": 24400 |
| }, |
| { |
| "epoch": 23.113234253361643, |
| "grad_norm": 0.6912421584129333, |
| "learning_rate": 9.373183148273905e-05, |
| "loss": 2.998, |
| "step": 24500 |
| }, |
| { |
| "epoch": 23.20759613116301, |
| "grad_norm": 0.6952877640724182, |
| "learning_rate": 9.364695531263354e-05, |
| "loss": 3.013, |
| "step": 24600 |
| }, |
| { |
| "epoch": 23.301958008964377, |
| "grad_norm": 0.697961688041687, |
| "learning_rate": 9.35615473181517e-05, |
| "loss": 3.0282, |
| "step": 24700 |
| }, |
| { |
| "epoch": 23.396319886765745, |
| "grad_norm": 0.6919636130332947, |
| "learning_rate": 9.347560853996298e-05, |
| "loss": 3.0346, |
| "step": 24800 |
| }, |
| { |
| "epoch": 23.490681764567114, |
| "grad_norm": 0.6811705827713013, |
| "learning_rate": 9.338914002520426e-05, |
| "loss": 3.044, |
| "step": 24900 |
| }, |
| { |
| "epoch": 23.585043642368483, |
| "grad_norm": 0.6770613789558411, |
| "learning_rate": 9.330214282746712e-05, |
| "loss": 3.0517, |
| "step": 25000 |
| }, |
| { |
| "epoch": 23.585043642368483, |
| "eval_loss": 3.3697221279144287, |
| "eval_runtime": 89.9693, |
| "eval_samples_per_second": 167.513, |
| "eval_steps_per_second": 5.235, |
| "step": 25000 |
| }, |
| { |
| "epoch": 23.67940552016985, |
| "grad_norm": 0.6813396215438843, |
| "learning_rate": 9.321461800678494e-05, |
| "loss": 3.054, |
| "step": 25100 |
| }, |
| { |
| "epoch": 23.77376739797122, |
| "grad_norm": 0.6816933751106262, |
| "learning_rate": 9.312656662962004e-05, |
| "loss": 3.0563, |
| "step": 25200 |
| }, |
| { |
| "epoch": 23.86812927577259, |
| "grad_norm": 0.6841709017753601, |
| "learning_rate": 9.30379897688507e-05, |
| "loss": 3.0676, |
| "step": 25300 |
| }, |
| { |
| "epoch": 23.962491153573957, |
| "grad_norm": 0.6790122985839844, |
| "learning_rate": 9.294888850375796e-05, |
| "loss": 3.0636, |
| "step": 25400 |
| }, |
| { |
| "epoch": 24.05661712668082, |
| "grad_norm": 0.7041143178939819, |
| "learning_rate": 9.285926392001265e-05, |
| "loss": 3.0099, |
| "step": 25500 |
| }, |
| { |
| "epoch": 24.15097900448219, |
| "grad_norm": 0.7083395719528198, |
| "learning_rate": 9.276911710966205e-05, |
| "loss": 2.9899, |
| "step": 25600 |
| }, |
| { |
| "epoch": 24.245340882283557, |
| "grad_norm": 0.7026656866073608, |
| "learning_rate": 9.267844917111657e-05, |
| "loss": 3.0013, |
| "step": 25700 |
| }, |
| { |
| "epoch": 24.339702760084926, |
| "grad_norm": 0.7285379767417908, |
| "learning_rate": 9.258726120913643e-05, |
| "loss": 3.0126, |
| "step": 25800 |
| }, |
| { |
| "epoch": 24.434064637886294, |
| "grad_norm": 0.6987733244895935, |
| "learning_rate": 9.249555433481819e-05, |
| "loss": 3.0135, |
| "step": 25900 |
| }, |
| { |
| "epoch": 24.528426515687663, |
| "grad_norm": 0.7016812562942505, |
| "learning_rate": 9.240332966558116e-05, |
| "loss": 3.0267, |
| "step": 26000 |
| }, |
| { |
| "epoch": 24.528426515687663, |
| "eval_loss": 3.3704559803009033, |
| "eval_runtime": 89.912, |
| "eval_samples_per_second": 167.619, |
| "eval_steps_per_second": 5.238, |
| "step": 26000 |
| }, |
| { |
| "epoch": 24.62278839348903, |
| "grad_norm": 0.6898572444915771, |
| "learning_rate": 9.231058832515383e-05, |
| "loss": 3.0314, |
| "step": 26100 |
| }, |
| { |
| "epoch": 24.7171502712904, |
| "grad_norm": 0.6981006264686584, |
| "learning_rate": 9.221733144356015e-05, |
| "loss": 3.0389, |
| "step": 26200 |
| }, |
| { |
| "epoch": 24.81151214909177, |
| "grad_norm": 0.6911486983299255, |
| "learning_rate": 9.212356015710581e-05, |
| "loss": 3.0342, |
| "step": 26300 |
| }, |
| { |
| "epoch": 24.905874026893134, |
| "grad_norm": 0.6749563217163086, |
| "learning_rate": 9.202927560836436e-05, |
| "loss": 3.0406, |
| "step": 26400 |
| }, |
| { |
| "epoch": 25.0, |
| "grad_norm": 0.8290687203407288, |
| "learning_rate": 9.193447894616324e-05, |
| "loss": 3.0449, |
| "step": 26500 |
| }, |
| { |
| "epoch": 25.09436187780137, |
| "grad_norm": 0.7203742861747742, |
| "learning_rate": 9.183917132556987e-05, |
| "loss": 2.9604, |
| "step": 26600 |
| }, |
| { |
| "epoch": 25.188723755602737, |
| "grad_norm": 0.7170647978782654, |
| "learning_rate": 9.174335390787754e-05, |
| "loss": 2.9701, |
| "step": 26700 |
| }, |
| { |
| "epoch": 25.283085633404106, |
| "grad_norm": 0.7039710879325867, |
| "learning_rate": 9.164702786059125e-05, |
| "loss": 2.9824, |
| "step": 26800 |
| }, |
| { |
| "epoch": 25.377447511205474, |
| "grad_norm": 0.6996464133262634, |
| "learning_rate": 9.155019435741348e-05, |
| "loss": 2.9852, |
| "step": 26900 |
| }, |
| { |
| "epoch": 25.47180938900684, |
| "grad_norm": 0.7146093845367432, |
| "learning_rate": 9.14528545782299e-05, |
| "loss": 3.0032, |
| "step": 27000 |
| }, |
| { |
| "epoch": 25.47180938900684, |
| "eval_loss": 3.370755195617676, |
| "eval_runtime": 89.9361, |
| "eval_samples_per_second": 167.575, |
| "eval_steps_per_second": 5.237, |
| "step": 27000 |
| }, |
| { |
| "epoch": 25.566171266808208, |
| "grad_norm": 0.7037533521652222, |
| "learning_rate": 9.135500970909501e-05, |
| "loss": 3.0142, |
| "step": 27100 |
| }, |
| { |
| "epoch": 25.660533144609577, |
| "grad_norm": 0.6862325668334961, |
| "learning_rate": 9.125666094221766e-05, |
| "loss": 3.0143, |
| "step": 27200 |
| }, |
| { |
| "epoch": 25.754895022410945, |
| "grad_norm": 0.7106537818908691, |
| "learning_rate": 9.115780947594654e-05, |
| "loss": 3.0167, |
| "step": 27300 |
| }, |
| { |
| "epoch": 25.849256900212314, |
| "grad_norm": 0.684776246547699, |
| "learning_rate": 9.105845651475556e-05, |
| "loss": 3.0252, |
| "step": 27400 |
| }, |
| { |
| "epoch": 25.943618778013683, |
| "grad_norm": 0.6834477782249451, |
| "learning_rate": 9.09586032692292e-05, |
| "loss": 3.0274, |
| "step": 27500 |
| }, |
| { |
| "epoch": 26.03774475112055, |
| "grad_norm": 0.6980359554290771, |
| "learning_rate": 9.085825095604778e-05, |
| "loss": 2.9955, |
| "step": 27600 |
| }, |
| { |
| "epoch": 26.132106628921914, |
| "grad_norm": 0.6987936496734619, |
| "learning_rate": 9.075740079797253e-05, |
| "loss": 2.944, |
| "step": 27700 |
| }, |
| { |
| "epoch": 26.226468506723283, |
| "grad_norm": 0.7081961631774902, |
| "learning_rate": 9.06560540238308e-05, |
| "loss": 2.9606, |
| "step": 27800 |
| }, |
| { |
| "epoch": 26.32083038452465, |
| "grad_norm": 0.7238285541534424, |
| "learning_rate": 9.055421186850104e-05, |
| "loss": 2.9731, |
| "step": 27900 |
| }, |
| { |
| "epoch": 26.41519226232602, |
| "grad_norm": 0.7310410141944885, |
| "learning_rate": 9.045187557289783e-05, |
| "loss": 2.9782, |
| "step": 28000 |
| }, |
| { |
| "epoch": 26.41519226232602, |
| "eval_loss": 3.3737330436706543, |
| "eval_runtime": 89.9068, |
| "eval_samples_per_second": 167.629, |
| "eval_steps_per_second": 5.239, |
| "step": 28000 |
| }, |
| { |
| "epoch": 26.50955414012739, |
| "grad_norm": 0.7020130753517151, |
| "learning_rate": 9.034904638395656e-05, |
| "loss": 2.9869, |
| "step": 28100 |
| }, |
| { |
| "epoch": 26.603916017928757, |
| "grad_norm": 0.6861512660980225, |
| "learning_rate": 9.024572555461852e-05, |
| "loss": 2.986, |
| "step": 28200 |
| }, |
| { |
| "epoch": 26.698277895730126, |
| "grad_norm": 0.6824551224708557, |
| "learning_rate": 9.014191434381535e-05, |
| "loss": 2.9844, |
| "step": 28300 |
| }, |
| { |
| "epoch": 26.792639773531494, |
| "grad_norm": 0.6935917735099792, |
| "learning_rate": 9.003761401645393e-05, |
| "loss": 3.0022, |
| "step": 28400 |
| }, |
| { |
| "epoch": 26.887001651332863, |
| "grad_norm": 0.7133828401565552, |
| "learning_rate": 8.99328258434008e-05, |
| "loss": 3.0163, |
| "step": 28500 |
| }, |
| { |
| "epoch": 26.981363529134228, |
| "grad_norm": 0.6982234120368958, |
| "learning_rate": 8.982755110146681e-05, |
| "loss": 3.0124, |
| "step": 28600 |
| }, |
| { |
| "epoch": 27.075489502241094, |
| "grad_norm": 0.7271915078163147, |
| "learning_rate": 8.972179107339148e-05, |
| "loss": 2.9435, |
| "step": 28700 |
| }, |
| { |
| "epoch": 27.169851380042463, |
| "grad_norm": 0.7148338556289673, |
| "learning_rate": 8.961554704782731e-05, |
| "loss": 2.9278, |
| "step": 28800 |
| }, |
| { |
| "epoch": 27.26421325784383, |
| "grad_norm": 0.714860737323761, |
| "learning_rate": 8.95088203193243e-05, |
| "loss": 2.9465, |
| "step": 28900 |
| }, |
| { |
| "epoch": 27.3585751356452, |
| "grad_norm": 0.7342090606689453, |
| "learning_rate": 8.940161218831391e-05, |
| "loss": 2.95, |
| "step": 29000 |
| }, |
| { |
| "epoch": 27.3585751356452, |
| "eval_loss": 3.375699281692505, |
| "eval_runtime": 89.9233, |
| "eval_samples_per_second": 167.598, |
| "eval_steps_per_second": 5.238, |
| "step": 29000 |
| }, |
| { |
| "epoch": 27.45293701344657, |
| "grad_norm": 0.7332724332809448, |
| "learning_rate": 8.929392396109341e-05, |
| "loss": 2.9628, |
| "step": 29100 |
| }, |
| { |
| "epoch": 27.547298891247937, |
| "grad_norm": 0.6984511017799377, |
| "learning_rate": 8.918575694980983e-05, |
| "loss": 2.9744, |
| "step": 29200 |
| }, |
| { |
| "epoch": 27.641660769049302, |
| "grad_norm": 0.6978433132171631, |
| "learning_rate": 8.90771124724441e-05, |
| "loss": 2.97, |
| "step": 29300 |
| }, |
| { |
| "epoch": 27.73602264685067, |
| "grad_norm": 0.6954489946365356, |
| "learning_rate": 8.896799185279487e-05, |
| "loss": 2.984, |
| "step": 29400 |
| }, |
| { |
| "epoch": 27.83038452465204, |
| "grad_norm": 0.6958970427513123, |
| "learning_rate": 8.885839642046249e-05, |
| "loss": 2.9867, |
| "step": 29500 |
| }, |
| { |
| "epoch": 27.924746402453408, |
| "grad_norm": 0.7199437618255615, |
| "learning_rate": 8.874832751083266e-05, |
| "loss": 2.9967, |
| "step": 29600 |
| }, |
| { |
| "epoch": 28.018872375560274, |
| "grad_norm": 0.7096304893493652, |
| "learning_rate": 8.863778646506035e-05, |
| "loss": 2.9769, |
| "step": 29700 |
| }, |
| { |
| "epoch": 28.113234253361643, |
| "grad_norm": 0.7242410182952881, |
| "learning_rate": 8.85267746300533e-05, |
| "loss": 2.9083, |
| "step": 29800 |
| }, |
| { |
| "epoch": 28.20759613116301, |
| "grad_norm": 0.7416352033615112, |
| "learning_rate": 8.841529335845569e-05, |
| "loss": 2.9245, |
| "step": 29900 |
| }, |
| { |
| "epoch": 28.301958008964377, |
| "grad_norm": 0.7048490047454834, |
| "learning_rate": 8.830334400863164e-05, |
| "loss": 2.9359, |
| "step": 30000 |
| }, |
| { |
| "epoch": 28.301958008964377, |
| "eval_loss": 3.3796088695526123, |
| "eval_runtime": 89.9224, |
| "eval_samples_per_second": 167.6, |
| "eval_steps_per_second": 5.238, |
| "step": 30000 |
| }, |
| { |
| "epoch": 28.396319886765745, |
| "grad_norm": 0.7235086560249329, |
| "learning_rate": 8.819092794464863e-05, |
| "loss": 2.9466, |
| "step": 30100 |
| }, |
| { |
| "epoch": 28.490681764567114, |
| "grad_norm": 0.7137046456336975, |
| "learning_rate": 8.807804653626095e-05, |
| "loss": 2.9498, |
| "step": 30200 |
| }, |
| { |
| "epoch": 28.585043642368483, |
| "grad_norm": 0.7323505282402039, |
| "learning_rate": 8.796470115889292e-05, |
| "loss": 2.959, |
| "step": 30300 |
| }, |
| { |
| "epoch": 28.67940552016985, |
| "grad_norm": 0.7089937329292297, |
| "learning_rate": 8.785089319362221e-05, |
| "loss": 2.9625, |
| "step": 30400 |
| }, |
| { |
| "epoch": 28.77376739797122, |
| "grad_norm": 0.7195263504981995, |
| "learning_rate": 8.773662402716294e-05, |
| "loss": 2.9597, |
| "step": 30500 |
| }, |
| { |
| "epoch": 28.86812927577259, |
| "grad_norm": 0.7019860148429871, |
| "learning_rate": 8.762189505184885e-05, |
| "loss": 2.9728, |
| "step": 30600 |
| }, |
| { |
| "epoch": 28.962491153573957, |
| "grad_norm": 0.7253925204277039, |
| "learning_rate": 8.75067076656163e-05, |
| "loss": 2.9719, |
| "step": 30700 |
| }, |
| { |
| "epoch": 29.05661712668082, |
| "grad_norm": 0.7433711290359497, |
| "learning_rate": 8.739106327198724e-05, |
| "loss": 2.9189, |
| "step": 30800 |
| }, |
| { |
| "epoch": 29.15097900448219, |
| "grad_norm": 0.7175608277320862, |
| "learning_rate": 8.727496328005211e-05, |
| "loss": 2.8938, |
| "step": 30900 |
| }, |
| { |
| "epoch": 29.245340882283557, |
| "grad_norm": 0.7219389081001282, |
| "learning_rate": 8.715840910445267e-05, |
| "loss": 2.9088, |
| "step": 31000 |
| }, |
| { |
| "epoch": 29.245340882283557, |
| "eval_loss": 3.3833043575286865, |
| "eval_runtime": 89.9172, |
| "eval_samples_per_second": 167.61, |
| "eval_steps_per_second": 5.238, |
| "step": 31000 |
| }, |
| { |
| "epoch": 29.339702760084926, |
| "grad_norm": 0.7131245136260986, |
| "learning_rate": 8.704140216536478e-05, |
| "loss": 2.9259, |
| "step": 31100 |
| }, |
| { |
| "epoch": 29.434064637886294, |
| "grad_norm": 0.7141076326370239, |
| "learning_rate": 8.692394388848107e-05, |
| "loss": 2.9384, |
| "step": 31200 |
| }, |
| { |
| "epoch": 29.528426515687663, |
| "grad_norm": 0.729604959487915, |
| "learning_rate": 8.680603570499354e-05, |
| "loss": 2.936, |
| "step": 31300 |
| }, |
| { |
| "epoch": 29.62278839348903, |
| "grad_norm": 0.7165504693984985, |
| "learning_rate": 8.668767905157625e-05, |
| "loss": 2.941, |
| "step": 31400 |
| }, |
| { |
| "epoch": 29.7171502712904, |
| "grad_norm": 0.7350253462791443, |
| "learning_rate": 8.656887537036762e-05, |
| "loss": 2.943, |
| "step": 31500 |
| }, |
| { |
| "epoch": 29.81151214909177, |
| "grad_norm": 0.741100013256073, |
| "learning_rate": 8.644962610895303e-05, |
| "loss": 2.9523, |
| "step": 31600 |
| }, |
| { |
| "epoch": 29.905874026893134, |
| "grad_norm": 0.7068082094192505, |
| "learning_rate": 8.63299327203471e-05, |
| "loss": 2.9574, |
| "step": 31700 |
| }, |
| { |
| "epoch": 30.0, |
| "grad_norm": 0.8827744722366333, |
| "learning_rate": 8.620979666297603e-05, |
| "loss": 2.9659, |
| "step": 31800 |
| }, |
| { |
| "epoch": 30.09436187780137, |
| "grad_norm": 0.7378543615341187, |
| "learning_rate": 8.608921940065973e-05, |
| "loss": 2.8722, |
| "step": 31900 |
| }, |
| { |
| "epoch": 30.188723755602737, |
| "grad_norm": 0.7561068534851074, |
| "learning_rate": 8.596820240259408e-05, |
| "loss": 2.8901, |
| "step": 32000 |
| }, |
| { |
| "epoch": 30.188723755602737, |
| "eval_loss": 3.385913372039795, |
| "eval_runtime": 89.9361, |
| "eval_samples_per_second": 167.574, |
| "eval_steps_per_second": 5.237, |
| "step": 32000 |
| }, |
| { |
| "epoch": 30.283085633404106, |
| "grad_norm": 0.7235729098320007, |
| "learning_rate": 8.584674714333303e-05, |
| "loss": 2.8969, |
| "step": 32100 |
| }, |
| { |
| "epoch": 30.377447511205474, |
| "grad_norm": 0.7277801036834717, |
| "learning_rate": 8.57248551027706e-05, |
| "loss": 2.911, |
| "step": 32200 |
| }, |
| { |
| "epoch": 30.47180938900684, |
| "grad_norm": 0.7331570386886597, |
| "learning_rate": 8.56025277661228e-05, |
| "loss": 2.9078, |
| "step": 32300 |
| }, |
| { |
| "epoch": 30.566171266808208, |
| "grad_norm": 0.7430235147476196, |
| "learning_rate": 8.547976662390964e-05, |
| "loss": 2.923, |
| "step": 32400 |
| }, |
| { |
| "epoch": 30.660533144609577, |
| "grad_norm": 0.7435758113861084, |
| "learning_rate": 8.535657317193692e-05, |
| "loss": 2.929, |
| "step": 32500 |
| }, |
| { |
| "epoch": 30.754895022410945, |
| "grad_norm": 0.7271901965141296, |
| "learning_rate": 8.523294891127794e-05, |
| "loss": 2.9341, |
| "step": 32600 |
| }, |
| { |
| "epoch": 30.849256900212314, |
| "grad_norm": 0.7337944507598877, |
| "learning_rate": 8.510889534825532e-05, |
| "loss": 2.9427, |
| "step": 32700 |
| }, |
| { |
| "epoch": 30.943618778013683, |
| "grad_norm": 0.74055415391922, |
| "learning_rate": 8.498441399442258e-05, |
| "loss": 2.9501, |
| "step": 32800 |
| }, |
| { |
| "epoch": 31.03774475112055, |
| "grad_norm": 0.7217901945114136, |
| "learning_rate": 8.485950636654572e-05, |
| "loss": 2.9139, |
| "step": 32900 |
| }, |
| { |
| "epoch": 31.132106628921914, |
| "grad_norm": 0.7476524114608765, |
| "learning_rate": 8.473417398658476e-05, |
| "loss": 2.862, |
| "step": 33000 |
| }, |
| { |
| "epoch": 31.132106628921914, |
| "eval_loss": 3.3878986835479736, |
| "eval_runtime": 89.9711, |
| "eval_samples_per_second": 167.509, |
| "eval_steps_per_second": 5.235, |
| "step": 33000 |
| }, |
| { |
| "epoch": 31.226468506723283, |
| "grad_norm": 0.7332272529602051, |
| "learning_rate": 8.460841838167523e-05, |
| "loss": 2.8792, |
| "step": 33100 |
| }, |
| { |
| "epoch": 31.32083038452465, |
| "grad_norm": 0.7293978333473206, |
| "learning_rate": 8.448224108410947e-05, |
| "loss": 2.8846, |
| "step": 33200 |
| }, |
| { |
| "epoch": 31.41519226232602, |
| "grad_norm": 0.760312557220459, |
| "learning_rate": 8.435564363131803e-05, |
| "loss": 2.8935, |
| "step": 33300 |
| }, |
| { |
| "epoch": 31.50955414012739, |
| "grad_norm": 0.7319904565811157, |
| "learning_rate": 8.422862756585091e-05, |
| "loss": 2.9072, |
| "step": 33400 |
| }, |
| { |
| "epoch": 31.603916017928757, |
| "grad_norm": 0.7277701497077942, |
| "learning_rate": 8.41011944353588e-05, |
| "loss": 2.9056, |
| "step": 33500 |
| }, |
| { |
| "epoch": 31.698277895730126, |
| "grad_norm": 0.7287102937698364, |
| "learning_rate": 8.397334579257418e-05, |
| "loss": 2.9164, |
| "step": 33600 |
| }, |
| { |
| "epoch": 31.792639773531494, |
| "grad_norm": 0.7181798815727234, |
| "learning_rate": 8.384508319529242e-05, |
| "loss": 2.9264, |
| "step": 33700 |
| }, |
| { |
| "epoch": 31.887001651332863, |
| "grad_norm": 0.7266701459884644, |
| "learning_rate": 8.371640820635278e-05, |
| "loss": 2.9325, |
| "step": 33800 |
| }, |
| { |
| "epoch": 31.981363529134228, |
| "grad_norm": 0.7393918633460999, |
| "learning_rate": 8.358732239361938e-05, |
| "loss": 2.9315, |
| "step": 33900 |
| }, |
| { |
| "epoch": 32.0754895022411, |
| "grad_norm": 0.7530562877655029, |
| "learning_rate": 8.345782732996215e-05, |
| "loss": 2.8642, |
| "step": 34000 |
| }, |
| { |
| "epoch": 32.0754895022411, |
| "eval_loss": 3.388929605484009, |
| "eval_runtime": 89.9448, |
| "eval_samples_per_second": 167.558, |
| "eval_steps_per_second": 5.237, |
| "step": 34000 |
| }, |
| { |
| "epoch": 32.169851380042466, |
| "grad_norm": 0.7409230470657349, |
| "learning_rate": 8.332792459323753e-05, |
| "loss": 2.8584, |
| "step": 34100 |
| }, |
| { |
| "epoch": 32.26421325784383, |
| "grad_norm": 0.7440000176429749, |
| "learning_rate": 8.31976157662694e-05, |
| "loss": 2.8751, |
| "step": 34200 |
| }, |
| { |
| "epoch": 32.3585751356452, |
| "grad_norm": 0.7307605743408203, |
| "learning_rate": 8.30669024368297e-05, |
| "loss": 2.8727, |
| "step": 34300 |
| }, |
| { |
| "epoch": 32.452937013446565, |
| "grad_norm": 0.7464194893836975, |
| "learning_rate": 8.293578619761906e-05, |
| "loss": 2.8911, |
| "step": 34400 |
| }, |
| { |
| "epoch": 32.547298891247934, |
| "grad_norm": 0.7538871169090271, |
| "learning_rate": 8.280426864624753e-05, |
| "loss": 2.8857, |
| "step": 34500 |
| }, |
| { |
| "epoch": 32.6416607690493, |
| "grad_norm": 0.7551910281181335, |
| "learning_rate": 8.267235138521492e-05, |
| "loss": 2.9027, |
| "step": 34600 |
| }, |
| { |
| "epoch": 32.73602264685067, |
| "grad_norm": 0.7386404275894165, |
| "learning_rate": 8.254003602189146e-05, |
| "loss": 2.8945, |
| "step": 34700 |
| }, |
| { |
| "epoch": 32.83038452465204, |
| "grad_norm": 0.7178263068199158, |
| "learning_rate": 8.240732416849807e-05, |
| "loss": 2.9168, |
| "step": 34800 |
| }, |
| { |
| "epoch": 32.92474640245341, |
| "grad_norm": 0.7415292859077454, |
| "learning_rate": 8.227421744208683e-05, |
| "loss": 2.9136, |
| "step": 34900 |
| }, |
| { |
| "epoch": 33.01887237556027, |
| "grad_norm": 0.7464359402656555, |
| "learning_rate": 8.214071746452117e-05, |
| "loss": 2.8994, |
| "step": 35000 |
| }, |
| { |
| "epoch": 33.01887237556027, |
| "eval_loss": 3.3880865573883057, |
| "eval_runtime": 89.9524, |
| "eval_samples_per_second": 167.544, |
| "eval_steps_per_second": 5.236, |
| "step": 35000 |
| }, |
| { |
| "epoch": 33.11323425336164, |
| "grad_norm": 0.7449530959129333, |
| "learning_rate": 8.200682586245621e-05, |
| "loss": 2.838, |
| "step": 35100 |
| }, |
| { |
| "epoch": 33.20759613116301, |
| "grad_norm": 0.7371875643730164, |
| "learning_rate": 8.187254426731884e-05, |
| "loss": 2.8481, |
| "step": 35200 |
| }, |
| { |
| "epoch": 33.30195800896438, |
| "grad_norm": 0.7737680077552795, |
| "learning_rate": 8.173787431528794e-05, |
| "loss": 2.8612, |
| "step": 35300 |
| }, |
| { |
| "epoch": 33.396319886765745, |
| "grad_norm": 0.7652265429496765, |
| "learning_rate": 8.160281764727436e-05, |
| "loss": 2.869, |
| "step": 35400 |
| }, |
| { |
| "epoch": 33.490681764567114, |
| "grad_norm": 0.7546202540397644, |
| "learning_rate": 8.146737590890101e-05, |
| "loss": 2.8735, |
| "step": 35500 |
| }, |
| { |
| "epoch": 33.58504364236848, |
| "grad_norm": 0.7740090489387512, |
| "learning_rate": 8.133155075048269e-05, |
| "loss": 2.8876, |
| "step": 35600 |
| }, |
| { |
| "epoch": 33.67940552016985, |
| "grad_norm": 0.7482042908668518, |
| "learning_rate": 8.119534382700613e-05, |
| "loss": 2.8855, |
| "step": 35700 |
| }, |
| { |
| "epoch": 33.77376739797122, |
| "grad_norm": 0.749400794506073, |
| "learning_rate": 8.105875679810968e-05, |
| "loss": 2.8938, |
| "step": 35800 |
| }, |
| { |
| "epoch": 33.86812927577259, |
| "grad_norm": 0.7444037795066833, |
| "learning_rate": 8.092179132806317e-05, |
| "loss": 2.8955, |
| "step": 35900 |
| }, |
| { |
| "epoch": 33.96249115357396, |
| "grad_norm": 0.7378069162368774, |
| "learning_rate": 8.078444908574767e-05, |
| "loss": 2.9029, |
| "step": 36000 |
| }, |
| { |
| "epoch": 33.96249115357396, |
| "eval_loss": 3.381842851638794, |
| "eval_runtime": 89.9983, |
| "eval_samples_per_second": 167.459, |
| "eval_steps_per_second": 5.233, |
| "step": 36000 |
| }, |
| { |
| "epoch": 34.05661712668082, |
| "grad_norm": 0.7438592910766602, |
| "learning_rate": 8.064673174463505e-05, |
| "loss": 2.8451, |
| "step": 36100 |
| }, |
| { |
| "epoch": 34.15097900448219, |
| "grad_norm": 0.7583552002906799, |
| "learning_rate": 8.050864098276762e-05, |
| "loss": 2.8246, |
| "step": 36200 |
| }, |
| { |
| "epoch": 34.24534088228356, |
| "grad_norm": 0.762986421585083, |
| "learning_rate": 8.037017848273776e-05, |
| "loss": 2.8364, |
| "step": 36300 |
| }, |
| { |
| "epoch": 34.339702760084926, |
| "grad_norm": 0.7692263722419739, |
| "learning_rate": 8.023134593166734e-05, |
| "loss": 2.8472, |
| "step": 36400 |
| }, |
| { |
| "epoch": 34.434064637886294, |
| "grad_norm": 0.759116530418396, |
| "learning_rate": 8.009214502118718e-05, |
| "loss": 2.8519, |
| "step": 36500 |
| }, |
| { |
| "epoch": 34.52842651568766, |
| "grad_norm": 0.7704517245292664, |
| "learning_rate": 7.995257744741642e-05, |
| "loss": 2.87, |
| "step": 36600 |
| }, |
| { |
| "epoch": 34.62278839348903, |
| "grad_norm": 0.7511392831802368, |
| "learning_rate": 7.981264491094192e-05, |
| "loss": 2.8769, |
| "step": 36700 |
| }, |
| { |
| "epoch": 34.7171502712904, |
| "grad_norm": 0.7402914762496948, |
| "learning_rate": 7.967234911679749e-05, |
| "loss": 2.8784, |
| "step": 36800 |
| }, |
| { |
| "epoch": 34.81151214909177, |
| "grad_norm": 0.7662982940673828, |
| "learning_rate": 7.953169177444309e-05, |
| "loss": 2.8838, |
| "step": 36900 |
| }, |
| { |
| "epoch": 34.90587402689314, |
| "grad_norm": 0.778233528137207, |
| "learning_rate": 7.939067459774405e-05, |
| "loss": 2.8886, |
| "step": 37000 |
| }, |
| { |
| "epoch": 34.90587402689314, |
| "eval_loss": 3.3882803916931152, |
| "eval_runtime": 89.9794, |
| "eval_samples_per_second": 167.494, |
| "eval_steps_per_second": 5.235, |
| "step": 37000 |
| }, |
| { |
| "epoch": 35.0, |
| "grad_norm": 0.8936963081359863, |
| "learning_rate": 7.924929930495018e-05, |
| "loss": 2.8903, |
| "step": 37100 |
| }, |
| { |
| "epoch": 35.09436187780137, |
| "grad_norm": 0.7600644826889038, |
| "learning_rate": 7.910756761867479e-05, |
| "loss": 2.8005, |
| "step": 37200 |
| }, |
| { |
| "epoch": 35.18872375560274, |
| "grad_norm": 0.7367326021194458, |
| "learning_rate": 7.896548126587374e-05, |
| "loss": 2.8147, |
| "step": 37300 |
| }, |
| { |
| "epoch": 35.283085633404106, |
| "grad_norm": 0.774819016456604, |
| "learning_rate": 7.882304197782443e-05, |
| "loss": 2.833, |
| "step": 37400 |
| }, |
| { |
| "epoch": 35.377447511205474, |
| "grad_norm": 0.7745602130889893, |
| "learning_rate": 7.86802514901046e-05, |
| "loss": 2.8376, |
| "step": 37500 |
| }, |
| { |
| "epoch": 35.47180938900684, |
| "grad_norm": 0.7619943022727966, |
| "learning_rate": 7.853711154257133e-05, |
| "loss": 2.8444, |
| "step": 37600 |
| }, |
| { |
| "epoch": 35.56617126680821, |
| "grad_norm": 0.7863638997077942, |
| "learning_rate": 7.839362387933965e-05, |
| "loss": 2.8523, |
| "step": 37700 |
| }, |
| { |
| "epoch": 35.66053314460958, |
| "grad_norm": 0.7846052646636963, |
| "learning_rate": 7.824979024876149e-05, |
| "loss": 2.8685, |
| "step": 37800 |
| }, |
| { |
| "epoch": 35.75489502241095, |
| "grad_norm": 0.7603549361228943, |
| "learning_rate": 7.810561240340424e-05, |
| "loss": 2.8682, |
| "step": 37900 |
| }, |
| { |
| "epoch": 35.84925690021232, |
| "grad_norm": 0.7679409384727478, |
| "learning_rate": 7.796109210002945e-05, |
| "loss": 2.8682, |
| "step": 38000 |
| }, |
| { |
| "epoch": 35.84925690021232, |
| "eval_loss": 3.392080068588257, |
| "eval_runtime": 89.9863, |
| "eval_samples_per_second": 167.481, |
| "eval_steps_per_second": 5.234, |
| "step": 38000 |
| }, |
| { |
| "epoch": 35.94361877801368, |
| "grad_norm": 0.7791533470153809, |
| "learning_rate": 7.781623109957139e-05, |
| "loss": 2.8776, |
| "step": 38100 |
| }, |
| { |
| "epoch": 36.03774475112055, |
| "grad_norm": 0.7758477926254272, |
| "learning_rate": 7.767103116711566e-05, |
| "loss": 2.8462, |
| "step": 38200 |
| }, |
| { |
| "epoch": 36.13210662892192, |
| "grad_norm": 0.7758071422576904, |
| "learning_rate": 7.752549407187761e-05, |
| "loss": 2.7986, |
| "step": 38300 |
| }, |
| { |
| "epoch": 36.226468506723286, |
| "grad_norm": 0.7871308922767639, |
| "learning_rate": 7.73796215871808e-05, |
| "loss": 2.8106, |
| "step": 38400 |
| }, |
| { |
| "epoch": 36.320830384524655, |
| "grad_norm": 0.7900902628898621, |
| "learning_rate": 7.723341549043543e-05, |
| "loss": 2.8269, |
| "step": 38500 |
| }, |
| { |
| "epoch": 36.41519226232602, |
| "grad_norm": 0.770078182220459, |
| "learning_rate": 7.708687756311666e-05, |
| "loss": 2.8296, |
| "step": 38600 |
| }, |
| { |
| "epoch": 36.50955414012739, |
| "grad_norm": 0.7602365016937256, |
| "learning_rate": 7.694000959074288e-05, |
| "loss": 2.8369, |
| "step": 38700 |
| }, |
| { |
| "epoch": 36.60391601792875, |
| "grad_norm": 0.7826948165893555, |
| "learning_rate": 7.679281336285398e-05, |
| "loss": 2.8472, |
| "step": 38800 |
| }, |
| { |
| "epoch": 36.69827789573012, |
| "grad_norm": 0.7577803134918213, |
| "learning_rate": 7.664529067298954e-05, |
| "loss": 2.8498, |
| "step": 38900 |
| }, |
| { |
| "epoch": 36.79263977353149, |
| "grad_norm": 0.7747544050216675, |
| "learning_rate": 7.649744331866702e-05, |
| "loss": 2.8566, |
| "step": 39000 |
| }, |
| { |
| "epoch": 36.79263977353149, |
| "eval_loss": 3.398228883743286, |
| "eval_runtime": 89.9885, |
| "eval_samples_per_second": 167.477, |
| "eval_steps_per_second": 5.234, |
| "step": 39000 |
| }, |
| { |
| "epoch": 36.88700165133286, |
| "grad_norm": 0.7758987545967102, |
| "learning_rate": 7.634927310135972e-05, |
| "loss": 2.8555, |
| "step": 39100 |
| }, |
| { |
| "epoch": 36.98136352913423, |
| "grad_norm": 0.7727870941162109, |
| "learning_rate": 7.620078182647502e-05, |
| "loss": 2.8658, |
| "step": 39200 |
| }, |
| { |
| "epoch": 37.0754895022411, |
| "grad_norm": 0.7815468311309814, |
| "learning_rate": 7.605197130333222e-05, |
| "loss": 2.7972, |
| "step": 39300 |
| }, |
| { |
| "epoch": 37.169851380042466, |
| "grad_norm": 0.7656765580177307, |
| "learning_rate": 7.590284334514057e-05, |
| "loss": 2.7851, |
| "step": 39400 |
| }, |
| { |
| "epoch": 37.26421325784383, |
| "grad_norm": 0.7890497446060181, |
| "learning_rate": 7.575339976897722e-05, |
| "loss": 2.8012, |
| "step": 39500 |
| }, |
| { |
| "epoch": 37.3585751356452, |
| "grad_norm": 0.781253457069397, |
| "learning_rate": 7.560364239576496e-05, |
| "loss": 2.8092, |
| "step": 39600 |
| }, |
| { |
| "epoch": 37.452937013446565, |
| "grad_norm": 0.8040027022361755, |
| "learning_rate": 7.545357305025013e-05, |
| "loss": 2.8174, |
| "step": 39700 |
| }, |
| { |
| "epoch": 37.547298891247934, |
| "grad_norm": 0.7832446098327637, |
| "learning_rate": 7.530319356098033e-05, |
| "loss": 2.8258, |
| "step": 39800 |
| }, |
| { |
| "epoch": 37.6416607690493, |
| "grad_norm": 0.7919667363166809, |
| "learning_rate": 7.51525057602822e-05, |
| "loss": 2.8337, |
| "step": 39900 |
| }, |
| { |
| "epoch": 37.73602264685067, |
| "grad_norm": 0.768998920917511, |
| "learning_rate": 7.500151148423902e-05, |
| "loss": 2.8465, |
| "step": 40000 |
| }, |
| { |
| "epoch": 37.73602264685067, |
| "eval_loss": 3.401014804840088, |
| "eval_runtime": 89.9821, |
| "eval_samples_per_second": 167.489, |
| "eval_steps_per_second": 5.234, |
| "step": 40000 |
| }, |
| { |
| "epoch": 37.83038452465204, |
| "grad_norm": 0.7764397859573364, |
| "learning_rate": 7.485021257266841e-05, |
| "loss": 2.8427, |
| "step": 40100 |
| }, |
| { |
| "epoch": 37.92474640245341, |
| "grad_norm": 0.8028204441070557, |
| "learning_rate": 7.469861086909983e-05, |
| "loss": 2.8575, |
| "step": 40200 |
| }, |
| { |
| "epoch": 38.01887237556027, |
| "grad_norm": 0.7660278677940369, |
| "learning_rate": 7.454670822075225e-05, |
| "loss": 2.8456, |
| "step": 40300 |
| }, |
| { |
| "epoch": 38.11323425336164, |
| "grad_norm": 0.7917942404747009, |
| "learning_rate": 7.439450647851145e-05, |
| "loss": 2.7736, |
| "step": 40400 |
| }, |
| { |
| "epoch": 38.20759613116301, |
| "grad_norm": 0.7773132920265198, |
| "learning_rate": 7.424200749690763e-05, |
| "loss": 2.7823, |
| "step": 40500 |
| }, |
| { |
| "epoch": 38.30195800896438, |
| "grad_norm": 0.7813265323638916, |
| "learning_rate": 7.40892131340928e-05, |
| "loss": 2.7882, |
| "step": 40600 |
| }, |
| { |
| "epoch": 38.396319886765745, |
| "grad_norm": 0.7759552001953125, |
| "learning_rate": 7.393612525181801e-05, |
| "loss": 2.8094, |
| "step": 40700 |
| }, |
| { |
| "epoch": 38.490681764567114, |
| "grad_norm": 0.8028907179832458, |
| "learning_rate": 7.37827457154108e-05, |
| "loss": 2.8172, |
| "step": 40800 |
| }, |
| { |
| "epoch": 38.58504364236848, |
| "grad_norm": 0.7959277033805847, |
| "learning_rate": 7.362907639375244e-05, |
| "loss": 2.8127, |
| "step": 40900 |
| }, |
| { |
| "epoch": 38.67940552016985, |
| "grad_norm": 0.803551971912384, |
| "learning_rate": 7.347511915925512e-05, |
| "loss": 2.8238, |
| "step": 41000 |
| }, |
| { |
| "epoch": 38.67940552016985, |
| "eval_loss": 3.407241106033325, |
| "eval_runtime": 89.9361, |
| "eval_samples_per_second": 167.575, |
| "eval_steps_per_second": 5.237, |
| "step": 41000 |
| }, |
| { |
| "epoch": 38.77376739797122, |
| "grad_norm": 0.7966712117195129, |
| "learning_rate": 7.33208758878391e-05, |
| "loss": 2.8334, |
| "step": 41100 |
| }, |
| { |
| "epoch": 38.86812927577259, |
| "grad_norm": 0.7839232683181763, |
| "learning_rate": 7.316634845891003e-05, |
| "loss": 2.8408, |
| "step": 41200 |
| }, |
| { |
| "epoch": 38.96249115357396, |
| "grad_norm": 0.7712773084640503, |
| "learning_rate": 7.301153875533583e-05, |
| "loss": 2.8411, |
| "step": 41300 |
| }, |
| { |
| "epoch": 39.05661712668082, |
| "grad_norm": 0.819406270980835, |
| "learning_rate": 7.28564486634239e-05, |
| "loss": 2.7901, |
| "step": 41400 |
| }, |
| { |
| "epoch": 39.15097900448219, |
| "grad_norm": 0.7960841655731201, |
| "learning_rate": 7.270108007289807e-05, |
| "loss": 2.7619, |
| "step": 41500 |
| }, |
| { |
| "epoch": 39.24534088228356, |
| "grad_norm": 0.7944643497467041, |
| "learning_rate": 7.254543487687558e-05, |
| "loss": 2.7769, |
| "step": 41600 |
| }, |
| { |
| "epoch": 39.339702760084926, |
| "grad_norm": 0.7662415504455566, |
| "learning_rate": 7.2389514971844e-05, |
| "loss": 2.7831, |
| "step": 41700 |
| }, |
| { |
| "epoch": 39.434064637886294, |
| "grad_norm": 0.8135995864868164, |
| "learning_rate": 7.22333222576382e-05, |
| "loss": 2.7926, |
| "step": 41800 |
| }, |
| { |
| "epoch": 39.52842651568766, |
| "grad_norm": 0.7792801856994629, |
| "learning_rate": 7.207685863741711e-05, |
| "loss": 2.8114, |
| "step": 41900 |
| }, |
| { |
| "epoch": 39.62278839348903, |
| "grad_norm": 0.7813442945480347, |
| "learning_rate": 7.192012601764053e-05, |
| "loss": 2.8113, |
| "step": 42000 |
| }, |
| { |
| "epoch": 39.62278839348903, |
| "eval_loss": 3.411085367202759, |
| "eval_runtime": 89.9453, |
| "eval_samples_per_second": 167.557, |
| "eval_steps_per_second": 5.237, |
| "step": 42000 |
| }, |
| { |
| "epoch": 39.7171502712904, |
| "grad_norm": 0.7800647020339966, |
| "learning_rate": 7.1763126308046e-05, |
| "loss": 2.8153, |
| "step": 42100 |
| }, |
| { |
| "epoch": 39.81151214909177, |
| "grad_norm": 0.7858373522758484, |
| "learning_rate": 7.160586142162544e-05, |
| "loss": 2.8193, |
| "step": 42200 |
| }, |
| { |
| "epoch": 39.90587402689314, |
| "grad_norm": 0.7742355465888977, |
| "learning_rate": 7.144833327460186e-05, |
| "loss": 2.8264, |
| "step": 42300 |
| }, |
| { |
| "epoch": 40.0, |
| "grad_norm": 0.9822238087654114, |
| "learning_rate": 7.129054378640599e-05, |
| "loss": 2.8416, |
| "step": 42400 |
| }, |
| { |
| "epoch": 40.09436187780137, |
| "grad_norm": 0.8307941555976868, |
| "learning_rate": 7.1132494879653e-05, |
| "loss": 2.7412, |
| "step": 42500 |
| }, |
| { |
| "epoch": 40.18872375560274, |
| "grad_norm": 0.8017317652702332, |
| "learning_rate": 7.097418848011888e-05, |
| "loss": 2.7688, |
| "step": 42600 |
| }, |
| { |
| "epoch": 40.283085633404106, |
| "grad_norm": 0.7949444651603699, |
| "learning_rate": 7.081562651671719e-05, |
| "loss": 2.7702, |
| "step": 42700 |
| }, |
| { |
| "epoch": 40.377447511205474, |
| "grad_norm": 0.8076364994049072, |
| "learning_rate": 7.065681092147542e-05, |
| "loss": 2.781, |
| "step": 42800 |
| }, |
| { |
| "epoch": 40.47180938900684, |
| "grad_norm": 0.8113875985145569, |
| "learning_rate": 7.049774362951144e-05, |
| "loss": 2.7869, |
| "step": 42900 |
| }, |
| { |
| "epoch": 40.56617126680821, |
| "grad_norm": 0.7861994504928589, |
| "learning_rate": 7.033842657901005e-05, |
| "loss": 2.7959, |
| "step": 43000 |
| }, |
| { |
| "epoch": 40.56617126680821, |
| "eval_loss": 3.4162018299102783, |
| "eval_runtime": 89.9359, |
| "eval_samples_per_second": 167.575, |
| "eval_steps_per_second": 5.237, |
| "step": 43000 |
| }, |
| { |
| "epoch": 40.66053314460958, |
| "grad_norm": 0.7873150706291199, |
| "learning_rate": 7.017886171119917e-05, |
| "loss": 2.7957, |
| "step": 43100 |
| }, |
| { |
| "epoch": 40.75489502241095, |
| "grad_norm": 0.8050914406776428, |
| "learning_rate": 7.001905097032644e-05, |
| "loss": 2.8019, |
| "step": 43200 |
| }, |
| { |
| "epoch": 40.84925690021232, |
| "grad_norm": 0.8050124645233154, |
| "learning_rate": 6.985899630363526e-05, |
| "loss": 2.8164, |
| "step": 43300 |
| }, |
| { |
| "epoch": 40.94361877801368, |
| "grad_norm": 0.8335410952568054, |
| "learning_rate": 6.969869966134123e-05, |
| "loss": 2.8181, |
| "step": 43400 |
| }, |
| { |
| "epoch": 41.03774475112055, |
| "grad_norm": 0.8209425806999207, |
| "learning_rate": 6.953816299660834e-05, |
| "loss": 2.7879, |
| "step": 43500 |
| }, |
| { |
| "epoch": 41.13210662892192, |
| "grad_norm": 0.8106586337089539, |
| "learning_rate": 6.937738826552524e-05, |
| "loss": 2.7338, |
| "step": 43600 |
| }, |
| { |
| "epoch": 41.226468506723286, |
| "grad_norm": 0.8221563696861267, |
| "learning_rate": 6.921637742708123e-05, |
| "loss": 2.7508, |
| "step": 43700 |
| }, |
| { |
| "epoch": 41.320830384524655, |
| "grad_norm": 0.8021454811096191, |
| "learning_rate": 6.905513244314259e-05, |
| "loss": 2.7614, |
| "step": 43800 |
| }, |
| { |
| "epoch": 41.41519226232602, |
| "grad_norm": 0.809982419013977, |
| "learning_rate": 6.889365527842857e-05, |
| "loss": 2.7696, |
| "step": 43900 |
| }, |
| { |
| "epoch": 41.50955414012739, |
| "grad_norm": 0.8207118511199951, |
| "learning_rate": 6.873194790048746e-05, |
| "loss": 2.7854, |
| "step": 44000 |
| }, |
| { |
| "epoch": 41.50955414012739, |
| "eval_loss": 3.42000675201416, |
| "eval_runtime": 89.959, |
| "eval_samples_per_second": 167.532, |
| "eval_steps_per_second": 5.236, |
| "step": 44000 |
| }, |
| { |
| "epoch": 41.60391601792875, |
| "grad_norm": 0.8175562620162964, |
| "learning_rate": 6.857001227967263e-05, |
| "loss": 2.7889, |
| "step": 44100 |
| }, |
| { |
| "epoch": 41.69827789573012, |
| "grad_norm": 0.8078480958938599, |
| "learning_rate": 6.84078503891185e-05, |
| "loss": 2.7947, |
| "step": 44200 |
| }, |
| { |
| "epoch": 41.79263977353149, |
| "grad_norm": 0.8170962333679199, |
| "learning_rate": 6.824546420471653e-05, |
| "loss": 2.8019, |
| "step": 44300 |
| }, |
| { |
| "epoch": 41.88700165133286, |
| "grad_norm": 0.8198286294937134, |
| "learning_rate": 6.808285570509117e-05, |
| "loss": 2.8085, |
| "step": 44400 |
| }, |
| { |
| "epoch": 41.98136352913423, |
| "grad_norm": 0.8155168890953064, |
| "learning_rate": 6.792002687157564e-05, |
| "loss": 2.8083, |
| "step": 44500 |
| }, |
| { |
| "epoch": 42.0754895022411, |
| "grad_norm": 0.8438757061958313, |
| "learning_rate": 6.775697968818788e-05, |
| "loss": 2.7438, |
| "step": 44600 |
| }, |
| { |
| "epoch": 42.169851380042466, |
| "grad_norm": 0.8201486468315125, |
| "learning_rate": 6.759371614160639e-05, |
| "loss": 2.7352, |
| "step": 44700 |
| }, |
| { |
| "epoch": 42.26421325784383, |
| "grad_norm": 0.8068023324012756, |
| "learning_rate": 6.743023822114596e-05, |
| "loss": 2.7432, |
| "step": 44800 |
| }, |
| { |
| "epoch": 42.3585751356452, |
| "grad_norm": 0.8397886157035828, |
| "learning_rate": 6.726654791873343e-05, |
| "loss": 2.7511, |
| "step": 44900 |
| }, |
| { |
| "epoch": 42.452937013446565, |
| "grad_norm": 0.8199644684791565, |
| "learning_rate": 6.710264722888352e-05, |
| "loss": 2.7714, |
| "step": 45000 |
| }, |
| { |
| "epoch": 42.452937013446565, |
| "eval_loss": 3.426100015640259, |
| "eval_runtime": 89.9591, |
| "eval_samples_per_second": 167.532, |
| "eval_steps_per_second": 5.236, |
| "step": 45000 |
| }, |
| { |
| "epoch": 42.547298891247934, |
| "grad_norm": 0.8234971761703491, |
| "learning_rate": 6.693853814867439e-05, |
| "loss": 2.7713, |
| "step": 45100 |
| }, |
| { |
| "epoch": 42.6416607690493, |
| "grad_norm": 0.8292032480239868, |
| "learning_rate": 6.677422267772338e-05, |
| "loss": 2.7787, |
| "step": 45200 |
| }, |
| { |
| "epoch": 42.73602264685067, |
| "grad_norm": 0.8246825933456421, |
| "learning_rate": 6.660970281816269e-05, |
| "loss": 2.7894, |
| "step": 45300 |
| }, |
| { |
| "epoch": 42.83038452465204, |
| "grad_norm": 0.8253381252288818, |
| "learning_rate": 6.644498057461485e-05, |
| "loss": 2.7938, |
| "step": 45400 |
| }, |
| { |
| "epoch": 42.92474640245341, |
| "grad_norm": 0.8081493377685547, |
| "learning_rate": 6.628005795416842e-05, |
| "loss": 2.7931, |
| "step": 45500 |
| }, |
| { |
| "epoch": 43.01887237556027, |
| "grad_norm": 0.8368313908576965, |
| "learning_rate": 6.611493696635351e-05, |
| "loss": 2.7819, |
| "step": 45600 |
| }, |
| { |
| "epoch": 43.11323425336164, |
| "grad_norm": 0.8381558656692505, |
| "learning_rate": 6.594961962311722e-05, |
| "loss": 2.7094, |
| "step": 45700 |
| }, |
| { |
| "epoch": 43.20759613116301, |
| "grad_norm": 0.8160618543624878, |
| "learning_rate": 6.578410793879921e-05, |
| "loss": 2.7349, |
| "step": 45800 |
| }, |
| { |
| "epoch": 43.30195800896438, |
| "grad_norm": 0.8383694291114807, |
| "learning_rate": 6.561840393010713e-05, |
| "loss": 2.7474, |
| "step": 45900 |
| }, |
| { |
| "epoch": 43.396319886765745, |
| "grad_norm": 0.8452011346817017, |
| "learning_rate": 6.545250961609202e-05, |
| "loss": 2.7521, |
| "step": 46000 |
| }, |
| { |
| "epoch": 43.396319886765745, |
| "eval_loss": 3.429819107055664, |
| "eval_runtime": 89.93, |
| "eval_samples_per_second": 167.586, |
| "eval_steps_per_second": 5.237, |
| "step": 46000 |
| }, |
| { |
| "epoch": 43.490681764567114, |
| "grad_norm": 0.8400077223777771, |
| "learning_rate": 6.528642701812378e-05, |
| "loss": 2.7534, |
| "step": 46100 |
| }, |
| { |
| "epoch": 43.58504364236848, |
| "grad_norm": 0.8386735320091248, |
| "learning_rate": 6.51201581598664e-05, |
| "loss": 2.7636, |
| "step": 46200 |
| }, |
| { |
| "epoch": 43.67940552016985, |
| "grad_norm": 0.8181544542312622, |
| "learning_rate": 6.49537050672535e-05, |
| "loss": 2.7754, |
| "step": 46300 |
| }, |
| { |
| "epoch": 43.77376739797122, |
| "grad_norm": 0.8343673944473267, |
| "learning_rate": 6.478706976846344e-05, |
| "loss": 2.7799, |
| "step": 46400 |
| }, |
| { |
| "epoch": 43.86812927577259, |
| "grad_norm": 0.8236196041107178, |
| "learning_rate": 6.462025429389475e-05, |
| "loss": 2.782, |
| "step": 46500 |
| }, |
| { |
| "epoch": 43.96249115357396, |
| "grad_norm": 0.8244067430496216, |
| "learning_rate": 6.445326067614139e-05, |
| "loss": 2.7854, |
| "step": 46600 |
| }, |
| { |
| "epoch": 44.05661712668082, |
| "grad_norm": 0.8476299047470093, |
| "learning_rate": 6.428609094996785e-05, |
| "loss": 2.7343, |
| "step": 46700 |
| }, |
| { |
| "epoch": 44.15097900448219, |
| "grad_norm": 0.8473312854766846, |
| "learning_rate": 6.411874715228447e-05, |
| "loss": 2.7108, |
| "step": 46800 |
| }, |
| { |
| "epoch": 44.24534088228356, |
| "grad_norm": 0.8587284684181213, |
| "learning_rate": 6.395123132212268e-05, |
| "loss": 2.7221, |
| "step": 46900 |
| }, |
| { |
| "epoch": 44.339702760084926, |
| "grad_norm": 0.8349334597587585, |
| "learning_rate": 6.378354550060997e-05, |
| "loss": 2.7337, |
| "step": 47000 |
| }, |
| { |
| "epoch": 44.339702760084926, |
| "eval_loss": 3.434607744216919, |
| "eval_runtime": 89.9582, |
| "eval_samples_per_second": 167.533, |
| "eval_steps_per_second": 5.236, |
| "step": 47000 |
| }, |
| { |
| "epoch": 44.434064637886294, |
| "grad_norm": 0.8338183760643005, |
| "learning_rate": 6.361569173094515e-05, |
| "loss": 2.7416, |
| "step": 47100 |
| }, |
| { |
| "epoch": 44.52842651568766, |
| "grad_norm": 0.827136754989624, |
| "learning_rate": 6.344767205837345e-05, |
| "loss": 2.7536, |
| "step": 47200 |
| }, |
| { |
| "epoch": 44.62278839348903, |
| "grad_norm": 0.8238788843154907, |
| "learning_rate": 6.327948853016153e-05, |
| "loss": 2.7555, |
| "step": 47300 |
| }, |
| { |
| "epoch": 44.7171502712904, |
| "grad_norm": 0.8395233154296875, |
| "learning_rate": 6.311114319557261e-05, |
| "loss": 2.767, |
| "step": 47400 |
| }, |
| { |
| "epoch": 44.81151214909177, |
| "grad_norm": 0.8498242497444153, |
| "learning_rate": 6.29426381058415e-05, |
| "loss": 2.7715, |
| "step": 47500 |
| }, |
| { |
| "epoch": 44.90587402689314, |
| "grad_norm": 0.8152209520339966, |
| "learning_rate": 6.277397531414951e-05, |
| "loss": 2.7745, |
| "step": 47600 |
| }, |
| { |
| "epoch": 45.0, |
| "grad_norm": 1.0275278091430664, |
| "learning_rate": 6.260515687559953e-05, |
| "loss": 2.7807, |
| "step": 47700 |
| }, |
| { |
| "epoch": 45.09436187780137, |
| "grad_norm": 0.8403949737548828, |
| "learning_rate": 6.243618484719098e-05, |
| "loss": 2.6994, |
| "step": 47800 |
| }, |
| { |
| "epoch": 45.18872375560274, |
| "grad_norm": 0.8385687470436096, |
| "learning_rate": 6.226706128779468e-05, |
| "loss": 2.7075, |
| "step": 47900 |
| }, |
| { |
| "epoch": 45.283085633404106, |
| "grad_norm": 0.8589709401130676, |
| "learning_rate": 6.209778825812784e-05, |
| "loss": 2.7193, |
| "step": 48000 |
| }, |
| { |
| "epoch": 45.283085633404106, |
| "eval_loss": 3.4386990070343018, |
| "eval_runtime": 89.9503, |
| "eval_samples_per_second": 167.548, |
| "eval_steps_per_second": 5.236, |
| "step": 48000 |
| }, |
| { |
| "epoch": 45.377447511205474, |
| "grad_norm": 0.8279402256011963, |
| "learning_rate": 6.19283678207289e-05, |
| "loss": 2.7255, |
| "step": 48100 |
| }, |
| { |
| "epoch": 45.47180938900684, |
| "grad_norm": 0.8281998634338379, |
| "learning_rate": 6.175880203993243e-05, |
| "loss": 2.7342, |
| "step": 48200 |
| }, |
| { |
| "epoch": 45.56617126680821, |
| "grad_norm": 0.8443288803100586, |
| "learning_rate": 6.158909298184395e-05, |
| "loss": 2.7397, |
| "step": 48300 |
| }, |
| { |
| "epoch": 45.66053314460958, |
| "grad_norm": 0.8450830578804016, |
| "learning_rate": 6.14192427143148e-05, |
| "loss": 2.7542, |
| "step": 48400 |
| }, |
| { |
| "epoch": 45.75489502241095, |
| "grad_norm": 0.8576694130897522, |
| "learning_rate": 6.124925330691685e-05, |
| "loss": 2.7591, |
| "step": 48500 |
| }, |
| { |
| "epoch": 45.84925690021232, |
| "grad_norm": 0.8564294576644897, |
| "learning_rate": 6.107912683091741e-05, |
| "loss": 2.7539, |
| "step": 48600 |
| }, |
| { |
| "epoch": 45.94361877801368, |
| "grad_norm": 0.8432328104972839, |
| "learning_rate": 6.0908865359253886e-05, |
| "loss": 2.7737, |
| "step": 48700 |
| }, |
| { |
| "epoch": 46.03774475112055, |
| "grad_norm": 0.8429199457168579, |
| "learning_rate": 6.07384709665086e-05, |
| "loss": 2.7328, |
| "step": 48800 |
| }, |
| { |
| "epoch": 46.13210662892192, |
| "grad_norm": 0.8714138269424438, |
| "learning_rate": 6.0567945728883435e-05, |
| "loss": 2.6807, |
| "step": 48900 |
| }, |
| { |
| "epoch": 46.226468506723286, |
| "grad_norm": 0.8653603196144104, |
| "learning_rate": 6.03972917241746e-05, |
| "loss": 2.7003, |
| "step": 49000 |
| }, |
| { |
| "epoch": 46.226468506723286, |
| "eval_loss": 3.442552089691162, |
| "eval_runtime": 89.9431, |
| "eval_samples_per_second": 167.561, |
| "eval_steps_per_second": 5.237, |
| "step": 49000 |
| }, |
| { |
| "epoch": 46.320830384524655, |
| "grad_norm": 0.8579922914505005, |
| "learning_rate": 6.02265110317473e-05, |
| "loss": 2.7197, |
| "step": 49100 |
| }, |
| { |
| "epoch": 46.41519226232602, |
| "grad_norm": 0.8620674014091492, |
| "learning_rate": 6.005560573251037e-05, |
| "loss": 2.7202, |
| "step": 49200 |
| }, |
| { |
| "epoch": 46.50955414012739, |
| "grad_norm": 0.8350743651390076, |
| "learning_rate": 5.9884577908890926e-05, |
| "loss": 2.7325, |
| "step": 49300 |
| }, |
| { |
| "epoch": 46.60391601792875, |
| "grad_norm": 0.8624823093414307, |
| "learning_rate": 5.971342964480906e-05, |
| "loss": 2.741, |
| "step": 49400 |
| }, |
| { |
| "epoch": 46.69827789573012, |
| "grad_norm": 0.8552801012992859, |
| "learning_rate": 5.954216302565235e-05, |
| "loss": 2.7404, |
| "step": 49500 |
| }, |
| { |
| "epoch": 46.79263977353149, |
| "grad_norm": 0.8489238619804382, |
| "learning_rate": 5.9370780138250484e-05, |
| "loss": 2.7522, |
| "step": 49600 |
| }, |
| { |
| "epoch": 46.88700165133286, |
| "grad_norm": 0.8348067402839661, |
| "learning_rate": 5.9199283070849875e-05, |
| "loss": 2.7549, |
| "step": 49700 |
| }, |
| { |
| "epoch": 46.98136352913423, |
| "grad_norm": 0.8578048348426819, |
| "learning_rate": 5.9027673913088165e-05, |
| "loss": 2.7582, |
| "step": 49800 |
| }, |
| { |
| "epoch": 47.0754895022411, |
| "grad_norm": 0.8323913812637329, |
| "learning_rate": 5.885595475596878e-05, |
| "loss": 2.6933, |
| "step": 49900 |
| }, |
| { |
| "epoch": 47.169851380042466, |
| "grad_norm": 0.8476216793060303, |
| "learning_rate": 5.868412769183547e-05, |
| "loss": 2.6913, |
| "step": 50000 |
| }, |
| { |
| "epoch": 47.169851380042466, |
| "eval_loss": 3.446800470352173, |
| "eval_runtime": 89.9173, |
| "eval_samples_per_second": 167.61, |
| "eval_steps_per_second": 5.238, |
| "step": 50000 |
| }, |
| { |
| "epoch": 47.26421325784383, |
| "grad_norm": 0.8619811534881592, |
| "learning_rate": 5.8512194814346775e-05, |
| "loss": 2.6924, |
| "step": 50100 |
| }, |
| { |
| "epoch": 47.3585751356452, |
| "grad_norm": 0.8556803464889526, |
| "learning_rate": 5.8340158218450555e-05, |
| "loss": 2.7015, |
| "step": 50200 |
| }, |
| { |
| "epoch": 47.452937013446565, |
| "grad_norm": 0.8642596006393433, |
| "learning_rate": 5.8168020000358435e-05, |
| "loss": 2.7113, |
| "step": 50300 |
| }, |
| { |
| "epoch": 47.547298891247934, |
| "grad_norm": 0.8575654029846191, |
| "learning_rate": 5.799578225752028e-05, |
| "loss": 2.7235, |
| "step": 50400 |
| }, |
| { |
| "epoch": 47.6416607690493, |
| "grad_norm": 0.8716620802879333, |
| "learning_rate": 5.7823447088598624e-05, |
| "loss": 2.728, |
| "step": 50500 |
| }, |
| { |
| "epoch": 47.73602264685067, |
| "grad_norm": 0.8643260598182678, |
| "learning_rate": 5.765101659344313e-05, |
| "loss": 2.7372, |
| "step": 50600 |
| }, |
| { |
| "epoch": 47.83038452465204, |
| "grad_norm": 0.8748656511306763, |
| "learning_rate": 5.747849287306496e-05, |
| "loss": 2.7513, |
| "step": 50700 |
| }, |
| { |
| "epoch": 47.92474640245341, |
| "grad_norm": 0.8508985042572021, |
| "learning_rate": 5.730587802961119e-05, |
| "loss": 2.7486, |
| "step": 50800 |
| }, |
| { |
| "epoch": 48.01887237556027, |
| "grad_norm": 0.838749349117279, |
| "learning_rate": 5.7133174166339245e-05, |
| "loss": 2.7313, |
| "step": 50900 |
| }, |
| { |
| "epoch": 48.11323425336164, |
| "grad_norm": 0.9001404047012329, |
| "learning_rate": 5.696038338759117e-05, |
| "loss": 2.6701, |
| "step": 51000 |
| }, |
| { |
| "epoch": 48.11323425336164, |
| "eval_loss": 3.448244571685791, |
| "eval_runtime": 89.967, |
| "eval_samples_per_second": 167.517, |
| "eval_steps_per_second": 5.235, |
| "step": 51000 |
| }, |
| { |
| "epoch": 48.20759613116301, |
| "grad_norm": 0.8677713871002197, |
| "learning_rate": 5.678750779876807e-05, |
| "loss": 2.6845, |
| "step": 51100 |
| }, |
| { |
| "epoch": 48.30195800896438, |
| "grad_norm": 0.8787952065467834, |
| "learning_rate": 5.661454950630445e-05, |
| "loss": 2.6908, |
| "step": 51200 |
| }, |
| { |
| "epoch": 48.396319886765745, |
| "grad_norm": 0.8668954968452454, |
| "learning_rate": 5.6441510617642526e-05, |
| "loss": 2.7085, |
| "step": 51300 |
| }, |
| { |
| "epoch": 48.490681764567114, |
| "grad_norm": 0.8844050765037537, |
| "learning_rate": 5.626839324120654e-05, |
| "loss": 2.7093, |
| "step": 51400 |
| }, |
| { |
| "epoch": 48.58504364236848, |
| "grad_norm": 0.8611410856246948, |
| "learning_rate": 5.609519948637708e-05, |
| "loss": 2.7179, |
| "step": 51500 |
| }, |
| { |
| "epoch": 48.67940552016985, |
| "grad_norm": 0.8534366488456726, |
| "learning_rate": 5.592193146346543e-05, |
| "loss": 2.7177, |
| "step": 51600 |
| }, |
| { |
| "epoch": 48.77376739797122, |
| "grad_norm": 0.8651390671730042, |
| "learning_rate": 5.5748591283687725e-05, |
| "loss": 2.729, |
| "step": 51700 |
| }, |
| { |
| "epoch": 48.86812927577259, |
| "grad_norm": 0.882972776889801, |
| "learning_rate": 5.557518105913939e-05, |
| "loss": 2.7283, |
| "step": 51800 |
| }, |
| { |
| "epoch": 48.96249115357396, |
| "grad_norm": 0.8630602359771729, |
| "learning_rate": 5.540170290276927e-05, |
| "loss": 2.7332, |
| "step": 51900 |
| }, |
| { |
| "epoch": 49.05661712668082, |
| "grad_norm": 0.8470357060432434, |
| "learning_rate": 5.5228158928353944e-05, |
| "loss": 2.6902, |
| "step": 52000 |
| }, |
| { |
| "epoch": 49.05661712668082, |
| "eval_loss": 3.449252128601074, |
| "eval_runtime": 89.9538, |
| "eval_samples_per_second": 167.542, |
| "eval_steps_per_second": 5.236, |
| "step": 52000 |
| }, |
| { |
| "epoch": 49.15097900448219, |
| "grad_norm": 0.8805975317955017, |
| "learning_rate": 5.5054551250471985e-05, |
| "loss": 2.6685, |
| "step": 52100 |
| }, |
| { |
| "epoch": 49.24534088228356, |
| "grad_norm": 0.8662949800491333, |
| "learning_rate": 5.488088198447816e-05, |
| "loss": 2.6802, |
| "step": 52200 |
| }, |
| { |
| "epoch": 49.339702760084926, |
| "grad_norm": 0.8698300719261169, |
| "learning_rate": 5.470715324647766e-05, |
| "loss": 2.6919, |
| "step": 52300 |
| }, |
| { |
| "epoch": 49.434064637886294, |
| "grad_norm": 0.8933809995651245, |
| "learning_rate": 5.453336715330034e-05, |
| "loss": 2.702, |
| "step": 52400 |
| }, |
| { |
| "epoch": 49.52842651568766, |
| "grad_norm": 0.8648493885993958, |
| "learning_rate": 5.43595258224749e-05, |
| "loss": 2.7047, |
| "step": 52500 |
| }, |
| { |
| "epoch": 49.62278839348903, |
| "grad_norm": 0.87412428855896, |
| "learning_rate": 5.4185631372203106e-05, |
| "loss": 2.7082, |
| "step": 52600 |
| }, |
| { |
| "epoch": 49.7171502712904, |
| "grad_norm": 0.8642207980155945, |
| "learning_rate": 5.401168592133394e-05, |
| "loss": 2.7095, |
| "step": 52700 |
| }, |
| { |
| "epoch": 49.81151214909177, |
| "grad_norm": 0.8690053224563599, |
| "learning_rate": 5.3837691589337833e-05, |
| "loss": 2.7143, |
| "step": 52800 |
| }, |
| { |
| "epoch": 49.90587402689314, |
| "grad_norm": 0.866743803024292, |
| "learning_rate": 5.3663650496280814e-05, |
| "loss": 2.7287, |
| "step": 52900 |
| }, |
| { |
| "epoch": 50.0, |
| "grad_norm": 1.0572550296783447, |
| "learning_rate": 5.348956476279867e-05, |
| "loss": 2.7342, |
| "step": 53000 |
| }, |
| { |
| "epoch": 50.0, |
| "eval_loss": 3.4417190551757812, |
| "eval_runtime": 89.9736, |
| "eval_samples_per_second": 167.505, |
| "eval_steps_per_second": 5.235, |
| "step": 53000 |
| }, |
| { |
| "epoch": 50.09436187780137, |
| "grad_norm": 0.8832667469978333, |
| "learning_rate": 5.331543651007114e-05, |
| "loss": 2.6499, |
| "step": 53100 |
| }, |
| { |
| "epoch": 50.18872375560274, |
| "grad_norm": 0.8885585069656372, |
| "learning_rate": 5.314126785979601e-05, |
| "loss": 2.6571, |
| "step": 53200 |
| }, |
| { |
| "epoch": 50.283085633404106, |
| "grad_norm": 0.8922543525695801, |
| "learning_rate": 5.296706093416334e-05, |
| "loss": 2.6718, |
| "step": 53300 |
| }, |
| { |
| "epoch": 50.377447511205474, |
| "grad_norm": 0.8803562521934509, |
| "learning_rate": 5.2792817855829534e-05, |
| "loss": 2.6878, |
| "step": 53400 |
| }, |
| { |
| "epoch": 50.47180938900684, |
| "grad_norm": 0.8851462602615356, |
| "learning_rate": 5.261854074789151e-05, |
| "loss": 2.6938, |
| "step": 53500 |
| }, |
| { |
| "epoch": 50.56617126680821, |
| "grad_norm": 0.8722124099731445, |
| "learning_rate": 5.244423173386084e-05, |
| "loss": 2.6999, |
| "step": 53600 |
| }, |
| { |
| "epoch": 50.66053314460958, |
| "grad_norm": 0.8915928602218628, |
| "learning_rate": 5.226989293763784e-05, |
| "loss": 2.7019, |
| "step": 53700 |
| }, |
| { |
| "epoch": 50.75489502241095, |
| "grad_norm": 0.8628272414207458, |
| "learning_rate": 5.2095526483485736e-05, |
| "loss": 2.7118, |
| "step": 53800 |
| }, |
| { |
| "epoch": 50.84925690021232, |
| "grad_norm": 0.8903440833091736, |
| "learning_rate": 5.192113449600473e-05, |
| "loss": 2.7135, |
| "step": 53900 |
| }, |
| { |
| "epoch": 50.94361877801368, |
| "grad_norm": 0.8711543083190918, |
| "learning_rate": 5.1746719100106164e-05, |
| "loss": 2.7131, |
| "step": 54000 |
| }, |
| { |
| "epoch": 50.94361877801368, |
| "eval_loss": 3.4459497928619385, |
| "eval_runtime": 89.9187, |
| "eval_samples_per_second": 167.607, |
| "eval_steps_per_second": 5.238, |
| "step": 54000 |
| }, |
| { |
| "epoch": 51.03774475112055, |
| "grad_norm": 0.9009747505187988, |
| "learning_rate": 5.1572282420986615e-05, |
| "loss": 2.6861, |
| "step": 54100 |
| }, |
| { |
| "epoch": 51.13210662892192, |
| "grad_norm": 0.8729134798049927, |
| "learning_rate": 5.139782658410193e-05, |
| "loss": 2.6493, |
| "step": 54200 |
| }, |
| { |
| "epoch": 51.226468506723286, |
| "grad_norm": 0.8997048139572144, |
| "learning_rate": 5.122335371514144e-05, |
| "loss": 2.6571, |
| "step": 54300 |
| }, |
| { |
| "epoch": 51.320830384524655, |
| "grad_norm": 0.8706479668617249, |
| "learning_rate": 5.1048865940002e-05, |
| "loss": 2.6727, |
| "step": 54400 |
| }, |
| { |
| "epoch": 51.41519226232602, |
| "grad_norm": 0.889145016670227, |
| "learning_rate": 5.0874365384762093e-05, |
| "loss": 2.6779, |
| "step": 54500 |
| }, |
| { |
| "epoch": 51.50955414012739, |
| "grad_norm": 0.8947675824165344, |
| "learning_rate": 5.069985417565589e-05, |
| "loss": 2.6921, |
| "step": 54600 |
| }, |
| { |
| "epoch": 51.60391601792875, |
| "grad_norm": 0.8841854929924011, |
| "learning_rate": 5.0525334439047435e-05, |
| "loss": 2.6914, |
| "step": 54700 |
| }, |
| { |
| "epoch": 51.69827789573012, |
| "grad_norm": 0.8952610492706299, |
| "learning_rate": 5.035080830140462e-05, |
| "loss": 2.7012, |
| "step": 54800 |
| }, |
| { |
| "epoch": 51.79263977353149, |
| "grad_norm": 0.9060307741165161, |
| "learning_rate": 5.017627788927336e-05, |
| "loss": 2.6982, |
| "step": 54900 |
| }, |
| { |
| "epoch": 51.88700165133286, |
| "grad_norm": 0.877874493598938, |
| "learning_rate": 5.000174532925165e-05, |
| "loss": 2.7039, |
| "step": 55000 |
| }, |
| { |
| "epoch": 51.88700165133286, |
| "eval_loss": 3.4530842304229736, |
| "eval_runtime": 89.9152, |
| "eval_samples_per_second": 167.614, |
| "eval_steps_per_second": 5.238, |
| "step": 55000 |
| }, |
| { |
| "epoch": 51.98136352913423, |
| "grad_norm": 0.8987739086151123, |
| "learning_rate": 4.982721274796365e-05, |
| "loss": 2.7111, |
| "step": 55100 |
| }, |
| { |
| "epoch": 52.0754895022411, |
| "grad_norm": 0.8870150446891785, |
| "learning_rate": 4.9652682272033776e-05, |
| "loss": 2.6464, |
| "step": 55200 |
| }, |
| { |
| "epoch": 52.169851380042466, |
| "grad_norm": 0.8857084512710571, |
| "learning_rate": 4.947815602806083e-05, |
| "loss": 2.6467, |
| "step": 55300 |
| }, |
| { |
| "epoch": 52.26421325784383, |
| "grad_norm": 0.8918935656547546, |
| "learning_rate": 4.9303636142592005e-05, |
| "loss": 2.6567, |
| "step": 55400 |
| }, |
| { |
| "epoch": 52.3585751356452, |
| "grad_norm": 0.9171741604804993, |
| "learning_rate": 4.912912474209699e-05, |
| "loss": 2.6714, |
| "step": 55500 |
| }, |
| { |
| "epoch": 52.452937013446565, |
| "grad_norm": 0.8810116052627563, |
| "learning_rate": 4.8954623952942196e-05, |
| "loss": 2.6652, |
| "step": 55600 |
| }, |
| { |
| "epoch": 52.547298891247934, |
| "grad_norm": 0.8983612656593323, |
| "learning_rate": 4.878013590136461e-05, |
| "loss": 2.6773, |
| "step": 55700 |
| }, |
| { |
| "epoch": 52.6416607690493, |
| "grad_norm": 0.8998178839683533, |
| "learning_rate": 4.860566271344612e-05, |
| "loss": 2.6851, |
| "step": 55800 |
| }, |
| { |
| "epoch": 52.73602264685067, |
| "grad_norm": 0.893043577671051, |
| "learning_rate": 4.8431206515087425e-05, |
| "loss": 2.6827, |
| "step": 55900 |
| }, |
| { |
| "epoch": 52.83038452465204, |
| "grad_norm": 0.8974281549453735, |
| "learning_rate": 4.825676943198228e-05, |
| "loss": 2.697, |
| "step": 56000 |
| }, |
| { |
| "epoch": 52.83038452465204, |
| "eval_loss": 3.4545717239379883, |
| "eval_runtime": 89.9579, |
| "eval_samples_per_second": 167.534, |
| "eval_steps_per_second": 5.236, |
| "step": 56000 |
| }, |
| { |
| "epoch": 52.92474640245341, |
| "grad_norm": 0.9062679409980774, |
| "learning_rate": 4.808235358959146e-05, |
| "loss": 2.7024, |
| "step": 56100 |
| }, |
| { |
| "epoch": 53.01887237556027, |
| "grad_norm": 0.8707714676856995, |
| "learning_rate": 4.790796111311697e-05, |
| "loss": 2.6894, |
| "step": 56200 |
| }, |
| { |
| "epoch": 53.11323425336164, |
| "grad_norm": 0.9064081907272339, |
| "learning_rate": 4.773359412747614e-05, |
| "loss": 2.6359, |
| "step": 56300 |
| }, |
| { |
| "epoch": 53.20759613116301, |
| "grad_norm": 0.9040234684944153, |
| "learning_rate": 4.75592547572756e-05, |
| "loss": 2.6381, |
| "step": 56400 |
| }, |
| { |
| "epoch": 53.30195800896438, |
| "grad_norm": 0.9168184399604797, |
| "learning_rate": 4.738494512678562e-05, |
| "loss": 2.6504, |
| "step": 56500 |
| }, |
| { |
| "epoch": 53.396319886765745, |
| "grad_norm": 0.9072760343551636, |
| "learning_rate": 4.7210667359913984e-05, |
| "loss": 2.6615, |
| "step": 56600 |
| }, |
| { |
| "epoch": 53.490681764567114, |
| "grad_norm": 0.9487440586090088, |
| "learning_rate": 4.7036423580180325e-05, |
| "loss": 2.6672, |
| "step": 56700 |
| }, |
| { |
| "epoch": 53.58504364236848, |
| "grad_norm": 0.9152506589889526, |
| "learning_rate": 4.6862215910690103e-05, |
| "loss": 2.6724, |
| "step": 56800 |
| }, |
| { |
| "epoch": 53.67940552016985, |
| "grad_norm": 0.911768913269043, |
| "learning_rate": 4.668804647410876e-05, |
| "loss": 2.6786, |
| "step": 56900 |
| }, |
| { |
| "epoch": 53.77376739797122, |
| "grad_norm": 0.8736267685890198, |
| "learning_rate": 4.6513917392635945e-05, |
| "loss": 2.6786, |
| "step": 57000 |
| }, |
| { |
| "epoch": 53.77376739797122, |
| "eval_loss": 3.4597623348236084, |
| "eval_runtime": 89.9335, |
| "eval_samples_per_second": 167.579, |
| "eval_steps_per_second": 5.237, |
| "step": 57000 |
| }, |
| { |
| "epoch": 53.86812927577259, |
| "grad_norm": 0.8981320858001709, |
| "learning_rate": 4.6339830787979574e-05, |
| "loss": 2.6876, |
| "step": 57100 |
| }, |
| { |
| "epoch": 53.96249115357396, |
| "grad_norm": 0.9124193787574768, |
| "learning_rate": 4.616578878132996e-05, |
| "loss": 2.6932, |
| "step": 57200 |
| }, |
| { |
| "epoch": 54.05661712668082, |
| "grad_norm": 0.9008533954620361, |
| "learning_rate": 4.5991793493334035e-05, |
| "loss": 2.6504, |
| "step": 57300 |
| }, |
| { |
| "epoch": 54.15097900448219, |
| "grad_norm": 0.9169653058052063, |
| "learning_rate": 4.58178470440695e-05, |
| "loss": 2.6309, |
| "step": 57400 |
| }, |
| { |
| "epoch": 54.24534088228356, |
| "grad_norm": 0.8994284868240356, |
| "learning_rate": 4.564395155301891e-05, |
| "loss": 2.6393, |
| "step": 57500 |
| }, |
| { |
| "epoch": 54.339702760084926, |
| "grad_norm": 0.9167579412460327, |
| "learning_rate": 4.5470109139043984e-05, |
| "loss": 2.6483, |
| "step": 57600 |
| }, |
| { |
| "epoch": 54.434064637886294, |
| "grad_norm": 0.9248393774032593, |
| "learning_rate": 4.529632192035965e-05, |
| "loss": 2.6541, |
| "step": 57700 |
| }, |
| { |
| "epoch": 54.52842651568766, |
| "grad_norm": 0.9072225093841553, |
| "learning_rate": 4.51225920145083e-05, |
| "loss": 2.663, |
| "step": 57800 |
| }, |
| { |
| "epoch": 54.62278839348903, |
| "grad_norm": 0.8942064642906189, |
| "learning_rate": 4.494892153833406e-05, |
| "loss": 2.6634, |
| "step": 57900 |
| }, |
| { |
| "epoch": 54.7171502712904, |
| "grad_norm": 0.8993781805038452, |
| "learning_rate": 4.477531260795683e-05, |
| "loss": 2.6687, |
| "step": 58000 |
| }, |
| { |
| "epoch": 54.7171502712904, |
| "eval_loss": 3.4639813899993896, |
| "eval_runtime": 89.93, |
| "eval_samples_per_second": 167.586, |
| "eval_steps_per_second": 5.237, |
| "step": 58000 |
| }, |
| { |
| "epoch": 54.81151214909177, |
| "grad_norm": 0.9311773180961609, |
| "learning_rate": 4.460176733874668e-05, |
| "loss": 2.673, |
| "step": 58100 |
| }, |
| { |
| "epoch": 54.90587402689314, |
| "grad_norm": 0.8964095115661621, |
| "learning_rate": 4.442828784529791e-05, |
| "loss": 2.6861, |
| "step": 58200 |
| }, |
| { |
| "epoch": 55.0, |
| "grad_norm": 1.1009083986282349, |
| "learning_rate": 4.4254876241403444e-05, |
| "loss": 2.6853, |
| "step": 58300 |
| }, |
| { |
| "epoch": 55.09436187780137, |
| "grad_norm": 0.9027389287948608, |
| "learning_rate": 4.4081534640028924e-05, |
| "loss": 2.6084, |
| "step": 58400 |
| }, |
| { |
| "epoch": 55.18872375560274, |
| "grad_norm": 0.9201973080635071, |
| "learning_rate": 4.390826515328704e-05, |
| "loss": 2.625, |
| "step": 58500 |
| }, |
| { |
| "epoch": 55.283085633404106, |
| "grad_norm": 0.9164554476737976, |
| "learning_rate": 4.373506989241186e-05, |
| "loss": 2.6345, |
| "step": 58600 |
| }, |
| { |
| "epoch": 55.377447511205474, |
| "grad_norm": 0.9286631941795349, |
| "learning_rate": 4.356195096773292e-05, |
| "loss": 2.6464, |
| "step": 58700 |
| }, |
| { |
| "epoch": 55.47180938900684, |
| "grad_norm": 0.9139711856842041, |
| "learning_rate": 4.338891048864973e-05, |
| "loss": 2.6536, |
| "step": 58800 |
| }, |
| { |
| "epoch": 55.56617126680821, |
| "grad_norm": 0.9070996046066284, |
| "learning_rate": 4.321595056360589e-05, |
| "loss": 2.6551, |
| "step": 58900 |
| }, |
| { |
| "epoch": 55.66053314460958, |
| "grad_norm": 0.9211101531982422, |
| "learning_rate": 4.304307330006352e-05, |
| "loss": 2.6571, |
| "step": 59000 |
| }, |
| { |
| "epoch": 55.66053314460958, |
| "eval_loss": 3.4707727432250977, |
| "eval_runtime": 89.9246, |
| "eval_samples_per_second": 167.596, |
| "eval_steps_per_second": 5.238, |
| "step": 59000 |
| }, |
| { |
| "epoch": 55.75489502241095, |
| "grad_norm": 0.9334980845451355, |
| "learning_rate": 4.2870280804477525e-05, |
| "loss": 2.6642, |
| "step": 59100 |
| }, |
| { |
| "epoch": 55.84925690021232, |
| "grad_norm": 0.9159435033798218, |
| "learning_rate": 4.2697575182269924e-05, |
| "loss": 2.6693, |
| "step": 59200 |
| }, |
| { |
| "epoch": 55.94361877801368, |
| "grad_norm": 0.9436858892440796, |
| "learning_rate": 4.2524958537804226e-05, |
| "loss": 2.6754, |
| "step": 59300 |
| }, |
| { |
| "epoch": 56.03774475112055, |
| "grad_norm": 0.9315575957298279, |
| "learning_rate": 4.235243297435975e-05, |
| "loss": 2.6502, |
| "step": 59400 |
| }, |
| { |
| "epoch": 56.13210662892192, |
| "grad_norm": 0.9315222501754761, |
| "learning_rate": 4.2180000594106076e-05, |
| "loss": 2.6076, |
| "step": 59500 |
| }, |
| { |
| "epoch": 56.226468506723286, |
| "grad_norm": 0.9421327114105225, |
| "learning_rate": 4.200766349807731e-05, |
| "loss": 2.62, |
| "step": 59600 |
| }, |
| { |
| "epoch": 56.320830384524655, |
| "grad_norm": 0.9382652044296265, |
| "learning_rate": 4.18354237861466e-05, |
| "loss": 2.634, |
| "step": 59700 |
| }, |
| { |
| "epoch": 56.41519226232602, |
| "grad_norm": 0.9161766171455383, |
| "learning_rate": 4.1663283557000455e-05, |
| "loss": 2.6375, |
| "step": 59800 |
| }, |
| { |
| "epoch": 56.50955414012739, |
| "grad_norm": 0.9238489866256714, |
| "learning_rate": 4.1491244908113266e-05, |
| "loss": 2.6367, |
| "step": 59900 |
| }, |
| { |
| "epoch": 56.60391601792875, |
| "grad_norm": 0.9288871884346008, |
| "learning_rate": 4.1319309935721695e-05, |
| "loss": 2.6462, |
| "step": 60000 |
| }, |
| { |
| "epoch": 56.60391601792875, |
| "eval_loss": 3.473339557647705, |
| "eval_runtime": 89.9214, |
| "eval_samples_per_second": 167.602, |
| "eval_steps_per_second": 5.238, |
| "step": 60000 |
| }, |
| { |
| "epoch": 56.69827789573012, |
| "grad_norm": 0.9236829280853271, |
| "learning_rate": 4.114748073479907e-05, |
| "loss": 2.6554, |
| "step": 60100 |
| }, |
| { |
| "epoch": 56.79263977353149, |
| "grad_norm": 0.9447175860404968, |
| "learning_rate": 4.097575939903003e-05, |
| "loss": 2.6616, |
| "step": 60200 |
| }, |
| { |
| "epoch": 56.88700165133286, |
| "grad_norm": 0.9397159814834595, |
| "learning_rate": 4.080414802078481e-05, |
| "loss": 2.6658, |
| "step": 60300 |
| }, |
| { |
| "epoch": 56.98136352913423, |
| "grad_norm": 0.922020673751831, |
| "learning_rate": 4.063264869109395e-05, |
| "loss": 2.6718, |
| "step": 60400 |
| }, |
| { |
| "epoch": 57.0754895022411, |
| "grad_norm": 0.9354454874992371, |
| "learning_rate": 4.046126349962261e-05, |
| "loss": 2.6197, |
| "step": 60500 |
| }, |
| { |
| "epoch": 57.169851380042466, |
| "grad_norm": 0.9390712380409241, |
| "learning_rate": 4.0289994534645305e-05, |
| "loss": 2.61, |
| "step": 60600 |
| }, |
| { |
| "epoch": 57.26421325784383, |
| "grad_norm": 0.9291863441467285, |
| "learning_rate": 4.01188438830203e-05, |
| "loss": 2.6188, |
| "step": 60700 |
| }, |
| { |
| "epoch": 57.3585751356452, |
| "grad_norm": 0.929843008518219, |
| "learning_rate": 3.994781363016427e-05, |
| "loss": 2.6235, |
| "step": 60800 |
| }, |
| { |
| "epoch": 57.452937013446565, |
| "grad_norm": 0.9367436170578003, |
| "learning_rate": 3.977690586002688e-05, |
| "loss": 2.6303, |
| "step": 60900 |
| }, |
| { |
| "epoch": 57.547298891247934, |
| "grad_norm": 0.9399311542510986, |
| "learning_rate": 3.9606122655065365e-05, |
| "loss": 2.6432, |
| "step": 61000 |
| }, |
| { |
| "epoch": 57.547298891247934, |
| "eval_loss": 3.4750752449035645, |
| "eval_runtime": 89.9203, |
| "eval_samples_per_second": 167.604, |
| "eval_steps_per_second": 5.238, |
| "step": 61000 |
| }, |
| { |
| "epoch": 57.6416607690493, |
| "grad_norm": 0.928033173084259, |
| "learning_rate": 3.943546609621921e-05, |
| "loss": 2.6394, |
| "step": 61100 |
| }, |
| { |
| "epoch": 57.73602264685067, |
| "grad_norm": 0.9516855478286743, |
| "learning_rate": 3.926493826288469e-05, |
| "loss": 2.6455, |
| "step": 61200 |
| }, |
| { |
| "epoch": 57.83038452465204, |
| "grad_norm": 0.9388333559036255, |
| "learning_rate": 3.909454123288968e-05, |
| "loss": 2.6551, |
| "step": 61300 |
| }, |
| { |
| "epoch": 57.92474640245341, |
| "grad_norm": 0.9445894956588745, |
| "learning_rate": 3.892427708246818e-05, |
| "loss": 2.6583, |
| "step": 61400 |
| }, |
| { |
| "epoch": 58.01887237556027, |
| "grad_norm": 0.9752274751663208, |
| "learning_rate": 3.875414788623515e-05, |
| "loss": 2.6422, |
| "step": 61500 |
| }, |
| { |
| "epoch": 58.11323425336164, |
| "grad_norm": 0.9434385895729065, |
| "learning_rate": 3.858415571716116e-05, |
| "loss": 2.5958, |
| "step": 61600 |
| }, |
| { |
| "epoch": 58.20759613116301, |
| "grad_norm": 0.9802889227867126, |
| "learning_rate": 3.8414302646547114e-05, |
| "loss": 2.6074, |
| "step": 61700 |
| }, |
| { |
| "epoch": 58.30195800896438, |
| "grad_norm": 0.9747982621192932, |
| "learning_rate": 3.824459074399911e-05, |
| "loss": 2.6163, |
| "step": 61800 |
| }, |
| { |
| "epoch": 58.396319886765745, |
| "grad_norm": 0.9481346011161804, |
| "learning_rate": 3.8075022077403095e-05, |
| "loss": 2.6123, |
| "step": 61900 |
| }, |
| { |
| "epoch": 58.490681764567114, |
| "grad_norm": 0.9717242121696472, |
| "learning_rate": 3.790559871289979e-05, |
| "loss": 2.6289, |
| "step": 62000 |
| }, |
| { |
| "epoch": 58.490681764567114, |
| "eval_loss": 3.47990345954895, |
| "eval_runtime": 89.9379, |
| "eval_samples_per_second": 167.571, |
| "eval_steps_per_second": 5.237, |
| "step": 62000 |
| }, |
| { |
| "epoch": 58.58504364236848, |
| "grad_norm": 0.9299172163009644, |
| "learning_rate": 3.77363227148594e-05, |
| "loss": 2.6348, |
| "step": 62100 |
| }, |
| { |
| "epoch": 58.67940552016985, |
| "grad_norm": 0.9443178176879883, |
| "learning_rate": 3.756719614585656e-05, |
| "loss": 2.6384, |
| "step": 62200 |
| }, |
| { |
| "epoch": 58.77376739797122, |
| "grad_norm": 0.9289298057556152, |
| "learning_rate": 3.739822106664513e-05, |
| "loss": 2.641, |
| "step": 62300 |
| }, |
| { |
| "epoch": 58.86812927577259, |
| "grad_norm": 0.9584905505180359, |
| "learning_rate": 3.7229399536133106e-05, |
| "loss": 2.6465, |
| "step": 62400 |
| }, |
| { |
| "epoch": 58.96249115357396, |
| "grad_norm": 0.922694981098175, |
| "learning_rate": 3.706073361135759e-05, |
| "loss": 2.6494, |
| "step": 62500 |
| }, |
| { |
| "epoch": 59.05661712668082, |
| "grad_norm": 0.9424399137496948, |
| "learning_rate": 3.6892225347459624e-05, |
| "loss": 2.6179, |
| "step": 62600 |
| }, |
| { |
| "epoch": 59.15097900448219, |
| "grad_norm": 0.9429112672805786, |
| "learning_rate": 3.672387679765925e-05, |
| "loss": 2.588, |
| "step": 62700 |
| }, |
| { |
| "epoch": 59.24534088228356, |
| "grad_norm": 0.941720724105835, |
| "learning_rate": 3.65556900132304e-05, |
| "loss": 2.6073, |
| "step": 62800 |
| }, |
| { |
| "epoch": 59.339702760084926, |
| "grad_norm": 0.947265088558197, |
| "learning_rate": 3.638766704347598e-05, |
| "loss": 2.6183, |
| "step": 62900 |
| }, |
| { |
| "epoch": 59.434064637886294, |
| "grad_norm": 0.9350099563598633, |
| "learning_rate": 3.621980993570283e-05, |
| "loss": 2.6128, |
| "step": 63000 |
| }, |
| { |
| "epoch": 59.434064637886294, |
| "eval_loss": 3.4838671684265137, |
| "eval_runtime": 89.9316, |
| "eval_samples_per_second": 167.583, |
| "eval_steps_per_second": 5.237, |
| "step": 63000 |
| }, |
| { |
| "epoch": 59.52842651568766, |
| "grad_norm": 0.9562307596206665, |
| "learning_rate": 3.605212073519687e-05, |
| "loss": 2.625, |
| "step": 63100 |
| }, |
| { |
| "epoch": 59.62278839348903, |
| "grad_norm": 0.9589905142784119, |
| "learning_rate": 3.588460148519808e-05, |
| "loss": 2.6283, |
| "step": 63200 |
| }, |
| { |
| "epoch": 59.7171502712904, |
| "grad_norm": 0.9597828984260559, |
| "learning_rate": 3.5717254226875605e-05, |
| "loss": 2.6324, |
| "step": 63300 |
| }, |
| { |
| "epoch": 59.81151214909177, |
| "grad_norm": 0.9642577171325684, |
| "learning_rate": 3.555008099930305e-05, |
| "loss": 2.6328, |
| "step": 63400 |
| }, |
| { |
| "epoch": 59.90587402689314, |
| "grad_norm": 0.9308410286903381, |
| "learning_rate": 3.5383083839433385e-05, |
| "loss": 2.6374, |
| "step": 63500 |
| }, |
| { |
| "epoch": 60.0, |
| "grad_norm": 1.1493489742279053, |
| "learning_rate": 3.521626478207432e-05, |
| "loss": 2.6411, |
| "step": 63600 |
| }, |
| { |
| "epoch": 60.09436187780137, |
| "grad_norm": 0.9469171166419983, |
| "learning_rate": 3.504962585986342e-05, |
| "loss": 2.5832, |
| "step": 63700 |
| }, |
| { |
| "epoch": 60.18872375560274, |
| "grad_norm": 0.964799702167511, |
| "learning_rate": 3.488316910324338e-05, |
| "loss": 2.5958, |
| "step": 63800 |
| }, |
| { |
| "epoch": 60.283085633404106, |
| "grad_norm": 0.9498776197433472, |
| "learning_rate": 3.471689654043724e-05, |
| "loss": 2.6038, |
| "step": 63900 |
| }, |
| { |
| "epoch": 60.377447511205474, |
| "grad_norm": 0.9644718170166016, |
| "learning_rate": 3.455081019742368e-05, |
| "loss": 2.6016, |
| "step": 64000 |
| }, |
| { |
| "epoch": 60.377447511205474, |
| "eval_loss": 3.48707914352417, |
| "eval_runtime": 89.9399, |
| "eval_samples_per_second": 167.567, |
| "eval_steps_per_second": 5.237, |
| "step": 64000 |
| }, |
| { |
| "epoch": 60.47180938900684, |
| "grad_norm": 0.9748337864875793, |
| "learning_rate": 3.438491209791242e-05, |
| "loss": 2.6067, |
| "step": 64100 |
| }, |
| { |
| "epoch": 60.56617126680821, |
| "grad_norm": 0.9413148164749146, |
| "learning_rate": 3.42192042633194e-05, |
| "loss": 2.6144, |
| "step": 64200 |
| }, |
| { |
| "epoch": 60.66053314460958, |
| "grad_norm": 0.9474183917045593, |
| "learning_rate": 3.405368871274234e-05, |
| "loss": 2.6153, |
| "step": 64300 |
| }, |
| { |
| "epoch": 60.75489502241095, |
| "grad_norm": 0.955107569694519, |
| "learning_rate": 3.3888367462935946e-05, |
| "loss": 2.6298, |
| "step": 64400 |
| }, |
| { |
| "epoch": 60.84925690021232, |
| "grad_norm": 0.9560959339141846, |
| "learning_rate": 3.3723242528287515e-05, |
| "loss": 2.6291, |
| "step": 64500 |
| }, |
| { |
| "epoch": 60.94361877801368, |
| "grad_norm": 0.9528979659080505, |
| "learning_rate": 3.355831592079223e-05, |
| "loss": 2.6374, |
| "step": 64600 |
| }, |
| { |
| "epoch": 61.03774475112055, |
| "grad_norm": 0.9570056796073914, |
| "learning_rate": 3.3393589650028766e-05, |
| "loss": 2.6132, |
| "step": 64700 |
| }, |
| { |
| "epoch": 61.13210662892192, |
| "grad_norm": 0.9864018559455872, |
| "learning_rate": 3.322906572313477e-05, |
| "loss": 2.5732, |
| "step": 64800 |
| }, |
| { |
| "epoch": 61.226468506723286, |
| "grad_norm": 0.948508620262146, |
| "learning_rate": 3.306474614478234e-05, |
| "loss": 2.587, |
| "step": 64900 |
| }, |
| { |
| "epoch": 61.320830384524655, |
| "grad_norm": 0.9861729145050049, |
| "learning_rate": 3.2900632917153705e-05, |
| "loss": 2.6049, |
| "step": 65000 |
| }, |
| { |
| "epoch": 61.320830384524655, |
| "eval_loss": 3.48958683013916, |
| "eval_runtime": 89.9577, |
| "eval_samples_per_second": 167.534, |
| "eval_steps_per_second": 5.236, |
| "step": 65000 |
| }, |
| { |
| "epoch": 61.41519226232602, |
| "grad_norm": 0.9540084600448608, |
| "learning_rate": 3.273672803991673e-05, |
| "loss": 2.5993, |
| "step": 65100 |
| }, |
| { |
| "epoch": 61.50955414012739, |
| "grad_norm": 0.9662155508995056, |
| "learning_rate": 3.257303351020066e-05, |
| "loss": 2.6071, |
| "step": 65200 |
| }, |
| { |
| "epoch": 61.60391601792875, |
| "grad_norm": 0.9816598892211914, |
| "learning_rate": 3.240955132257162e-05, |
| "loss": 2.6101, |
| "step": 65300 |
| }, |
| { |
| "epoch": 61.69827789573012, |
| "grad_norm": 0.9878286719322205, |
| "learning_rate": 3.224628346900853e-05, |
| "loss": 2.617, |
| "step": 65400 |
| }, |
| { |
| "epoch": 61.79263977353149, |
| "grad_norm": 0.9473662972450256, |
| "learning_rate": 3.208323193887863e-05, |
| "loss": 2.6219, |
| "step": 65500 |
| }, |
| { |
| "epoch": 61.88700165133286, |
| "grad_norm": 0.9426729679107666, |
| "learning_rate": 3.192039871891336e-05, |
| "loss": 2.6228, |
| "step": 65600 |
| }, |
| { |
| "epoch": 61.98136352913423, |
| "grad_norm": 0.9848515391349792, |
| "learning_rate": 3.1757785793184144e-05, |
| "loss": 2.6279, |
| "step": 65700 |
| }, |
| { |
| "epoch": 62.0754895022411, |
| "grad_norm": 0.9706041812896729, |
| "learning_rate": 3.159539514307812e-05, |
| "loss": 2.5873, |
| "step": 65800 |
| }, |
| { |
| "epoch": 62.169851380042466, |
| "grad_norm": 0.9837217330932617, |
| "learning_rate": 3.143322874727417e-05, |
| "loss": 2.5714, |
| "step": 65900 |
| }, |
| { |
| "epoch": 62.26421325784383, |
| "grad_norm": 0.9780051112174988, |
| "learning_rate": 3.1271288581718586e-05, |
| "loss": 2.587, |
| "step": 66000 |
| }, |
| { |
| "epoch": 62.26421325784383, |
| "eval_loss": 3.492661237716675, |
| "eval_runtime": 89.913, |
| "eval_samples_per_second": 167.618, |
| "eval_steps_per_second": 5.238, |
| "step": 66000 |
| }, |
| { |
| "epoch": 62.3585751356452, |
| "grad_norm": 0.9650771617889404, |
| "learning_rate": 3.1109576619601245e-05, |
| "loss": 2.5869, |
| "step": 66100 |
| }, |
| { |
| "epoch": 62.452937013446565, |
| "grad_norm": 0.9946165680885315, |
| "learning_rate": 3.0948094831331334e-05, |
| "loss": 2.6024, |
| "step": 66200 |
| }, |
| { |
| "epoch": 62.547298891247934, |
| "grad_norm": 0.971069872379303, |
| "learning_rate": 3.078684518451346e-05, |
| "loss": 2.6002, |
| "step": 66300 |
| }, |
| { |
| "epoch": 62.6416607690493, |
| "grad_norm": 0.978173553943634, |
| "learning_rate": 3.062582964392373e-05, |
| "loss": 2.6079, |
| "step": 66400 |
| }, |
| { |
| "epoch": 62.73602264685067, |
| "grad_norm": 0.957497239112854, |
| "learning_rate": 3.0465050171485677e-05, |
| "loss": 2.6131, |
| "step": 66500 |
| }, |
| { |
| "epoch": 62.83038452465204, |
| "grad_norm": 0.9888630509376526, |
| "learning_rate": 3.0304508726246428e-05, |
| "loss": 2.6085, |
| "step": 66600 |
| }, |
| { |
| "epoch": 62.92474640245341, |
| "grad_norm": 0.9818923473358154, |
| "learning_rate": 3.0144207264352814e-05, |
| "loss": 2.6175, |
| "step": 66700 |
| }, |
| { |
| "epoch": 63.01887237556027, |
| "grad_norm": 0.975088894367218, |
| "learning_rate": 2.99841477390276e-05, |
| "loss": 2.6067, |
| "step": 66800 |
| }, |
| { |
| "epoch": 63.11323425336164, |
| "grad_norm": 0.9827061891555786, |
| "learning_rate": 2.982433210054557e-05, |
| "loss": 2.5636, |
| "step": 66900 |
| }, |
| { |
| "epoch": 63.20759613116301, |
| "grad_norm": 0.9684129357337952, |
| "learning_rate": 2.9664762296209824e-05, |
| "loss": 2.5811, |
| "step": 67000 |
| }, |
| { |
| "epoch": 63.20759613116301, |
| "eval_loss": 3.4976654052734375, |
| "eval_runtime": 89.9173, |
| "eval_samples_per_second": 167.61, |
| "eval_steps_per_second": 5.238, |
| "step": 67000 |
| }, |
| { |
| "epoch": 63.30195800896438, |
| "grad_norm": 0.9793866872787476, |
| "learning_rate": 2.9505440270328112e-05, |
| "loss": 2.5806, |
| "step": 67100 |
| }, |
| { |
| "epoch": 63.396319886765745, |
| "grad_norm": 0.9836909770965576, |
| "learning_rate": 2.9346367964188992e-05, |
| "loss": 2.5833, |
| "step": 67200 |
| }, |
| { |
| "epoch": 63.490681764567114, |
| "grad_norm": 0.9712016582489014, |
| "learning_rate": 2.918754731603835e-05, |
| "loss": 2.5929, |
| "step": 67300 |
| }, |
| { |
| "epoch": 63.58504364236848, |
| "grad_norm": 0.9777830839157104, |
| "learning_rate": 2.9028980261055637e-05, |
| "loss": 2.6031, |
| "step": 67400 |
| }, |
| { |
| "epoch": 63.67940552016985, |
| "grad_norm": 0.9632680416107178, |
| "learning_rate": 2.8870668731330426e-05, |
| "loss": 2.6045, |
| "step": 67500 |
| }, |
| { |
| "epoch": 63.77376739797122, |
| "grad_norm": 0.9569922685623169, |
| "learning_rate": 2.8712614655838683e-05, |
| "loss": 2.6036, |
| "step": 67600 |
| }, |
| { |
| "epoch": 63.86812927577259, |
| "grad_norm": 0.9730768203735352, |
| "learning_rate": 2.8554819960419493e-05, |
| "loss": 2.6091, |
| "step": 67700 |
| }, |
| { |
| "epoch": 63.96249115357396, |
| "grad_norm": 0.9747135043144226, |
| "learning_rate": 2.8397286567751397e-05, |
| "loss": 2.6073, |
| "step": 67800 |
| }, |
| { |
| "epoch": 64.05661712668082, |
| "grad_norm": 1.009427785873413, |
| "learning_rate": 2.824001639732905e-05, |
| "loss": 2.5767, |
| "step": 67900 |
| }, |
| { |
| "epoch": 64.1509790044822, |
| "grad_norm": 0.982246458530426, |
| "learning_rate": 2.8083011365439892e-05, |
| "loss": 2.5632, |
| "step": 68000 |
| }, |
| { |
| "epoch": 64.1509790044822, |
| "eval_loss": 3.4971301555633545, |
| "eval_runtime": 89.9117, |
| "eval_samples_per_second": 167.62, |
| "eval_steps_per_second": 5.238, |
| "step": 68000 |
| }, |
| { |
| "epoch": 64.24534088228356, |
| "grad_norm": 0.9694870114326477, |
| "learning_rate": 2.792627338514065e-05, |
| "loss": 2.5706, |
| "step": 68100 |
| }, |
| { |
| "epoch": 64.33970276008493, |
| "grad_norm": 0.9907557964324951, |
| "learning_rate": 2.7769804366234187e-05, |
| "loss": 2.581, |
| "step": 68200 |
| }, |
| { |
| "epoch": 64.4340646378863, |
| "grad_norm": 0.9872937202453613, |
| "learning_rate": 2.7613606215246067e-05, |
| "loss": 2.5803, |
| "step": 68300 |
| }, |
| { |
| "epoch": 64.52842651568766, |
| "grad_norm": 0.9905672669410706, |
| "learning_rate": 2.7457680835401533e-05, |
| "loss": 2.5882, |
| "step": 68400 |
| }, |
| { |
| "epoch": 64.62278839348903, |
| "grad_norm": 0.9775338768959045, |
| "learning_rate": 2.730203012660209e-05, |
| "loss": 2.5936, |
| "step": 68500 |
| }, |
| { |
| "epoch": 64.7171502712904, |
| "grad_norm": 1.0013508796691895, |
| "learning_rate": 2.714665598540249e-05, |
| "loss": 2.5954, |
| "step": 68600 |
| }, |
| { |
| "epoch": 64.81151214909177, |
| "grad_norm": 0.965290904045105, |
| "learning_rate": 2.699156030498764e-05, |
| "loss": 2.5983, |
| "step": 68700 |
| }, |
| { |
| "epoch": 64.90587402689313, |
| "grad_norm": 1.0086421966552734, |
| "learning_rate": 2.6836744975149463e-05, |
| "loss": 2.6026, |
| "step": 68800 |
| }, |
| { |
| "epoch": 65.0, |
| "grad_norm": 1.220382571220398, |
| "learning_rate": 2.6682211882263873e-05, |
| "loss": 2.6107, |
| "step": 68900 |
| }, |
| { |
| "epoch": 65.09436187780136, |
| "grad_norm": 0.981444776058197, |
| "learning_rate": 2.6527962909267856e-05, |
| "loss": 2.5515, |
| "step": 69000 |
| }, |
| { |
| "epoch": 65.09436187780136, |
| "eval_loss": 3.49924635887146, |
| "eval_runtime": 89.9129, |
| "eval_samples_per_second": 167.618, |
| "eval_steps_per_second": 5.238, |
| "step": 69000 |
| }, |
| { |
| "epoch": 65.18872375560274, |
| "grad_norm": 0.9748377203941345, |
| "learning_rate": 2.637399993563648e-05, |
| "loss": 2.556, |
| "step": 69100 |
| }, |
| { |
| "epoch": 65.2830856334041, |
| "grad_norm": 1.0094541311264038, |
| "learning_rate": 2.6220324837359956e-05, |
| "loss": 2.5734, |
| "step": 69200 |
| }, |
| { |
| "epoch": 65.37744751120547, |
| "grad_norm": 1.0065981149673462, |
| "learning_rate": 2.6066939486920904e-05, |
| "loss": 2.5733, |
| "step": 69300 |
| }, |
| { |
| "epoch": 65.47180938900684, |
| "grad_norm": 0.9851287007331848, |
| "learning_rate": 2.5913845753271393e-05, |
| "loss": 2.5768, |
| "step": 69400 |
| }, |
| { |
| "epoch": 65.56617126680821, |
| "grad_norm": 1.0075511932373047, |
| "learning_rate": 2.5761045501810222e-05, |
| "loss": 2.5891, |
| "step": 69500 |
| }, |
| { |
| "epoch": 65.66053314460957, |
| "grad_norm": 0.9952249526977539, |
| "learning_rate": 2.560854059436029e-05, |
| "loss": 2.5925, |
| "step": 69600 |
| }, |
| { |
| "epoch": 65.75489502241095, |
| "grad_norm": 0.9942532777786255, |
| "learning_rate": 2.5456332889145718e-05, |
| "loss": 2.5929, |
| "step": 69700 |
| }, |
| { |
| "epoch": 65.84925690021231, |
| "grad_norm": 0.992847740650177, |
| "learning_rate": 2.530442424076941e-05, |
| "loss": 2.5896, |
| "step": 69800 |
| }, |
| { |
| "epoch": 65.94361877801369, |
| "grad_norm": 0.9890574812889099, |
| "learning_rate": 2.5152816500190253e-05, |
| "loss": 2.5912, |
| "step": 69900 |
| }, |
| { |
| "epoch": 66.03774475112054, |
| "grad_norm": 0.9975290298461914, |
| "learning_rate": 2.500151151470077e-05, |
| "loss": 2.586, |
| "step": 70000 |
| }, |
| { |
| "epoch": 66.03774475112054, |
| "eval_loss": 3.501796245574951, |
| "eval_runtime": 89.9053, |
| "eval_samples_per_second": 167.632, |
| "eval_steps_per_second": 5.239, |
| "step": 70000 |
| }, |
| { |
| "epoch": 66.13210662892192, |
| "grad_norm": 0.9931478500366211, |
| "learning_rate": 2.4850511127904437e-05, |
| "loss": 2.5538, |
| "step": 70100 |
| }, |
| { |
| "epoch": 66.22646850672328, |
| "grad_norm": 1.002678632736206, |
| "learning_rate": 2.469981717969329e-05, |
| "loss": 2.5557, |
| "step": 70200 |
| }, |
| { |
| "epoch": 66.32083038452465, |
| "grad_norm": 0.9973838329315186, |
| "learning_rate": 2.4549431506225586e-05, |
| "loss": 2.571, |
| "step": 70300 |
| }, |
| { |
| "epoch": 66.41519226232602, |
| "grad_norm": 1.0087188482284546, |
| "learning_rate": 2.4399355939903245e-05, |
| "loss": 2.5711, |
| "step": 70400 |
| }, |
| { |
| "epoch": 66.50955414012739, |
| "grad_norm": 1.0123405456542969, |
| "learning_rate": 2.4249592309349728e-05, |
| "loss": 2.5793, |
| "step": 70500 |
| }, |
| { |
| "epoch": 66.60391601792875, |
| "grad_norm": 0.9953839182853699, |
| "learning_rate": 2.410014243938757e-05, |
| "loss": 2.576, |
| "step": 70600 |
| }, |
| { |
| "epoch": 66.69827789573013, |
| "grad_norm": 0.9909201860427856, |
| "learning_rate": 2.3951008151016285e-05, |
| "loss": 2.5811, |
| "step": 70700 |
| }, |
| { |
| "epoch": 66.79263977353149, |
| "grad_norm": 0.997779369354248, |
| "learning_rate": 2.380219126139014e-05, |
| "loss": 2.5825, |
| "step": 70800 |
| }, |
| { |
| "epoch": 66.88700165133287, |
| "grad_norm": 0.9807635545730591, |
| "learning_rate": 2.3653693583795932e-05, |
| "loss": 2.5833, |
| "step": 70900 |
| }, |
| { |
| "epoch": 66.98136352913423, |
| "grad_norm": 0.9934784173965454, |
| "learning_rate": 2.3505516927631037e-05, |
| "loss": 2.5906, |
| "step": 71000 |
| }, |
| { |
| "epoch": 66.98136352913423, |
| "eval_loss": 3.4984776973724365, |
| "eval_runtime": 89.9142, |
| "eval_samples_per_second": 167.615, |
| "eval_steps_per_second": 5.238, |
| "step": 71000 |
| }, |
| { |
| "epoch": 67.0754895022411, |
| "grad_norm": 1.0115574598312378, |
| "learning_rate": 2.3357663098381217e-05, |
| "loss": 2.5508, |
| "step": 71100 |
| }, |
| { |
| "epoch": 67.16985138004246, |
| "grad_norm": 0.9923783540725708, |
| "learning_rate": 2.3210133897598744e-05, |
| "loss": 2.5533, |
| "step": 71200 |
| }, |
| { |
| "epoch": 67.26421325784383, |
| "grad_norm": 0.9832899570465088, |
| "learning_rate": 2.3062931122880348e-05, |
| "loss": 2.5495, |
| "step": 71300 |
| }, |
| { |
| "epoch": 67.3585751356452, |
| "grad_norm": 1.005965232849121, |
| "learning_rate": 2.2916056567845418e-05, |
| "loss": 2.5595, |
| "step": 71400 |
| }, |
| { |
| "epoch": 67.45293701344657, |
| "grad_norm": 1.0081592798233032, |
| "learning_rate": 2.276951202211402e-05, |
| "loss": 2.5684, |
| "step": 71500 |
| }, |
| { |
| "epoch": 67.54729889124793, |
| "grad_norm": 0.9831011891365051, |
| "learning_rate": 2.262329927128523e-05, |
| "loss": 2.5704, |
| "step": 71600 |
| }, |
| { |
| "epoch": 67.64166076904931, |
| "grad_norm": 0.9909459352493286, |
| "learning_rate": 2.2477420096915257e-05, |
| "loss": 2.5721, |
| "step": 71700 |
| }, |
| { |
| "epoch": 67.73602264685067, |
| "grad_norm": 1.0143030881881714, |
| "learning_rate": 2.2331876276495796e-05, |
| "loss": 2.5757, |
| "step": 71800 |
| }, |
| { |
| "epoch": 67.83038452465205, |
| "grad_norm": 1.0063189268112183, |
| "learning_rate": 2.218666958343239e-05, |
| "loss": 2.5865, |
| "step": 71900 |
| }, |
| { |
| "epoch": 67.92474640245341, |
| "grad_norm": 1.0109525918960571, |
| "learning_rate": 2.2041801787022742e-05, |
| "loss": 2.5861, |
| "step": 72000 |
| }, |
| { |
| "epoch": 67.92474640245341, |
| "eval_loss": 3.5032782554626465, |
| "eval_runtime": 89.9205, |
| "eval_samples_per_second": 167.604, |
| "eval_steps_per_second": 5.238, |
| "step": 72000 |
| }, |
| { |
| "epoch": 68.01887237556028, |
| "grad_norm": 0.9933319091796875, |
| "learning_rate": 2.189727465243527e-05, |
| "loss": 2.5741, |
| "step": 72100 |
| }, |
| { |
| "epoch": 68.11323425336164, |
| "grad_norm": 1.0024901628494263, |
| "learning_rate": 2.1753089940687456e-05, |
| "loss": 2.5397, |
| "step": 72200 |
| }, |
| { |
| "epoch": 68.20759613116302, |
| "grad_norm": 1.0160821676254272, |
| "learning_rate": 2.1609249408624547e-05, |
| "loss": 2.5429, |
| "step": 72300 |
| }, |
| { |
| "epoch": 68.30195800896438, |
| "grad_norm": 0.9907500147819519, |
| "learning_rate": 2.1465754808898007e-05, |
| "loss": 2.5527, |
| "step": 72400 |
| }, |
| { |
| "epoch": 68.39631988676575, |
| "grad_norm": 1.0081429481506348, |
| "learning_rate": 2.132260788994428e-05, |
| "loss": 2.5562, |
| "step": 72500 |
| }, |
| { |
| "epoch": 68.49068176456711, |
| "grad_norm": 1.0024923086166382, |
| "learning_rate": 2.1179810395963363e-05, |
| "loss": 2.5671, |
| "step": 72600 |
| }, |
| { |
| "epoch": 68.58504364236849, |
| "grad_norm": 1.021855354309082, |
| "learning_rate": 2.1037364066897674e-05, |
| "loss": 2.5708, |
| "step": 72700 |
| }, |
| { |
| "epoch": 68.67940552016985, |
| "grad_norm": 1.0236481428146362, |
| "learning_rate": 2.0895270638410797e-05, |
| "loss": 2.5632, |
| "step": 72800 |
| }, |
| { |
| "epoch": 68.77376739797121, |
| "grad_norm": 1.0253878831863403, |
| "learning_rate": 2.0753531841866282e-05, |
| "loss": 2.5667, |
| "step": 72900 |
| }, |
| { |
| "epoch": 68.86812927577259, |
| "grad_norm": 1.031447410583496, |
| "learning_rate": 2.0612149404306664e-05, |
| "loss": 2.5735, |
| "step": 73000 |
| }, |
| { |
| "epoch": 68.86812927577259, |
| "eval_loss": 3.5056300163269043, |
| "eval_runtime": 89.9181, |
| "eval_samples_per_second": 167.608, |
| "eval_steps_per_second": 5.238, |
| "step": 73000 |
| }, |
| { |
| "epoch": 68.96249115357395, |
| "grad_norm": 1.0303679704666138, |
| "learning_rate": 2.0471125048432317e-05, |
| "loss": 2.587, |
| "step": 73100 |
| }, |
| { |
| "epoch": 69.05661712668082, |
| "grad_norm": 1.0058467388153076, |
| "learning_rate": 2.033046049258049e-05, |
| "loss": 2.5513, |
| "step": 73200 |
| }, |
| { |
| "epoch": 69.1509790044822, |
| "grad_norm": 1.029050588607788, |
| "learning_rate": 2.0190157450704443e-05, |
| "loss": 2.5324, |
| "step": 73300 |
| }, |
| { |
| "epoch": 69.24534088228356, |
| "grad_norm": 0.9888247847557068, |
| "learning_rate": 2.005021763235243e-05, |
| "loss": 2.5505, |
| "step": 73400 |
| }, |
| { |
| "epoch": 69.33970276008493, |
| "grad_norm": 1.0089259147644043, |
| "learning_rate": 1.9910642742647013e-05, |
| "loss": 2.549, |
| "step": 73500 |
| }, |
| { |
| "epoch": 69.4340646378863, |
| "grad_norm": 1.0188043117523193, |
| "learning_rate": 1.9771434482264132e-05, |
| "loss": 2.5493, |
| "step": 73600 |
| }, |
| { |
| "epoch": 69.52842651568766, |
| "grad_norm": 1.021933913230896, |
| "learning_rate": 1.963259454741255e-05, |
| "loss": 2.5613, |
| "step": 73700 |
| }, |
| { |
| "epoch": 69.62278839348903, |
| "grad_norm": 0.9995942115783691, |
| "learning_rate": 1.949412462981302e-05, |
| "loss": 2.5605, |
| "step": 73800 |
| }, |
| { |
| "epoch": 69.7171502712904, |
| "grad_norm": 1.024236798286438, |
| "learning_rate": 1.935602641667783e-05, |
| "loss": 2.565, |
| "step": 73900 |
| }, |
| { |
| "epoch": 69.81151214909177, |
| "grad_norm": 1.0142080783843994, |
| "learning_rate": 1.9218301590690103e-05, |
| "loss": 2.566, |
| "step": 74000 |
| }, |
| { |
| "epoch": 69.81151214909177, |
| "eval_loss": 3.508208990097046, |
| "eval_runtime": 89.9057, |
| "eval_samples_per_second": 167.631, |
| "eval_steps_per_second": 5.239, |
| "step": 74000 |
| }, |
| { |
| "epoch": 69.90587402689313, |
| "grad_norm": 1.0000115633010864, |
| "learning_rate": 1.9080951829983358e-05, |
| "loss": 2.5689, |
| "step": 74100 |
| }, |
| { |
| "epoch": 70.0, |
| "grad_norm": 1.2522852420806885, |
| "learning_rate": 1.894397880812113e-05, |
| "loss": 2.5776, |
| "step": 74200 |
| }, |
| { |
| "epoch": 70.09436187780136, |
| "grad_norm": 1.0207244157791138, |
| "learning_rate": 1.8807384194076426e-05, |
| "loss": 2.5298, |
| "step": 74300 |
| }, |
| { |
| "epoch": 70.18872375560274, |
| "grad_norm": 1.0133506059646606, |
| "learning_rate": 1.8671169652211524e-05, |
| "loss": 2.5354, |
| "step": 74400 |
| }, |
| { |
| "epoch": 70.2830856334041, |
| "grad_norm": 1.022737741470337, |
| "learning_rate": 1.8535336842257657e-05, |
| "loss": 2.5467, |
| "step": 74500 |
| }, |
| { |
| "epoch": 70.37744751120547, |
| "grad_norm": 1.0356816053390503, |
| "learning_rate": 1.8399887419294696e-05, |
| "loss": 2.54, |
| "step": 74600 |
| }, |
| { |
| "epoch": 70.47180938900684, |
| "grad_norm": 1.031171441078186, |
| "learning_rate": 1.826482303373117e-05, |
| "loss": 2.5501, |
| "step": 74700 |
| }, |
| { |
| "epoch": 70.56617126680821, |
| "grad_norm": 1.0288480520248413, |
| "learning_rate": 1.8130145331283948e-05, |
| "loss": 2.5445, |
| "step": 74800 |
| }, |
| { |
| "epoch": 70.66053314460957, |
| "grad_norm": 1.0107218027114868, |
| "learning_rate": 1.799585595295837e-05, |
| "loss": 2.5604, |
| "step": 74900 |
| }, |
| { |
| "epoch": 70.75489502241095, |
| "grad_norm": 1.025320053100586, |
| "learning_rate": 1.786195653502809e-05, |
| "loss": 2.5648, |
| "step": 75000 |
| }, |
| { |
| "epoch": 70.75489502241095, |
| "eval_loss": 3.510239601135254, |
| "eval_runtime": 89.9964, |
| "eval_samples_per_second": 167.462, |
| "eval_steps_per_second": 5.234, |
| "step": 75000 |
| }, |
| { |
| "epoch": 70.84925690021231, |
| "grad_norm": 1.0341417789459229, |
| "learning_rate": 1.7728448709015304e-05, |
| "loss": 2.5618, |
| "step": 75100 |
| }, |
| { |
| "epoch": 70.94361877801369, |
| "grad_norm": 0.989520251750946, |
| "learning_rate": 1.7595334101670703e-05, |
| "loss": 2.567, |
| "step": 75200 |
| }, |
| { |
| "epoch": 71.03774475112054, |
| "grad_norm": 1.0228410959243774, |
| "learning_rate": 1.7462614334953798e-05, |
| "loss": 2.5553, |
| "step": 75300 |
| }, |
| { |
| "epoch": 71.13210662892192, |
| "grad_norm": 1.029288411140442, |
| "learning_rate": 1.733029102601303e-05, |
| "loss": 2.5319, |
| "step": 75400 |
| }, |
| { |
| "epoch": 71.22646850672328, |
| "grad_norm": 1.024125337600708, |
| "learning_rate": 1.71983657871662e-05, |
| "loss": 2.5339, |
| "step": 75500 |
| }, |
| { |
| "epoch": 71.32083038452465, |
| "grad_norm": 1.0339561700820923, |
| "learning_rate": 1.706684022588068e-05, |
| "loss": 2.5445, |
| "step": 75600 |
| }, |
| { |
| "epoch": 71.41519226232602, |
| "grad_norm": 1.0182002782821655, |
| "learning_rate": 1.6935715944753928e-05, |
| "loss": 2.5438, |
| "step": 75700 |
| }, |
| { |
| "epoch": 71.50955414012739, |
| "grad_norm": 1.0119465589523315, |
| "learning_rate": 1.6804994541493953e-05, |
| "loss": 2.5432, |
| "step": 75800 |
| }, |
| { |
| "epoch": 71.60391601792875, |
| "grad_norm": 1.0229612588882446, |
| "learning_rate": 1.6674677608899763e-05, |
| "loss": 2.5516, |
| "step": 75900 |
| }, |
| { |
| "epoch": 71.69827789573013, |
| "grad_norm": 1.053076982498169, |
| "learning_rate": 1.6544766734842093e-05, |
| "loss": 2.554, |
| "step": 76000 |
| }, |
| { |
| "epoch": 71.69827789573013, |
| "eval_loss": 3.5122032165527344, |
| "eval_runtime": 89.9374, |
| "eval_samples_per_second": 167.572, |
| "eval_steps_per_second": 5.237, |
| "step": 76000 |
| }, |
| { |
| "epoch": 71.79263977353149, |
| "grad_norm": 1.0219072103500366, |
| "learning_rate": 1.641526350224392e-05, |
| "loss": 2.5512, |
| "step": 76100 |
| }, |
| { |
| "epoch": 71.88700165133287, |
| "grad_norm": 1.0074644088745117, |
| "learning_rate": 1.628616948906129e-05, |
| "loss": 2.5542, |
| "step": 76200 |
| }, |
| { |
| "epoch": 71.98136352913423, |
| "grad_norm": 1.0306816101074219, |
| "learning_rate": 1.615748626826398e-05, |
| "loss": 2.557, |
| "step": 76300 |
| }, |
| { |
| "epoch": 72.0754895022411, |
| "grad_norm": 1.015085220336914, |
| "learning_rate": 1.602921540781645e-05, |
| "loss": 2.5319, |
| "step": 76400 |
| }, |
| { |
| "epoch": 72.16985138004246, |
| "grad_norm": 1.0190519094467163, |
| "learning_rate": 1.5901358470658667e-05, |
| "loss": 2.5283, |
| "step": 76500 |
| }, |
| { |
| "epoch": 72.26421325784383, |
| "grad_norm": 1.0237911939620972, |
| "learning_rate": 1.5773917014687024e-05, |
| "loss": 2.5352, |
| "step": 76600 |
| }, |
| { |
| "epoch": 72.3585751356452, |
| "grad_norm": 1.0036261081695557, |
| "learning_rate": 1.5646892592735478e-05, |
| "loss": 2.5312, |
| "step": 76700 |
| }, |
| { |
| "epoch": 72.45293701344657, |
| "grad_norm": 1.032339096069336, |
| "learning_rate": 1.55202867525565e-05, |
| "loss": 2.5387, |
| "step": 76800 |
| }, |
| { |
| "epoch": 72.54729889124793, |
| "grad_norm": 1.0219571590423584, |
| "learning_rate": 1.5394101036802316e-05, |
| "loss": 2.5443, |
| "step": 76900 |
| }, |
| { |
| "epoch": 72.64166076904931, |
| "grad_norm": 1.0258891582489014, |
| "learning_rate": 1.5268336983006048e-05, |
| "loss": 2.5451, |
| "step": 77000 |
| }, |
| { |
| "epoch": 72.64166076904931, |
| "eval_loss": 3.5152316093444824, |
| "eval_runtime": 89.969, |
| "eval_samples_per_second": 167.513, |
| "eval_steps_per_second": 5.235, |
| "step": 77000 |
| }, |
| { |
| "epoch": 72.73602264685067, |
| "grad_norm": 1.041174292564392, |
| "learning_rate": 1.514299612356298e-05, |
| "loss": 2.5453, |
| "step": 77100 |
| }, |
| { |
| "epoch": 72.83038452465205, |
| "grad_norm": 1.0180433988571167, |
| "learning_rate": 1.5018079985711963e-05, |
| "loss": 2.5543, |
| "step": 77200 |
| }, |
| { |
| "epoch": 72.92474640245341, |
| "grad_norm": 1.046834111213684, |
| "learning_rate": 1.4893590091516686e-05, |
| "loss": 2.5545, |
| "step": 77300 |
| }, |
| { |
| "epoch": 73.01887237556028, |
| "grad_norm": 1.0277193784713745, |
| "learning_rate": 1.4769527957847246e-05, |
| "loss": 2.5443, |
| "step": 77400 |
| }, |
| { |
| "epoch": 73.11323425336164, |
| "grad_norm": 1.0245246887207031, |
| "learning_rate": 1.4645895096361568e-05, |
| "loss": 2.5189, |
| "step": 77500 |
| }, |
| { |
| "epoch": 73.20759613116302, |
| "grad_norm": 1.0065805912017822, |
| "learning_rate": 1.4522693013487077e-05, |
| "loss": 2.5251, |
| "step": 77600 |
| }, |
| { |
| "epoch": 73.30195800896438, |
| "grad_norm": 1.0437077283859253, |
| "learning_rate": 1.439992321040225e-05, |
| "loss": 2.5347, |
| "step": 77700 |
| }, |
| { |
| "epoch": 73.39631988676575, |
| "grad_norm": 1.0220911502838135, |
| "learning_rate": 1.427758718301842e-05, |
| "loss": 2.5339, |
| "step": 77800 |
| }, |
| { |
| "epoch": 73.49068176456711, |
| "grad_norm": 1.0436162948608398, |
| "learning_rate": 1.4155686421961456e-05, |
| "loss": 2.5351, |
| "step": 77900 |
| }, |
| { |
| "epoch": 73.58504364236849, |
| "grad_norm": 1.0351018905639648, |
| "learning_rate": 1.4034222412553655e-05, |
| "loss": 2.5397, |
| "step": 78000 |
| }, |
| { |
| "epoch": 73.58504364236849, |
| "eval_loss": 3.5180113315582275, |
| "eval_runtime": 89.9383, |
| "eval_samples_per_second": 167.57, |
| "eval_steps_per_second": 5.237, |
| "step": 78000 |
| }, |
| { |
| "epoch": 73.67940552016985, |
| "grad_norm": 1.0252406597137451, |
| "learning_rate": 1.3913196634795644e-05, |
| "loss": 2.5413, |
| "step": 78100 |
| }, |
| { |
| "epoch": 73.77376739797121, |
| "grad_norm": 1.0012431144714355, |
| "learning_rate": 1.3792610563348352e-05, |
| "loss": 2.5421, |
| "step": 78200 |
| }, |
| { |
| "epoch": 73.86812927577259, |
| "grad_norm": 1.0136473178863525, |
| "learning_rate": 1.3672465667514977e-05, |
| "loss": 2.5438, |
| "step": 78300 |
| }, |
| { |
| "epoch": 73.96249115357395, |
| "grad_norm": 1.0364450216293335, |
| "learning_rate": 1.3552763411223173e-05, |
| "loss": 2.5459, |
| "step": 78400 |
| }, |
| { |
| "epoch": 74.05661712668082, |
| "grad_norm": 1.0424617528915405, |
| "learning_rate": 1.3433505253007172e-05, |
| "loss": 2.5315, |
| "step": 78500 |
| }, |
| { |
| "epoch": 74.1509790044822, |
| "grad_norm": 1.0614136457443237, |
| "learning_rate": 1.3314692645989978e-05, |
| "loss": 2.5198, |
| "step": 78600 |
| }, |
| { |
| "epoch": 74.24534088228356, |
| "grad_norm": 1.0322601795196533, |
| "learning_rate": 1.3196327037865701e-05, |
| "loss": 2.5281, |
| "step": 78700 |
| }, |
| { |
| "epoch": 74.33970276008493, |
| "grad_norm": 1.065558671951294, |
| "learning_rate": 1.3078409870881952e-05, |
| "loss": 2.527, |
| "step": 78800 |
| }, |
| { |
| "epoch": 74.4340646378863, |
| "grad_norm": 1.050769567489624, |
| "learning_rate": 1.2960942581822166e-05, |
| "loss": 2.5292, |
| "step": 78900 |
| }, |
| { |
| "epoch": 74.52842651568766, |
| "grad_norm": 1.046426773071289, |
| "learning_rate": 1.2843926601988227e-05, |
| "loss": 2.5304, |
| "step": 79000 |
| }, |
| { |
| "epoch": 74.52842651568766, |
| "eval_loss": 3.5176949501037598, |
| "eval_runtime": 89.9419, |
| "eval_samples_per_second": 167.564, |
| "eval_steps_per_second": 5.237, |
| "step": 79000 |
| }, |
| { |
| "epoch": 74.62278839348903, |
| "grad_norm": 1.0452642440795898, |
| "learning_rate": 1.272736335718288e-05, |
| "loss": 2.5298, |
| "step": 79100 |
| }, |
| { |
| "epoch": 74.7171502712904, |
| "grad_norm": 1.0447572469711304, |
| "learning_rate": 1.2611254267692518e-05, |
| "loss": 2.5374, |
| "step": 79200 |
| }, |
| { |
| "epoch": 74.81151214909177, |
| "grad_norm": 1.0334386825561523, |
| "learning_rate": 1.2495600748269732e-05, |
| "loss": 2.5421, |
| "step": 79300 |
| }, |
| { |
| "epoch": 74.90587402689313, |
| "grad_norm": 1.033453345298767, |
| "learning_rate": 1.2380404208116148e-05, |
| "loss": 2.5359, |
| "step": 79400 |
| }, |
| { |
| "epoch": 75.0, |
| "grad_norm": 1.2676030397415161, |
| "learning_rate": 1.2265666050865283e-05, |
| "loss": 2.5502, |
| "step": 79500 |
| }, |
| { |
| "epoch": 75.09436187780136, |
| "grad_norm": 1.0433539152145386, |
| "learning_rate": 1.215138767456534e-05, |
| "loss": 2.5152, |
| "step": 79600 |
| }, |
| { |
| "epoch": 75.18872375560274, |
| "grad_norm": 1.046495795249939, |
| "learning_rate": 1.2037570471662307e-05, |
| "loss": 2.5191, |
| "step": 79700 |
| }, |
| { |
| "epoch": 75.2830856334041, |
| "grad_norm": 1.0373479127883911, |
| "learning_rate": 1.1924215828982842e-05, |
| "loss": 2.518, |
| "step": 79800 |
| }, |
| { |
| "epoch": 75.37744751120547, |
| "grad_norm": 1.0448527336120605, |
| "learning_rate": 1.1811325127717544e-05, |
| "loss": 2.5275, |
| "step": 79900 |
| }, |
| { |
| "epoch": 75.47180938900684, |
| "grad_norm": 1.0297698974609375, |
| "learning_rate": 1.169889974340393e-05, |
| "loss": 2.5265, |
| "step": 80000 |
| }, |
| { |
| "epoch": 75.47180938900684, |
| "eval_loss": 3.520663022994995, |
| "eval_runtime": 89.963, |
| "eval_samples_per_second": 167.524, |
| "eval_steps_per_second": 5.235, |
| "step": 80000 |
| }, |
| { |
| "epoch": 75.56617126680821, |
| "grad_norm": 1.0429484844207764, |
| "learning_rate": 1.158694104590985e-05, |
| "loss": 2.5249, |
| "step": 80100 |
| }, |
| { |
| "epoch": 75.66053314460957, |
| "grad_norm": 1.0250797271728516, |
| "learning_rate": 1.1475450399416721e-05, |
| "loss": 2.5293, |
| "step": 80200 |
| }, |
| { |
| "epoch": 75.75489502241095, |
| "grad_norm": 1.0294713973999023, |
| "learning_rate": 1.1364429162402857e-05, |
| "loss": 2.5352, |
| "step": 80300 |
| }, |
| { |
| "epoch": 75.84925690021231, |
| "grad_norm": 1.0261166095733643, |
| "learning_rate": 1.1253878687627017e-05, |
| "loss": 2.5372, |
| "step": 80400 |
| }, |
| { |
| "epoch": 75.94361877801369, |
| "grad_norm": 1.052734613418579, |
| "learning_rate": 1.1143800322111825e-05, |
| "loss": 2.5384, |
| "step": 80500 |
| }, |
| { |
| "epoch": 76.03774475112054, |
| "grad_norm": 1.0278379917144775, |
| "learning_rate": 1.1034195407127451e-05, |
| "loss": 2.5258, |
| "step": 80600 |
| }, |
| { |
| "epoch": 76.13210662892192, |
| "grad_norm": 1.0214896202087402, |
| "learning_rate": 1.0925065278175173e-05, |
| "loss": 2.5091, |
| "step": 80700 |
| }, |
| { |
| "epoch": 76.22646850672328, |
| "grad_norm": 1.0401657819747925, |
| "learning_rate": 1.0816411264971193e-05, |
| "loss": 2.5178, |
| "step": 80800 |
| }, |
| { |
| "epoch": 76.32083038452465, |
| "grad_norm": 1.0354347229003906, |
| "learning_rate": 1.0708234691430364e-05, |
| "loss": 2.5193, |
| "step": 80900 |
| }, |
| { |
| "epoch": 76.41519226232602, |
| "grad_norm": 1.0436785221099854, |
| "learning_rate": 1.060053687565008e-05, |
| "loss": 2.5209, |
| "step": 81000 |
| }, |
| { |
| "epoch": 76.41519226232602, |
| "eval_loss": 3.522312641143799, |
| "eval_runtime": 89.9194, |
| "eval_samples_per_second": 167.606, |
| "eval_steps_per_second": 5.238, |
| "step": 81000 |
| }, |
| { |
| "epoch": 76.50955414012739, |
| "grad_norm": 1.0483065843582153, |
| "learning_rate": 1.0493319129894263e-05, |
| "loss": 2.5299, |
| "step": 81100 |
| }, |
| { |
| "epoch": 76.60391601792875, |
| "grad_norm": 1.0402249097824097, |
| "learning_rate": 1.0386582760577296e-05, |
| "loss": 2.5216, |
| "step": 81200 |
| }, |
| { |
| "epoch": 76.69827789573013, |
| "grad_norm": 1.0522435903549194, |
| "learning_rate": 1.0280329068248162e-05, |
| "loss": 2.5262, |
| "step": 81300 |
| }, |
| { |
| "epoch": 76.79263977353149, |
| "grad_norm": 1.047003984451294, |
| "learning_rate": 1.0174559347574564e-05, |
| "loss": 2.5315, |
| "step": 81400 |
| }, |
| { |
| "epoch": 76.88700165133287, |
| "grad_norm": 1.0495613813400269, |
| "learning_rate": 1.006927488732718e-05, |
| "loss": 2.5355, |
| "step": 81500 |
| }, |
| { |
| "epoch": 76.98136352913423, |
| "grad_norm": 1.0396546125411987, |
| "learning_rate": 9.964476970363913e-06, |
| "loss": 2.529, |
| "step": 81600 |
| }, |
| { |
| "epoch": 77.0754895022411, |
| "grad_norm": 1.0478557348251343, |
| "learning_rate": 9.860166873614335e-06, |
| "loss": 2.509, |
| "step": 81700 |
| }, |
| { |
| "epoch": 77.16985138004246, |
| "grad_norm": 1.0440701246261597, |
| "learning_rate": 9.756345868064026e-06, |
| "loss": 2.5131, |
| "step": 81800 |
| }, |
| { |
| "epoch": 77.26421325784383, |
| "grad_norm": 1.0421642065048218, |
| "learning_rate": 9.653015218739208e-06, |
| "loss": 2.5191, |
| "step": 81900 |
| }, |
| { |
| "epoch": 77.3585751356452, |
| "grad_norm": 1.0573500394821167, |
| "learning_rate": 9.550176184691206e-06, |
| "loss": 2.5203, |
| "step": 82000 |
| }, |
| { |
| "epoch": 77.3585751356452, |
| "eval_loss": 3.524492025375366, |
| "eval_runtime": 89.9003, |
| "eval_samples_per_second": 167.641, |
| "eval_steps_per_second": 5.239, |
| "step": 82000 |
| }, |
| { |
| "epoch": 77.45293701344657, |
| "grad_norm": 1.055014729499817, |
| "learning_rate": 9.447830018981202e-06, |
| "loss": 2.5145, |
| "step": 82100 |
| }, |
| { |
| "epoch": 77.54729889124793, |
| "grad_norm": 1.0383727550506592, |
| "learning_rate": 9.345977968664948e-06, |
| "loss": 2.5222, |
| "step": 82200 |
| }, |
| { |
| "epoch": 77.64166076904931, |
| "grad_norm": 1.0465056896209717, |
| "learning_rate": 9.24462127477751e-06, |
| "loss": 2.5143, |
| "step": 82300 |
| }, |
| { |
| "epoch": 77.73602264685067, |
| "grad_norm": 1.0561587810516357, |
| "learning_rate": 9.143761172318232e-06, |
| "loss": 2.5266, |
| "step": 82400 |
| }, |
| { |
| "epoch": 77.83038452465205, |
| "grad_norm": 1.055894374847412, |
| "learning_rate": 9.043398890235632e-06, |
| "loss": 2.5318, |
| "step": 82500 |
| }, |
| { |
| "epoch": 77.92474640245341, |
| "grad_norm": 1.0386089086532593, |
| "learning_rate": 8.943535651412422e-06, |
| "loss": 2.5229, |
| "step": 82600 |
| }, |
| { |
| "epoch": 78.01887237556028, |
| "grad_norm": 1.0355311632156372, |
| "learning_rate": 8.84417267265068e-06, |
| "loss": 2.5221, |
| "step": 82700 |
| }, |
| { |
| "epoch": 78.11323425336164, |
| "grad_norm": 1.0528194904327393, |
| "learning_rate": 8.745311164656928e-06, |
| "loss": 2.506, |
| "step": 82800 |
| }, |
| { |
| "epoch": 78.20759613116302, |
| "grad_norm": 1.0460840463638306, |
| "learning_rate": 8.646952332027453e-06, |
| "loss": 2.51, |
| "step": 82900 |
| }, |
| { |
| "epoch": 78.30195800896438, |
| "grad_norm": 1.0365525484085083, |
| "learning_rate": 8.549097373233578e-06, |
| "loss": 2.509, |
| "step": 83000 |
| }, |
| { |
| "epoch": 78.30195800896438, |
| "eval_loss": 3.5256645679473877, |
| "eval_runtime": 89.9438, |
| "eval_samples_per_second": 167.56, |
| "eval_steps_per_second": 5.237, |
| "step": 83000 |
| }, |
| { |
| "epoch": 78.39631988676575, |
| "grad_norm": 1.0634748935699463, |
| "learning_rate": 8.451747480607119e-06, |
| "loss": 2.5174, |
| "step": 83100 |
| }, |
| { |
| "epoch": 78.49068176456711, |
| "grad_norm": 1.0390421152114868, |
| "learning_rate": 8.354903840325773e-06, |
| "loss": 2.5152, |
| "step": 83200 |
| }, |
| { |
| "epoch": 78.58504364236849, |
| "grad_norm": 1.0396511554718018, |
| "learning_rate": 8.258567632398734e-06, |
| "loss": 2.5143, |
| "step": 83300 |
| }, |
| { |
| "epoch": 78.67940552016985, |
| "grad_norm": 1.0683192014694214, |
| "learning_rate": 8.16274003065231e-06, |
| "loss": 2.5149, |
| "step": 83400 |
| }, |
| { |
| "epoch": 78.77376739797121, |
| "grad_norm": 1.0451536178588867, |
| "learning_rate": 8.067422202715569e-06, |
| "loss": 2.5197, |
| "step": 83500 |
| }, |
| { |
| "epoch": 78.86812927577259, |
| "grad_norm": 1.0467932224273682, |
| "learning_rate": 7.972615310006181e-06, |
| "loss": 2.5279, |
| "step": 83600 |
| }, |
| { |
| "epoch": 78.96249115357395, |
| "grad_norm": 1.0642589330673218, |
| "learning_rate": 7.878320507716197e-06, |
| "loss": 2.5273, |
| "step": 83700 |
| }, |
| { |
| "epoch": 79.05661712668082, |
| "grad_norm": 1.061995029449463, |
| "learning_rate": 7.78453894479803e-06, |
| "loss": 2.5137, |
| "step": 83800 |
| }, |
| { |
| "epoch": 79.1509790044822, |
| "grad_norm": 1.0708853006362915, |
| "learning_rate": 7.691271763950453e-06, |
| "loss": 2.5001, |
| "step": 83900 |
| }, |
| { |
| "epoch": 79.24534088228356, |
| "grad_norm": 1.0408705472946167, |
| "learning_rate": 7.5985201016045945e-06, |
| "loss": 2.506, |
| "step": 84000 |
| }, |
| { |
| "epoch": 79.24534088228356, |
| "eval_loss": 3.5260493755340576, |
| "eval_runtime": 89.9073, |
| "eval_samples_per_second": 167.628, |
| "eval_steps_per_second": 5.239, |
| "step": 84000 |
| }, |
| { |
| "epoch": 79.33970276008493, |
| "grad_norm": 1.0534332990646362, |
| "learning_rate": 7.50628508791022e-06, |
| "loss": 2.513, |
| "step": 84100 |
| }, |
| { |
| "epoch": 79.4340646378863, |
| "grad_norm": 1.0592793226242065, |
| "learning_rate": 7.414567846721837e-06, |
| "loss": 2.5123, |
| "step": 84200 |
| }, |
| { |
| "epoch": 79.52842651568766, |
| "grad_norm": 1.066508173942566, |
| "learning_rate": 7.323369495585114e-06, |
| "loss": 2.5135, |
| "step": 84300 |
| }, |
| { |
| "epoch": 79.62278839348903, |
| "grad_norm": 1.0478541851043701, |
| "learning_rate": 7.232691145723147e-06, |
| "loss": 2.5178, |
| "step": 84400 |
| }, |
| { |
| "epoch": 79.7171502712904, |
| "grad_norm": 1.0479909181594849, |
| "learning_rate": 7.142533902023046e-06, |
| "loss": 2.5168, |
| "step": 84500 |
| }, |
| { |
| "epoch": 79.81151214909177, |
| "grad_norm": 1.0606141090393066, |
| "learning_rate": 7.052898863022344e-06, |
| "loss": 2.52, |
| "step": 84600 |
| }, |
| { |
| "epoch": 79.90587402689313, |
| "grad_norm": 1.0583046674728394, |
| "learning_rate": 6.963787120895726e-06, |
| "loss": 2.5145, |
| "step": 84700 |
| }, |
| { |
| "epoch": 80.0, |
| "grad_norm": 1.3051832914352417, |
| "learning_rate": 6.875199761441642e-06, |
| "loss": 2.5214, |
| "step": 84800 |
| }, |
| { |
| "epoch": 80.09436187780136, |
| "grad_norm": 1.051279902458191, |
| "learning_rate": 6.787137864069093e-06, |
| "loss": 2.5031, |
| "step": 84900 |
| }, |
| { |
| "epoch": 80.18872375560274, |
| "grad_norm": 1.0184589624404907, |
| "learning_rate": 6.699602501784535e-06, |
| "loss": 2.4947, |
| "step": 85000 |
| }, |
| { |
| "epoch": 80.18872375560274, |
| "eval_loss": 3.5270843505859375, |
| "eval_runtime": 89.9406, |
| "eval_samples_per_second": 167.566, |
| "eval_steps_per_second": 5.237, |
| "step": 85000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 100000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 95, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.8419166568448e+18, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|