| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 862, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011614401858304297, |
| "grad_norm": 1.2486553192138672, |
| "learning_rate": 1.111111111111111e-06, |
| "loss": 1.3051, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.023228803716608595, |
| "grad_norm": 1.02664053440094, |
| "learning_rate": 2.4999999999999998e-06, |
| "loss": 1.3323, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03484320557491289, |
| "grad_norm": 0.8225151896476746, |
| "learning_rate": 3.888888888888889e-06, |
| "loss": 1.3047, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.04645760743321719, |
| "grad_norm": 0.7819671630859375, |
| "learning_rate": 5.277777777777778e-06, |
| "loss": 1.2887, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.05807200929152149, |
| "grad_norm": 0.6035093069076538, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 1.3248, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.06968641114982578, |
| "grad_norm": 0.5274394750595093, |
| "learning_rate": 8.055555555555557e-06, |
| "loss": 1.2981, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.08130081300813008, |
| "grad_norm": 0.4473659098148346, |
| "learning_rate": 9.444444444444445e-06, |
| "loss": 1.2253, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.09291521486643438, |
| "grad_norm": 0.5344942808151245, |
| "learning_rate": 1.0833333333333334e-05, |
| "loss": 1.2101, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.10452961672473868, |
| "grad_norm": 0.42955347895622253, |
| "learning_rate": 1.2222222222222222e-05, |
| "loss": 1.2136, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.11614401858304298, |
| "grad_norm": 0.4606517553329468, |
| "learning_rate": 1.3611111111111111e-05, |
| "loss": 1.2501, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.12775842044134728, |
| "grad_norm": 0.43243467807769775, |
| "learning_rate": 1.5e-05, |
| "loss": 1.2287, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.13937282229965156, |
| "grad_norm": 0.45852982997894287, |
| "learning_rate": 1.638888888888889e-05, |
| "loss": 1.2272, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.15098722415795587, |
| "grad_norm": 0.422735333442688, |
| "learning_rate": 1.7777777777777777e-05, |
| "loss": 1.1591, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.16260162601626016, |
| "grad_norm": 0.48696285486221313, |
| "learning_rate": 1.9166666666666667e-05, |
| "loss": 1.2238, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.17421602787456447, |
| "grad_norm": 0.6145470142364502, |
| "learning_rate": 2.0555555555555558e-05, |
| "loss": 1.2375, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.18583042973286876, |
| "grad_norm": 0.5468384623527527, |
| "learning_rate": 2.1944444444444445e-05, |
| "loss": 1.1484, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.19744483159117304, |
| "grad_norm": 0.4397794008255005, |
| "learning_rate": 2.3333333333333336e-05, |
| "loss": 1.1567, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.20905923344947736, |
| "grad_norm": 0.4396965801715851, |
| "learning_rate": 2.4722222222222223e-05, |
| "loss": 1.1665, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.22067363530778164, |
| "grad_norm": 0.5894995927810669, |
| "learning_rate": 2.611111111111111e-05, |
| "loss": 1.1519, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.23228803716608595, |
| "grad_norm": 0.4796619117259979, |
| "learning_rate": 2.75e-05, |
| "loss": 1.1242, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.24390243902439024, |
| "grad_norm": 0.7274127006530762, |
| "learning_rate": 2.8888888888888888e-05, |
| "loss": 1.1197, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.25551684088269455, |
| "grad_norm": 0.5538251996040344, |
| "learning_rate": 2.999998233452831e-05, |
| "loss": 1.09, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.26713124274099886, |
| "grad_norm": 0.4963814616203308, |
| "learning_rate": 2.999936404738799e-05, |
| "loss": 1.14, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.2787456445993031, |
| "grad_norm": 0.5045656561851501, |
| "learning_rate": 2.9997862528271754e-05, |
| "loss": 1.0846, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.29036004645760743, |
| "grad_norm": 0.5694212913513184, |
| "learning_rate": 2.999547786559598e-05, |
| "loss": 1.0566, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.30197444831591175, |
| "grad_norm": 0.5698084831237793, |
| "learning_rate": 2.9992210199780657e-05, |
| "loss": 1.0812, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.313588850174216, |
| "grad_norm": 0.5170321464538574, |
| "learning_rate": 2.9988059723241064e-05, |
| "loss": 1.0414, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.3252032520325203, |
| "grad_norm": 0.5222340226173401, |
| "learning_rate": 2.9983026680376472e-05, |
| "loss": 1.0631, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.33681765389082463, |
| "grad_norm": 0.5456163883209229, |
| "learning_rate": 2.997711136755574e-05, |
| "loss": 1.0866, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.34843205574912894, |
| "grad_norm": 0.5397904515266418, |
| "learning_rate": 2.9970314133099855e-05, |
| "loss": 1.1579, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3600464576074332, |
| "grad_norm": 0.6353335380554199, |
| "learning_rate": 2.9962635377261457e-05, |
| "loss": 0.9954, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.3716608594657375, |
| "grad_norm": 0.6079632639884949, |
| "learning_rate": 2.9954075552201222e-05, |
| "loss": 1.0647, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.3832752613240418, |
| "grad_norm": 0.6199746131896973, |
| "learning_rate": 2.994463516196126e-05, |
| "loss": 1.0637, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.3948896631823461, |
| "grad_norm": 0.6312094926834106, |
| "learning_rate": 2.9934314762435444e-05, |
| "loss": 1.0242, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.4065040650406504, |
| "grad_norm": 0.677851140499115, |
| "learning_rate": 2.9923114961336672e-05, |
| "loss": 0.9563, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.4181184668989547, |
| "grad_norm": 0.6415532827377319, |
| "learning_rate": 2.9911036418161058e-05, |
| "loss": 0.9728, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.429732868757259, |
| "grad_norm": 0.7620236873626709, |
| "learning_rate": 2.9898079844149132e-05, |
| "loss": 1.0066, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.4413472706155633, |
| "grad_norm": 0.6720292568206787, |
| "learning_rate": 2.9884246002243936e-05, |
| "loss": 1.0059, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.4529616724738676, |
| "grad_norm": 0.7387037873268127, |
| "learning_rate": 2.9869535707046104e-05, |
| "loss": 0.9314, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.4645760743321719, |
| "grad_norm": 0.6071408987045288, |
| "learning_rate": 2.985394982476591e-05, |
| "loss": 0.9805, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.47619047619047616, |
| "grad_norm": 0.7038552165031433, |
| "learning_rate": 2.9837489273172232e-05, |
| "loss": 0.9766, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.4878048780487805, |
| "grad_norm": 0.7892246842384338, |
| "learning_rate": 2.9820155021538533e-05, |
| "loss": 0.9558, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.4994192799070848, |
| "grad_norm": 0.7563413381576538, |
| "learning_rate": 2.980194809058577e-05, |
| "loss": 0.9455, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.5110336817653891, |
| "grad_norm": 0.7255906462669373, |
| "learning_rate": 2.9782869552422316e-05, |
| "loss": 0.9183, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.5226480836236934, |
| "grad_norm": 0.7455199956893921, |
| "learning_rate": 2.9762920530480788e-05, |
| "loss": 0.9363, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.5342624854819977, |
| "grad_norm": 0.7163615822792053, |
| "learning_rate": 2.974210219945193e-05, |
| "loss": 0.9436, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.5458768873403019, |
| "grad_norm": 0.6869027018547058, |
| "learning_rate": 2.9720415785215428e-05, |
| "loss": 0.8932, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.5574912891986062, |
| "grad_norm": 0.776175320148468, |
| "learning_rate": 2.969786256476772e-05, |
| "loss": 0.9234, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5691056910569106, |
| "grad_norm": 0.687179446220398, |
| "learning_rate": 2.9674443866146807e-05, |
| "loss": 0.9165, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.5807200929152149, |
| "grad_norm": 0.8298389911651611, |
| "learning_rate": 2.9650161068354054e-05, |
| "loss": 0.8863, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5923344947735192, |
| "grad_norm": 0.7793598175048828, |
| "learning_rate": 2.9625015601272974e-05, |
| "loss": 0.8693, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.6039488966318235, |
| "grad_norm": 0.9237378835678101, |
| "learning_rate": 2.9599008945585066e-05, |
| "loss": 0.8435, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.6155632984901278, |
| "grad_norm": 0.8277608752250671, |
| "learning_rate": 2.9572142632682562e-05, |
| "loss": 0.8683, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.627177700348432, |
| "grad_norm": 0.7906745076179504, |
| "learning_rate": 2.954441824457832e-05, |
| "loss": 0.9144, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6387921022067363, |
| "grad_norm": 0.8107167482376099, |
| "learning_rate": 2.951583741381263e-05, |
| "loss": 0.8696, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.6504065040650406, |
| "grad_norm": 0.734391987323761, |
| "learning_rate": 2.948640182335708e-05, |
| "loss": 0.871, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.662020905923345, |
| "grad_norm": 0.8837676048278809, |
| "learning_rate": 2.9456113206515475e-05, |
| "loss": 0.8883, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.6736353077816493, |
| "grad_norm": 0.8748791217803955, |
| "learning_rate": 2.942497334682176e-05, |
| "loss": 0.7801, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6852497096399536, |
| "grad_norm": 0.8546818494796753, |
| "learning_rate": 2.9392984077934987e-05, |
| "loss": 0.8724, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.6968641114982579, |
| "grad_norm": 0.7864014506340027, |
| "learning_rate": 2.9360147283531373e-05, |
| "loss": 0.8214, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7084785133565621, |
| "grad_norm": 0.7984604835510254, |
| "learning_rate": 2.9326464897193343e-05, |
| "loss": 0.836, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.7200929152148664, |
| "grad_norm": 0.9750844836235046, |
| "learning_rate": 2.9291938902295695e-05, |
| "loss": 0.7873, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.7317073170731707, |
| "grad_norm": 0.8499948382377625, |
| "learning_rate": 2.925657133188881e-05, |
| "loss": 0.8412, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.743321718931475, |
| "grad_norm": 0.86067134141922, |
| "learning_rate": 2.9220364268578922e-05, |
| "loss": 0.8256, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.7549361207897793, |
| "grad_norm": 0.8807356357574463, |
| "learning_rate": 2.918331984440549e-05, |
| "loss": 0.8364, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.7665505226480837, |
| "grad_norm": 0.8951946496963501, |
| "learning_rate": 2.9145440240715657e-05, |
| "loss": 0.8415, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.778164924506388, |
| "grad_norm": 0.8705118894577026, |
| "learning_rate": 2.9106727688035814e-05, |
| "loss": 0.7762, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.7897793263646922, |
| "grad_norm": 0.9147759675979614, |
| "learning_rate": 2.9067184465940225e-05, |
| "loss": 0.7983, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.8013937282229965, |
| "grad_norm": 0.8624527454376221, |
| "learning_rate": 2.9026812902916834e-05, |
| "loss": 0.7434, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.8130081300813008, |
| "grad_norm": 0.9535494446754456, |
| "learning_rate": 2.898561537623011e-05, |
| "loss": 0.8049, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.8246225319396051, |
| "grad_norm": 0.836469292640686, |
| "learning_rate": 2.8943594311781104e-05, |
| "loss": 0.777, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.8362369337979094, |
| "grad_norm": 0.9156211018562317, |
| "learning_rate": 2.8900752183964573e-05, |
| "loss": 0.7746, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.8478513356562137, |
| "grad_norm": 0.9176764488220215, |
| "learning_rate": 2.8857091515523287e-05, |
| "loss": 0.7827, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.859465737514518, |
| "grad_norm": 0.8853964805603027, |
| "learning_rate": 2.8812614877399476e-05, |
| "loss": 0.754, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.8710801393728222, |
| "grad_norm": 0.8558771014213562, |
| "learning_rate": 2.876732488858344e-05, |
| "loss": 0.7039, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.8826945412311266, |
| "grad_norm": 0.8176305294036865, |
| "learning_rate": 2.8721224215959335e-05, |
| "loss": 0.7892, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.8943089430894309, |
| "grad_norm": 0.9530948996543884, |
| "learning_rate": 2.8674315574148126e-05, |
| "loss": 0.7049, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.9059233449477352, |
| "grad_norm": 0.9136149287223816, |
| "learning_rate": 2.862660172534776e-05, |
| "loss": 0.7614, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.9175377468060395, |
| "grad_norm": 0.9666270017623901, |
| "learning_rate": 2.8578085479170478e-05, |
| "loss": 0.7265, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.9291521486643438, |
| "grad_norm": 0.9841073751449585, |
| "learning_rate": 2.85287696924774e-05, |
| "loss": 0.6903, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.9407665505226481, |
| "grad_norm": 1.1007276773452759, |
| "learning_rate": 2.8478657269210294e-05, |
| "loss": 0.7114, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 0.9525352120399475, |
| "learning_rate": 2.8427751160220573e-05, |
| "loss": 0.7115, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.9639953542392566, |
| "grad_norm": 0.8143008351325989, |
| "learning_rate": 2.8376054363095545e-05, |
| "loss": 0.6784, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.975609756097561, |
| "grad_norm": 0.9916719198226929, |
| "learning_rate": 2.8323569921981885e-05, |
| "loss": 0.7366, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.9872241579558653, |
| "grad_norm": 0.9247941374778748, |
| "learning_rate": 2.82703009274064e-05, |
| "loss": 0.6974, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.9988385598141696, |
| "grad_norm": 0.9906225800514221, |
| "learning_rate": 2.8216250516094027e-05, |
| "loss": 0.6673, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.0092915214866434, |
| "grad_norm": 0.9208662509918213, |
| "learning_rate": 2.816142187078315e-05, |
| "loss": 0.603, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.0209059233449478, |
| "grad_norm": 1.0540852546691895, |
| "learning_rate": 2.8105818220038167e-05, |
| "loss": 0.6062, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.032520325203252, |
| "grad_norm": 0.9971239566802979, |
| "learning_rate": 2.804944283805938e-05, |
| "loss": 0.5986, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.0441347270615564, |
| "grad_norm": 1.0977699756622314, |
| "learning_rate": 2.7992299044490192e-05, |
| "loss": 0.5556, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.0557491289198606, |
| "grad_norm": 1.0428180694580078, |
| "learning_rate": 2.793439020422165e-05, |
| "loss": 0.599, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.0673635307781648, |
| "grad_norm": 1.2783362865447998, |
| "learning_rate": 2.787571972719429e-05, |
| "loss": 0.5589, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.0789779326364692, |
| "grad_norm": 1.0532207489013672, |
| "learning_rate": 2.781629106819733e-05, |
| "loss": 0.5534, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.0905923344947734, |
| "grad_norm": 1.0525319576263428, |
| "learning_rate": 2.775610772666527e-05, |
| "loss": 0.577, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.1022067363530779, |
| "grad_norm": 0.9602555632591248, |
| "learning_rate": 2.7695173246471803e-05, |
| "loss": 0.5625, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.113821138211382, |
| "grad_norm": 1.0547131299972534, |
| "learning_rate": 2.763349121572114e-05, |
| "loss": 0.6173, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.1254355400696865, |
| "grad_norm": 0.963524341583252, |
| "learning_rate": 2.7571065266536737e-05, |
| "loss": 0.62, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.1370499419279907, |
| "grad_norm": 1.066751480102539, |
| "learning_rate": 2.7507899074847394e-05, |
| "loss": 0.5829, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.1486643437862951, |
| "grad_norm": 0.9866227507591248, |
| "learning_rate": 2.7443996360170836e-05, |
| "loss": 0.5729, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.1602787456445993, |
| "grad_norm": 1.037983775138855, |
| "learning_rate": 2.7379360885394664e-05, |
| "loss": 0.5955, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.1718931475029035, |
| "grad_norm": 1.2030766010284424, |
| "learning_rate": 2.731399645655477e-05, |
| "loss": 0.6062, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.183507549361208, |
| "grad_norm": 1.0329967737197876, |
| "learning_rate": 2.7247906922611254e-05, |
| "loss": 0.5871, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.1951219512195121, |
| "grad_norm": 1.1776123046875, |
| "learning_rate": 2.7181096175221757e-05, |
| "loss": 0.5755, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.2067363530778166, |
| "grad_norm": 1.0537413358688354, |
| "learning_rate": 2.7113568148512296e-05, |
| "loss": 0.545, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.2183507549361208, |
| "grad_norm": 0.9902351498603821, |
| "learning_rate": 2.704532681884562e-05, |
| "loss": 0.5564, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.229965156794425, |
| "grad_norm": 1.0097850561141968, |
| "learning_rate": 2.697637620458706e-05, |
| "loss": 0.5056, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.2415795586527294, |
| "grad_norm": 0.9843356013298035, |
| "learning_rate": 2.690672036586791e-05, |
| "loss": 0.5348, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.2531939605110336, |
| "grad_norm": 0.9540740251541138, |
| "learning_rate": 2.6836363404346324e-05, |
| "loss": 0.5411, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.264808362369338, |
| "grad_norm": 1.0271143913269043, |
| "learning_rate": 2.6765309462965845e-05, |
| "loss": 0.5065, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.2764227642276422, |
| "grad_norm": 1.077792763710022, |
| "learning_rate": 2.669356272571138e-05, |
| "loss": 0.5307, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.2880371660859466, |
| "grad_norm": 1.0701345205307007, |
| "learning_rate": 2.6621127417362886e-05, |
| "loss": 0.5587, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.2996515679442509, |
| "grad_norm": 1.0792369842529297, |
| "learning_rate": 2.6548007803246575e-05, |
| "loss": 0.5264, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.3112659698025553, |
| "grad_norm": 0.8799574375152588, |
| "learning_rate": 2.647420818898373e-05, |
| "loss": 0.4964, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.3228803716608595, |
| "grad_norm": 1.1636409759521484, |
| "learning_rate": 2.6399732920237212e-05, |
| "loss": 0.5576, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.3344947735191637, |
| "grad_norm": 1.055418610572815, |
| "learning_rate": 2.6324586382455525e-05, |
| "loss": 0.5149, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.346109175377468, |
| "grad_norm": 1.0145419836044312, |
| "learning_rate": 2.624877300061462e-05, |
| "loss": 0.4917, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.3577235772357723, |
| "grad_norm": 0.9634491801261902, |
| "learning_rate": 2.6172297238957297e-05, |
| "loss": 0.515, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.3693379790940767, |
| "grad_norm": 0.9772712588310242, |
| "learning_rate": 2.6095163600730355e-05, |
| "loss": 0.5012, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.380952380952381, |
| "grad_norm": 0.9910968542098999, |
| "learning_rate": 2.6017376627919405e-05, |
| "loss": 0.4839, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.3925667828106851, |
| "grad_norm": 1.103830099105835, |
| "learning_rate": 2.5938940900981424e-05, |
| "loss": 0.562, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.4041811846689896, |
| "grad_norm": 1.1236923933029175, |
| "learning_rate": 2.5859861038575035e-05, |
| "loss": 0.493, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.415795586527294, |
| "grad_norm": 0.9853049516677856, |
| "learning_rate": 2.5780141697288537e-05, |
| "loss": 0.4578, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.4274099883855982, |
| "grad_norm": 1.02285635471344, |
| "learning_rate": 2.5699787571365704e-05, |
| "loss": 0.5141, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.4390243902439024, |
| "grad_norm": 1.1389844417572021, |
| "learning_rate": 2.5618803392429373e-05, |
| "loss": 0.4684, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.4506387921022068, |
| "grad_norm": 1.1375923156738281, |
| "learning_rate": 2.5537193929202815e-05, |
| "loss": 0.4761, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.462253193960511, |
| "grad_norm": 1.046151876449585, |
| "learning_rate": 2.5454963987228926e-05, |
| "loss": 0.5179, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.4738675958188154, |
| "grad_norm": 1.1057745218276978, |
| "learning_rate": 2.5372118408587284e-05, |
| "loss": 0.489, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.4854819976771196, |
| "grad_norm": 1.032041311264038, |
| "learning_rate": 2.5288662071608975e-05, |
| "loss": 0.4871, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.4970963995354238, |
| "grad_norm": 1.0814687013626099, |
| "learning_rate": 2.520459989058939e-05, |
| "loss": 0.5172, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.5087108013937283, |
| "grad_norm": 1.0195306539535522, |
| "learning_rate": 2.5119936815498797e-05, |
| "loss": 0.5218, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.5203252032520327, |
| "grad_norm": 0.9469033479690552, |
| "learning_rate": 2.503467783169091e-05, |
| "loss": 0.4883, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.5319396051103369, |
| "grad_norm": 1.1030808687210083, |
| "learning_rate": 2.4948827959609285e-05, |
| "loss": 0.501, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.543554006968641, |
| "grad_norm": 1.0832819938659668, |
| "learning_rate": 2.4862392254491736e-05, |
| "loss": 0.4813, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.5551684088269453, |
| "grad_norm": 1.1702861785888672, |
| "learning_rate": 2.477537580607261e-05, |
| "loss": 0.4737, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.5667828106852497, |
| "grad_norm": 1.0441266298294067, |
| "learning_rate": 2.4687783738283144e-05, |
| "loss": 0.454, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.5783972125435541, |
| "grad_norm": 1.1927803754806519, |
| "learning_rate": 2.4599621208949674e-05, |
| "loss": 0.4328, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.5900116144018583, |
| "grad_norm": 1.1129752397537231, |
| "learning_rate": 2.4510893409489967e-05, |
| "loss": 0.4024, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.6016260162601625, |
| "grad_norm": 1.0736514329910278, |
| "learning_rate": 2.4421605564607514e-05, |
| "loss": 0.4499, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.6132404181184667, |
| "grad_norm": 1.0657414197921753, |
| "learning_rate": 2.4331762931983866e-05, |
| "loss": 0.4649, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.6248548199767712, |
| "grad_norm": 1.0618281364440918, |
| "learning_rate": 2.4241370801969045e-05, |
| "loss": 0.4337, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.6364692218350756, |
| "grad_norm": 1.1562613248825073, |
| "learning_rate": 2.415043449727003e-05, |
| "loss": 0.4429, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.6480836236933798, |
| "grad_norm": 1.1286054849624634, |
| "learning_rate": 2.4058959372637304e-05, |
| "loss": 0.4714, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.659698025551684, |
| "grad_norm": 1.043707251548767, |
| "learning_rate": 2.396695081454959e-05, |
| "loss": 0.4549, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.6713124274099884, |
| "grad_norm": 1.001025676727295, |
| "learning_rate": 2.387441424089662e-05, |
| "loss": 0.4843, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.6829268292682928, |
| "grad_norm": 1.0033694505691528, |
| "learning_rate": 2.378135510066013e-05, |
| "loss": 0.4203, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.694541231126597, |
| "grad_norm": 1.041979432106018, |
| "learning_rate": 2.3687778873593e-05, |
| "loss": 0.4609, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.7061556329849012, |
| "grad_norm": 1.0614110231399536, |
| "learning_rate": 2.3593691069896582e-05, |
| "loss": 0.4328, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.7177700348432055, |
| "grad_norm": 1.1184892654418945, |
| "learning_rate": 2.3499097229896213e-05, |
| "loss": 0.4323, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.7293844367015099, |
| "grad_norm": 1.0571225881576538, |
| "learning_rate": 2.340400292371499e-05, |
| "loss": 0.3688, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.7409988385598143, |
| "grad_norm": 0.9831321239471436, |
| "learning_rate": 2.3308413750945788e-05, |
| "loss": 0.4091, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.7526132404181185, |
| "grad_norm": 1.334681749343872, |
| "learning_rate": 2.3212335340321518e-05, |
| "loss": 0.4363, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.7642276422764227, |
| "grad_norm": 1.0677987337112427, |
| "learning_rate": 2.3115773349383658e-05, |
| "loss": 0.4202, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.775842044134727, |
| "grad_norm": 1.1645525693893433, |
| "learning_rate": 2.3018733464149156e-05, |
| "loss": 0.4515, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.7874564459930313, |
| "grad_norm": 1.0856503248214722, |
| "learning_rate": 2.292122139877558e-05, |
| "loss": 0.401, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.7990708478513358, |
| "grad_norm": 1.0238492488861084, |
| "learning_rate": 2.2823242895224643e-05, |
| "loss": 0.3955, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.81068524970964, |
| "grad_norm": 1.1932063102722168, |
| "learning_rate": 2.2724803722924106e-05, |
| "loss": 0.4154, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.8222996515679442, |
| "grad_norm": 1.0874030590057373, |
| "learning_rate": 2.2625909678428038e-05, |
| "loss": 0.4082, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.8339140534262486, |
| "grad_norm": 1.109732747077942, |
| "learning_rate": 2.2526566585075485e-05, |
| "loss": 0.3826, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.845528455284553, |
| "grad_norm": 1.0591766834259033, |
| "learning_rate": 2.2426780292647568e-05, |
| "loss": 0.388, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.8571428571428572, |
| "grad_norm": 1.1858235597610474, |
| "learning_rate": 2.2326556677023017e-05, |
| "loss": 0.4026, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.8687572590011614, |
| "grad_norm": 1.0293341875076294, |
| "learning_rate": 2.2225901639832188e-05, |
| "loss": 0.419, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.8803716608594656, |
| "grad_norm": 1.1112502813339233, |
| "learning_rate": 2.2124821108109515e-05, |
| "loss": 0.4005, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.89198606271777, |
| "grad_norm": 0.9960114359855652, |
| "learning_rate": 2.2023321033944544e-05, |
| "loss": 0.328, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.9036004645760745, |
| "grad_norm": 1.1106208562850952, |
| "learning_rate": 2.1921407394131406e-05, |
| "loss": 0.3667, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.9152148664343787, |
| "grad_norm": 1.0423858165740967, |
| "learning_rate": 2.1819086189816893e-05, |
| "loss": 0.3773, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.9268292682926829, |
| "grad_norm": 1.13150155544281, |
| "learning_rate": 2.171636344614708e-05, |
| "loss": 0.3491, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.938443670150987, |
| "grad_norm": 1.0611273050308228, |
| "learning_rate": 2.1613245211912554e-05, |
| "loss": 0.3323, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.9500580720092915, |
| "grad_norm": 0.9943997263908386, |
| "learning_rate": 2.1509737559192188e-05, |
| "loss": 0.3741, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.961672473867596, |
| "grad_norm": 0.9484174847602844, |
| "learning_rate": 2.140584658299564e-05, |
| "loss": 0.3415, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.9732868757259001, |
| "grad_norm": 1.0280919075012207, |
| "learning_rate": 2.1301578400904424e-05, |
| "loss": 0.3724, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.9849012775842043, |
| "grad_norm": 1.1084401607513428, |
| "learning_rate": 2.119693915271168e-05, |
| "loss": 0.3867, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.9965156794425087, |
| "grad_norm": 1.1195417642593384, |
| "learning_rate": 2.1091935000060637e-05, |
| "loss": 0.3641, |
| "step": 860 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 2155, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.3574620943199764e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|