| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.4686278543509568, | |
| "eval_steps": 500, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016457519029006377, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 4e-05, | |
| "loss": 3.4409, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.032915038058012755, | |
| "grad_norm": 1.125, | |
| "learning_rate": 8e-05, | |
| "loss": 2.834, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04937255708701913, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 0.00012, | |
| "loss": 2.0422, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06583007611602551, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 0.00016, | |
| "loss": 1.6617, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.08228759514503188, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5035, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09874511417403826, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 0.0001988716502115656, | |
| "loss": 1.4449, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.11520263320304464, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 0.00019774330042313118, | |
| "loss": 1.3653, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.13166015223205102, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 0.00019661495063469676, | |
| "loss": 1.3634, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1481176712610574, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 0.00019548660084626237, | |
| "loss": 1.3116, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.16457519029006376, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 0.00019435825105782795, | |
| "loss": 1.2961, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.18103270931907015, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 0.00019322990126939354, | |
| "loss": 1.2868, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.19749022834807653, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 0.0001921015514809591, | |
| "loss": 1.2902, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.21394774737708291, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 0.00019097320169252468, | |
| "loss": 1.209, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.23040526640608927, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 0.00018984485190409026, | |
| "loss": 1.2912, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.24686278543509566, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 0.00018871650211565587, | |
| "loss": 1.2733, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.26332030446410204, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 0.00018758815232722145, | |
| "loss": 1.1895, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.2797778234931084, | |
| "grad_norm": 1.25, | |
| "learning_rate": 0.00018645980253878704, | |
| "loss": 1.2259, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.2962353425221148, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 0.00018533145275035262, | |
| "loss": 1.2636, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3126928615511212, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 0.0001842031029619182, | |
| "loss": 1.2748, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.3291503805801275, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 0.0001830747531734838, | |
| "loss": 1.1335, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3456078996091339, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 0.00018194640338504937, | |
| "loss": 1.1985, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.3620654186381403, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 0.00018081805359661496, | |
| "loss": 1.2015, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.3785229376671467, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 0.00017968970380818057, | |
| "loss": 1.1548, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.39498045669615306, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 0.00017856135401974612, | |
| "loss": 1.1249, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.41143797572515944, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 0.0001774330042313117, | |
| "loss": 1.1734, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.42789549475416583, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 0.0001763046544428773, | |
| "loss": 1.1772, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.4443530137831722, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 0.00017517630465444287, | |
| "loss": 1.1092, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.46081053281217854, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 0.00017404795486600846, | |
| "loss": 1.1306, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.4772680518411849, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 0.00017291960507757407, | |
| "loss": 1.1457, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.4937255708701913, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 0.00017179125528913965, | |
| "loss": 1.1501, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5101830898991977, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 0.00017066290550070523, | |
| "loss": 1.1539, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.5266406089282041, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 0.00016953455571227082, | |
| "loss": 1.1242, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.5430981279572105, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 0.0001684062059238364, | |
| "loss": 1.1551, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.5595556469862168, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 0.00016727785613540198, | |
| "loss": 1.1252, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.5760131660152232, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 0.00016614950634696757, | |
| "loss": 1.1023, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.5924706850442296, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 0.00016502115655853315, | |
| "loss": 1.1136, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.608928204073236, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 0.00016389280677009873, | |
| "loss": 1.1363, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.6253857231022424, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 0.00016276445698166432, | |
| "loss": 1.0763, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.6418432421312487, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 0.0001616361071932299, | |
| "loss": 1.1558, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.658300761160255, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 0.00016050775740479548, | |
| "loss": 1.0374, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.6747582801892614, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 0.00015937940761636107, | |
| "loss": 1.1492, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.6912157992182678, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 0.00015825105782792665, | |
| "loss": 1.1261, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.7076733182472742, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 0.00015712270803949226, | |
| "loss": 1.1236, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.7241308372762806, | |
| "grad_norm": 1.5, | |
| "learning_rate": 0.00015599435825105785, | |
| "loss": 1.0795, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.740588356305287, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 0.00015486600846262343, | |
| "loss": 1.0866, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7570458753342934, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 0.000153737658674189, | |
| "loss": 1.0635, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.7735033943632997, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 0.0001526093088857546, | |
| "loss": 1.1106, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.7899609133923061, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 0.00015148095909732018, | |
| "loss": 1.1241, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.8064184324213125, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 0.00015035260930888576, | |
| "loss": 1.078, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.8228759514503189, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 0.00014922425952045135, | |
| "loss": 1.1545, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8393334704793253, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 0.00014809590973201693, | |
| "loss": 1.1102, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.8557909895083317, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 0.0001469675599435825, | |
| "loss": 1.0258, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.872248508537338, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 0.0001458392101551481, | |
| "loss": 1.1205, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.8887060275663444, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 0.00014471086036671368, | |
| "loss": 1.0562, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.9051635465953507, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 0.00014358251057827926, | |
| "loss": 1.1234, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.9216210656243571, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 0.00014245416078984485, | |
| "loss": 1.0511, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.9380785846533635, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 0.00014132581100141046, | |
| "loss": 1.0451, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.9545361036823699, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 0.00014019746121297604, | |
| "loss": 1.0539, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.9709936227113762, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 0.00013906911142454162, | |
| "loss": 1.0957, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.9874511417403826, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 0.0001379407616361072, | |
| "loss": 1.1191, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.003908660769389, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 0.0001368124118476728, | |
| "loss": 1.0942, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.0203661797983954, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 0.00013568406205923835, | |
| "loss": 0.9808, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.0368236988274018, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 0.00013455571227080396, | |
| "loss": 0.9606, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.0532812178564082, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 0.00013342736248236954, | |
| "loss": 0.9658, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.0697387368854145, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 0.00013229901269393512, | |
| "loss": 0.8955, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.086196255914421, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 0.0001311706629055007, | |
| "loss": 0.9449, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.1026537749434273, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 0.0001300423131170663, | |
| "loss": 0.936, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.1191112939724337, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 0.00012891396332863187, | |
| "loss": 0.8886, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.13556881300144, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 0.00012778561354019746, | |
| "loss": 0.9193, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.1520263320304465, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 0.00012665726375176307, | |
| "loss": 0.8895, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.1684838510594528, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 0.00012552891396332865, | |
| "loss": 0.9278, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.1849413700884592, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 0.00012440056417489424, | |
| "loss": 0.9096, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.2013988891174656, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 0.00012327221438645982, | |
| "loss": 0.9084, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.217856408146472, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 0.0001221438645980254, | |
| "loss": 0.971, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.2343139271754784, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 0.00012101551480959097, | |
| "loss": 0.9663, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.2507714462044848, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 0.00011988716502115656, | |
| "loss": 0.8331, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.2672289652334912, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 0.00011875881523272214, | |
| "loss": 0.9695, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.2836864842624975, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 0.00011763046544428774, | |
| "loss": 0.9584, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.300144003291504, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 0.00011650211565585332, | |
| "loss": 0.975, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.3166015223205103, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 0.0001153737658674189, | |
| "loss": 0.9561, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.3330590413495167, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 0.00011424541607898449, | |
| "loss": 0.9068, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.3495165603785229, | |
| "grad_norm": 1.375, | |
| "learning_rate": 0.00011311706629055008, | |
| "loss": 0.9088, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.3659740794075292, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 0.00011198871650211567, | |
| "loss": 0.8766, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.3824315984365356, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 0.00011086036671368125, | |
| "loss": 0.9034, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.398889117465542, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 0.00010973201692524683, | |
| "loss": 0.9532, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.4153466364945484, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 0.00010860366713681243, | |
| "loss": 0.8913, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.4318041555235548, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 0.00010747531734837801, | |
| "loss": 0.886, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.4482616745525612, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 0.00010634696755994358, | |
| "loss": 0.9405, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.4647191935815675, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 0.00010521861777150917, | |
| "loss": 0.9094, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.481176712610574, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 0.00010409026798307475, | |
| "loss": 0.9985, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.4976342316395803, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 0.00010296191819464033, | |
| "loss": 0.9577, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.5140917506685867, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 0.00010183356840620593, | |
| "loss": 0.9125, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.530549269697593, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 0.00010070521861777152, | |
| "loss": 0.9305, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.5470067887265995, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 9.95768688293371e-05, | |
| "loss": 0.9372, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.5634643077556059, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 9.844851904090268e-05, | |
| "loss": 0.9658, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.5799218267846122, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 9.732016925246828e-05, | |
| "loss": 0.8637, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.5963793458136186, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 9.619181946403385e-05, | |
| "loss": 0.8523, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.612836864842625, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 9.506346967559943e-05, | |
| "loss": 0.8689, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.6292943838716314, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 9.393511988716503e-05, | |
| "loss": 0.8898, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.6457519029006378, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 9.280677009873061e-05, | |
| "loss": 0.9299, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.6622094219296442, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 9.16784203102962e-05, | |
| "loss": 0.9165, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.6786669409586503, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 9.055007052186178e-05, | |
| "loss": 0.897, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.6951244599876567, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 8.942172073342738e-05, | |
| "loss": 0.9132, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.711581979016663, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 8.829337094499295e-05, | |
| "loss": 0.8897, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.7280394980456695, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 8.716502115655853e-05, | |
| "loss": 0.9079, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.7444970170746759, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 8.603667136812413e-05, | |
| "loss": 0.8468, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.7609545361036822, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 8.490832157968971e-05, | |
| "loss": 0.862, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.7774120551326886, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 8.37799717912553e-05, | |
| "loss": 0.901, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.793869574161695, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 8.265162200282088e-05, | |
| "loss": 0.864, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.8103270931907014, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 8.152327221438646e-05, | |
| "loss": 0.8773, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.8267846122197078, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 8.039492242595204e-05, | |
| "loss": 0.8714, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.8432421312487142, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 7.926657263751763e-05, | |
| "loss": 0.9346, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.8596996502777206, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 7.813822284908322e-05, | |
| "loss": 0.877, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.876157169306727, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 7.700987306064881e-05, | |
| "loss": 0.9007, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.8926146883357333, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 7.588152327221439e-05, | |
| "loss": 0.9481, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.9090722073647397, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 7.475317348377997e-05, | |
| "loss": 0.8854, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.925529726393746, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 7.362482369534556e-05, | |
| "loss": 0.9206, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.9419872454227525, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 7.249647390691114e-05, | |
| "loss": 0.8936, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.9584447644517589, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 7.136812411847673e-05, | |
| "loss": 0.8942, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.9749022834807652, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 7.023977433004232e-05, | |
| "loss": 0.881, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.9913598025097716, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 6.91114245416079e-05, | |
| "loss": 0.9034, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.007817321538778, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 6.798307475317349e-05, | |
| "loss": 0.8102, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.0242748405677844, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 6.685472496473907e-05, | |
| "loss": 0.7575, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.040732359596791, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 6.572637517630466e-05, | |
| "loss": 0.7636, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.057189878625797, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 6.459802538787024e-05, | |
| "loss": 0.8552, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.0736473976548035, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 6.346967559943582e-05, | |
| "loss": 0.747, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.09010491668381, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 6.234132581100142e-05, | |
| "loss": 0.7759, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.1065624357128163, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 6.1212976022567e-05, | |
| "loss": 0.7866, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.1230199547418227, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 6.008462623413258e-05, | |
| "loss": 0.7525, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.139477473770829, | |
| "grad_norm": 1.75, | |
| "learning_rate": 5.8956276445698163e-05, | |
| "loss": 0.7364, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.1559349927998355, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 5.7827926657263754e-05, | |
| "loss": 0.7933, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.172392511828842, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 5.669957686882934e-05, | |
| "loss": 0.795, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.1888500308578482, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 5.557122708039493e-05, | |
| "loss": 0.7651, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.2053075498868546, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 5.444287729196052e-05, | |
| "loss": 0.7425, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.221765068915861, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 5.331452750352609e-05, | |
| "loss": 0.7951, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.2382225879448674, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 5.218617771509168e-05, | |
| "loss": 0.7468, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.2546801069738738, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 5.105782792665727e-05, | |
| "loss": 0.7959, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.27113762600288, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 4.992947813822285e-05, | |
| "loss": 0.7291, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.2875951450318865, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 4.880112834978844e-05, | |
| "loss": 0.7307, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.304052664060893, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 4.767277856135402e-05, | |
| "loss": 0.7085, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.3205101830898993, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 4.654442877291961e-05, | |
| "loss": 0.6951, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.3369677021189057, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 4.541607898448519e-05, | |
| "loss": 0.7688, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.353425221147912, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 4.4287729196050775e-05, | |
| "loss": 0.7248, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.3698827401769185, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 4.3159379407616366e-05, | |
| "loss": 0.8123, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.386340259205925, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 4.203102961918195e-05, | |
| "loss": 0.8186, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.4027977782349312, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 4.090267983074753e-05, | |
| "loss": 0.7557, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.4192552972639376, | |
| "grad_norm": 1.625, | |
| "learning_rate": 3.9774330042313116e-05, | |
| "loss": 0.7121, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.435712816292944, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 3.8645980253878706e-05, | |
| "loss": 0.7574, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.4521703353219504, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 3.751763046544429e-05, | |
| "loss": 0.7817, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.4686278543509568, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 3.638928067700987e-05, | |
| "loss": 0.7675, | |
| "step": 3000 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 3645, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.004969207187456e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |