| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 31635, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00948316737790422, | |
| "grad_norm": 3.985076904296875, | |
| "learning_rate": 9.7e-06, | |
| "loss": 2.4371, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01896633475580844, | |
| "grad_norm": 3.8551318645477295, | |
| "learning_rate": 1.97e-05, | |
| "loss": 2.1056, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.02844950213371266, | |
| "grad_norm": 4.302079200744629, | |
| "learning_rate": 2.97e-05, | |
| "loss": 1.9608, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.03793266951161688, | |
| "grad_norm": 3.31756329536438, | |
| "learning_rate": 3.97e-05, | |
| "loss": 1.8338, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0474158368895211, | |
| "grad_norm": 2.4619405269622803, | |
| "learning_rate": 4.97e-05, | |
| "loss": 1.7855, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0474158368895211, | |
| "eval_loss": 1.6501274108886719, | |
| "eval_runtime": 72.2019, | |
| "eval_samples_per_second": 129.83, | |
| "eval_steps_per_second": 16.232, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05689900426742532, | |
| "grad_norm": 2.553483724594116, | |
| "learning_rate": 4.9844226754456404e-05, | |
| "loss": 1.7277, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.06638217164532954, | |
| "grad_norm": 2.0428194999694824, | |
| "learning_rate": 4.9683635779669185e-05, | |
| "loss": 1.6971, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.07586533902323377, | |
| "grad_norm": 1.9449608325958252, | |
| "learning_rate": 4.9523044804881966e-05, | |
| "loss": 1.6537, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.08534850640113797, | |
| "grad_norm": 2.5439252853393555, | |
| "learning_rate": 4.9362453830094753e-05, | |
| "loss": 1.6464, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.0948316737790422, | |
| "grad_norm": 2.118544578552246, | |
| "learning_rate": 4.9201862855307534e-05, | |
| "loss": 1.5804, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0948316737790422, | |
| "eval_loss": 1.5088456869125366, | |
| "eval_runtime": 72.0739, | |
| "eval_samples_per_second": 130.061, | |
| "eval_steps_per_second": 16.261, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.10431484115694642, | |
| "grad_norm": 1.8551363945007324, | |
| "learning_rate": 4.9041271880520315e-05, | |
| "loss": 1.6341, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.11379800853485064, | |
| "grad_norm": 1.9903297424316406, | |
| "learning_rate": 4.88806809057331e-05, | |
| "loss": 1.5718, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.12328117591275486, | |
| "grad_norm": 2.2142210006713867, | |
| "learning_rate": 4.8720089930945884e-05, | |
| "loss": 1.5718, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.1327643432906591, | |
| "grad_norm": 2.2737417221069336, | |
| "learning_rate": 4.8559498956158664e-05, | |
| "loss": 1.5137, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.1422475106685633, | |
| "grad_norm": 2.3361587524414062, | |
| "learning_rate": 4.839890798137145e-05, | |
| "loss": 1.5332, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1422475106685633, | |
| "eval_loss": 1.4451285600662231, | |
| "eval_runtime": 72.138, | |
| "eval_samples_per_second": 129.945, | |
| "eval_steps_per_second": 16.247, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.15173067804646753, | |
| "grad_norm": 2.335610866546631, | |
| "learning_rate": 4.823831700658423e-05, | |
| "loss": 1.5669, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.16121384542437173, | |
| "grad_norm": 1.811543583869934, | |
| "learning_rate": 4.8077726031797014e-05, | |
| "loss": 1.4985, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.17069701280227595, | |
| "grad_norm": 2.1588528156280518, | |
| "learning_rate": 4.79171350570098e-05, | |
| "loss": 1.4979, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.18018018018018017, | |
| "grad_norm": 1.7643985748291016, | |
| "learning_rate": 4.775654408222258e-05, | |
| "loss": 1.5246, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.1896633475580844, | |
| "grad_norm": 1.9193495512008667, | |
| "learning_rate": 4.759595310743536e-05, | |
| "loss": 1.4915, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1896633475580844, | |
| "eval_loss": 1.403477430343628, | |
| "eval_runtime": 71.9579, | |
| "eval_samples_per_second": 130.271, | |
| "eval_steps_per_second": 16.287, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.19914651493598862, | |
| "grad_norm": 1.8307377099990845, | |
| "learning_rate": 4.743536213264815e-05, | |
| "loss": 1.5009, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.20862968231389284, | |
| "grad_norm": 1.7923104763031006, | |
| "learning_rate": 4.727477115786093e-05, | |
| "loss": 1.4968, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.21811284969179706, | |
| "grad_norm": 1.925938367843628, | |
| "learning_rate": 4.711418018307371e-05, | |
| "loss": 1.4696, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.22759601706970128, | |
| "grad_norm": 2.106110095977783, | |
| "learning_rate": 4.69535892082865e-05, | |
| "loss": 1.4853, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.2370791844476055, | |
| "grad_norm": 2.345017433166504, | |
| "learning_rate": 4.679299823349928e-05, | |
| "loss": 1.4868, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.2370791844476055, | |
| "eval_loss": 1.3772392272949219, | |
| "eval_runtime": 72.0321, | |
| "eval_samples_per_second": 130.136, | |
| "eval_steps_per_second": 16.271, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.24656235182550973, | |
| "grad_norm": 1.5003846883773804, | |
| "learning_rate": 4.663240725871206e-05, | |
| "loss": 1.4641, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.25604551920341395, | |
| "grad_norm": 1.8472124338150024, | |
| "learning_rate": 4.647181628392485e-05, | |
| "loss": 1.4594, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.2655286865813182, | |
| "grad_norm": 1.8818256855010986, | |
| "learning_rate": 4.631122530913763e-05, | |
| "loss": 1.4547, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.2750118539592224, | |
| "grad_norm": 1.5926233530044556, | |
| "learning_rate": 4.615063433435041e-05, | |
| "loss": 1.4414, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.2844950213371266, | |
| "grad_norm": 1.505327820777893, | |
| "learning_rate": 4.59900433595632e-05, | |
| "loss": 1.4165, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.2844950213371266, | |
| "eval_loss": 1.3518378734588623, | |
| "eval_runtime": 71.9886, | |
| "eval_samples_per_second": 130.215, | |
| "eval_steps_per_second": 16.28, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.29397818871503084, | |
| "grad_norm": 1.77092707157135, | |
| "learning_rate": 4.582945238477598e-05, | |
| "loss": 1.4222, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.30346135609293506, | |
| "grad_norm": 2.265411376953125, | |
| "learning_rate": 4.566886140998876e-05, | |
| "loss": 1.3973, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.3129445234708393, | |
| "grad_norm": 1.4207345247268677, | |
| "learning_rate": 4.550827043520154e-05, | |
| "loss": 1.4423, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.32242769084874345, | |
| "grad_norm": 1.72047758102417, | |
| "learning_rate": 4.534767946041433e-05, | |
| "loss": 1.3939, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.3319108582266477, | |
| "grad_norm": 1.7695670127868652, | |
| "learning_rate": 4.518708848562711e-05, | |
| "loss": 1.3911, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.3319108582266477, | |
| "eval_loss": 1.3347505331039429, | |
| "eval_runtime": 72.0526, | |
| "eval_samples_per_second": 130.099, | |
| "eval_steps_per_second": 16.266, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.3413940256045519, | |
| "grad_norm": 1.93614661693573, | |
| "learning_rate": 4.502649751083989e-05, | |
| "loss": 1.405, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 1.4412301778793335, | |
| "learning_rate": 4.486590653605268e-05, | |
| "loss": 1.421, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.36036036036036034, | |
| "grad_norm": 1.5761134624481201, | |
| "learning_rate": 4.470531556126546e-05, | |
| "loss": 1.3758, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.36984352773826457, | |
| "grad_norm": 1.7923239469528198, | |
| "learning_rate": 4.454472458647824e-05, | |
| "loss": 1.4087, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.3793266951161688, | |
| "grad_norm": 2.2492587566375732, | |
| "learning_rate": 4.438413361169103e-05, | |
| "loss": 1.3797, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.3793266951161688, | |
| "eval_loss": 1.3214360475540161, | |
| "eval_runtime": 72.0741, | |
| "eval_samples_per_second": 130.061, | |
| "eval_steps_per_second": 16.261, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.388809862494073, | |
| "grad_norm": 1.978060245513916, | |
| "learning_rate": 4.422354263690381e-05, | |
| "loss": 1.4024, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.39829302987197723, | |
| "grad_norm": 1.7838459014892578, | |
| "learning_rate": 4.406295166211659e-05, | |
| "loss": 1.4047, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.40777619724988146, | |
| "grad_norm": 1.682637333869934, | |
| "learning_rate": 4.3902360687329377e-05, | |
| "loss": 1.3709, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.4172593646277857, | |
| "grad_norm": 1.5510674715042114, | |
| "learning_rate": 4.374176971254216e-05, | |
| "loss": 1.4175, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.4267425320056899, | |
| "grad_norm": 1.7401492595672607, | |
| "learning_rate": 4.358117873775494e-05, | |
| "loss": 1.3801, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.4267425320056899, | |
| "eval_loss": 1.3049076795578003, | |
| "eval_runtime": 72.1294, | |
| "eval_samples_per_second": 129.961, | |
| "eval_steps_per_second": 16.249, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.4362256993835941, | |
| "grad_norm": 1.6590989828109741, | |
| "learning_rate": 4.3420587762967726e-05, | |
| "loss": 1.3827, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.44570886676149835, | |
| "grad_norm": 1.5440171957015991, | |
| "learning_rate": 4.325999678818051e-05, | |
| "loss": 1.3617, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.45519203413940257, | |
| "grad_norm": 1.716539978981018, | |
| "learning_rate": 4.309940581339329e-05, | |
| "loss": 1.3463, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.4646752015173068, | |
| "grad_norm": 1.3042521476745605, | |
| "learning_rate": 4.2938814838606075e-05, | |
| "loss": 1.3456, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.474158368895211, | |
| "grad_norm": 1.3467687368392944, | |
| "learning_rate": 4.2778223863818856e-05, | |
| "loss": 1.3559, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.474158368895211, | |
| "eval_loss": 1.2918757200241089, | |
| "eval_runtime": 72.0072, | |
| "eval_samples_per_second": 130.181, | |
| "eval_steps_per_second": 16.276, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.48364153627311524, | |
| "grad_norm": 1.3807010650634766, | |
| "learning_rate": 4.261763288903164e-05, | |
| "loss": 1.3507, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.49312470365101946, | |
| "grad_norm": 1.3885177373886108, | |
| "learning_rate": 4.2457041914244425e-05, | |
| "loss": 1.3552, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.5026078710289237, | |
| "grad_norm": 1.2807698249816895, | |
| "learning_rate": 4.2296450939457205e-05, | |
| "loss": 1.3642, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.5120910384068279, | |
| "grad_norm": 1.4009428024291992, | |
| "learning_rate": 4.2135859964669986e-05, | |
| "loss": 1.3781, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.5215742057847321, | |
| "grad_norm": 1.3763035535812378, | |
| "learning_rate": 4.1975268989882774e-05, | |
| "loss": 1.3717, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.5215742057847321, | |
| "eval_loss": 1.280537724494934, | |
| "eval_runtime": 72.1115, | |
| "eval_samples_per_second": 129.993, | |
| "eval_steps_per_second": 16.253, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.5310573731626363, | |
| "grad_norm": 1.5511786937713623, | |
| "learning_rate": 4.1814678015095555e-05, | |
| "loss": 1.3502, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.5405405405405406, | |
| "grad_norm": 1.4995437860488892, | |
| "learning_rate": 4.1654087040308336e-05, | |
| "loss": 1.3599, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.5500237079184448, | |
| "grad_norm": 1.3496274948120117, | |
| "learning_rate": 4.149349606552112e-05, | |
| "loss": 1.3421, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.559506875296349, | |
| "grad_norm": 1.3634631633758545, | |
| "learning_rate": 4.1332905090733904e-05, | |
| "loss": 1.3617, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.5689900426742532, | |
| "grad_norm": 1.5579423904418945, | |
| "learning_rate": 4.1172314115946685e-05, | |
| "loss": 1.3604, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.5689900426742532, | |
| "eval_loss": 1.2698478698730469, | |
| "eval_runtime": 72.1231, | |
| "eval_samples_per_second": 129.972, | |
| "eval_steps_per_second": 16.25, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.5784732100521575, | |
| "grad_norm": 1.380241870880127, | |
| "learning_rate": 4.101332905090734e-05, | |
| "loss": 1.3379, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.5879563774300617, | |
| "grad_norm": 1.764551043510437, | |
| "learning_rate": 4.085273807612012e-05, | |
| "loss": 1.3208, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.5974395448079659, | |
| "grad_norm": 1.627012848854065, | |
| "learning_rate": 4.069214710133291e-05, | |
| "loss": 1.3448, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.6069227121858701, | |
| "grad_norm": 1.539115071296692, | |
| "learning_rate": 4.053155612654569e-05, | |
| "loss": 1.3422, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.6164058795637744, | |
| "grad_norm": 1.4698444604873657, | |
| "learning_rate": 4.037257106150635e-05, | |
| "loss": 1.3264, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.6164058795637744, | |
| "eval_loss": 1.259299635887146, | |
| "eval_runtime": 72.1176, | |
| "eval_samples_per_second": 129.982, | |
| "eval_steps_per_second": 16.251, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.6258890469416786, | |
| "grad_norm": 1.8150815963745117, | |
| "learning_rate": 4.021198008671913e-05, | |
| "loss": 1.3262, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.6353722143195828, | |
| "grad_norm": 1.4278889894485474, | |
| "learning_rate": 4.005138911193191e-05, | |
| "loss": 1.334, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.6448553816974869, | |
| "grad_norm": 1.4713215827941895, | |
| "learning_rate": 3.98907981371447e-05, | |
| "loss": 1.2924, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.6543385490753911, | |
| "grad_norm": 1.626541018486023, | |
| "learning_rate": 3.9731813072105354e-05, | |
| "loss": 1.3057, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.6638217164532954, | |
| "grad_norm": 1.7835373878479004, | |
| "learning_rate": 3.9571222097318134e-05, | |
| "loss": 1.328, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.6638217164532954, | |
| "eval_loss": 1.252388834953308, | |
| "eval_runtime": 72.2427, | |
| "eval_samples_per_second": 129.757, | |
| "eval_steps_per_second": 16.223, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.6733048838311996, | |
| "grad_norm": 1.8675563335418701, | |
| "learning_rate": 3.9410631122530915e-05, | |
| "loss": 1.322, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.6827880512091038, | |
| "grad_norm": 1.5719430446624756, | |
| "learning_rate": 3.92500401477437e-05, | |
| "loss": 1.3464, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.692271218587008, | |
| "grad_norm": 1.5038641691207886, | |
| "learning_rate": 3.9089449172956484e-05, | |
| "loss": 1.3315, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 1.777970314025879, | |
| "learning_rate": 3.8928858198169265e-05, | |
| "loss": 1.3549, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.7112375533428165, | |
| "grad_norm": 1.8796472549438477, | |
| "learning_rate": 3.8768267223382045e-05, | |
| "loss": 1.2907, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.7112375533428165, | |
| "eval_loss": 1.2450358867645264, | |
| "eval_runtime": 72.1657, | |
| "eval_samples_per_second": 129.895, | |
| "eval_steps_per_second": 16.24, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.7207207207207207, | |
| "grad_norm": 1.7477796077728271, | |
| "learning_rate": 3.860767624859483e-05, | |
| "loss": 1.3196, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.7302038880986249, | |
| "grad_norm": 1.6598505973815918, | |
| "learning_rate": 3.8447085273807614e-05, | |
| "loss": 1.2799, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.7396870554765291, | |
| "grad_norm": 1.7319283485412598, | |
| "learning_rate": 3.8286494299020395e-05, | |
| "loss": 1.3354, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.7491702228544334, | |
| "grad_norm": 1.847347617149353, | |
| "learning_rate": 3.812590332423318e-05, | |
| "loss": 1.3034, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.7586533902323376, | |
| "grad_norm": 1.6584995985031128, | |
| "learning_rate": 3.796531234944596e-05, | |
| "loss": 1.3092, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.7586533902323376, | |
| "eval_loss": 1.2385543584823608, | |
| "eval_runtime": 72.1594, | |
| "eval_samples_per_second": 129.907, | |
| "eval_steps_per_second": 16.242, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.7681365576102418, | |
| "grad_norm": 1.581036925315857, | |
| "learning_rate": 3.7804721374658744e-05, | |
| "loss": 1.3064, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.777619724988146, | |
| "grad_norm": 1.6824501752853394, | |
| "learning_rate": 3.764413039987153e-05, | |
| "loss": 1.3039, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.7871028923660502, | |
| "grad_norm": 1.4804019927978516, | |
| "learning_rate": 3.748353942508431e-05, | |
| "loss": 1.2774, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.7965860597439545, | |
| "grad_norm": 1.5401322841644287, | |
| "learning_rate": 3.732294845029709e-05, | |
| "loss": 1.3042, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.8060692271218587, | |
| "grad_norm": 1.9226937294006348, | |
| "learning_rate": 3.716235747550988e-05, | |
| "loss": 1.3186, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.8060692271218587, | |
| "eval_loss": 1.2315117120742798, | |
| "eval_runtime": 72.0639, | |
| "eval_samples_per_second": 130.079, | |
| "eval_steps_per_second": 16.263, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.8155523944997629, | |
| "grad_norm": 1.3993178606033325, | |
| "learning_rate": 3.700176650072266e-05, | |
| "loss": 1.3074, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.8250355618776671, | |
| "grad_norm": 1.6044120788574219, | |
| "learning_rate": 3.684117552593544e-05, | |
| "loss": 1.2681, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.8345187292555714, | |
| "grad_norm": 1.6285070180892944, | |
| "learning_rate": 3.668058455114823e-05, | |
| "loss": 1.3198, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.8440018966334756, | |
| "grad_norm": 2.002086639404297, | |
| "learning_rate": 3.651999357636101e-05, | |
| "loss": 1.3227, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.8534850640113798, | |
| "grad_norm": 1.5941271781921387, | |
| "learning_rate": 3.635940260157379e-05, | |
| "loss": 1.2914, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.8534850640113798, | |
| "eval_loss": 1.2264697551727295, | |
| "eval_runtime": 72.0482, | |
| "eval_samples_per_second": 130.107, | |
| "eval_steps_per_second": 16.267, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.862968231389284, | |
| "grad_norm": 1.5721193552017212, | |
| "learning_rate": 3.619881162678658e-05, | |
| "loss": 1.3268, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.8724513987671882, | |
| "grad_norm": 1.7066916227340698, | |
| "learning_rate": 3.603822065199936e-05, | |
| "loss": 1.2845, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.8819345661450925, | |
| "grad_norm": 1.5683172941207886, | |
| "learning_rate": 3.587762967721214e-05, | |
| "loss": 1.2779, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.8914177335229967, | |
| "grad_norm": 1.7200586795806885, | |
| "learning_rate": 3.571703870242493e-05, | |
| "loss": 1.3161, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.9009009009009009, | |
| "grad_norm": 1.4963386058807373, | |
| "learning_rate": 3.555644772763771e-05, | |
| "loss": 1.2668, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.9009009009009009, | |
| "eval_loss": 1.2190866470336914, | |
| "eval_runtime": 72.0991, | |
| "eval_samples_per_second": 130.015, | |
| "eval_steps_per_second": 16.255, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.9103840682788051, | |
| "grad_norm": 1.5414083003997803, | |
| "learning_rate": 3.539585675285049e-05, | |
| "loss": 1.3185, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.9198672356567094, | |
| "grad_norm": 1.46302330493927, | |
| "learning_rate": 3.523526577806328e-05, | |
| "loss": 1.2485, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.9293504030346136, | |
| "grad_norm": 1.4815856218338013, | |
| "learning_rate": 3.507467480327606e-05, | |
| "loss": 1.2912, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.9388335704125178, | |
| "grad_norm": 1.5166754722595215, | |
| "learning_rate": 3.491408382848884e-05, | |
| "loss": 1.2722, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.948316737790422, | |
| "grad_norm": 1.9628846645355225, | |
| "learning_rate": 3.475349285370163e-05, | |
| "loss": 1.2538, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.948316737790422, | |
| "eval_loss": 1.2150416374206543, | |
| "eval_runtime": 72.1513, | |
| "eval_samples_per_second": 129.921, | |
| "eval_steps_per_second": 16.244, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.9577999051683262, | |
| "grad_norm": 1.6791901588439941, | |
| "learning_rate": 3.459290187891441e-05, | |
| "loss": 1.2624, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.9672830725462305, | |
| "grad_norm": 1.5026668310165405, | |
| "learning_rate": 3.443231090412719e-05, | |
| "loss": 1.2696, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.9767662399241347, | |
| "grad_norm": 1.176558017730713, | |
| "learning_rate": 3.427171992933998e-05, | |
| "loss": 1.29, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.9862494073020389, | |
| "grad_norm": 1.5698468685150146, | |
| "learning_rate": 3.411112895455276e-05, | |
| "loss": 1.2874, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.9957325746799431, | |
| "grad_norm": 1.4970085620880127, | |
| "learning_rate": 3.395053797976554e-05, | |
| "loss": 1.2874, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.9957325746799431, | |
| "eval_loss": 1.2110899686813354, | |
| "eval_runtime": 72.0475, | |
| "eval_samples_per_second": 130.109, | |
| "eval_steps_per_second": 16.267, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.0052157420578474, | |
| "grad_norm": 1.284839391708374, | |
| "learning_rate": 3.3789947004978326e-05, | |
| "loss": 1.2793, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.0146989094357515, | |
| "grad_norm": 1.680851697921753, | |
| "learning_rate": 3.362935603019111e-05, | |
| "loss": 1.2487, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.0241820768136558, | |
| "grad_norm": 1.659610629081726, | |
| "learning_rate": 3.346876505540389e-05, | |
| "loss": 1.2454, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.03366524419156, | |
| "grad_norm": 1.6641312837600708, | |
| "learning_rate": 3.330817408061667e-05, | |
| "loss": 1.2323, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.0431484115694643, | |
| "grad_norm": 1.481063723564148, | |
| "learning_rate": 3.3147583105829456e-05, | |
| "loss": 1.2646, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.0431484115694643, | |
| "eval_loss": 1.2060637474060059, | |
| "eval_runtime": 71.9819, | |
| "eval_samples_per_second": 130.227, | |
| "eval_steps_per_second": 16.282, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 1.699491024017334, | |
| "learning_rate": 3.298699213104224e-05, | |
| "loss": 1.2828, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.0621147463252727, | |
| "grad_norm": 2.0708415508270264, | |
| "learning_rate": 3.282640115625502e-05, | |
| "loss": 1.2648, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.0715979137031768, | |
| "grad_norm": 1.4921772480010986, | |
| "learning_rate": 3.266741609121567e-05, | |
| "loss": 1.2611, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.0810810810810811, | |
| "grad_norm": 1.744384765625, | |
| "learning_rate": 3.250682511642846e-05, | |
| "loss": 1.2435, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.0905642484589853, | |
| "grad_norm": 1.1988921165466309, | |
| "learning_rate": 3.234623414164124e-05, | |
| "loss": 1.2525, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.0905642484589853, | |
| "eval_loss": 1.2018728256225586, | |
| "eval_runtime": 71.9385, | |
| "eval_samples_per_second": 130.306, | |
| "eval_steps_per_second": 16.292, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.1000474158368896, | |
| "grad_norm": 1.5618336200714111, | |
| "learning_rate": 3.218564316685402e-05, | |
| "loss": 1.2387, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.1095305832147937, | |
| "grad_norm": 1.512651801109314, | |
| "learning_rate": 3.202505219206681e-05, | |
| "loss": 1.2507, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.119013750592698, | |
| "grad_norm": 2.1945042610168457, | |
| "learning_rate": 3.186446121727959e-05, | |
| "loss": 1.2316, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.1284969179706021, | |
| "grad_norm": 1.3046265840530396, | |
| "learning_rate": 3.170387024249237e-05, | |
| "loss": 1.2352, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.1379800853485065, | |
| "grad_norm": 1.5922869443893433, | |
| "learning_rate": 3.154327926770516e-05, | |
| "loss": 1.2361, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.1379800853485065, | |
| "eval_loss": 1.1982355117797852, | |
| "eval_runtime": 72.0166, | |
| "eval_samples_per_second": 130.164, | |
| "eval_steps_per_second": 16.274, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.1474632527264106, | |
| "grad_norm": 1.2342475652694702, | |
| "learning_rate": 3.138268829291794e-05, | |
| "loss": 1.2318, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.156946420104315, | |
| "grad_norm": 1.630129337310791, | |
| "learning_rate": 3.122209731813072e-05, | |
| "loss": 1.2185, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.166429587482219, | |
| "grad_norm": 1.4030356407165527, | |
| "learning_rate": 3.106150634334351e-05, | |
| "loss": 1.2635, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.1759127548601234, | |
| "grad_norm": 1.372003436088562, | |
| "learning_rate": 3.090091536855629e-05, | |
| "loss": 1.2131, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.1853959222380275, | |
| "grad_norm": 1.1380951404571533, | |
| "learning_rate": 3.074032439376907e-05, | |
| "loss": 1.2553, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.1853959222380275, | |
| "eval_loss": 1.1942973136901855, | |
| "eval_runtime": 71.9892, | |
| "eval_samples_per_second": 130.214, | |
| "eval_steps_per_second": 16.28, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.1948790896159318, | |
| "grad_norm": 1.8760716915130615, | |
| "learning_rate": 3.057973341898186e-05, | |
| "loss": 1.2479, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.204362256993836, | |
| "grad_norm": 1.7070045471191406, | |
| "learning_rate": 3.0419142444194638e-05, | |
| "loss": 1.2283, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 1.2138454243717403, | |
| "grad_norm": 1.6677838563919067, | |
| "learning_rate": 3.025855146940742e-05, | |
| "loss": 1.2527, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.2233285917496444, | |
| "grad_norm": 1.5015747547149658, | |
| "learning_rate": 3.0097960494620203e-05, | |
| "loss": 1.2402, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 1.2328117591275487, | |
| "grad_norm": 1.613587737083435, | |
| "learning_rate": 2.9937369519832987e-05, | |
| "loss": 1.2288, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.2328117591275487, | |
| "eval_loss": 1.1904593706130981, | |
| "eval_runtime": 72.0827, | |
| "eval_samples_per_second": 130.045, | |
| "eval_steps_per_second": 16.259, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.2422949265054528, | |
| "grad_norm": 1.7170720100402832, | |
| "learning_rate": 2.9776778545045768e-05, | |
| "loss": 1.2199, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.251778093883357, | |
| "grad_norm": 1.3260998725891113, | |
| "learning_rate": 2.9616187570258552e-05, | |
| "loss": 1.2575, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.2612612612612613, | |
| "grad_norm": 1.450626254081726, | |
| "learning_rate": 2.9455596595471337e-05, | |
| "loss": 1.2267, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.2707444286391656, | |
| "grad_norm": 1.51180899143219, | |
| "learning_rate": 2.9295005620684118e-05, | |
| "loss": 1.2546, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.2802275960170697, | |
| "grad_norm": 1.846704125404358, | |
| "learning_rate": 2.9134414645896902e-05, | |
| "loss": 1.2216, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.2802275960170697, | |
| "eval_loss": 1.1853208541870117, | |
| "eval_runtime": 72.0024, | |
| "eval_samples_per_second": 130.19, | |
| "eval_steps_per_second": 16.277, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.2897107633949738, | |
| "grad_norm": 1.5088779926300049, | |
| "learning_rate": 2.8973823671109686e-05, | |
| "loss": 1.2028, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.2991939307728781, | |
| "grad_norm": 1.2047330141067505, | |
| "learning_rate": 2.8813232696322467e-05, | |
| "loss": 1.2326, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.3086770981507825, | |
| "grad_norm": 1.6895666122436523, | |
| "learning_rate": 2.865264172153525e-05, | |
| "loss": 1.2032, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.3181602655286866, | |
| "grad_norm": 1.3885574340820312, | |
| "learning_rate": 2.8492050746748032e-05, | |
| "loss": 1.2438, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.3276434329065907, | |
| "grad_norm": 1.5129587650299072, | |
| "learning_rate": 2.8331459771960816e-05, | |
| "loss": 1.2099, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.3276434329065907, | |
| "eval_loss": 1.1841365098953247, | |
| "eval_runtime": 72.0289, | |
| "eval_samples_per_second": 130.142, | |
| "eval_steps_per_second": 16.271, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.337126600284495, | |
| "grad_norm": 1.5244189500808716, | |
| "learning_rate": 2.81708687971736e-05, | |
| "loss": 1.2528, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 1.3466097676623994, | |
| "grad_norm": 1.6656090021133423, | |
| "learning_rate": 2.801027782238638e-05, | |
| "loss": 1.2437, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.3560929350403035, | |
| "grad_norm": 1.6365015506744385, | |
| "learning_rate": 2.7849686847599165e-05, | |
| "loss": 1.2481, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.3655761024182076, | |
| "grad_norm": 1.729038953781128, | |
| "learning_rate": 2.768909587281195e-05, | |
| "loss": 1.2363, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.375059269796112, | |
| "grad_norm": 1.663041114807129, | |
| "learning_rate": 2.752850489802473e-05, | |
| "loss": 1.2371, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.375059269796112, | |
| "eval_loss": 1.1793495416641235, | |
| "eval_runtime": 72.0339, | |
| "eval_samples_per_second": 130.133, | |
| "eval_steps_per_second": 16.27, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.384542437174016, | |
| "grad_norm": 1.5626816749572754, | |
| "learning_rate": 2.7367913923237515e-05, | |
| "loss": 1.2287, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.3940256045519204, | |
| "grad_norm": 1.2476764917373657, | |
| "learning_rate": 2.72073229484503e-05, | |
| "loss": 1.2129, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 1.4035087719298245, | |
| "grad_norm": 1.4796671867370605, | |
| "learning_rate": 2.704673197366308e-05, | |
| "loss": 1.2143, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.4129919393077288, | |
| "grad_norm": 1.8260607719421387, | |
| "learning_rate": 2.6886140998875864e-05, | |
| "loss": 1.2411, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 1.422475106685633, | |
| "grad_norm": 1.6393589973449707, | |
| "learning_rate": 2.6725550024088648e-05, | |
| "loss": 1.2128, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.422475106685633, | |
| "eval_loss": 1.1766639947891235, | |
| "eval_runtime": 72.0436, | |
| "eval_samples_per_second": 130.116, | |
| "eval_steps_per_second": 16.268, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.4319582740635373, | |
| "grad_norm": 1.2327754497528076, | |
| "learning_rate": 2.656495904930143e-05, | |
| "loss": 1.2218, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 1.4414414414414414, | |
| "grad_norm": 1.4845291376113892, | |
| "learning_rate": 2.6405973984262084e-05, | |
| "loss": 1.2158, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 1.4509246088193457, | |
| "grad_norm": 1.5115349292755127, | |
| "learning_rate": 2.6245383009474868e-05, | |
| "loss": 1.2597, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 1.4604077761972498, | |
| "grad_norm": 1.2558484077453613, | |
| "learning_rate": 2.608479203468765e-05, | |
| "loss": 1.2293, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 1.4698909435751542, | |
| "grad_norm": 1.412372350692749, | |
| "learning_rate": 2.5924201059900433e-05, | |
| "loss": 1.2078, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.4698909435751542, | |
| "eval_loss": 1.175757646560669, | |
| "eval_runtime": 72.1719, | |
| "eval_samples_per_second": 129.884, | |
| "eval_steps_per_second": 16.239, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.4793741109530583, | |
| "grad_norm": 1.1586443185806274, | |
| "learning_rate": 2.5763610085113217e-05, | |
| "loss": 1.2167, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 1.4888572783309626, | |
| "grad_norm": 1.535499095916748, | |
| "learning_rate": 2.5603019110325998e-05, | |
| "loss": 1.2177, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 1.4983404457088667, | |
| "grad_norm": 1.3925201892852783, | |
| "learning_rate": 2.5442428135538782e-05, | |
| "loss": 1.2089, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 1.5078236130867708, | |
| "grad_norm": 1.239797592163086, | |
| "learning_rate": 2.5281837160751563e-05, | |
| "loss": 1.2183, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 1.5173067804646752, | |
| "grad_norm": 1.4727925062179565, | |
| "learning_rate": 2.5121246185964347e-05, | |
| "loss": 1.2382, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.5173067804646752, | |
| "eval_loss": 1.1705734729766846, | |
| "eval_runtime": 72.2315, | |
| "eval_samples_per_second": 129.777, | |
| "eval_steps_per_second": 16.226, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.5267899478425795, | |
| "grad_norm": 1.9122114181518555, | |
| "learning_rate": 2.4960655211177135e-05, | |
| "loss": 1.2062, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 1.5362731152204836, | |
| "grad_norm": 1.705417275428772, | |
| "learning_rate": 2.4800064236389916e-05, | |
| "loss": 1.2002, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 1.5457562825983877, | |
| "grad_norm": 1.4141908884048462, | |
| "learning_rate": 2.46394732616027e-05, | |
| "loss": 1.2323, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 1.555239449976292, | |
| "grad_norm": 2.050583839416504, | |
| "learning_rate": 2.4478882286815484e-05, | |
| "loss": 1.2145, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 1.5647226173541964, | |
| "grad_norm": 1.495006799697876, | |
| "learning_rate": 2.4318291312028265e-05, | |
| "loss": 1.2041, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.5647226173541964, | |
| "eval_loss": 1.1694616079330444, | |
| "eval_runtime": 71.9712, | |
| "eval_samples_per_second": 130.247, | |
| "eval_steps_per_second": 16.284, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.5742057847321005, | |
| "grad_norm": 1.4379011392593384, | |
| "learning_rate": 2.415770033724105e-05, | |
| "loss": 1.2045, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 1.5836889521100046, | |
| "grad_norm": 1.6558938026428223, | |
| "learning_rate": 2.399710936245383e-05, | |
| "loss": 1.2234, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 1.593172119487909, | |
| "grad_norm": 1.6931570768356323, | |
| "learning_rate": 2.3836518387666614e-05, | |
| "loss": 1.2061, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 1.6026552868658133, | |
| "grad_norm": 1.445521593093872, | |
| "learning_rate": 2.36759274128794e-05, | |
| "loss": 1.2243, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 1.6121384542437174, | |
| "grad_norm": 1.4067689180374146, | |
| "learning_rate": 2.351533643809218e-05, | |
| "loss": 1.2154, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.6121384542437174, | |
| "eval_loss": 1.1659753322601318, | |
| "eval_runtime": 72.1888, | |
| "eval_samples_per_second": 129.854, | |
| "eval_steps_per_second": 16.235, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.6216216216216215, | |
| "grad_norm": 1.0550585985183716, | |
| "learning_rate": 2.3354745463304964e-05, | |
| "loss": 1.2333, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 1.6311047889995258, | |
| "grad_norm": 1.5547784566879272, | |
| "learning_rate": 2.3194154488517748e-05, | |
| "loss": 1.2088, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 1.6405879563774302, | |
| "grad_norm": 2.006110191345215, | |
| "learning_rate": 2.303356351373053e-05, | |
| "loss": 1.1881, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 1.6500711237553343, | |
| "grad_norm": 1.6522830724716187, | |
| "learning_rate": 2.2872972538943313e-05, | |
| "loss": 1.2158, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 1.6595542911332384, | |
| "grad_norm": 1.2928231954574585, | |
| "learning_rate": 2.2712381564156097e-05, | |
| "loss": 1.2303, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.6595542911332384, | |
| "eval_loss": 1.1643718481063843, | |
| "eval_runtime": 72.2381, | |
| "eval_samples_per_second": 129.765, | |
| "eval_steps_per_second": 16.224, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.6690374585111427, | |
| "grad_norm": 1.38106107711792, | |
| "learning_rate": 2.2551790589368878e-05, | |
| "loss": 1.1969, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.678520625889047, | |
| "grad_norm": 1.3726710081100464, | |
| "learning_rate": 2.2391199614581662e-05, | |
| "loss": 1.2122, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 1.6880037932669512, | |
| "grad_norm": 1.2017816305160522, | |
| "learning_rate": 2.2230608639794447e-05, | |
| "loss": 1.2331, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 1.6974869606448553, | |
| "grad_norm": 1.329315423965454, | |
| "learning_rate": 2.2070017665007227e-05, | |
| "loss": 1.2339, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 1.7069701280227596, | |
| "grad_norm": 1.5352445840835571, | |
| "learning_rate": 2.190942669022001e-05, | |
| "loss": 1.2429, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.7069701280227596, | |
| "eval_loss": 1.1619985103607178, | |
| "eval_runtime": 72.1286, | |
| "eval_samples_per_second": 129.962, | |
| "eval_steps_per_second": 16.249, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.716453295400664, | |
| "grad_norm": 1.5836015939712524, | |
| "learning_rate": 2.1748835715432796e-05, | |
| "loss": 1.1925, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 1.725936462778568, | |
| "grad_norm": 1.7755178213119507, | |
| "learning_rate": 2.1588244740645577e-05, | |
| "loss": 1.2146, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 1.7354196301564722, | |
| "grad_norm": 1.3868217468261719, | |
| "learning_rate": 2.142765376585836e-05, | |
| "loss": 1.2082, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 1.7449027975343765, | |
| "grad_norm": 1.320333480834961, | |
| "learning_rate": 2.1267062791071142e-05, | |
| "loss": 1.213, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 1.7543859649122808, | |
| "grad_norm": 1.5032850503921509, | |
| "learning_rate": 2.1106471816283926e-05, | |
| "loss": 1.2048, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.7543859649122808, | |
| "eval_loss": 1.1578137874603271, | |
| "eval_runtime": 72.0841, | |
| "eval_samples_per_second": 130.043, | |
| "eval_steps_per_second": 16.259, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.763869132290185, | |
| "grad_norm": 1.5423904657363892, | |
| "learning_rate": 2.094588084149671e-05, | |
| "loss": 1.2282, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 1.773352299668089, | |
| "grad_norm": 1.439765453338623, | |
| "learning_rate": 2.078528986670949e-05, | |
| "loss": 1.2171, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 1.7828354670459934, | |
| "grad_norm": 1.573088526725769, | |
| "learning_rate": 2.0624698891922275e-05, | |
| "loss": 1.2149, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 1.7923186344238977, | |
| "grad_norm": 1.4882514476776123, | |
| "learning_rate": 2.046410791713506e-05, | |
| "loss": 1.2278, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 1.8018018018018018, | |
| "grad_norm": 1.9028195142745972, | |
| "learning_rate": 2.030351694234784e-05, | |
| "loss": 1.2247, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.8018018018018018, | |
| "eval_loss": 1.157362937927246, | |
| "eval_runtime": 72.1036, | |
| "eval_samples_per_second": 130.007, | |
| "eval_steps_per_second": 16.254, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.811284969179706, | |
| "grad_norm": 1.289600133895874, | |
| "learning_rate": 2.0142925967560625e-05, | |
| "loss": 1.215, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 1.8207681365576103, | |
| "grad_norm": 1.4183131456375122, | |
| "learning_rate": 1.998233499277341e-05, | |
| "loss": 1.2284, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 1.8302513039355146, | |
| "grad_norm": 1.235146403312683, | |
| "learning_rate": 1.982174401798619e-05, | |
| "loss": 1.2067, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 1.8397344713134187, | |
| "grad_norm": 1.486122488975525, | |
| "learning_rate": 1.9661153043198974e-05, | |
| "loss": 1.183, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 1.8492176386913228, | |
| "grad_norm": 1.4615782499313354, | |
| "learning_rate": 1.9500562068411758e-05, | |
| "loss": 1.1847, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.8492176386913228, | |
| "eval_loss": 1.1544617414474487, | |
| "eval_runtime": 72.1411, | |
| "eval_samples_per_second": 129.94, | |
| "eval_steps_per_second": 16.246, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.8587008060692272, | |
| "grad_norm": 1.3062597513198853, | |
| "learning_rate": 1.933997109362454e-05, | |
| "loss": 1.1998, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 1.8681839734471315, | |
| "grad_norm": 1.7676483392715454, | |
| "learning_rate": 1.9180986028585193e-05, | |
| "loss": 1.1985, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 1.8776671408250356, | |
| "grad_norm": 1.55678129196167, | |
| "learning_rate": 1.9020395053797978e-05, | |
| "loss": 1.2155, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 1.8871503082029397, | |
| "grad_norm": 1.2260453701019287, | |
| "learning_rate": 1.885980407901076e-05, | |
| "loss": 1.2282, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 1.896633475580844, | |
| "grad_norm": 1.6828114986419678, | |
| "learning_rate": 1.8699213104223543e-05, | |
| "loss": 1.2183, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.896633475580844, | |
| "eval_loss": 1.1521168947219849, | |
| "eval_runtime": 72.1018, | |
| "eval_samples_per_second": 130.011, | |
| "eval_steps_per_second": 16.255, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.9061166429587484, | |
| "grad_norm": 1.6691786050796509, | |
| "learning_rate": 1.8538622129436327e-05, | |
| "loss": 1.1651, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 1.9155998103366523, | |
| "grad_norm": 1.4728951454162598, | |
| "learning_rate": 1.8378031154649108e-05, | |
| "loss": 1.2022, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 1.9250829777145566, | |
| "grad_norm": 1.6341995000839233, | |
| "learning_rate": 1.8217440179861892e-05, | |
| "loss": 1.1777, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 1.934566145092461, | |
| "grad_norm": 1.4492669105529785, | |
| "learning_rate": 1.8056849205074676e-05, | |
| "loss": 1.2081, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 1.944049312470365, | |
| "grad_norm": 1.6642097234725952, | |
| "learning_rate": 1.7896258230287457e-05, | |
| "loss": 1.1848, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.944049312470365, | |
| "eval_loss": 1.150140404701233, | |
| "eval_runtime": 72.0779, | |
| "eval_samples_per_second": 130.054, | |
| "eval_steps_per_second": 16.26, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.9535324798482692, | |
| "grad_norm": 1.8986822366714478, | |
| "learning_rate": 1.773566725550024e-05, | |
| "loss": 1.2223, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 1.9630156472261735, | |
| "grad_norm": 1.390931248664856, | |
| "learning_rate": 1.7575076280713022e-05, | |
| "loss": 1.2068, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 1.9724988146040778, | |
| "grad_norm": 1.3856289386749268, | |
| "learning_rate": 1.7414485305925806e-05, | |
| "loss": 1.1828, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 1.981981981981982, | |
| "grad_norm": 1.2241305112838745, | |
| "learning_rate": 1.725389433113859e-05, | |
| "loss": 1.1938, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 1.991465149359886, | |
| "grad_norm": 1.5855077505111694, | |
| "learning_rate": 1.709330335635137e-05, | |
| "loss": 1.206, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.991465149359886, | |
| "eval_loss": 1.1497843265533447, | |
| "eval_runtime": 72.1674, | |
| "eval_samples_per_second": 129.893, | |
| "eval_steps_per_second": 16.24, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.0009483167377904, | |
| "grad_norm": 2.0832741260528564, | |
| "learning_rate": 1.6932712381564156e-05, | |
| "loss": 1.1805, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 2.0104314841156947, | |
| "grad_norm": 1.893350601196289, | |
| "learning_rate": 1.677212140677694e-05, | |
| "loss": 1.1757, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 2.019914651493599, | |
| "grad_norm": 1.346118688583374, | |
| "learning_rate": 1.661153043198972e-05, | |
| "loss": 1.1938, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 2.029397818871503, | |
| "grad_norm": 1.658034086227417, | |
| "learning_rate": 1.6450939457202505e-05, | |
| "loss": 1.1773, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 2.0388809862494073, | |
| "grad_norm": 1.4759783744812012, | |
| "learning_rate": 1.629034848241529e-05, | |
| "loss": 1.1735, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.0388809862494073, | |
| "eval_loss": 1.1474945545196533, | |
| "eval_runtime": 71.9179, | |
| "eval_samples_per_second": 130.343, | |
| "eval_steps_per_second": 16.296, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.0483641536273116, | |
| "grad_norm": 1.2887206077575684, | |
| "learning_rate": 1.612975750762807e-05, | |
| "loss": 1.1701, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 2.057847321005216, | |
| "grad_norm": 1.552646279335022, | |
| "learning_rate": 1.5969166532840854e-05, | |
| "loss": 1.1734, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 2.06733048838312, | |
| "grad_norm": 1.6683566570281982, | |
| "learning_rate": 1.581018146780151e-05, | |
| "loss": 1.1883, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 2.076813655761024, | |
| "grad_norm": 1.4613324403762817, | |
| "learning_rate": 1.5649590493014293e-05, | |
| "loss": 1.1845, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 2.0862968231389285, | |
| "grad_norm": 1.5622040033340454, | |
| "learning_rate": 1.5488999518227077e-05, | |
| "loss": 1.1584, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.0862968231389285, | |
| "eval_loss": 1.1467849016189575, | |
| "eval_runtime": 72.0497, | |
| "eval_samples_per_second": 130.105, | |
| "eval_steps_per_second": 16.267, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.095779990516833, | |
| "grad_norm": 1.721030831336975, | |
| "learning_rate": 1.5328408543439858e-05, | |
| "loss": 1.2018, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 1.3872593641281128, | |
| "learning_rate": 1.5167817568652642e-05, | |
| "loss": 1.1659, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 2.114746325272641, | |
| "grad_norm": 1.655704140663147, | |
| "learning_rate": 1.5007226593865425e-05, | |
| "loss": 1.1503, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 2.1242294926505454, | |
| "grad_norm": 1.5672900676727295, | |
| "learning_rate": 1.4848241528826081e-05, | |
| "loss": 1.1879, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 2.1337126600284497, | |
| "grad_norm": 1.6815894842147827, | |
| "learning_rate": 1.4687650554038865e-05, | |
| "loss": 1.1719, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.1337126600284497, | |
| "eval_loss": 1.1450951099395752, | |
| "eval_runtime": 72.1598, | |
| "eval_samples_per_second": 129.906, | |
| "eval_steps_per_second": 16.242, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.1431958274063536, | |
| "grad_norm": 1.040648102760315, | |
| "learning_rate": 1.4527059579251648e-05, | |
| "loss": 1.1629, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 2.152678994784258, | |
| "grad_norm": 1.5001453161239624, | |
| "learning_rate": 1.436646860446443e-05, | |
| "loss": 1.1796, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 2.1621621621621623, | |
| "grad_norm": 1.7325968742370605, | |
| "learning_rate": 1.4205877629677215e-05, | |
| "loss": 1.1757, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 2.171645329540066, | |
| "grad_norm": 1.7485188245773315, | |
| "learning_rate": 1.4045286654889997e-05, | |
| "loss": 1.1485, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 2.1811284969179705, | |
| "grad_norm": 1.4972156286239624, | |
| "learning_rate": 1.388469568010278e-05, | |
| "loss": 1.1667, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.1811284969179705, | |
| "eval_loss": 1.144049048423767, | |
| "eval_runtime": 72.1218, | |
| "eval_samples_per_second": 129.975, | |
| "eval_steps_per_second": 16.25, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.190611664295875, | |
| "grad_norm": 1.2919082641601562, | |
| "learning_rate": 1.3724104705315564e-05, | |
| "loss": 1.1764, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 2.200094831673779, | |
| "grad_norm": 1.6442806720733643, | |
| "learning_rate": 1.3563513730528346e-05, | |
| "loss": 1.174, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 2.209577999051683, | |
| "grad_norm": 1.480901837348938, | |
| "learning_rate": 1.3402922755741129e-05, | |
| "loss": 1.1666, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 2.2190611664295874, | |
| "grad_norm": 1.6193006038665771, | |
| "learning_rate": 1.3242331780953911e-05, | |
| "loss": 1.1975, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 2.2285443338074917, | |
| "grad_norm": 1.2970917224884033, | |
| "learning_rate": 1.3081740806166696e-05, | |
| "loss": 1.1579, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 2.2285443338074917, | |
| "eval_loss": 1.1433159112930298, | |
| "eval_runtime": 72.0832, | |
| "eval_samples_per_second": 130.044, | |
| "eval_steps_per_second": 16.259, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 2.238027501185396, | |
| "grad_norm": 1.4054538011550903, | |
| "learning_rate": 1.2921149831379478e-05, | |
| "loss": 1.1779, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 2.2475106685633, | |
| "grad_norm": 1.5161010026931763, | |
| "learning_rate": 1.276055885659226e-05, | |
| "loss": 1.1709, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 2.2569938359412043, | |
| "grad_norm": 2.040818929672241, | |
| "learning_rate": 1.2599967881805045e-05, | |
| "loss": 1.1692, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 2.2664770033191086, | |
| "grad_norm": 1.3812401294708252, | |
| "learning_rate": 1.2439376907017826e-05, | |
| "loss": 1.1733, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 2.275960170697013, | |
| "grad_norm": 2.113886833190918, | |
| "learning_rate": 1.2278785932230608e-05, | |
| "loss": 1.1682, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.275960170697013, | |
| "eval_loss": 1.1404303312301636, | |
| "eval_runtime": 72.1649, | |
| "eval_samples_per_second": 129.897, | |
| "eval_steps_per_second": 16.241, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.285443338074917, | |
| "grad_norm": 1.3256770372390747, | |
| "learning_rate": 1.2118194957443393e-05, | |
| "loss": 1.1847, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 2.294926505452821, | |
| "grad_norm": 1.4699623584747314, | |
| "learning_rate": 1.1957603982656175e-05, | |
| "loss": 1.1576, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 2.3044096728307255, | |
| "grad_norm": 1.5492583513259888, | |
| "learning_rate": 1.1797013007868958e-05, | |
| "loss": 1.1532, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 2.31389284020863, | |
| "grad_norm": 1.409488558769226, | |
| "learning_rate": 1.1636422033081742e-05, | |
| "loss": 1.1626, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 2.3233760075865337, | |
| "grad_norm": 1.642247200012207, | |
| "learning_rate": 1.1475831058294524e-05, | |
| "loss": 1.1943, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 2.3233760075865337, | |
| "eval_loss": 1.139186978340149, | |
| "eval_runtime": 72.1131, | |
| "eval_samples_per_second": 129.99, | |
| "eval_steps_per_second": 16.252, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 2.332859174964438, | |
| "grad_norm": 1.4776501655578613, | |
| "learning_rate": 1.1315240083507307e-05, | |
| "loss": 1.1566, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 2.3423423423423424, | |
| "grad_norm": 1.475188136100769, | |
| "learning_rate": 1.115464910872009e-05, | |
| "loss": 1.1743, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 2.3518255097202467, | |
| "grad_norm": 1.48451828956604, | |
| "learning_rate": 1.0994058133932874e-05, | |
| "loss": 1.1539, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 2.3613086770981506, | |
| "grad_norm": 1.4650864601135254, | |
| "learning_rate": 1.0833467159145656e-05, | |
| "loss": 1.2073, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 2.370791844476055, | |
| "grad_norm": 1.71983003616333, | |
| "learning_rate": 1.0672876184358439e-05, | |
| "loss": 1.2021, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.370791844476055, | |
| "eval_loss": 1.1377766132354736, | |
| "eval_runtime": 71.9749, | |
| "eval_samples_per_second": 130.24, | |
| "eval_steps_per_second": 16.283, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.3802750118539593, | |
| "grad_norm": 1.3838121891021729, | |
| "learning_rate": 1.0512285209571223e-05, | |
| "loss": 1.1791, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 2.3897581792318636, | |
| "grad_norm": 1.8836325407028198, | |
| "learning_rate": 1.0351694234784006e-05, | |
| "loss": 1.1834, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 2.3992413466097675, | |
| "grad_norm": 1.3679293394088745, | |
| "learning_rate": 1.0191103259996788e-05, | |
| "loss": 1.183, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 2.408724513987672, | |
| "grad_norm": 1.5593743324279785, | |
| "learning_rate": 1.003051228520957e-05, | |
| "loss": 1.1703, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 2.418207681365576, | |
| "grad_norm": 1.4257512092590332, | |
| "learning_rate": 9.869921310422355e-06, | |
| "loss": 1.172, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 2.418207681365576, | |
| "eval_loss": 1.1378742456436157, | |
| "eval_runtime": 72.0363, | |
| "eval_samples_per_second": 130.129, | |
| "eval_steps_per_second": 16.27, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 2.4276908487434805, | |
| "grad_norm": 1.771941065788269, | |
| "learning_rate": 9.709330335635137e-06, | |
| "loss": 1.1676, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 2.4371740161213844, | |
| "grad_norm": 1.7247157096862793, | |
| "learning_rate": 9.54873936084792e-06, | |
| "loss": 1.1753, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 2.4466571834992887, | |
| "grad_norm": 1.5509614944458008, | |
| "learning_rate": 9.388148386060704e-06, | |
| "loss": 1.1705, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 2.456140350877193, | |
| "grad_norm": 1.8205307722091675, | |
| "learning_rate": 9.227557411273487e-06, | |
| "loss": 1.1938, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 2.4656235182550974, | |
| "grad_norm": 1.501631498336792, | |
| "learning_rate": 9.06696643648627e-06, | |
| "loss": 1.1737, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.4656235182550974, | |
| "eval_loss": 1.1362242698669434, | |
| "eval_runtime": 71.9608, | |
| "eval_samples_per_second": 130.265, | |
| "eval_steps_per_second": 16.287, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.4751066856330013, | |
| "grad_norm": 1.4233213663101196, | |
| "learning_rate": 8.906375461699054e-06, | |
| "loss": 1.1728, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 2.4845898530109056, | |
| "grad_norm": 1.597785472869873, | |
| "learning_rate": 8.745784486911836e-06, | |
| "loss": 1.1559, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 2.49407302038881, | |
| "grad_norm": 1.2396786212921143, | |
| "learning_rate": 8.585193512124619e-06, | |
| "loss": 1.1645, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 2.503556187766714, | |
| "grad_norm": 1.643211841583252, | |
| "learning_rate": 8.424602537337401e-06, | |
| "loss": 1.1948, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 2.513039355144618, | |
| "grad_norm": 1.688436508178711, | |
| "learning_rate": 8.264011562550185e-06, | |
| "loss": 1.1875, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.513039355144618, | |
| "eval_loss": 1.134669303894043, | |
| "eval_runtime": 72.1082, | |
| "eval_samples_per_second": 129.999, | |
| "eval_steps_per_second": 16.253, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.5225225225225225, | |
| "grad_norm": 1.6127384901046753, | |
| "learning_rate": 8.103420587762968e-06, | |
| "loss": 1.1657, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 2.532005689900427, | |
| "grad_norm": 2.12892484664917, | |
| "learning_rate": 7.944435522723622e-06, | |
| "loss": 1.1636, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 2.541488857278331, | |
| "grad_norm": 1.173686146736145, | |
| "learning_rate": 7.783844547936407e-06, | |
| "loss": 1.1866, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 2.550972024656235, | |
| "grad_norm": 1.4527802467346191, | |
| "learning_rate": 7.623253573149189e-06, | |
| "loss": 1.1755, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 2.5604551920341394, | |
| "grad_norm": 1.6228667497634888, | |
| "learning_rate": 7.462662598361972e-06, | |
| "loss": 1.1427, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.5604551920341394, | |
| "eval_loss": 1.134996771812439, | |
| "eval_runtime": 72.1853, | |
| "eval_samples_per_second": 129.86, | |
| "eval_steps_per_second": 16.236, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.5699383594120437, | |
| "grad_norm": 1.5179518461227417, | |
| "learning_rate": 7.302071623574755e-06, | |
| "loss": 1.1496, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 2.5794215267899476, | |
| "grad_norm": 1.2633978128433228, | |
| "learning_rate": 7.141480648787538e-06, | |
| "loss": 1.1633, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 2.588904694167852, | |
| "grad_norm": 1.3050264120101929, | |
| "learning_rate": 6.980889674000321e-06, | |
| "loss": 1.1614, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 2.5983878615457563, | |
| "grad_norm": 1.432268500328064, | |
| "learning_rate": 6.820298699213104e-06, | |
| "loss": 1.1684, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 2.6078710289236606, | |
| "grad_norm": 1.6904171705245972, | |
| "learning_rate": 6.659707724425887e-06, | |
| "loss": 1.1673, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.6078710289236606, | |
| "eval_loss": 1.1333271265029907, | |
| "eval_runtime": 72.187, | |
| "eval_samples_per_second": 129.857, | |
| "eval_steps_per_second": 16.236, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.617354196301565, | |
| "grad_norm": 1.2229042053222656, | |
| "learning_rate": 6.49911674963867e-06, | |
| "loss": 1.1793, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 2.626837363679469, | |
| "grad_norm": 1.7409764528274536, | |
| "learning_rate": 6.338525774851453e-06, | |
| "loss": 1.1963, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 2.636320531057373, | |
| "grad_norm": 1.4706058502197266, | |
| "learning_rate": 6.177934800064237e-06, | |
| "loss": 1.1836, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 2.6458036984352775, | |
| "grad_norm": 1.3871138095855713, | |
| "learning_rate": 6.01734382527702e-06, | |
| "loss": 1.1669, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 2.6552868658131814, | |
| "grad_norm": 1.5841022729873657, | |
| "learning_rate": 5.856752850489803e-06, | |
| "loss": 1.1765, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.6552868658131814, | |
| "eval_loss": 1.1325418949127197, | |
| "eval_runtime": 72.1699, | |
| "eval_samples_per_second": 129.888, | |
| "eval_steps_per_second": 16.239, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.6647700331910857, | |
| "grad_norm": 1.2488940954208374, | |
| "learning_rate": 5.696161875702586e-06, | |
| "loss": 1.1581, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 2.67425320056899, | |
| "grad_norm": 1.633123517036438, | |
| "learning_rate": 5.535570900915369e-06, | |
| "loss": 1.1829, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 2.6837363679468944, | |
| "grad_norm": 1.558030366897583, | |
| "learning_rate": 5.374979926128152e-06, | |
| "loss": 1.1816, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 2.6932195353247987, | |
| "grad_norm": 1.5178041458129883, | |
| "learning_rate": 5.214388951340935e-06, | |
| "loss": 1.1789, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 2.7027027027027026, | |
| "grad_norm": 1.8317012786865234, | |
| "learning_rate": 5.053797976553718e-06, | |
| "loss": 1.1612, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.7027027027027026, | |
| "eval_loss": 1.1320453882217407, | |
| "eval_runtime": 72.3445, | |
| "eval_samples_per_second": 129.575, | |
| "eval_steps_per_second": 16.2, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.712185870080607, | |
| "grad_norm": 1.4248275756835938, | |
| "learning_rate": 4.893207001766502e-06, | |
| "loss": 1.1583, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 2.7216690374585113, | |
| "grad_norm": 1.3696835041046143, | |
| "learning_rate": 4.732616026979284e-06, | |
| "loss": 1.1302, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 2.731152204836415, | |
| "grad_norm": 1.4212887287139893, | |
| "learning_rate": 4.5720250521920675e-06, | |
| "loss": 1.1396, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 2.7406353722143195, | |
| "grad_norm": 1.6230417490005493, | |
| "learning_rate": 4.41143407740485e-06, | |
| "loss": 1.167, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 2.750118539592224, | |
| "grad_norm": 1.4556254148483276, | |
| "learning_rate": 4.252449012365505e-06, | |
| "loss": 1.2229, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.750118539592224, | |
| "eval_loss": 1.1307094097137451, | |
| "eval_runtime": 72.2019, | |
| "eval_samples_per_second": 129.83, | |
| "eval_steps_per_second": 16.232, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.759601706970128, | |
| "grad_norm": 1.399604082107544, | |
| "learning_rate": 4.091858037578288e-06, | |
| "loss": 1.183, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 2.769084874348032, | |
| "grad_norm": 1.3562369346618652, | |
| "learning_rate": 3.931267062791071e-06, | |
| "loss": 1.1729, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 2.7785680417259364, | |
| "grad_norm": 1.4427545070648193, | |
| "learning_rate": 3.7706760880038542e-06, | |
| "loss": 1.1636, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 2.7880512091038407, | |
| "grad_norm": 1.6153539419174194, | |
| "learning_rate": 3.610085113216637e-06, | |
| "loss": 1.1608, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 2.797534376481745, | |
| "grad_norm": 1.553841233253479, | |
| "learning_rate": 3.44949413842942e-06, | |
| "loss": 1.1727, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 2.797534376481745, | |
| "eval_loss": 1.1305798292160034, | |
| "eval_runtime": 72.4369, | |
| "eval_samples_per_second": 129.409, | |
| "eval_steps_per_second": 16.18, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 2.807017543859649, | |
| "grad_norm": 1.4503796100616455, | |
| "learning_rate": 3.2889031636422036e-06, | |
| "loss": 1.1533, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 2.8165007112375533, | |
| "grad_norm": 2.3234095573425293, | |
| "learning_rate": 3.1283121888549865e-06, | |
| "loss": 1.1849, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 2.8259838786154576, | |
| "grad_norm": 1.6692347526550293, | |
| "learning_rate": 2.9677212140677695e-06, | |
| "loss": 1.1629, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 2.8354670459933615, | |
| "grad_norm": 1.6683822870254517, | |
| "learning_rate": 2.8071302392805524e-06, | |
| "loss": 1.1584, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 2.844950213371266, | |
| "grad_norm": 1.371102213859558, | |
| "learning_rate": 2.6465392644933354e-06, | |
| "loss": 1.1208, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.844950213371266, | |
| "eval_loss": 1.1299171447753906, | |
| "eval_runtime": 72.1885, | |
| "eval_samples_per_second": 129.854, | |
| "eval_steps_per_second": 16.235, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.85443338074917, | |
| "grad_norm": 1.9285227060317993, | |
| "learning_rate": 2.4859482897061184e-06, | |
| "loss": 1.1871, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 2.8639165481270745, | |
| "grad_norm": 1.5394768714904785, | |
| "learning_rate": 2.3253573149189017e-06, | |
| "loss": 1.1786, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 2.873399715504979, | |
| "grad_norm": 1.606779932975769, | |
| "learning_rate": 2.1647663401316847e-06, | |
| "loss": 1.181, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 2.8828828828828827, | |
| "grad_norm": 1.6637898683547974, | |
| "learning_rate": 2.0041753653444677e-06, | |
| "loss": 1.1435, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 2.892366050260787, | |
| "grad_norm": 1.4190491437911987, | |
| "learning_rate": 1.8435843905572506e-06, | |
| "loss": 1.158, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 2.892366050260787, | |
| "eval_loss": 1.129961371421814, | |
| "eval_runtime": 72.2984, | |
| "eval_samples_per_second": 129.657, | |
| "eval_steps_per_second": 16.211, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 2.9018492176386914, | |
| "grad_norm": 1.3839406967163086, | |
| "learning_rate": 1.6829934157700338e-06, | |
| "loss": 1.1716, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 2.9113323850165953, | |
| "grad_norm": 1.2562811374664307, | |
| "learning_rate": 1.5224024409828168e-06, | |
| "loss": 1.1466, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 2.9208155523944996, | |
| "grad_norm": 1.4180203676223755, | |
| "learning_rate": 1.3618114661955997e-06, | |
| "loss": 1.1405, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 2.930298719772404, | |
| "grad_norm": 1.7891360521316528, | |
| "learning_rate": 1.2012204914083829e-06, | |
| "loss": 1.1591, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 2.9397818871503083, | |
| "grad_norm": 1.7551426887512207, | |
| "learning_rate": 1.0406295166211659e-06, | |
| "loss": 1.1833, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.9397818871503083, | |
| "eval_loss": 1.129394292831421, | |
| "eval_runtime": 72.2972, | |
| "eval_samples_per_second": 129.659, | |
| "eval_steps_per_second": 16.211, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.9492650545282126, | |
| "grad_norm": 1.4321238994598389, | |
| "learning_rate": 8.800385418339489e-07, | |
| "loss": 1.1879, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 2.9587482219061165, | |
| "grad_norm": 1.732853651046753, | |
| "learning_rate": 7.210534767946041e-07, | |
| "loss": 1.1682, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 2.968231389284021, | |
| "grad_norm": 1.473656415939331, | |
| "learning_rate": 5.604625020073872e-07, | |
| "loss": 1.1708, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 2.977714556661925, | |
| "grad_norm": 1.2021667957305908, | |
| "learning_rate": 3.998715272201702e-07, | |
| "loss": 1.1679, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 2.987197724039829, | |
| "grad_norm": 1.4972681999206543, | |
| "learning_rate": 2.4088646218082545e-07, | |
| "loss": 1.1678, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 2.987197724039829, | |
| "eval_loss": 1.129324197769165, | |
| "eval_runtime": 72.2219, | |
| "eval_samples_per_second": 129.794, | |
| "eval_steps_per_second": 16.228, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 2.9966808914177334, | |
| "grad_norm": 1.7410774230957031, | |
| "learning_rate": 8.029548739360848e-08, | |
| "loss": 1.1645, | |
| "step": 31600 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 31635, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.32254007164928e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |