| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 100, | |
| "global_step": 421, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0023788284269997025, | |
| "grad_norm": 1.0281301736831665, | |
| "learning_rate": 0.0, | |
| "loss": 2.0766, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.004757656853999405, | |
| "grad_norm": 1.2767698764801025, | |
| "learning_rate": 1e-05, | |
| "loss": 2.3525, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.007136485280999108, | |
| "grad_norm": 1.3647927045822144, | |
| "learning_rate": 2e-05, | |
| "loss": 2.4261, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.00951531370799881, | |
| "grad_norm": 0.8589039444923401, | |
| "learning_rate": 3e-05, | |
| "loss": 1.7271, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.011894142134998514, | |
| "grad_norm": 1.1814510822296143, | |
| "learning_rate": 4e-05, | |
| "loss": 2.1379, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.014272970561998216, | |
| "grad_norm": 1.012987732887268, | |
| "learning_rate": 5e-05, | |
| "loss": 1.8598, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.016651798988997917, | |
| "grad_norm": 1.052037239074707, | |
| "learning_rate": 6e-05, | |
| "loss": 1.9315, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.01903062741599762, | |
| "grad_norm": 0.8621532320976257, | |
| "learning_rate": 7e-05, | |
| "loss": 1.7205, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.021409455842997322, | |
| "grad_norm": 0.7924615740776062, | |
| "learning_rate": 8e-05, | |
| "loss": 1.589, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.023788284269997028, | |
| "grad_norm": 0.6322035789489746, | |
| "learning_rate": 9e-05, | |
| "loss": 1.3374, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02616711269699673, | |
| "grad_norm": 0.7854194641113281, | |
| "learning_rate": 0.0001, | |
| "loss": 1.4292, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.028545941123996433, | |
| "grad_norm": 0.6697576642036438, | |
| "learning_rate": 9.975669099756692e-05, | |
| "loss": 1.206, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.030924769550996135, | |
| "grad_norm": 0.7266756296157837, | |
| "learning_rate": 9.951338199513383e-05, | |
| "loss": 1.2737, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.033303597977995834, | |
| "grad_norm": 0.7196414470672607, | |
| "learning_rate": 9.927007299270074e-05, | |
| "loss": 1.2023, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.03568242640499554, | |
| "grad_norm": 0.7868577241897583, | |
| "learning_rate": 9.902676399026765e-05, | |
| "loss": 1.1486, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03806125483199524, | |
| "grad_norm": 0.7759497761726379, | |
| "learning_rate": 9.878345498783455e-05, | |
| "loss": 1.112, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.040440083258994945, | |
| "grad_norm": 0.7133831977844238, | |
| "learning_rate": 9.854014598540146e-05, | |
| "loss": 1.0158, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.042818911685994644, | |
| "grad_norm": 0.644658088684082, | |
| "learning_rate": 9.829683698296837e-05, | |
| "loss": 0.9656, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.04519774011299435, | |
| "grad_norm": 0.6295652985572815, | |
| "learning_rate": 9.805352798053529e-05, | |
| "loss": 0.9295, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.047576568539994056, | |
| "grad_norm": 0.6460481882095337, | |
| "learning_rate": 9.78102189781022e-05, | |
| "loss": 0.9851, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.049955396966993755, | |
| "grad_norm": 0.5485146045684814, | |
| "learning_rate": 9.756690997566911e-05, | |
| "loss": 0.864, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.05233422539399346, | |
| "grad_norm": 0.6552495360374451, | |
| "learning_rate": 9.732360097323602e-05, | |
| "loss": 0.7903, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.05471305382099316, | |
| "grad_norm": 0.5481061935424805, | |
| "learning_rate": 9.708029197080293e-05, | |
| "loss": 0.843, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.057091882247992866, | |
| "grad_norm": 0.6262676119804382, | |
| "learning_rate": 9.683698296836983e-05, | |
| "loss": 0.9421, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.059470710674992565, | |
| "grad_norm": 0.5493852496147156, | |
| "learning_rate": 9.659367396593674e-05, | |
| "loss": 0.9045, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06184953910199227, | |
| "grad_norm": 0.542617678642273, | |
| "learning_rate": 9.635036496350366e-05, | |
| "loss": 0.8028, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.06422836752899197, | |
| "grad_norm": 0.4842965602874756, | |
| "learning_rate": 9.610705596107057e-05, | |
| "loss": 0.716, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.06660719595599167, | |
| "grad_norm": 0.5457621216773987, | |
| "learning_rate": 9.586374695863748e-05, | |
| "loss": 0.7069, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.06898602438299138, | |
| "grad_norm": 0.5163421034812927, | |
| "learning_rate": 9.562043795620439e-05, | |
| "loss": 0.6999, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.07136485280999108, | |
| "grad_norm": 0.4519389569759369, | |
| "learning_rate": 9.537712895377129e-05, | |
| "loss": 0.6219, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07374368123699078, | |
| "grad_norm": 0.5117298364639282, | |
| "learning_rate": 9.51338199513382e-05, | |
| "loss": 0.597, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.07612250966399048, | |
| "grad_norm": 0.5518686771392822, | |
| "learning_rate": 9.489051094890511e-05, | |
| "loss": 0.6789, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.07850133809099019, | |
| "grad_norm": 0.5353817939758301, | |
| "learning_rate": 9.464720194647201e-05, | |
| "loss": 0.6493, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.08088016651798989, | |
| "grad_norm": 0.48304057121276855, | |
| "learning_rate": 9.440389294403893e-05, | |
| "loss": 0.5638, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.08325899494498959, | |
| "grad_norm": 0.42356157302856445, | |
| "learning_rate": 9.416058394160584e-05, | |
| "loss": 0.5589, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.08563782337198929, | |
| "grad_norm": 0.39856433868408203, | |
| "learning_rate": 9.391727493917275e-05, | |
| "loss": 0.5208, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.088016651798989, | |
| "grad_norm": 0.4774276912212372, | |
| "learning_rate": 9.367396593673966e-05, | |
| "loss": 0.6367, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.0903954802259887, | |
| "grad_norm": 0.48944222927093506, | |
| "learning_rate": 9.343065693430657e-05, | |
| "loss": 0.6261, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.0927743086529884, | |
| "grad_norm": 0.4067060053348541, | |
| "learning_rate": 9.318734793187348e-05, | |
| "loss": 0.5512, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.09515313707998811, | |
| "grad_norm": 0.4157446622848511, | |
| "learning_rate": 9.29440389294404e-05, | |
| "loss": 0.5823, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09753196550698781, | |
| "grad_norm": 0.43023228645324707, | |
| "learning_rate": 9.27007299270073e-05, | |
| "loss": 0.5499, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.09991079393398751, | |
| "grad_norm": 0.40139999985694885, | |
| "learning_rate": 9.245742092457421e-05, | |
| "loss": 0.5808, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.10228962236098721, | |
| "grad_norm": 0.4564228653907776, | |
| "learning_rate": 9.221411192214112e-05, | |
| "loss": 0.652, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.10466845078798692, | |
| "grad_norm": 0.40338802337646484, | |
| "learning_rate": 9.197080291970803e-05, | |
| "loss": 0.5065, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.10704727921498662, | |
| "grad_norm": 0.5054526925086975, | |
| "learning_rate": 9.172749391727494e-05, | |
| "loss": 0.6194, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.10942610764198632, | |
| "grad_norm": 0.44123998284339905, | |
| "learning_rate": 9.148418491484186e-05, | |
| "loss": 0.5657, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.11180493606898602, | |
| "grad_norm": 0.4445621371269226, | |
| "learning_rate": 9.124087591240877e-05, | |
| "loss": 0.5808, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.11418376449598573, | |
| "grad_norm": 0.3913380801677704, | |
| "learning_rate": 9.099756690997568e-05, | |
| "loss": 0.5816, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.11656259292298543, | |
| "grad_norm": 0.43966594338417053, | |
| "learning_rate": 9.075425790754258e-05, | |
| "loss": 0.4836, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.11894142134998513, | |
| "grad_norm": 0.4385397434234619, | |
| "learning_rate": 9.051094890510949e-05, | |
| "loss": 0.5962, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.12132024977698483, | |
| "grad_norm": 0.41545379161834717, | |
| "learning_rate": 9.02676399026764e-05, | |
| "loss": 0.5748, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.12369907820398454, | |
| "grad_norm": 0.3504558801651001, | |
| "learning_rate": 9.002433090024331e-05, | |
| "loss": 0.5202, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.12607790663098423, | |
| "grad_norm": 0.44601067900657654, | |
| "learning_rate": 8.978102189781023e-05, | |
| "loss": 0.5451, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.12845673505798394, | |
| "grad_norm": 0.3965899646282196, | |
| "learning_rate": 8.953771289537714e-05, | |
| "loss": 0.5578, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.13083556348498365, | |
| "grad_norm": 0.44455817341804504, | |
| "learning_rate": 8.929440389294405e-05, | |
| "loss": 0.4968, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.13321439191198334, | |
| "grad_norm": 0.45583397150039673, | |
| "learning_rate": 8.905109489051096e-05, | |
| "loss": 0.598, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.13559322033898305, | |
| "grad_norm": 0.46639132499694824, | |
| "learning_rate": 8.880778588807786e-05, | |
| "loss": 0.5857, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.13797204876598276, | |
| "grad_norm": 0.4358392655849457, | |
| "learning_rate": 8.856447688564477e-05, | |
| "loss": 0.5299, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.14035087719298245, | |
| "grad_norm": 0.4803287088871002, | |
| "learning_rate": 8.832116788321168e-05, | |
| "loss": 0.5082, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.14272970561998216, | |
| "grad_norm": 0.41488590836524963, | |
| "learning_rate": 8.80778588807786e-05, | |
| "loss": 0.5223, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14510853404698187, | |
| "grad_norm": 0.46010011434555054, | |
| "learning_rate": 8.783454987834551e-05, | |
| "loss": 0.5125, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.14748736247398156, | |
| "grad_norm": 0.3907822370529175, | |
| "learning_rate": 8.759124087591242e-05, | |
| "loss": 0.5121, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.14986619090098127, | |
| "grad_norm": 0.4548039138317108, | |
| "learning_rate": 8.734793187347933e-05, | |
| "loss": 0.577, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.15224501932798096, | |
| "grad_norm": 0.3938556909561157, | |
| "learning_rate": 8.710462287104624e-05, | |
| "loss": 0.4726, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.15462384775498067, | |
| "grad_norm": 0.46850043535232544, | |
| "learning_rate": 8.686131386861314e-05, | |
| "loss": 0.5203, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.15700267618198038, | |
| "grad_norm": 0.43241533637046814, | |
| "learning_rate": 8.661800486618005e-05, | |
| "loss": 0.4683, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.15938150460898007, | |
| "grad_norm": 0.43365252017974854, | |
| "learning_rate": 8.637469586374697e-05, | |
| "loss": 0.5508, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.16176033303597978, | |
| "grad_norm": 0.4271024763584137, | |
| "learning_rate": 8.613138686131386e-05, | |
| "loss": 0.4992, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.1641391614629795, | |
| "grad_norm": 0.4089069366455078, | |
| "learning_rate": 8.588807785888078e-05, | |
| "loss": 0.4575, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.16651798988997918, | |
| "grad_norm": 0.43635204434394836, | |
| "learning_rate": 8.564476885644769e-05, | |
| "loss": 0.5513, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1688968183169789, | |
| "grad_norm": 0.4246712625026703, | |
| "learning_rate": 8.54014598540146e-05, | |
| "loss": 0.4759, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.17127564674397858, | |
| "grad_norm": 0.463638573884964, | |
| "learning_rate": 8.515815085158151e-05, | |
| "loss": 0.5323, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.1736544751709783, | |
| "grad_norm": 0.4437566101551056, | |
| "learning_rate": 8.491484184914842e-05, | |
| "loss": 0.5446, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.176033303597978, | |
| "grad_norm": 0.4280001223087311, | |
| "learning_rate": 8.467153284671534e-05, | |
| "loss": 0.5092, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.1784121320249777, | |
| "grad_norm": 0.4512069523334503, | |
| "learning_rate": 8.442822384428223e-05, | |
| "loss": 0.5002, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.1807909604519774, | |
| "grad_norm": 0.39247390627861023, | |
| "learning_rate": 8.418491484184915e-05, | |
| "loss": 0.4522, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.1831697888789771, | |
| "grad_norm": 0.531852662563324, | |
| "learning_rate": 8.394160583941606e-05, | |
| "loss": 0.5507, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.1855486173059768, | |
| "grad_norm": 0.49727141857147217, | |
| "learning_rate": 8.369829683698297e-05, | |
| "loss": 0.519, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.1879274457329765, | |
| "grad_norm": 0.46292659640312195, | |
| "learning_rate": 8.345498783454988e-05, | |
| "loss": 0.5271, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.19030627415997622, | |
| "grad_norm": 0.4514133036136627, | |
| "learning_rate": 8.32116788321168e-05, | |
| "loss": 0.5499, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1926851025869759, | |
| "grad_norm": 0.42186304926872253, | |
| "learning_rate": 8.29683698296837e-05, | |
| "loss": 0.541, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.19506393101397562, | |
| "grad_norm": 0.4527208209037781, | |
| "learning_rate": 8.272506082725062e-05, | |
| "loss": 0.482, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.1974427594409753, | |
| "grad_norm": 0.4818190634250641, | |
| "learning_rate": 8.248175182481752e-05, | |
| "loss": 0.5627, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.19982158786797502, | |
| "grad_norm": 0.44998934864997864, | |
| "learning_rate": 8.223844282238443e-05, | |
| "loss": 0.5169, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.20220041629497473, | |
| "grad_norm": 0.42806658148765564, | |
| "learning_rate": 8.199513381995134e-05, | |
| "loss": 0.5613, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.20457924472197442, | |
| "grad_norm": 0.4187542796134949, | |
| "learning_rate": 8.175182481751825e-05, | |
| "loss": 0.4584, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.20695807314897413, | |
| "grad_norm": 0.5182843804359436, | |
| "learning_rate": 8.150851581508516e-05, | |
| "loss": 0.5253, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.20933690157597384, | |
| "grad_norm": 0.4145914614200592, | |
| "learning_rate": 8.126520681265208e-05, | |
| "loss": 0.4828, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.21171573000297353, | |
| "grad_norm": 0.46647554636001587, | |
| "learning_rate": 8.102189781021899e-05, | |
| "loss": 0.4496, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.21409455842997324, | |
| "grad_norm": 0.4022408723831177, | |
| "learning_rate": 8.07785888077859e-05, | |
| "loss": 0.4329, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.21647338685697295, | |
| "grad_norm": 0.4754613935947418, | |
| "learning_rate": 8.05352798053528e-05, | |
| "loss": 0.5314, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.21885221528397264, | |
| "grad_norm": 0.41736510396003723, | |
| "learning_rate": 8.029197080291971e-05, | |
| "loss": 0.4906, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.22123104371097235, | |
| "grad_norm": 0.4199044406414032, | |
| "learning_rate": 8.004866180048662e-05, | |
| "loss": 0.495, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.22360987213797204, | |
| "grad_norm": 0.4374317526817322, | |
| "learning_rate": 7.980535279805353e-05, | |
| "loss": 0.533, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.22598870056497175, | |
| "grad_norm": 0.4060494899749756, | |
| "learning_rate": 7.956204379562045e-05, | |
| "loss": 0.4711, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.22836752899197146, | |
| "grad_norm": 0.49371036887168884, | |
| "learning_rate": 7.931873479318736e-05, | |
| "loss": 0.5417, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.23074635741897115, | |
| "grad_norm": 0.4903344511985779, | |
| "learning_rate": 7.907542579075427e-05, | |
| "loss": 0.4595, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.23312518584597086, | |
| "grad_norm": 0.4055982530117035, | |
| "learning_rate": 7.883211678832118e-05, | |
| "loss": 0.4926, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.23550401427297057, | |
| "grad_norm": 0.4552580416202545, | |
| "learning_rate": 7.858880778588808e-05, | |
| "loss": 0.5644, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.23788284269997026, | |
| "grad_norm": 0.4261854588985443, | |
| "learning_rate": 7.834549878345499e-05, | |
| "loss": 0.4915, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.23788284269997026, | |
| "eval_loss": 0.48822253942489624, | |
| "eval_runtime": 27.0528, | |
| "eval_samples_per_second": 27.65, | |
| "eval_steps_per_second": 13.825, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.24026167112696997, | |
| "grad_norm": 0.48629841208457947, | |
| "learning_rate": 7.81021897810219e-05, | |
| "loss": 0.4824, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.24264049955396966, | |
| "grad_norm": 0.4699622094631195, | |
| "learning_rate": 7.785888077858882e-05, | |
| "loss": 0.4642, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.24501932798096937, | |
| "grad_norm": 0.39009714126586914, | |
| "learning_rate": 7.761557177615573e-05, | |
| "loss": 0.4614, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.24739815640796908, | |
| "grad_norm": 0.4852162003517151, | |
| "learning_rate": 7.737226277372264e-05, | |
| "loss": 0.5112, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.24977698483496877, | |
| "grad_norm": 0.5088614821434021, | |
| "learning_rate": 7.712895377128954e-05, | |
| "loss": 0.4914, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.25215581326196845, | |
| "grad_norm": 0.4846271276473999, | |
| "learning_rate": 7.688564476885645e-05, | |
| "loss": 0.4832, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.25453464168896817, | |
| "grad_norm": 0.48459744453430176, | |
| "learning_rate": 7.664233576642336e-05, | |
| "loss": 0.5386, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.2569134701159679, | |
| "grad_norm": 0.47507190704345703, | |
| "learning_rate": 7.639902676399027e-05, | |
| "loss": 0.4593, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.2592922985429676, | |
| "grad_norm": 0.46605178713798523, | |
| "learning_rate": 7.615571776155717e-05, | |
| "loss": 0.5265, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.2616711269699673, | |
| "grad_norm": 0.4616071283817291, | |
| "learning_rate": 7.591240875912408e-05, | |
| "loss": 0.4323, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.264049955396967, | |
| "grad_norm": 0.5419961214065552, | |
| "learning_rate": 7.5669099756691e-05, | |
| "loss": 0.5403, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.2664287838239667, | |
| "grad_norm": 0.4992631673812866, | |
| "learning_rate": 7.542579075425791e-05, | |
| "loss": 0.4558, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.2688076122509664, | |
| "grad_norm": 0.5071640610694885, | |
| "learning_rate": 7.518248175182482e-05, | |
| "loss": 0.5385, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.2711864406779661, | |
| "grad_norm": 0.5004437565803528, | |
| "learning_rate": 7.493917274939173e-05, | |
| "loss": 0.5106, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.2735652691049658, | |
| "grad_norm": 0.4706288278102875, | |
| "learning_rate": 7.469586374695864e-05, | |
| "loss": 0.5333, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2759440975319655, | |
| "grad_norm": 0.5518192052841187, | |
| "learning_rate": 7.445255474452556e-05, | |
| "loss": 0.5945, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.2783229259589652, | |
| "grad_norm": 0.4758547246456146, | |
| "learning_rate": 7.420924574209245e-05, | |
| "loss": 0.533, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.2807017543859649, | |
| "grad_norm": 0.4491472840309143, | |
| "learning_rate": 7.396593673965937e-05, | |
| "loss": 0.4213, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.2830805828129646, | |
| "grad_norm": 0.4540884494781494, | |
| "learning_rate": 7.372262773722628e-05, | |
| "loss": 0.4747, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.2854594112399643, | |
| "grad_norm": 0.4252176880836487, | |
| "learning_rate": 7.347931873479319e-05, | |
| "loss": 0.4639, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.28783823966696404, | |
| "grad_norm": 0.4695313274860382, | |
| "learning_rate": 7.32360097323601e-05, | |
| "loss": 0.5563, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.29021706809396375, | |
| "grad_norm": 0.49951449036598206, | |
| "learning_rate": 7.299270072992701e-05, | |
| "loss": 0.4517, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.2925958965209634, | |
| "grad_norm": 0.4763878285884857, | |
| "learning_rate": 7.274939172749393e-05, | |
| "loss": 0.5223, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.2949747249479631, | |
| "grad_norm": 0.49518251419067383, | |
| "learning_rate": 7.250608272506084e-05, | |
| "loss": 0.5832, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.29735355337496283, | |
| "grad_norm": 0.4721708595752716, | |
| "learning_rate": 7.226277372262774e-05, | |
| "loss": 0.5259, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.29973238180196254, | |
| "grad_norm": 0.4356157183647156, | |
| "learning_rate": 7.201946472019465e-05, | |
| "loss": 0.408, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.30211121022896226, | |
| "grad_norm": 0.4771776795387268, | |
| "learning_rate": 7.177615571776156e-05, | |
| "loss": 0.4909, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.3044900386559619, | |
| "grad_norm": 0.5367064476013184, | |
| "learning_rate": 7.153284671532847e-05, | |
| "loss": 0.556, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.3068688670829616, | |
| "grad_norm": 0.43037423491477966, | |
| "learning_rate": 7.128953771289538e-05, | |
| "loss": 0.3805, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.30924769550996134, | |
| "grad_norm": 0.4576215445995331, | |
| "learning_rate": 7.10462287104623e-05, | |
| "loss": 0.437, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.31162652393696105, | |
| "grad_norm": 0.46385103464126587, | |
| "learning_rate": 7.080291970802921e-05, | |
| "loss": 0.4561, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.31400535236396077, | |
| "grad_norm": 0.4961807131767273, | |
| "learning_rate": 7.055961070559612e-05, | |
| "loss": 0.3691, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.3163841807909605, | |
| "grad_norm": 0.5330118536949158, | |
| "learning_rate": 7.031630170316302e-05, | |
| "loss": 0.5341, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.31876300921796014, | |
| "grad_norm": 0.4410998225212097, | |
| "learning_rate": 7.007299270072993e-05, | |
| "loss": 0.4867, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.32114183764495985, | |
| "grad_norm": 0.4794428050518036, | |
| "learning_rate": 6.982968369829684e-05, | |
| "loss": 0.4818, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.32352066607195956, | |
| "grad_norm": 0.49389636516571045, | |
| "learning_rate": 6.958637469586375e-05, | |
| "loss": 0.5156, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.3258994944989593, | |
| "grad_norm": 0.49501174688339233, | |
| "learning_rate": 6.934306569343067e-05, | |
| "loss": 0.5122, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.328278322925959, | |
| "grad_norm": 0.4967164695262909, | |
| "learning_rate": 6.909975669099758e-05, | |
| "loss": 0.4853, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.33065715135295864, | |
| "grad_norm": 0.5276123881340027, | |
| "learning_rate": 6.885644768856449e-05, | |
| "loss": 0.4703, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.33303597977995836, | |
| "grad_norm": 0.5336246490478516, | |
| "learning_rate": 6.86131386861314e-05, | |
| "loss": 0.4652, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.33541480820695807, | |
| "grad_norm": 0.3898225426673889, | |
| "learning_rate": 6.83698296836983e-05, | |
| "loss": 0.3983, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.3377936366339578, | |
| "grad_norm": 0.4705723226070404, | |
| "learning_rate": 6.81265206812652e-05, | |
| "loss": 0.4646, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.3401724650609575, | |
| "grad_norm": 0.4973605275154114, | |
| "learning_rate": 6.788321167883211e-05, | |
| "loss": 0.4848, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.34255129348795715, | |
| "grad_norm": 0.5203383564949036, | |
| "learning_rate": 6.763990267639902e-05, | |
| "loss": 0.4992, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.34493012191495687, | |
| "grad_norm": 0.47000083327293396, | |
| "learning_rate": 6.739659367396593e-05, | |
| "loss": 0.4665, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.3473089503419566, | |
| "grad_norm": 0.4483359456062317, | |
| "learning_rate": 6.715328467153285e-05, | |
| "loss": 0.4553, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.3496877787689563, | |
| "grad_norm": 0.4584805965423584, | |
| "learning_rate": 6.690997566909976e-05, | |
| "loss": 0.4945, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.352066607195956, | |
| "grad_norm": 0.5037974119186401, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 0.4723, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.3544454356229557, | |
| "grad_norm": 0.4981141984462738, | |
| "learning_rate": 6.642335766423358e-05, | |
| "loss": 0.5417, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.3568242640499554, | |
| "grad_norm": 0.49627479910850525, | |
| "learning_rate": 6.618004866180048e-05, | |
| "loss": 0.4569, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3592030924769551, | |
| "grad_norm": 0.49889135360717773, | |
| "learning_rate": 6.593673965936739e-05, | |
| "loss": 0.5391, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.3615819209039548, | |
| "grad_norm": 0.5222321152687073, | |
| "learning_rate": 6.56934306569343e-05, | |
| "loss": 0.4482, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.3639607493309545, | |
| "grad_norm": 0.48798415064811707, | |
| "learning_rate": 6.545012165450122e-05, | |
| "loss": 0.5148, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.3663395777579542, | |
| "grad_norm": 0.4458797574043274, | |
| "learning_rate": 6.520681265206813e-05, | |
| "loss": 0.4516, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.3687184061849539, | |
| "grad_norm": 0.4997321665287018, | |
| "learning_rate": 6.496350364963504e-05, | |
| "loss": 0.4769, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.3710972346119536, | |
| "grad_norm": 0.5233137011528015, | |
| "learning_rate": 6.472019464720195e-05, | |
| "loss": 0.5108, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.3734760630389533, | |
| "grad_norm": 0.5136488676071167, | |
| "learning_rate": 6.447688564476886e-05, | |
| "loss": 0.4755, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.375854891465953, | |
| "grad_norm": 0.4736205041408539, | |
| "learning_rate": 6.423357664233576e-05, | |
| "loss": 0.3712, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.37823371989295274, | |
| "grad_norm": 0.43539518117904663, | |
| "learning_rate": 6.399026763990267e-05, | |
| "loss": 0.416, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.38061254831995245, | |
| "grad_norm": 0.44481900334358215, | |
| "learning_rate": 6.374695863746959e-05, | |
| "loss": 0.4628, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3829913767469521, | |
| "grad_norm": 0.45447418093681335, | |
| "learning_rate": 6.35036496350365e-05, | |
| "loss": 0.4273, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.3853702051739518, | |
| "grad_norm": 0.43469730019569397, | |
| "learning_rate": 6.326034063260341e-05, | |
| "loss": 0.4584, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.38774903360095153, | |
| "grad_norm": 0.4581772983074188, | |
| "learning_rate": 6.301703163017032e-05, | |
| "loss": 0.4379, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.39012786202795124, | |
| "grad_norm": 0.42583826184272766, | |
| "learning_rate": 6.277372262773723e-05, | |
| "loss": 0.4336, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.39250669045495096, | |
| "grad_norm": 0.5775184631347656, | |
| "learning_rate": 6.253041362530415e-05, | |
| "loss": 0.4901, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3948855188819506, | |
| "grad_norm": 0.4797421991825104, | |
| "learning_rate": 6.228710462287104e-05, | |
| "loss": 0.4319, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.3972643473089503, | |
| "grad_norm": 0.5240707397460938, | |
| "learning_rate": 6.204379562043796e-05, | |
| "loss": 0.4638, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.39964317573595004, | |
| "grad_norm": 0.5036562085151672, | |
| "learning_rate": 6.180048661800487e-05, | |
| "loss": 0.5198, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.40202200416294975, | |
| "grad_norm": 0.5376377105712891, | |
| "learning_rate": 6.155717761557178e-05, | |
| "loss": 0.4564, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.40440083258994947, | |
| "grad_norm": 0.5044119954109192, | |
| "learning_rate": 6.131386861313869e-05, | |
| "loss": 0.4269, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4067796610169492, | |
| "grad_norm": 0.5195197463035583, | |
| "learning_rate": 6.10705596107056e-05, | |
| "loss": 0.4866, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.40915848944394884, | |
| "grad_norm": 0.4676225483417511, | |
| "learning_rate": 6.082725060827251e-05, | |
| "loss": 0.4509, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.41153731787094855, | |
| "grad_norm": 0.4422922134399414, | |
| "learning_rate": 6.058394160583942e-05, | |
| "loss": 0.4498, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.41391614629794826, | |
| "grad_norm": 0.5001342296600342, | |
| "learning_rate": 6.034063260340633e-05, | |
| "loss": 0.4743, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.416294974724948, | |
| "grad_norm": 0.4759523272514343, | |
| "learning_rate": 6.0097323600973245e-05, | |
| "loss": 0.4514, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.4186738031519477, | |
| "grad_norm": 0.46963706612586975, | |
| "learning_rate": 5.985401459854016e-05, | |
| "loss": 0.4776, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.49028345942497253, | |
| "learning_rate": 5.961070559610706e-05, | |
| "loss": 0.4632, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.42343146000594706, | |
| "grad_norm": 0.4388274848461151, | |
| "learning_rate": 5.9367396593673974e-05, | |
| "loss": 0.4822, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.42581028843294677, | |
| "grad_norm": 0.4349250793457031, | |
| "learning_rate": 5.9124087591240886e-05, | |
| "loss": 0.4468, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.4281891168599465, | |
| "grad_norm": 0.4884635806083679, | |
| "learning_rate": 5.8880778588807784e-05, | |
| "loss": 0.4724, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4305679452869462, | |
| "grad_norm": 0.4598194360733032, | |
| "learning_rate": 5.8637469586374696e-05, | |
| "loss": 0.4762, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.4329467737139459, | |
| "grad_norm": 0.5073289275169373, | |
| "learning_rate": 5.83941605839416e-05, | |
| "loss": 0.4732, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.43532560214094557, | |
| "grad_norm": 0.4630277752876282, | |
| "learning_rate": 5.815085158150851e-05, | |
| "loss": 0.4301, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.4377044305679453, | |
| "grad_norm": 0.42805591225624084, | |
| "learning_rate": 5.7907542579075425e-05, | |
| "loss": 0.47, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.440083258994945, | |
| "grad_norm": 0.6053451299667358, | |
| "learning_rate": 5.766423357664234e-05, | |
| "loss": 0.4977, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.4424620874219447, | |
| "grad_norm": 0.530331552028656, | |
| "learning_rate": 5.742092457420924e-05, | |
| "loss": 0.4492, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.4448409158489444, | |
| "grad_norm": 0.46791204810142517, | |
| "learning_rate": 5.7177615571776154e-05, | |
| "loss": 0.4402, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.4472197442759441, | |
| "grad_norm": 0.543755829334259, | |
| "learning_rate": 5.6934306569343066e-05, | |
| "loss": 0.5958, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.4495985727029438, | |
| "grad_norm": 0.4579405188560486, | |
| "learning_rate": 5.669099756690998e-05, | |
| "loss": 0.4145, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.4519774011299435, | |
| "grad_norm": 0.5464953184127808, | |
| "learning_rate": 5.644768856447688e-05, | |
| "loss": 0.4869, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4543562295569432, | |
| "grad_norm": 0.4730067551136017, | |
| "learning_rate": 5.6204379562043795e-05, | |
| "loss": 0.4432, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.4567350579839429, | |
| "grad_norm": 0.4859981834888458, | |
| "learning_rate": 5.596107055961071e-05, | |
| "loss": 0.4376, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.4591138864109426, | |
| "grad_norm": 0.4743499159812927, | |
| "learning_rate": 5.571776155717762e-05, | |
| "loss": 0.4542, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.4614927148379423, | |
| "grad_norm": 0.5267781019210815, | |
| "learning_rate": 5.5474452554744524e-05, | |
| "loss": 0.4684, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.463871543264942, | |
| "grad_norm": 0.5235748291015625, | |
| "learning_rate": 5.5231143552311436e-05, | |
| "loss": 0.4412, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.4662503716919417, | |
| "grad_norm": 0.4266457259654999, | |
| "learning_rate": 5.498783454987835e-05, | |
| "loss": 0.3855, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.46862920011894144, | |
| "grad_norm": 0.49032875895500183, | |
| "learning_rate": 5.474452554744526e-05, | |
| "loss": 0.4413, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.47100802854594115, | |
| "grad_norm": 0.5913426280021667, | |
| "learning_rate": 5.4501216545012165e-05, | |
| "loss": 0.5653, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.4733868569729408, | |
| "grad_norm": 0.5137138366699219, | |
| "learning_rate": 5.425790754257908e-05, | |
| "loss": 0.4835, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.4757656853999405, | |
| "grad_norm": 0.43370646238327026, | |
| "learning_rate": 5.401459854014599e-05, | |
| "loss": 0.41, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4757656853999405, | |
| "eval_loss": 0.4463058412075043, | |
| "eval_runtime": 24.577, | |
| "eval_samples_per_second": 30.435, | |
| "eval_steps_per_second": 15.218, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.47814451382694023, | |
| "grad_norm": 0.5559374690055847, | |
| "learning_rate": 5.37712895377129e-05, | |
| "loss": 0.5726, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.48052334225393994, | |
| "grad_norm": 0.48858100175857544, | |
| "learning_rate": 5.3527980535279806e-05, | |
| "loss": 0.481, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.48290217068093966, | |
| "grad_norm": 0.45764482021331787, | |
| "learning_rate": 5.328467153284672e-05, | |
| "loss": 0.4345, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.4852809991079393, | |
| "grad_norm": 0.5081002116203308, | |
| "learning_rate": 5.304136253041363e-05, | |
| "loss": 0.4091, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.487659827534939, | |
| "grad_norm": 0.47981277108192444, | |
| "learning_rate": 5.279805352798054e-05, | |
| "loss": 0.4198, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.49003865596193874, | |
| "grad_norm": 0.5422961115837097, | |
| "learning_rate": 5.255474452554745e-05, | |
| "loss": 0.4645, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.49241748438893845, | |
| "grad_norm": 0.5617045164108276, | |
| "learning_rate": 5.231143552311436e-05, | |
| "loss": 0.499, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.49479631281593817, | |
| "grad_norm": 0.5472312569618225, | |
| "learning_rate": 5.206812652068127e-05, | |
| "loss": 0.4303, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.4971751412429379, | |
| "grad_norm": 0.560845136642456, | |
| "learning_rate": 5.182481751824818e-05, | |
| "loss": 0.507, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.49955396966993754, | |
| "grad_norm": 0.5158256888389587, | |
| "learning_rate": 5.158150851581509e-05, | |
| "loss": 0.5126, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5019327980969372, | |
| "grad_norm": 0.49810492992401123, | |
| "learning_rate": 5.1338199513382e-05, | |
| "loss": 0.3799, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.5043116265239369, | |
| "grad_norm": 0.5269343852996826, | |
| "learning_rate": 5.109489051094891e-05, | |
| "loss": 0.5066, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.5066904549509367, | |
| "grad_norm": 0.48883453011512756, | |
| "learning_rate": 5.0851581508515824e-05, | |
| "loss": 0.4593, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.5090692833779363, | |
| "grad_norm": 0.5003328919410706, | |
| "learning_rate": 5.060827250608273e-05, | |
| "loss": 0.4672, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.5114481118049361, | |
| "grad_norm": 0.5346857905387878, | |
| "learning_rate": 5.036496350364964e-05, | |
| "loss": 0.459, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5138269402319358, | |
| "grad_norm": 0.5018550157546997, | |
| "learning_rate": 5.012165450121655e-05, | |
| "loss": 0.427, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.5162057686589355, | |
| "grad_norm": 0.5394279956817627, | |
| "learning_rate": 4.987834549878346e-05, | |
| "loss": 0.4906, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.5185845970859352, | |
| "grad_norm": 0.41929885745048523, | |
| "learning_rate": 4.963503649635037e-05, | |
| "loss": 0.4719, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.5209634255129348, | |
| "grad_norm": 0.5181702375411987, | |
| "learning_rate": 4.9391727493917275e-05, | |
| "loss": 0.4187, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.5233422539399346, | |
| "grad_norm": 0.5157968997955322, | |
| "learning_rate": 4.914841849148419e-05, | |
| "loss": 0.3988, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5257210823669343, | |
| "grad_norm": 0.5713850259780884, | |
| "learning_rate": 4.89051094890511e-05, | |
| "loss": 0.4988, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.528099910793934, | |
| "grad_norm": 0.5276979804039001, | |
| "learning_rate": 4.866180048661801e-05, | |
| "loss": 0.4289, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.5304787392209337, | |
| "grad_norm": 0.5099546313285828, | |
| "learning_rate": 4.8418491484184916e-05, | |
| "loss": 0.4143, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.5328575676479334, | |
| "grad_norm": 0.5845214128494263, | |
| "learning_rate": 4.817518248175183e-05, | |
| "loss": 0.528, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.5352363960749331, | |
| "grad_norm": 0.5065467357635498, | |
| "learning_rate": 4.793187347931874e-05, | |
| "loss": 0.4109, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5376152245019328, | |
| "grad_norm": 0.46541669964790344, | |
| "learning_rate": 4.7688564476885646e-05, | |
| "loss": 0.4202, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.5399940529289325, | |
| "grad_norm": 0.571874737739563, | |
| "learning_rate": 4.744525547445256e-05, | |
| "loss": 0.5088, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.5423728813559322, | |
| "grad_norm": 0.542327880859375, | |
| "learning_rate": 4.720194647201946e-05, | |
| "loss": 0.458, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.5447517097829319, | |
| "grad_norm": 0.48291677236557007, | |
| "learning_rate": 4.6958637469586375e-05, | |
| "loss": 0.3743, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.5471305382099316, | |
| "grad_norm": 0.5330439805984497, | |
| "learning_rate": 4.6715328467153287e-05, | |
| "loss": 0.4217, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5495093666369313, | |
| "grad_norm": 0.47011399269104004, | |
| "learning_rate": 4.64720194647202e-05, | |
| "loss": 0.4582, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.551888195063931, | |
| "grad_norm": 0.5513572692871094, | |
| "learning_rate": 4.6228710462287104e-05, | |
| "loss": 0.4527, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.5542670234909307, | |
| "grad_norm": 0.61734539270401, | |
| "learning_rate": 4.5985401459854016e-05, | |
| "loss": 0.5963, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.5566458519179304, | |
| "grad_norm": 0.4924643933773041, | |
| "learning_rate": 4.574209245742093e-05, | |
| "loss": 0.3916, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.5590246803449301, | |
| "grad_norm": 0.4713283181190491, | |
| "learning_rate": 4.549878345498784e-05, | |
| "loss": 0.3963, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5614035087719298, | |
| "grad_norm": 0.4790879786014557, | |
| "learning_rate": 4.5255474452554745e-05, | |
| "loss": 0.4097, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.5637823371989296, | |
| "grad_norm": 0.4752825200557709, | |
| "learning_rate": 4.5012165450121657e-05, | |
| "loss": 0.3671, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.5661611656259292, | |
| "grad_norm": 0.6054632663726807, | |
| "learning_rate": 4.476885644768857e-05, | |
| "loss": 0.478, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.568539994052929, | |
| "grad_norm": 0.5796523094177246, | |
| "learning_rate": 4.452554744525548e-05, | |
| "loss": 0.4677, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.5709188224799286, | |
| "grad_norm": 0.7159845232963562, | |
| "learning_rate": 4.4282238442822386e-05, | |
| "loss": 0.6, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5732976509069283, | |
| "grad_norm": 0.4576638638973236, | |
| "learning_rate": 4.40389294403893e-05, | |
| "loss": 0.3984, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.5756764793339281, | |
| "grad_norm": 0.5089020133018494, | |
| "learning_rate": 4.379562043795621e-05, | |
| "loss": 0.4245, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.5780553077609277, | |
| "grad_norm": 0.4587903618812561, | |
| "learning_rate": 4.355231143552312e-05, | |
| "loss": 0.399, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.5804341361879275, | |
| "grad_norm": 0.46936094760894775, | |
| "learning_rate": 4.3309002433090027e-05, | |
| "loss": 0.3892, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.5828129646149272, | |
| "grad_norm": 0.5407763719558716, | |
| "learning_rate": 4.306569343065693e-05, | |
| "loss": 0.5055, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.5851917930419268, | |
| "grad_norm": 0.5131598114967346, | |
| "learning_rate": 4.2822384428223844e-05, | |
| "loss": 0.469, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.5875706214689266, | |
| "grad_norm": 0.48786240816116333, | |
| "learning_rate": 4.2579075425790756e-05, | |
| "loss": 0.3993, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.5899494498959262, | |
| "grad_norm": 0.5068132281303406, | |
| "learning_rate": 4.233576642335767e-05, | |
| "loss": 0.4114, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.592328278322926, | |
| "grad_norm": 0.5316363573074341, | |
| "learning_rate": 4.209245742092457e-05, | |
| "loss": 0.446, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.5947071067499257, | |
| "grad_norm": 0.511885404586792, | |
| "learning_rate": 4.1849148418491485e-05, | |
| "loss": 0.4738, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5970859351769253, | |
| "grad_norm": 0.6263471245765686, | |
| "learning_rate": 4.16058394160584e-05, | |
| "loss": 0.5074, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.5994647636039251, | |
| "grad_norm": 0.5359822511672974, | |
| "learning_rate": 4.136253041362531e-05, | |
| "loss": 0.4239, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.6018435920309247, | |
| "grad_norm": 0.5764995217323303, | |
| "learning_rate": 4.1119221411192214e-05, | |
| "loss": 0.5247, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.6042224204579245, | |
| "grad_norm": 0.5081723928451538, | |
| "learning_rate": 4.0875912408759126e-05, | |
| "loss": 0.4104, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.6066012488849242, | |
| "grad_norm": 0.5557652115821838, | |
| "learning_rate": 4.063260340632604e-05, | |
| "loss": 0.4889, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.6089800773119238, | |
| "grad_norm": 0.48941561579704285, | |
| "learning_rate": 4.038929440389295e-05, | |
| "loss": 0.4557, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.6113589057389236, | |
| "grad_norm": 0.565647304058075, | |
| "learning_rate": 4.0145985401459855e-05, | |
| "loss": 0.4924, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.6137377341659233, | |
| "grad_norm": 0.5029006004333496, | |
| "learning_rate": 3.990267639902677e-05, | |
| "loss": 0.4885, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.616116562592923, | |
| "grad_norm": 0.5224466323852539, | |
| "learning_rate": 3.965936739659368e-05, | |
| "loss": 0.4433, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.6184953910199227, | |
| "grad_norm": 0.6890097260475159, | |
| "learning_rate": 3.941605839416059e-05, | |
| "loss": 0.5399, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6208742194469223, | |
| "grad_norm": 0.48153895139694214, | |
| "learning_rate": 3.9172749391727496e-05, | |
| "loss": 0.3881, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.6232530478739221, | |
| "grad_norm": 0.5819414258003235, | |
| "learning_rate": 3.892944038929441e-05, | |
| "loss": 0.4568, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.6256318763009218, | |
| "grad_norm": 0.4706772267818451, | |
| "learning_rate": 3.868613138686132e-05, | |
| "loss": 0.4382, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.6280107047279215, | |
| "grad_norm": 0.5553036332130432, | |
| "learning_rate": 3.8442822384428225e-05, | |
| "loss": 0.4611, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.6303895331549212, | |
| "grad_norm": 0.5100443959236145, | |
| "learning_rate": 3.819951338199514e-05, | |
| "loss": 0.424, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.632768361581921, | |
| "grad_norm": 0.47650212049484253, | |
| "learning_rate": 3.795620437956204e-05, | |
| "loss": 0.4642, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.6351471900089206, | |
| "grad_norm": 0.5056859254837036, | |
| "learning_rate": 3.7712895377128954e-05, | |
| "loss": 0.4643, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.6375260184359203, | |
| "grad_norm": 0.48869505524635315, | |
| "learning_rate": 3.7469586374695866e-05, | |
| "loss": 0.4058, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.63990484686292, | |
| "grad_norm": 0.5686202645301819, | |
| "learning_rate": 3.722627737226278e-05, | |
| "loss": 0.4772, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.6422836752899197, | |
| "grad_norm": 0.523289680480957, | |
| "learning_rate": 3.698296836982968e-05, | |
| "loss": 0.4257, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6446625037169195, | |
| "grad_norm": 0.4355628192424774, | |
| "learning_rate": 3.6739659367396595e-05, | |
| "loss": 0.4035, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.6470413321439191, | |
| "grad_norm": 0.5667319297790527, | |
| "learning_rate": 3.649635036496351e-05, | |
| "loss": 0.5109, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.6494201605709188, | |
| "grad_norm": 0.49869677424430847, | |
| "learning_rate": 3.625304136253042e-05, | |
| "loss": 0.4496, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.6517989889979185, | |
| "grad_norm": 0.4514950215816498, | |
| "learning_rate": 3.6009732360097324e-05, | |
| "loss": 0.3948, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.6541778174249182, | |
| "grad_norm": 0.5341079235076904, | |
| "learning_rate": 3.5766423357664236e-05, | |
| "loss": 0.5351, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.656556645851918, | |
| "grad_norm": 0.5494990348815918, | |
| "learning_rate": 3.552311435523115e-05, | |
| "loss": 0.5355, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.6589354742789176, | |
| "grad_norm": 0.5039032697677612, | |
| "learning_rate": 3.527980535279806e-05, | |
| "loss": 0.388, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.6613143027059173, | |
| "grad_norm": 0.543323278427124, | |
| "learning_rate": 3.5036496350364965e-05, | |
| "loss": 0.5407, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.6636931311329171, | |
| "grad_norm": 0.5307977795600891, | |
| "learning_rate": 3.479318734793188e-05, | |
| "loss": 0.4416, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.6660719595599167, | |
| "grad_norm": 0.5810051560401917, | |
| "learning_rate": 3.454987834549879e-05, | |
| "loss": 0.4984, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6684507879869165, | |
| "grad_norm": 0.5557127594947815, | |
| "learning_rate": 3.43065693430657e-05, | |
| "loss": 0.4426, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.6708296164139161, | |
| "grad_norm": 0.49247419834136963, | |
| "learning_rate": 3.40632603406326e-05, | |
| "loss": 0.4353, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.6732084448409158, | |
| "grad_norm": 0.5093483328819275, | |
| "learning_rate": 3.381995133819951e-05, | |
| "loss": 0.4141, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.6755872732679156, | |
| "grad_norm": 0.5625473856925964, | |
| "learning_rate": 3.357664233576642e-05, | |
| "loss": 0.5592, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.6779661016949152, | |
| "grad_norm": 0.47916609048843384, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 0.4519, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.680344930121915, | |
| "grad_norm": 0.514183759689331, | |
| "learning_rate": 3.309002433090024e-05, | |
| "loss": 0.3704, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.6827237585489146, | |
| "grad_norm": 0.5638673305511475, | |
| "learning_rate": 3.284671532846715e-05, | |
| "loss": 0.4813, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.6851025869759143, | |
| "grad_norm": 0.436166375875473, | |
| "learning_rate": 3.2603406326034064e-05, | |
| "loss": 0.4546, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.6874814154029141, | |
| "grad_norm": 0.5311592221260071, | |
| "learning_rate": 3.2360097323600976e-05, | |
| "loss": 0.4948, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.6898602438299137, | |
| "grad_norm": 0.5682978630065918, | |
| "learning_rate": 3.211678832116788e-05, | |
| "loss": 0.4595, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6922390722569135, | |
| "grad_norm": 0.5363884568214417, | |
| "learning_rate": 3.187347931873479e-05, | |
| "loss": 0.3572, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.6946179006839132, | |
| "grad_norm": 0.517888069152832, | |
| "learning_rate": 3.1630170316301705e-05, | |
| "loss": 0.3949, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.6969967291109129, | |
| "grad_norm": 0.5211926698684692, | |
| "learning_rate": 3.138686131386862e-05, | |
| "loss": 0.4904, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.6993755575379126, | |
| "grad_norm": 0.5304282903671265, | |
| "learning_rate": 3.114355231143552e-05, | |
| "loss": 0.4632, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 0.4735037386417389, | |
| "learning_rate": 3.0900243309002434e-05, | |
| "loss": 0.4002, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.704133214391912, | |
| "grad_norm": 0.5239513516426086, | |
| "learning_rate": 3.0656934306569346e-05, | |
| "loss": 0.4758, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.7065120428189117, | |
| "grad_norm": 0.5408491492271423, | |
| "learning_rate": 3.0413625304136255e-05, | |
| "loss": 0.4993, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.7088908712459114, | |
| "grad_norm": 0.6086730360984802, | |
| "learning_rate": 3.0170316301703166e-05, | |
| "loss": 0.5048, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.7112696996729111, | |
| "grad_norm": 0.4720120131969452, | |
| "learning_rate": 2.992700729927008e-05, | |
| "loss": 0.4215, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.7136485280999108, | |
| "grad_norm": 0.5266088843345642, | |
| "learning_rate": 2.9683698296836987e-05, | |
| "loss": 0.4553, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7136485280999108, | |
| "eval_loss": 0.42479127645492554, | |
| "eval_runtime": 24.6017, | |
| "eval_samples_per_second": 30.404, | |
| "eval_steps_per_second": 15.202, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7160273565269105, | |
| "grad_norm": 0.5402945280075073, | |
| "learning_rate": 2.9440389294403892e-05, | |
| "loss": 0.4477, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.7184061849539102, | |
| "grad_norm": 0.5225419402122498, | |
| "learning_rate": 2.91970802919708e-05, | |
| "loss": 0.4112, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.72078501338091, | |
| "grad_norm": 0.516687273979187, | |
| "learning_rate": 2.8953771289537713e-05, | |
| "loss": 0.51, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.7231638418079096, | |
| "grad_norm": 0.5380510091781616, | |
| "learning_rate": 2.871046228710462e-05, | |
| "loss": 0.3956, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.7255426702349093, | |
| "grad_norm": 0.5149944424629211, | |
| "learning_rate": 2.8467153284671533e-05, | |
| "loss": 0.4596, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.727921498661909, | |
| "grad_norm": 0.5109550356864929, | |
| "learning_rate": 2.822384428223844e-05, | |
| "loss": 0.4538, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.7303003270889087, | |
| "grad_norm": 0.4751725196838379, | |
| "learning_rate": 2.7980535279805354e-05, | |
| "loss": 0.3858, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.7326791555159085, | |
| "grad_norm": 0.5320805907249451, | |
| "learning_rate": 2.7737226277372262e-05, | |
| "loss": 0.4388, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.7350579839429081, | |
| "grad_norm": 0.4881283938884735, | |
| "learning_rate": 2.7493917274939174e-05, | |
| "loss": 0.3787, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.7374368123699078, | |
| "grad_norm": 0.5177855491638184, | |
| "learning_rate": 2.7250608272506083e-05, | |
| "loss": 0.4489, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7398156407969075, | |
| "grad_norm": 0.4821816682815552, | |
| "learning_rate": 2.7007299270072995e-05, | |
| "loss": 0.4178, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.7421944692239072, | |
| "grad_norm": 0.5222703218460083, | |
| "learning_rate": 2.6763990267639903e-05, | |
| "loss": 0.3892, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.744573297650907, | |
| "grad_norm": 0.554428219795227, | |
| "learning_rate": 2.6520681265206815e-05, | |
| "loss": 0.425, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.7469521260779066, | |
| "grad_norm": 0.47558993101119995, | |
| "learning_rate": 2.6277372262773724e-05, | |
| "loss": 0.4414, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.7493309545049064, | |
| "grad_norm": 0.5236990451812744, | |
| "learning_rate": 2.6034063260340636e-05, | |
| "loss": 0.4194, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.751709782931906, | |
| "grad_norm": 0.5379797220230103, | |
| "learning_rate": 2.5790754257907544e-05, | |
| "loss": 0.4434, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.7540886113589057, | |
| "grad_norm": 0.7105163931846619, | |
| "learning_rate": 2.5547445255474456e-05, | |
| "loss": 0.6032, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.7564674397859055, | |
| "grad_norm": 0.5832124352455139, | |
| "learning_rate": 2.5304136253041365e-05, | |
| "loss": 0.5152, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.7588462682129051, | |
| "grad_norm": 0.5089643597602844, | |
| "learning_rate": 2.5060827250608277e-05, | |
| "loss": 0.4333, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.7612250966399049, | |
| "grad_norm": 0.509891152381897, | |
| "learning_rate": 2.4817518248175185e-05, | |
| "loss": 0.4357, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7636039250669046, | |
| "grad_norm": 0.6722425222396851, | |
| "learning_rate": 2.4574209245742094e-05, | |
| "loss": 0.5146, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.7659827534939042, | |
| "grad_norm": 0.5415557622909546, | |
| "learning_rate": 2.4330900243309006e-05, | |
| "loss": 0.5069, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.768361581920904, | |
| "grad_norm": 0.4898138642311096, | |
| "learning_rate": 2.4087591240875914e-05, | |
| "loss": 0.4025, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.7707404103479036, | |
| "grad_norm": 0.47411227226257324, | |
| "learning_rate": 2.3844282238442823e-05, | |
| "loss": 0.4339, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.7731192387749034, | |
| "grad_norm": 0.5482645034790039, | |
| "learning_rate": 2.360097323600973e-05, | |
| "loss": 0.4023, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.7754980672019031, | |
| "grad_norm": 0.4828038513660431, | |
| "learning_rate": 2.3357664233576643e-05, | |
| "loss": 0.4683, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.7778768956289027, | |
| "grad_norm": 0.47106465697288513, | |
| "learning_rate": 2.3114355231143552e-05, | |
| "loss": 0.3926, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.7802557240559025, | |
| "grad_norm": 0.5016714930534363, | |
| "learning_rate": 2.2871046228710464e-05, | |
| "loss": 0.3897, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.7826345524829021, | |
| "grad_norm": 0.48692432045936584, | |
| "learning_rate": 2.2627737226277372e-05, | |
| "loss": 0.4213, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.7850133809099019, | |
| "grad_norm": 0.625466525554657, | |
| "learning_rate": 2.2384428223844284e-05, | |
| "loss": 0.4761, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7873922093369016, | |
| "grad_norm": 0.5269051194190979, | |
| "learning_rate": 2.2141119221411193e-05, | |
| "loss": 0.4216, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.7897710377639012, | |
| "grad_norm": 0.6055566072463989, | |
| "learning_rate": 2.1897810218978105e-05, | |
| "loss": 0.5351, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.792149866190901, | |
| "grad_norm": 0.5403903126716614, | |
| "learning_rate": 2.1654501216545013e-05, | |
| "loss": 0.4028, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.7945286946179007, | |
| "grad_norm": 0.6619285941123962, | |
| "learning_rate": 2.1411192214111922e-05, | |
| "loss": 0.4291, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.7969075230449004, | |
| "grad_norm": 0.48977240920066833, | |
| "learning_rate": 2.1167883211678834e-05, | |
| "loss": 0.4171, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.7992863514719001, | |
| "grad_norm": 0.5986789464950562, | |
| "learning_rate": 2.0924574209245742e-05, | |
| "loss": 0.3742, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.8016651798988997, | |
| "grad_norm": 0.5153005719184875, | |
| "learning_rate": 2.0681265206812654e-05, | |
| "loss": 0.4147, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.8040440083258995, | |
| "grad_norm": 0.5173148512840271, | |
| "learning_rate": 2.0437956204379563e-05, | |
| "loss": 0.4648, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.8064228367528992, | |
| "grad_norm": 0.5250167846679688, | |
| "learning_rate": 2.0194647201946475e-05, | |
| "loss": 0.4919, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.8088016651798989, | |
| "grad_norm": 0.5532633066177368, | |
| "learning_rate": 1.9951338199513383e-05, | |
| "loss": 0.3462, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8111804936068986, | |
| "grad_norm": 0.4855538308620453, | |
| "learning_rate": 1.9708029197080295e-05, | |
| "loss": 0.4902, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.8135593220338984, | |
| "grad_norm": 0.597204864025116, | |
| "learning_rate": 1.9464720194647204e-05, | |
| "loss": 0.4741, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.815938150460898, | |
| "grad_norm": 0.5064167380332947, | |
| "learning_rate": 1.9221411192214112e-05, | |
| "loss": 0.4488, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.8183169788878977, | |
| "grad_norm": 0.5640056133270264, | |
| "learning_rate": 1.897810218978102e-05, | |
| "loss": 0.4408, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.8206958073148974, | |
| "grad_norm": 0.5452293753623962, | |
| "learning_rate": 1.8734793187347933e-05, | |
| "loss": 0.4406, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.8230746357418971, | |
| "grad_norm": 0.5527360439300537, | |
| "learning_rate": 1.849148418491484e-05, | |
| "loss": 0.432, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.8254534641688969, | |
| "grad_norm": 0.5353720784187317, | |
| "learning_rate": 1.8248175182481753e-05, | |
| "loss": 0.4195, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.8278322925958965, | |
| "grad_norm": 0.4622812569141388, | |
| "learning_rate": 1.8004866180048662e-05, | |
| "loss": 0.4146, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.8302111210228962, | |
| "grad_norm": 0.5472270250320435, | |
| "learning_rate": 1.7761557177615574e-05, | |
| "loss": 0.4521, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.832589949449896, | |
| "grad_norm": 0.5854270458221436, | |
| "learning_rate": 1.7518248175182482e-05, | |
| "loss": 0.3974, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8349687778768956, | |
| "grad_norm": 0.4779118597507477, | |
| "learning_rate": 1.7274939172749394e-05, | |
| "loss": 0.3683, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.8373476063038954, | |
| "grad_norm": 0.5334904789924622, | |
| "learning_rate": 1.70316301703163e-05, | |
| "loss": 0.4319, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.839726434730895, | |
| "grad_norm": 0.49920886754989624, | |
| "learning_rate": 1.678832116788321e-05, | |
| "loss": 0.4115, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.6039828658103943, | |
| "learning_rate": 1.654501216545012e-05, | |
| "loss": 0.4133, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.8444840915848945, | |
| "grad_norm": 0.5769008994102478, | |
| "learning_rate": 1.6301703163017032e-05, | |
| "loss": 0.4316, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.8468629200118941, | |
| "grad_norm": 0.5622073411941528, | |
| "learning_rate": 1.605839416058394e-05, | |
| "loss": 0.5002, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.8492417484388939, | |
| "grad_norm": 0.5158018469810486, | |
| "learning_rate": 1.5815085158150852e-05, | |
| "loss": 0.3537, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.8516205768658935, | |
| "grad_norm": 0.5640501379966736, | |
| "learning_rate": 1.557177615571776e-05, | |
| "loss": 0.5049, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.8539994052928932, | |
| "grad_norm": 0.5024840831756592, | |
| "learning_rate": 1.5328467153284673e-05, | |
| "loss": 0.4185, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.856378233719893, | |
| "grad_norm": 0.5789101123809814, | |
| "learning_rate": 1.5085158150851583e-05, | |
| "loss": 0.4016, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8587570621468926, | |
| "grad_norm": 0.5746119022369385, | |
| "learning_rate": 1.4841849148418493e-05, | |
| "loss": 0.4421, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.8611358905738924, | |
| "grad_norm": 0.5970929861068726, | |
| "learning_rate": 1.45985401459854e-05, | |
| "loss": 0.5766, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.863514719000892, | |
| "grad_norm": 0.5322396755218506, | |
| "learning_rate": 1.435523114355231e-05, | |
| "loss": 0.4418, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.8658935474278918, | |
| "grad_norm": 0.5240874886512756, | |
| "learning_rate": 1.411192214111922e-05, | |
| "loss": 0.461, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.8682723758548915, | |
| "grad_norm": 0.6501151919364929, | |
| "learning_rate": 1.3868613138686131e-05, | |
| "loss": 0.3628, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.8706512042818911, | |
| "grad_norm": 0.5303117036819458, | |
| "learning_rate": 1.3625304136253041e-05, | |
| "loss": 0.441, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.8730300327088909, | |
| "grad_norm": 0.48986294865608215, | |
| "learning_rate": 1.3381995133819952e-05, | |
| "loss": 0.4371, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.8754088611358906, | |
| "grad_norm": 0.5813177824020386, | |
| "learning_rate": 1.3138686131386862e-05, | |
| "loss": 0.4787, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.8777876895628903, | |
| "grad_norm": 0.5057734251022339, | |
| "learning_rate": 1.2895377128953772e-05, | |
| "loss": 0.3852, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.88016651798989, | |
| "grad_norm": 0.5309827923774719, | |
| "learning_rate": 1.2652068126520682e-05, | |
| "loss": 0.4518, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8825453464168896, | |
| "grad_norm": 0.5106115341186523, | |
| "learning_rate": 1.2408759124087593e-05, | |
| "loss": 0.414, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.8849241748438894, | |
| "grad_norm": 0.6071767210960388, | |
| "learning_rate": 1.2165450121654503e-05, | |
| "loss": 0.4988, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.8873030032708891, | |
| "grad_norm": 0.5328472852706909, | |
| "learning_rate": 1.1922141119221411e-05, | |
| "loss": 0.5115, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.8896818316978888, | |
| "grad_norm": 0.4671655595302582, | |
| "learning_rate": 1.1678832116788322e-05, | |
| "loss": 0.3692, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.8920606601248885, | |
| "grad_norm": 0.5364991426467896, | |
| "learning_rate": 1.1435523114355232e-05, | |
| "loss": 0.369, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.8944394885518882, | |
| "grad_norm": 0.5912413597106934, | |
| "learning_rate": 1.1192214111922142e-05, | |
| "loss": 0.4404, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.8968183169788879, | |
| "grad_norm": 0.54830402135849, | |
| "learning_rate": 1.0948905109489052e-05, | |
| "loss": 0.3858, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.8991971454058876, | |
| "grad_norm": 0.5137721300125122, | |
| "learning_rate": 1.0705596107055961e-05, | |
| "loss": 0.3976, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.9015759738328873, | |
| "grad_norm": 0.5718546509742737, | |
| "learning_rate": 1.0462287104622871e-05, | |
| "loss": 0.4714, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.903954802259887, | |
| "grad_norm": 0.5441231727600098, | |
| "learning_rate": 1.0218978102189781e-05, | |
| "loss": 0.4706, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9063336306868867, | |
| "grad_norm": 0.5782937407493591, | |
| "learning_rate": 9.975669099756692e-06, | |
| "loss": 0.4758, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.9087124591138864, | |
| "grad_norm": 0.5077878832817078, | |
| "learning_rate": 9.732360097323602e-06, | |
| "loss": 0.4739, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.9110912875408861, | |
| "grad_norm": 0.5690765380859375, | |
| "learning_rate": 9.48905109489051e-06, | |
| "loss": 0.3926, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.9134701159678859, | |
| "grad_norm": 0.5629989504814148, | |
| "learning_rate": 9.24574209245742e-06, | |
| "loss": 0.4728, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.9158489443948855, | |
| "grad_norm": 0.46644964814186096, | |
| "learning_rate": 9.002433090024331e-06, | |
| "loss": 0.3798, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.9182277728218852, | |
| "grad_norm": 0.5197304487228394, | |
| "learning_rate": 8.759124087591241e-06, | |
| "loss": 0.4181, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.9206066012488849, | |
| "grad_norm": 0.5880502462387085, | |
| "learning_rate": 8.51581508515815e-06, | |
| "loss": 0.5106, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.9229854296758846, | |
| "grad_norm": 0.6154156923294067, | |
| "learning_rate": 8.27250608272506e-06, | |
| "loss": 0.4669, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.9253642581028844, | |
| "grad_norm": 0.604262113571167, | |
| "learning_rate": 8.02919708029197e-06, | |
| "loss": 0.451, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.927743086529884, | |
| "grad_norm": 0.5451841354370117, | |
| "learning_rate": 7.78588807785888e-06, | |
| "loss": 0.4477, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9301219149568838, | |
| "grad_norm": 0.6219511032104492, | |
| "learning_rate": 7.542579075425792e-06, | |
| "loss": 0.4633, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.9325007433838834, | |
| "grad_norm": 0.5371522903442383, | |
| "learning_rate": 7.2992700729927e-06, | |
| "loss": 0.4109, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.9348795718108831, | |
| "grad_norm": 0.5737317800521851, | |
| "learning_rate": 7.05596107055961e-06, | |
| "loss": 0.434, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.9372584002378829, | |
| "grad_norm": 0.6237983703613281, | |
| "learning_rate": 6.812652068126521e-06, | |
| "loss": 0.4657, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.9396372286648825, | |
| "grad_norm": 0.5219160914421082, | |
| "learning_rate": 6.569343065693431e-06, | |
| "loss": 0.3983, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.9420160570918823, | |
| "grad_norm": 0.5657615661621094, | |
| "learning_rate": 6.326034063260341e-06, | |
| "loss": 0.3901, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.944394885518882, | |
| "grad_norm": 0.6130802035331726, | |
| "learning_rate": 6.082725060827251e-06, | |
| "loss": 0.4844, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.9467737139458816, | |
| "grad_norm": 0.6125937104225159, | |
| "learning_rate": 5.839416058394161e-06, | |
| "loss": 0.4312, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.9491525423728814, | |
| "grad_norm": 0.5268482565879822, | |
| "learning_rate": 5.596107055961071e-06, | |
| "loss": 0.3764, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.951531370799881, | |
| "grad_norm": 0.601588785648346, | |
| "learning_rate": 5.3527980535279805e-06, | |
| "loss": 0.429, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.951531370799881, | |
| "eval_loss": 0.4152432680130005, | |
| "eval_runtime": 24.6197, | |
| "eval_samples_per_second": 30.382, | |
| "eval_steps_per_second": 15.191, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9539101992268808, | |
| "grad_norm": 0.6177543997764587, | |
| "learning_rate": 5.109489051094891e-06, | |
| "loss": 0.4553, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.9562890276538805, | |
| "grad_norm": 0.5756453275680542, | |
| "learning_rate": 4.866180048661801e-06, | |
| "loss": 0.4026, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.9586678560808801, | |
| "grad_norm": 0.575804591178894, | |
| "learning_rate": 4.62287104622871e-06, | |
| "loss": 0.4351, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.9610466845078799, | |
| "grad_norm": 0.5242466926574707, | |
| "learning_rate": 4.379562043795621e-06, | |
| "loss": 0.4606, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.9634255129348795, | |
| "grad_norm": 0.5496498346328735, | |
| "learning_rate": 4.13625304136253e-06, | |
| "loss": 0.5175, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.9658043413618793, | |
| "grad_norm": 0.5221502184867859, | |
| "learning_rate": 3.89294403892944e-06, | |
| "loss": 0.3652, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.968183169788879, | |
| "grad_norm": 0.5580674409866333, | |
| "learning_rate": 3.64963503649635e-06, | |
| "loss": 0.3914, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.9705619982158786, | |
| "grad_norm": 0.5373774170875549, | |
| "learning_rate": 3.4063260340632603e-06, | |
| "loss": 0.399, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.9729408266428784, | |
| "grad_norm": 0.44664666056632996, | |
| "learning_rate": 3.1630170316301706e-06, | |
| "loss": 0.3659, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.975319655069878, | |
| "grad_norm": 0.5495840311050415, | |
| "learning_rate": 2.9197080291970804e-06, | |
| "loss": 0.4548, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9776984834968778, | |
| "grad_norm": 0.6554842591285706, | |
| "learning_rate": 2.6763990267639902e-06, | |
| "loss": 0.3963, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.9800773119238775, | |
| "grad_norm": 0.5366389155387878, | |
| "learning_rate": 2.4330900243309005e-06, | |
| "loss": 0.391, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.9824561403508771, | |
| "grad_norm": 0.5802214741706848, | |
| "learning_rate": 2.1897810218978103e-06, | |
| "loss": 0.4397, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.9848349687778769, | |
| "grad_norm": 0.5553064942359924, | |
| "learning_rate": 1.94647201946472e-06, | |
| "loss": 0.384, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.9872137972048766, | |
| "grad_norm": 0.5031337141990662, | |
| "learning_rate": 1.7031630170316302e-06, | |
| "loss": 0.3638, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.9895926256318763, | |
| "grad_norm": 0.5195873379707336, | |
| "learning_rate": 1.4598540145985402e-06, | |
| "loss": 0.4275, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.991971454058876, | |
| "grad_norm": 0.5617223978042603, | |
| "learning_rate": 1.2165450121654502e-06, | |
| "loss": 0.4383, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.9943502824858758, | |
| "grad_norm": 0.5391303300857544, | |
| "learning_rate": 9.7323600973236e-07, | |
| "loss": 0.5038, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.9967291109128754, | |
| "grad_norm": 0.5291772484779358, | |
| "learning_rate": 7.299270072992701e-07, | |
| "loss": 0.4119, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.9991079393398751, | |
| "grad_norm": 0.52870112657547, | |
| "learning_rate": 4.8661800486618e-07, | |
| "loss": 0.4272, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.8552002310752869, | |
| "learning_rate": 2.4330900243309e-07, | |
| "loss": 0.3481, | |
| "step": 421 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 421, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.4660818389684224e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |