| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.010863661053775122, |
| "eval_steps": 100, |
| "global_step": 100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00010863661053775122, |
| "grad_norm": 75.1976089477539, |
| "learning_rate": 6.666666649834951e-06, |
| "loss": 4.3799, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.00010863661053775122, |
| "eval_loss": 2.1306209564208984, |
| "eval_runtime": 47.4534, |
| "eval_samples_per_second": 6.238, |
| "eval_steps_per_second": 0.78, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.00021727322107550245, |
| "grad_norm": 61.36337661743164, |
| "learning_rate": 1.3333333299669903e-05, |
| "loss": 3.7486, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.00032590983161325367, |
| "grad_norm": 76.23029327392578, |
| "learning_rate": 1.9999999494757503e-05, |
| "loss": 3.994, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0004345464421510049, |
| "grad_norm": 47.53773880004883, |
| "learning_rate": 2.6666666599339806e-05, |
| "loss": 3.322, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.0005431830526887561, |
| "grad_norm": 35.830772399902344, |
| "learning_rate": 3.333333370392211e-05, |
| "loss": 2.9118, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0006518196632265073, |
| "grad_norm": 40.87867736816406, |
| "learning_rate": 3.9999998989515007e-05, |
| "loss": 2.962, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.0007604562737642585, |
| "grad_norm": 34.244205474853516, |
| "learning_rate": 4.6666664275107905e-05, |
| "loss": 2.5104, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.0008690928843020098, |
| "grad_norm": 32.14802551269531, |
| "learning_rate": 5.333333319867961e-05, |
| "loss": 1.7319, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.000977729494839761, |
| "grad_norm": 29.908082962036133, |
| "learning_rate": 6.000000212225132e-05, |
| "loss": 1.8764, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.0010863661053775121, |
| "grad_norm": 26.57996940612793, |
| "learning_rate": 6.666666740784422e-05, |
| "loss": 1.9402, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0011950027159152634, |
| "grad_norm": 31.68722915649414, |
| "learning_rate": 7.333333633141592e-05, |
| "loss": 2.4069, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.0013036393264530147, |
| "grad_norm": 37.19240188598633, |
| "learning_rate": 7.999999797903001e-05, |
| "loss": 2.241, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.001412275936990766, |
| "grad_norm": 25.371042251586914, |
| "learning_rate": 8.666666690260172e-05, |
| "loss": 1.8175, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.001520912547528517, |
| "grad_norm": 32.323822021484375, |
| "learning_rate": 9.333332855021581e-05, |
| "loss": 1.8046, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.0016295491580662683, |
| "grad_norm": 36.42885208129883, |
| "learning_rate": 9.999999747378752e-05, |
| "loss": 1.8406, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0017381857686040196, |
| "grad_norm": 25.119384765625, |
| "learning_rate": 0.00010666666639735922, |
| "loss": 1.5764, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.0018468223791417708, |
| "grad_norm": 40.079566955566406, |
| "learning_rate": 0.00011333332804497331, |
| "loss": 2.3284, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.001955458989679522, |
| "grad_norm": 39.090049743652344, |
| "learning_rate": 0.00012000000424450263, |
| "loss": 2.0081, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.002064095600217273, |
| "grad_norm": 30.269433975219727, |
| "learning_rate": 0.00012666666589211673, |
| "loss": 2.3601, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.0021727322107550242, |
| "grad_norm": 23.745521545410156, |
| "learning_rate": 0.00013333333481568843, |
| "loss": 2.0127, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0022813688212927757, |
| "grad_norm": 22.75748634338379, |
| "learning_rate": 0.0001399999891873449, |
| "loss": 1.6383, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.002390005431830527, |
| "grad_norm": 35.27119827270508, |
| "learning_rate": 0.00014666667266283184, |
| "loss": 2.3615, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.0024986420423682783, |
| "grad_norm": 33.7574348449707, |
| "learning_rate": 0.00015333332703448832, |
| "loss": 1.8493, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.0026072786529060294, |
| "grad_norm": 23.23124885559082, |
| "learning_rate": 0.00015999999595806003, |
| "loss": 1.8225, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.0027159152634437804, |
| "grad_norm": 20.905302047729492, |
| "learning_rate": 0.00016666666488163173, |
| "loss": 1.7292, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.002824551873981532, |
| "grad_norm": 39.847740173339844, |
| "learning_rate": 0.00017333333380520344, |
| "loss": 2.0045, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.002933188484519283, |
| "grad_norm": 47.51205062866211, |
| "learning_rate": 0.00017999998817685992, |
| "loss": 1.7328, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.003041825095057034, |
| "grad_norm": 25.91466522216797, |
| "learning_rate": 0.00018666665710043162, |
| "loss": 2.275, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.0031504617055947855, |
| "grad_norm": 22.333343505859375, |
| "learning_rate": 0.00019333332602400333, |
| "loss": 1.5069, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.0032590983161325366, |
| "grad_norm": 20.31553077697754, |
| "learning_rate": 0.00019999999494757503, |
| "loss": 2.0431, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.003367734926670288, |
| "grad_norm": 26.47588348388672, |
| "learning_rate": 0.00019999999494757503, |
| "loss": 2.6115, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.003476371537208039, |
| "grad_norm": 27.3530330657959, |
| "learning_rate": 0.00019999999494757503, |
| "loss": 2.4115, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.00358500814774579, |
| "grad_norm": 24.26110076904297, |
| "learning_rate": 0.00019999999494757503, |
| "loss": 2.559, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.0036936447582835417, |
| "grad_norm": 18.223628997802734, |
| "learning_rate": 0.0001999999803956598, |
| "loss": 2.1361, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.0038022813688212928, |
| "grad_norm": 15.68957805633545, |
| "learning_rate": 0.0001999999803956598, |
| "loss": 1.1702, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.003910917979359044, |
| "grad_norm": 21.0484676361084, |
| "learning_rate": 0.00019999996584374458, |
| "loss": 2.0919, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.004019554589896795, |
| "grad_norm": 24.82903289794922, |
| "learning_rate": 0.00019999996584374458, |
| "loss": 2.0038, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.004128191200434546, |
| "grad_norm": 26.992136001586914, |
| "learning_rate": 0.00019999996584374458, |
| "loss": 2.0042, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.004236827810972297, |
| "grad_norm": 20.41558265686035, |
| "learning_rate": 0.00019999995129182935, |
| "loss": 2.3824, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.0043454644215100485, |
| "grad_norm": 21.90787696838379, |
| "learning_rate": 0.00019999993673991412, |
| "loss": 1.7127, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0044541010320478, |
| "grad_norm": 16.50169563293457, |
| "learning_rate": 0.00019999990763608366, |
| "loss": 2.1749, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.0045627376425855515, |
| "grad_norm": 19.28976821899414, |
| "learning_rate": 0.00019999989308416843, |
| "loss": 1.8387, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.0046713742531233025, |
| "grad_norm": 26.109228134155273, |
| "learning_rate": 0.00019999989308416843, |
| "loss": 2.5672, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.004780010863661054, |
| "grad_norm": 27.74605369567871, |
| "learning_rate": 0.00019999986398033798, |
| "loss": 2.0975, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.004888647474198805, |
| "grad_norm": 22.240659713745117, |
| "learning_rate": 0.00019999984942842275, |
| "loss": 2.2658, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.004997284084736557, |
| "grad_norm": 22.99379539489746, |
| "learning_rate": 0.00019999983487650752, |
| "loss": 2.3556, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.005105920695274308, |
| "grad_norm": 16.565080642700195, |
| "learning_rate": 0.00019999980577267706, |
| "loss": 1.9804, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.005214557305812059, |
| "grad_norm": 20.19988441467285, |
| "learning_rate": 0.0001999997766688466, |
| "loss": 2.2955, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.00532319391634981, |
| "grad_norm": 22.294095993041992, |
| "learning_rate": 0.00019999976211693138, |
| "loss": 2.2707, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.005431830526887561, |
| "grad_norm": 16.89426612854004, |
| "learning_rate": 0.00019999973301310092, |
| "loss": 1.7356, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.005540467137425313, |
| "grad_norm": 19.89691162109375, |
| "learning_rate": 0.00019999970390927047, |
| "loss": 1.8736, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.005649103747963064, |
| "grad_norm": 19.16709327697754, |
| "learning_rate": 0.00019999968935735524, |
| "loss": 2.5384, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.005757740358500815, |
| "grad_norm": 20.3719482421875, |
| "learning_rate": 0.00019999964570160955, |
| "loss": 2.4689, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.005866376969038566, |
| "grad_norm": 19.757091522216797, |
| "learning_rate": 0.00019999963114969432, |
| "loss": 2.4842, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.005975013579576317, |
| "grad_norm": 17.72039222717285, |
| "learning_rate": 0.00019999958749394864, |
| "loss": 2.2902, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.006083650190114068, |
| "grad_norm": 15.102923393249512, |
| "learning_rate": 0.00019999955839011818, |
| "loss": 1.8482, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.00619228680065182, |
| "grad_norm": 16.011249542236328, |
| "learning_rate": 0.0001999995147343725, |
| "loss": 2.567, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.006300923411189571, |
| "grad_norm": 21.987627029418945, |
| "learning_rate": 0.00019999948563054204, |
| "loss": 2.032, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.006409560021727322, |
| "grad_norm": 17.612539291381836, |
| "learning_rate": 0.00019999944197479635, |
| "loss": 2.0042, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.006518196632265073, |
| "grad_norm": 16.884017944335938, |
| "learning_rate": 0.0001999994128709659, |
| "loss": 2.1906, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.006626833242802824, |
| "grad_norm": 25.764307022094727, |
| "learning_rate": 0.0001999993692152202, |
| "loss": 1.9963, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.006735469853340576, |
| "grad_norm": 18.538373947143555, |
| "learning_rate": 0.00019999932555947453, |
| "loss": 2.8442, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.006844106463878327, |
| "grad_norm": 20.098533630371094, |
| "learning_rate": 0.00019999929645564407, |
| "loss": 2.5989, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.006952743074416078, |
| "grad_norm": 28.170995712280273, |
| "learning_rate": 0.00019999923824798316, |
| "loss": 2.5551, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.007061379684953829, |
| "grad_norm": 15.595468521118164, |
| "learning_rate": 0.00019999919459223747, |
| "loss": 2.032, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.00717001629549158, |
| "grad_norm": 19.367572784423828, |
| "learning_rate": 0.0001999991509364918, |
| "loss": 2.4259, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.0072786529060293315, |
| "grad_norm": 22.636173248291016, |
| "learning_rate": 0.0001999991072807461, |
| "loss": 2.8257, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.007387289516567083, |
| "grad_norm": 19.595232009887695, |
| "learning_rate": 0.0001999990490730852, |
| "loss": 2.4524, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.0074959261271048344, |
| "grad_norm": 17.523649215698242, |
| "learning_rate": 0.0001999990054173395, |
| "loss": 2.37, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.0076045627376425855, |
| "grad_norm": 19.54729461669922, |
| "learning_rate": 0.00019999896176159382, |
| "loss": 2.0727, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.007713199348180337, |
| "grad_norm": 21.045974731445312, |
| "learning_rate": 0.0001999989035539329, |
| "loss": 2.1805, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.007821835958718088, |
| "grad_norm": 16.674009323120117, |
| "learning_rate": 0.000199998845346272, |
| "loss": 2.0192, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.00793047256925584, |
| "grad_norm": 20.260122299194336, |
| "learning_rate": 0.00019999878713861108, |
| "loss": 2.0772, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.00803910917979359, |
| "grad_norm": 40.26232147216797, |
| "learning_rate": 0.0001999987434828654, |
| "loss": 2.3982, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.008147745790331342, |
| "grad_norm": 13.498699188232422, |
| "learning_rate": 0.00019999868527520448, |
| "loss": 1.894, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.008256382400869093, |
| "grad_norm": 21.632869720458984, |
| "learning_rate": 0.00019999862706754357, |
| "loss": 2.3204, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.008365019011406844, |
| "grad_norm": 21.490217208862305, |
| "learning_rate": 0.00019999856885988265, |
| "loss": 2.9766, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.008473655621944595, |
| "grad_norm": 16.603654861450195, |
| "learning_rate": 0.00019999851065222174, |
| "loss": 1.7165, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.008582292232482346, |
| "grad_norm": 18.105417251586914, |
| "learning_rate": 0.0001999984378926456, |
| "loss": 1.6366, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.008690928843020097, |
| "grad_norm": 17.52101707458496, |
| "learning_rate": 0.00019999837968498468, |
| "loss": 2.1694, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.00879956545355785, |
| "grad_norm": 25.236108779907227, |
| "learning_rate": 0.00019999830692540854, |
| "loss": 2.62, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.0089082020640956, |
| "grad_norm": 17.830217361450195, |
| "learning_rate": 0.00019999824871774763, |
| "loss": 2.3372, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.009016838674633352, |
| "grad_norm": 19.23275375366211, |
| "learning_rate": 0.0001999981759581715, |
| "loss": 2.4547, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.009125475285171103, |
| "grad_norm": 21.711286544799805, |
| "learning_rate": 0.00019999810319859535, |
| "loss": 2.5832, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.009234111895708854, |
| "grad_norm": 19.885831832885742, |
| "learning_rate": 0.0001999980304390192, |
| "loss": 1.7628, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.009342748506246605, |
| "grad_norm": 19.456787109375, |
| "learning_rate": 0.00019999795767944306, |
| "loss": 1.9876, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.009451385116784356, |
| "grad_norm": 17.41053581237793, |
| "learning_rate": 0.00019999788491986692, |
| "loss": 2.4501, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.009560021727322107, |
| "grad_norm": 23.842021942138672, |
| "learning_rate": 0.00019999781216029078, |
| "loss": 2.625, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.009668658337859858, |
| "grad_norm": 16.086332321166992, |
| "learning_rate": 0.00019999773940071464, |
| "loss": 2.2009, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.00977729494839761, |
| "grad_norm": 19.22362518310547, |
| "learning_rate": 0.00019999765208922327, |
| "loss": 2.2728, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.00988593155893536, |
| "grad_norm": 19.044240951538086, |
| "learning_rate": 0.00019999759388156235, |
| "loss": 2.5071, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.009994568169473113, |
| "grad_norm": 23.328094482421875, |
| "learning_rate": 0.00019999750657007098, |
| "loss": 2.8502, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.010103204780010864, |
| "grad_norm": 17.411951065063477, |
| "learning_rate": 0.0001999974192585796, |
| "loss": 2.1442, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.010211841390548615, |
| "grad_norm": 19.746517181396484, |
| "learning_rate": 0.00019999733194708824, |
| "loss": 2.4696, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.010320478001086366, |
| "grad_norm": 26.232284545898438, |
| "learning_rate": 0.0001999972591875121, |
| "loss": 2.4734, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.010429114611624117, |
| "grad_norm": 19.801883697509766, |
| "learning_rate": 0.00019999717187602073, |
| "loss": 2.8977, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.010537751222161868, |
| "grad_norm": 24.759071350097656, |
| "learning_rate": 0.00019999708456452936, |
| "loss": 2.1901, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.01064638783269962, |
| "grad_norm": 17.468563079833984, |
| "learning_rate": 0.000199996997253038, |
| "loss": 1.5876, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.01075502444323737, |
| "grad_norm": 21.162076950073242, |
| "learning_rate": 0.00019999690994154662, |
| "loss": 2.3618, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.010863661053775122, |
| "grad_norm": 18.7758731842041, |
| "learning_rate": 0.00019999680807814002, |
| "loss": 2.9082, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.010863661053775122, |
| "eval_loss": 1.1110957860946655, |
| "eval_runtime": 47.3732, |
| "eval_samples_per_second": 6.248, |
| "eval_steps_per_second": 0.781, |
| "step": 100 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 27615, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.57749502246912e+16, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|