| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 250000000, |
| "global_step": 218, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.009216589861751152, |
| "grad_norm": 2.1054060459136963, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.7676, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.018433179723502304, |
| "grad_norm": 9.905202865600586, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.9468, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.027649769585253458, |
| "grad_norm": 1.3358978033065796, |
| "learning_rate": 3e-06, |
| "loss": 0.3785, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.03686635944700461, |
| "grad_norm": 3.6846885681152344, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.9081, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.04608294930875576, |
| "grad_norm": 2.038400411605835, |
| "learning_rate": 5e-06, |
| "loss": 0.6648, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.055299539170506916, |
| "grad_norm": 2.199817180633545, |
| "learning_rate": 6e-06, |
| "loss": 0.7593, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.06451612903225806, |
| "grad_norm": 2.8529105186462402, |
| "learning_rate": 7e-06, |
| "loss": 0.8711, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.07373271889400922, |
| "grad_norm": 1.8705589771270752, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.6107, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.08294930875576037, |
| "grad_norm": 2.1660964488983154, |
| "learning_rate": 9e-06, |
| "loss": 0.3503, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.09216589861751152, |
| "grad_norm": 1.2933653593063354, |
| "learning_rate": 1e-05, |
| "loss": 0.3459, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.10138248847926268, |
| "grad_norm": 1.471388578414917, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 0.3638, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.11059907834101383, |
| "grad_norm": 1.3579180240631104, |
| "learning_rate": 1.2e-05, |
| "loss": 0.3372, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.11981566820276497, |
| "grad_norm": 1.3174383640289307, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 0.4586, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.12903225806451613, |
| "grad_norm": 1.3872649669647217, |
| "learning_rate": 1.4e-05, |
| "loss": 0.347, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.1382488479262673, |
| "grad_norm": 1.552891731262207, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 0.3831, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.14746543778801843, |
| "grad_norm": 1.5922861099243164, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.4652, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.15668202764976957, |
| "grad_norm": 1.427708387374878, |
| "learning_rate": 1.7e-05, |
| "loss": 0.3327, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.16589861751152074, |
| "grad_norm": 1.6242046356201172, |
| "learning_rate": 1.8e-05, |
| "loss": 0.379, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.17511520737327188, |
| "grad_norm": 2.9465889930725098, |
| "learning_rate": 1.9e-05, |
| "loss": 0.4163, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.18433179723502305, |
| "grad_norm": 2.959738254547119, |
| "learning_rate": 2e-05, |
| "loss": 0.8535, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.1935483870967742, |
| "grad_norm": 2.475205898284912, |
| "learning_rate": 1.999988738608264e-05, |
| "loss": 0.5698, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.20276497695852536, |
| "grad_norm": 1.6797610521316528, |
| "learning_rate": 1.9999549547148767e-05, |
| "loss": 0.6646, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.2119815668202765, |
| "grad_norm": 1.5180604457855225, |
| "learning_rate": 1.9998986491652896e-05, |
| "loss": 0.3323, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.22119815668202766, |
| "grad_norm": 1.5638575553894043, |
| "learning_rate": 1.9998198233685676e-05, |
| "loss": 0.3772, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.2304147465437788, |
| "grad_norm": 1.1676337718963623, |
| "learning_rate": 1.9997184792973504e-05, |
| "loss": 0.2302, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.23963133640552994, |
| "grad_norm": 1.8648298978805542, |
| "learning_rate": 1.999594619487806e-05, |
| "loss": 0.53, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.2488479262672811, |
| "grad_norm": 1.3829364776611328, |
| "learning_rate": 1.999448247039565e-05, |
| "loss": 0.279, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.25806451612903225, |
| "grad_norm": 2.869626760482788, |
| "learning_rate": 1.999279365615644e-05, |
| "loss": 0.6919, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.2672811059907834, |
| "grad_norm": 2.110597610473633, |
| "learning_rate": 1.9990879794423536e-05, |
| "loss": 0.5208, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.2764976958525346, |
| "grad_norm": 1.8286411762237549, |
| "learning_rate": 1.9988740933091932e-05, |
| "loss": 0.6699, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.2857142857142857, |
| "grad_norm": 1.846047282218933, |
| "learning_rate": 1.9986377125687305e-05, |
| "loss": 0.5865, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.29493087557603687, |
| "grad_norm": 1.6027253866195679, |
| "learning_rate": 1.998378843136468e-05, |
| "loss": 0.3248, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.30414746543778803, |
| "grad_norm": 1.8412662744522095, |
| "learning_rate": 1.998097491490695e-05, |
| "loss": 0.4681, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.31336405529953915, |
| "grad_norm": 2.7138962745666504, |
| "learning_rate": 1.9977936646723254e-05, |
| "loss": 0.6067, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.3225806451612903, |
| "grad_norm": 1.8338406085968018, |
| "learning_rate": 1.99746737028472e-05, |
| "loss": 0.6356, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.3317972350230415, |
| "grad_norm": 1.3918920755386353, |
| "learning_rate": 1.9971186164934995e-05, |
| "loss": 0.2682, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.34101382488479265, |
| "grad_norm": 1.3505946397781372, |
| "learning_rate": 1.996747412026337e-05, |
| "loss": 0.5671, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.35023041474654376, |
| "grad_norm": 2.4151482582092285, |
| "learning_rate": 1.9963537661727415e-05, |
| "loss": 0.5727, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.35944700460829493, |
| "grad_norm": 1.56583571434021, |
| "learning_rate": 1.995937688783824e-05, |
| "loss": 0.4294, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.3686635944700461, |
| "grad_norm": 1.828765869140625, |
| "learning_rate": 1.995499190272053e-05, |
| "loss": 0.4719, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.3778801843317972, |
| "grad_norm": 1.1976102590560913, |
| "learning_rate": 1.9950382816109904e-05, |
| "loss": 0.2039, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.3870967741935484, |
| "grad_norm": 1.4709233045578003, |
| "learning_rate": 1.994554974335022e-05, |
| "loss": 0.5271, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.39631336405529954, |
| "grad_norm": 3.198866844177246, |
| "learning_rate": 1.9940492805390644e-05, |
| "loss": 0.7437, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.4055299539170507, |
| "grad_norm": 1.6070488691329956, |
| "learning_rate": 1.9935212128782637e-05, |
| "loss": 0.3161, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.4147465437788018, |
| "grad_norm": 1.2444138526916504, |
| "learning_rate": 1.9929707845676796e-05, |
| "loss": 0.2362, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.423963133640553, |
| "grad_norm": 1.3169045448303223, |
| "learning_rate": 1.992398009381954e-05, |
| "loss": 0.3135, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.43317972350230416, |
| "grad_norm": 1.3725547790527344, |
| "learning_rate": 1.991802901654966e-05, |
| "loss": 0.3323, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.4423963133640553, |
| "grad_norm": 2.5597922801971436, |
| "learning_rate": 1.9911854762794747e-05, |
| "loss": 0.8516, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.45161290322580644, |
| "grad_norm": 1.195428490638733, |
| "learning_rate": 1.9905457487067438e-05, |
| "loss": 0.3244, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.4608294930875576, |
| "grad_norm": 1.841422438621521, |
| "learning_rate": 1.9898837349461573e-05, |
| "loss": 0.4963, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.4700460829493088, |
| "grad_norm": 1.3410362005233765, |
| "learning_rate": 1.989199451564819e-05, |
| "loss": 0.3315, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.4792626728110599, |
| "grad_norm": 1.4391103982925415, |
| "learning_rate": 1.9884929156871348e-05, |
| "loss": 0.4324, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.48847926267281105, |
| "grad_norm": 1.137695550918579, |
| "learning_rate": 1.9877641449943884e-05, |
| "loss": 0.3528, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.4976958525345622, |
| "grad_norm": 1.9326167106628418, |
| "learning_rate": 1.9870131577242958e-05, |
| "loss": 0.5226, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.5069124423963134, |
| "grad_norm": 1.3373956680297852, |
| "learning_rate": 1.98623997267055e-05, |
| "loss": 0.276, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.5161290322580645, |
| "grad_norm": 1.9294296503067017, |
| "learning_rate": 1.98544460918235e-05, |
| "loss": 0.4546, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.5253456221198156, |
| "grad_norm": 2.1642041206359863, |
| "learning_rate": 1.984627087163918e-05, |
| "loss": 0.5938, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.5345622119815668, |
| "grad_norm": 1.2871601581573486, |
| "learning_rate": 1.9837874270740005e-05, |
| "loss": 0.3375, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.543778801843318, |
| "grad_norm": 1.8051406145095825, |
| "learning_rate": 1.9829256499253548e-05, |
| "loss": 0.3842, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.5529953917050692, |
| "grad_norm": 1.9581133127212524, |
| "learning_rate": 1.982041777284226e-05, |
| "loss": 0.6373, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.5622119815668203, |
| "grad_norm": 1.327392578125, |
| "learning_rate": 1.9811358312698052e-05, |
| "loss": 0.369, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.5714285714285714, |
| "grad_norm": 1.6942732334136963, |
| "learning_rate": 1.980207834553677e-05, |
| "loss": 0.4919, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.5806451612903226, |
| "grad_norm": 1.6956502199172974, |
| "learning_rate": 1.9792578103592506e-05, |
| "loss": 0.3703, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.5898617511520737, |
| "grad_norm": 1.1825238466262817, |
| "learning_rate": 1.978285782461182e-05, |
| "loss": 0.2592, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.5990783410138248, |
| "grad_norm": 1.0454672574996948, |
| "learning_rate": 1.977291775184775e-05, |
| "loss": 0.2206, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.6082949308755761, |
| "grad_norm": 2.619781732559204, |
| "learning_rate": 1.976275813405374e-05, |
| "loss": 0.6658, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.6175115207373272, |
| "grad_norm": 1.996297001838684, |
| "learning_rate": 1.9752379225477436e-05, |
| "loss": 0.6573, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.6267281105990783, |
| "grad_norm": 1.7142549753189087, |
| "learning_rate": 1.974178128585429e-05, |
| "loss": 0.5846, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.6359447004608295, |
| "grad_norm": 1.6060806512832642, |
| "learning_rate": 1.973096458040108e-05, |
| "loss": 0.3448, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 2.079186201095581, |
| "learning_rate": 1.9719929379809262e-05, |
| "loss": 0.6707, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.6543778801843319, |
| "grad_norm": 2.5870113372802734, |
| "learning_rate": 1.9708675960238214e-05, |
| "loss": 0.8176, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.663594470046083, |
| "grad_norm": 1.8405799865722656, |
| "learning_rate": 1.9697204603308303e-05, |
| "loss": 0.5869, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.6728110599078341, |
| "grad_norm": 1.7923684120178223, |
| "learning_rate": 1.9685515596093844e-05, |
| "loss": 0.7463, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.6820276497695853, |
| "grad_norm": 1.9837538003921509, |
| "learning_rate": 1.967360923111593e-05, |
| "loss": 0.636, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.6912442396313364, |
| "grad_norm": 1.6785178184509277, |
| "learning_rate": 1.9661485806335095e-05, |
| "loss": 0.6603, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.7004608294930875, |
| "grad_norm": 1.2299582958221436, |
| "learning_rate": 1.964914562514386e-05, |
| "loss": 0.3568, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.7096774193548387, |
| "grad_norm": 1.5773401260375977, |
| "learning_rate": 1.9636588996359145e-05, |
| "loss": 0.3953, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.7188940092165899, |
| "grad_norm": 1.3753741979599, |
| "learning_rate": 1.9623816234214538e-05, |
| "loss": 0.4119, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.728110599078341, |
| "grad_norm": 2.0371758937835693, |
| "learning_rate": 1.9610827658352448e-05, |
| "loss": 0.6218, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.7373271889400922, |
| "grad_norm": 2.0338077545166016, |
| "learning_rate": 1.959762359381606e-05, |
| "loss": 0.4715, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.7465437788018433, |
| "grad_norm": 1.7828141450881958, |
| "learning_rate": 1.9584204371041257e-05, |
| "loss": 0.4246, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.7557603686635944, |
| "grad_norm": 1.9453110694885254, |
| "learning_rate": 1.957057032584832e-05, |
| "loss": 0.5038, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.7649769585253456, |
| "grad_norm": 0.9295899271965027, |
| "learning_rate": 1.955672179943351e-05, |
| "loss": 0.1661, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.7741935483870968, |
| "grad_norm": 2.042292594909668, |
| "learning_rate": 1.9542659138360575e-05, |
| "loss": 0.4365, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.783410138248848, |
| "grad_norm": 1.303910255432129, |
| "learning_rate": 1.9528382694552033e-05, |
| "loss": 0.2884, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.7926267281105991, |
| "grad_norm": 2.2542123794555664, |
| "learning_rate": 1.9513892825280387e-05, |
| "loss": 0.6708, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.8018433179723502, |
| "grad_norm": 1.465098261833191, |
| "learning_rate": 1.9499189893159178e-05, |
| "loss": 0.3452, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.8110599078341014, |
| "grad_norm": 2.565708875656128, |
| "learning_rate": 1.9484274266133918e-05, |
| "loss": 0.4814, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.8202764976958525, |
| "grad_norm": 1.4185545444488525, |
| "learning_rate": 1.9469146317472867e-05, |
| "loss": 0.2444, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.8294930875576036, |
| "grad_norm": 2.197986125946045, |
| "learning_rate": 1.9453806425757706e-05, |
| "loss": 0.4713, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.8387096774193549, |
| "grad_norm": 1.7127975225448608, |
| "learning_rate": 1.9438254974874055e-05, |
| "loss": 0.2751, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.847926267281106, |
| "grad_norm": 4.733283519744873, |
| "learning_rate": 1.9422492354001876e-05, |
| "loss": 0.7135, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.8571428571428571, |
| "grad_norm": 2.4395713806152344, |
| "learning_rate": 1.9406518957605716e-05, |
| "loss": 0.7377, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.8663594470046083, |
| "grad_norm": 2.862421751022339, |
| "learning_rate": 1.9390335185424852e-05, |
| "loss": 0.6709, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.8755760368663594, |
| "grad_norm": 1.8491294384002686, |
| "learning_rate": 1.9373941442463286e-05, |
| "loss": 0.3542, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.8847926267281107, |
| "grad_norm": 1.7241147756576538, |
| "learning_rate": 1.9357338138979586e-05, |
| "loss": 0.4067, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.8940092165898618, |
| "grad_norm": 1.8195747137069702, |
| "learning_rate": 1.9340525690476665e-05, |
| "loss": 0.52, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.9032258064516129, |
| "grad_norm": 1.7616909742355347, |
| "learning_rate": 1.9323504517691335e-05, |
| "loss": 0.5016, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.9124423963133641, |
| "grad_norm": 1.763556957244873, |
| "learning_rate": 1.9306275046583804e-05, |
| "loss": 0.5356, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.9216589861751152, |
| "grad_norm": 1.6357682943344116, |
| "learning_rate": 1.9288837708327018e-05, |
| "loss": 0.2396, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.9308755760368663, |
| "grad_norm": 0.9229624271392822, |
| "learning_rate": 1.9271192939295863e-05, |
| "loss": 0.1841, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.9400921658986175, |
| "grad_norm": 1.3163220882415771, |
| "learning_rate": 1.925334118105623e-05, |
| "loss": 0.2638, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.9493087557603687, |
| "grad_norm": 1.4951300621032715, |
| "learning_rate": 1.9235282880354e-05, |
| "loss": 0.4424, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.9585253456221198, |
| "grad_norm": 1.6376820802688599, |
| "learning_rate": 1.9217018489103832e-05, |
| "loss": 0.3157, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.967741935483871, |
| "grad_norm": 1.760594367980957, |
| "learning_rate": 1.9198548464377875e-05, |
| "loss": 0.579, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.9769585253456221, |
| "grad_norm": 1.0897892713546753, |
| "learning_rate": 1.917987326839431e-05, |
| "loss": 0.2917, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.9861751152073732, |
| "grad_norm": 1.8386499881744385, |
| "learning_rate": 1.9160993368505803e-05, |
| "loss": 0.7074, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.9953917050691244, |
| "grad_norm": 1.7440464496612549, |
| "learning_rate": 1.914190923718779e-05, |
| "loss": 0.3854, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.7440464496612549, |
| "learning_rate": 1.914190923718779e-05, |
| "loss": 0.0934, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.0092165898617511, |
| "grad_norm": 1.0595107078552246, |
| "learning_rate": 1.912262135202667e-05, |
| "loss": 0.4695, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.0184331797235022, |
| "grad_norm": 1.5694299936294556, |
| "learning_rate": 1.9103130195707846e-05, |
| "loss": 0.4488, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.0276497695852536, |
| "grad_norm": 1.2001843452453613, |
| "learning_rate": 1.9083436256003643e-05, |
| "loss": 0.2041, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.0368663594470047, |
| "grad_norm": 0.9177109599113464, |
| "learning_rate": 1.906354002576111e-05, |
| "loss": 0.398, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.0460829493087558, |
| "grad_norm": 1.2801213264465332, |
| "learning_rate": 1.9043442002889663e-05, |
| "loss": 0.2568, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.055299539170507, |
| "grad_norm": 1.288846731185913, |
| "learning_rate": 1.9023142690348663e-05, |
| "loss": 0.6494, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.064516129032258, |
| "grad_norm": 1.7401024103164673, |
| "learning_rate": 1.90026425961348e-05, |
| "loss": 0.5024, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.0737327188940091, |
| "grad_norm": 1.4946353435516357, |
| "learning_rate": 1.898194223326939e-05, |
| "loss": 0.655, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.0829493087557605, |
| "grad_norm": 1.5450165271759033, |
| "learning_rate": 1.8961042119785534e-05, |
| "loss": 0.2916, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.0921658986175116, |
| "grad_norm": 1.4745348691940308, |
| "learning_rate": 1.893994277871515e-05, |
| "loss": 0.5693, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.1013824884792627, |
| "grad_norm": 1.7780627012252808, |
| "learning_rate": 1.891864473807589e-05, |
| "loss": 0.4635, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.1105990783410138, |
| "grad_norm": 1.5207669734954834, |
| "learning_rate": 1.8897148530857944e-05, |
| "loss": 0.6628, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.119815668202765, |
| "grad_norm": 1.3493820428848267, |
| "learning_rate": 1.8875454695010655e-05, |
| "loss": 0.3444, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.129032258064516, |
| "grad_norm": 1.3421741724014282, |
| "learning_rate": 1.8853563773429102e-05, |
| "loss": 0.3206, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.1382488479262673, |
| "grad_norm": 1.4489786624908447, |
| "learning_rate": 1.8831476313940495e-05, |
| "loss": 0.1653, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.1474654377880185, |
| "grad_norm": 1.6162666082382202, |
| "learning_rate": 1.8809192869290463e-05, |
| "loss": 0.5919, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.1566820276497696, |
| "grad_norm": 2.1422464847564697, |
| "learning_rate": 1.878671399712923e-05, |
| "loss": 0.4031, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.1658986175115207, |
| "grad_norm": 1.5537471771240234, |
| "learning_rate": 1.8764040259997642e-05, |
| "loss": 0.3704, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.1751152073732718, |
| "grad_norm": 1.0332657098770142, |
| "learning_rate": 1.874117222531312e-05, |
| "loss": 0.4165, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.1843317972350231, |
| "grad_norm": 1.8113311529159546, |
| "learning_rate": 1.8718110465355436e-05, |
| "loss": 0.749, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.1935483870967742, |
| "grad_norm": 2.1138980388641357, |
| "learning_rate": 1.8694855557252395e-05, |
| "loss": 0.2031, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.2027649769585254, |
| "grad_norm": 1.017136573791504, |
| "learning_rate": 1.8671408082965394e-05, |
| "loss": 0.553, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.2119815668202765, |
| "grad_norm": 1.7025851011276245, |
| "learning_rate": 1.8647768629274865e-05, |
| "loss": 0.6251, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.2211981566820276, |
| "grad_norm": 1.3828938007354736, |
| "learning_rate": 1.8623937787765582e-05, |
| "loss": 0.4867, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.230414746543779, |
| "grad_norm": 1.569438099861145, |
| "learning_rate": 1.8599916154811858e-05, |
| "loss": 0.7632, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.23963133640553, |
| "grad_norm": 2.157384157180786, |
| "learning_rate": 1.8575704331562624e-05, |
| "loss": 0.9114, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.2488479262672811, |
| "grad_norm": 1.4164490699768066, |
| "learning_rate": 1.8551302923926387e-05, |
| "loss": 0.1767, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.2580645161290323, |
| "grad_norm": 1.5470051765441895, |
| "learning_rate": 1.8526712542556054e-05, |
| "loss": 0.3999, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.2672811059907834, |
| "grad_norm": 0.9011774063110352, |
| "learning_rate": 1.8501933802833664e-05, |
| "loss": 0.2622, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.2764976958525347, |
| "grad_norm": 1.5134320259094238, |
| "learning_rate": 1.8476967324854987e-05, |
| "loss": 0.3059, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.2857142857142856, |
| "grad_norm": 1.7358366250991821, |
| "learning_rate": 1.8451813733413998e-05, |
| "loss": 0.6285, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.294930875576037, |
| "grad_norm": 1.9286249876022339, |
| "learning_rate": 1.8426473657987238e-05, |
| "loss": 0.429, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.304147465437788, |
| "grad_norm": 1.0351864099502563, |
| "learning_rate": 1.8400947732718083e-05, |
| "loss": 0.3417, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.3133640552995391, |
| "grad_norm": 1.2357749938964844, |
| "learning_rate": 1.837523659640085e-05, |
| "loss": 0.472, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.3225806451612903, |
| "grad_norm": 1.3678104877471924, |
| "learning_rate": 1.8349340892464827e-05, |
| "loss": 0.397, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.3317972350230414, |
| "grad_norm": 0.8640093803405762, |
| "learning_rate": 1.832326126895816e-05, |
| "loss": 0.2434, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.3410138248847927, |
| "grad_norm": 1.2898294925689697, |
| "learning_rate": 1.8296998378531634e-05, |
| "loss": 0.2458, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.3502304147465438, |
| "grad_norm": 1.467898964881897, |
| "learning_rate": 1.827055287842236e-05, |
| "loss": 0.431, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.359447004608295, |
| "grad_norm": 1.185132622718811, |
| "learning_rate": 1.8243925430437314e-05, |
| "loss": 0.5099, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.368663594470046, |
| "grad_norm": 1.1978954076766968, |
| "learning_rate": 1.821711670093676e-05, |
| "loss": 0.3224, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.3778801843317972, |
| "grad_norm": 2.067573308944702, |
| "learning_rate": 1.81901273608176e-05, |
| "loss": 0.5459, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.3870967741935485, |
| "grad_norm": 1.7116279602050781, |
| "learning_rate": 1.8162958085496572e-05, |
| "loss": 0.5529, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.3963133640552996, |
| "grad_norm": 2.248635768890381, |
| "learning_rate": 1.8135609554893345e-05, |
| "loss": 0.5498, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.4055299539170507, |
| "grad_norm": 1.7334017753601074, |
| "learning_rate": 1.810808245341352e-05, |
| "loss": 0.5408, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.4147465437788018, |
| "grad_norm": 1.073221206665039, |
| "learning_rate": 1.8080377469931468e-05, |
| "loss": 0.4188, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.423963133640553, |
| "grad_norm": 1.0701338052749634, |
| "learning_rate": 1.8052495297773135e-05, |
| "loss": 0.4552, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.4331797235023043, |
| "grad_norm": 2.1286838054656982, |
| "learning_rate": 1.802443663469867e-05, |
| "loss": 0.3399, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.4423963133640554, |
| "grad_norm": 1.0282185077667236, |
| "learning_rate": 1.7996202182884938e-05, |
| "loss": 0.5194, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.4516129032258065, |
| "grad_norm": 2.318063497543335, |
| "learning_rate": 1.7967792648907993e-05, |
| "loss": 0.5981, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.4608294930875576, |
| "grad_norm": 1.4625202417373657, |
| "learning_rate": 1.7939208743725378e-05, |
| "loss": 0.6277, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.4700460829493087, |
| "grad_norm": 2.1962194442749023, |
| "learning_rate": 1.7910451182658318e-05, |
| "loss": 0.5162, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.4792626728110598, |
| "grad_norm": 1.6668182611465454, |
| "learning_rate": 1.7881520685373836e-05, |
| "loss": 0.5978, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.488479262672811, |
| "grad_norm": 1.6689997911453247, |
| "learning_rate": 1.7852417975866735e-05, |
| "loss": 0.5508, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.4976958525345623, |
| "grad_norm": 1.2357803583145142, |
| "learning_rate": 1.7823143782441498e-05, |
| "loss": 0.5443, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.5069124423963134, |
| "grad_norm": 1.4967455863952637, |
| "learning_rate": 1.779369883769403e-05, |
| "loss": 0.5563, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.5161290322580645, |
| "grad_norm": 1.739134669303894, |
| "learning_rate": 1.7764083878493342e-05, |
| "loss": 0.3784, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.5253456221198156, |
| "grad_norm": 1.2600858211517334, |
| "learning_rate": 1.7734299645963126e-05, |
| "loss": 0.4362, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.5345622119815667, |
| "grad_norm": 1.8002910614013672, |
| "learning_rate": 1.7704346885463173e-05, |
| "loss": 0.5586, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.543778801843318, |
| "grad_norm": 1.7405978441238403, |
| "learning_rate": 1.7674226346570756e-05, |
| "loss": 0.8054, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.5529953917050692, |
| "grad_norm": 1.320211410522461, |
| "learning_rate": 1.7643938783061844e-05, |
| "loss": 0.6346, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.5622119815668203, |
| "grad_norm": 1.6102598905563354, |
| "learning_rate": 1.761348495289225e-05, |
| "loss": 0.6073, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.5714285714285714, |
| "grad_norm": 1.765432596206665, |
| "learning_rate": 1.7582865618178673e-05, |
| "loss": 0.381, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.5806451612903225, |
| "grad_norm": 1.5802050828933716, |
| "learning_rate": 1.755208154517961e-05, |
| "loss": 0.7346, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.5898617511520738, |
| "grad_norm": 1.241126537322998, |
| "learning_rate": 1.752113350427617e-05, |
| "loss": 0.8343, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.5990783410138247, |
| "grad_norm": 2.071810007095337, |
| "learning_rate": 1.7490022269952836e-05, |
| "loss": 0.3506, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.608294930875576, |
| "grad_norm": 1.5097633600234985, |
| "learning_rate": 1.7458748620778047e-05, |
| "loss": 0.636, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.6175115207373272, |
| "grad_norm": 1.7457658052444458, |
| "learning_rate": 1.742731333938472e-05, |
| "loss": 0.6356, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.6267281105990783, |
| "grad_norm": 1.6532824039459229, |
| "learning_rate": 1.7395717212450673e-05, |
| "loss": 0.6368, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.6359447004608296, |
| "grad_norm": 1.5569558143615723, |
| "learning_rate": 1.736396103067893e-05, |
| "loss": 0.4563, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.6451612903225805, |
| "grad_norm": 1.1203193664550781, |
| "learning_rate": 1.733204558877795e-05, |
| "loss": 0.2038, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.6543778801843319, |
| "grad_norm": 0.6701219081878662, |
| "learning_rate": 1.729997168544171e-05, |
| "loss": 0.4304, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.663594470046083, |
| "grad_norm": 1.7273935079574585, |
| "learning_rate": 1.7267740123329756e-05, |
| "loss": 0.4431, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.672811059907834, |
| "grad_norm": 1.6673085689544678, |
| "learning_rate": 1.7235351709047072e-05, |
| "loss": 0.3977, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.6820276497695854, |
| "grad_norm": 2.1155049800872803, |
| "learning_rate": 1.720280725312393e-05, |
| "loss": 1.0269, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.6912442396313363, |
| "grad_norm": 1.494881272315979, |
| "learning_rate": 1.7170107569995588e-05, |
| "loss": 0.4423, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.7004608294930876, |
| "grad_norm": 1.590615153312683, |
| "learning_rate": 1.7137253477981916e-05, |
| "loss": 0.4478, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.7096774193548387, |
| "grad_norm": 1.4964159727096558, |
| "learning_rate": 1.7104245799266917e-05, |
| "loss": 0.5488, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.7188940092165899, |
| "grad_norm": 2.062837839126587, |
| "learning_rate": 1.707108535987815e-05, |
| "loss": 0.4705, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.728110599078341, |
| "grad_norm": 2.3107001781463623, |
| "learning_rate": 1.7037772989666043e-05, |
| "loss": 0.5786, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.737327188940092, |
| "grad_norm": 1.247986078262329, |
| "learning_rate": 1.7004309522283162e-05, |
| "loss": 0.6896, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.7465437788018434, |
| "grad_norm": 1.693481206893921, |
| "learning_rate": 1.6970695795163322e-05, |
| "loss": 0.4202, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.7557603686635943, |
| "grad_norm": 1.6047693490982056, |
| "learning_rate": 1.693693264950062e-05, |
| "loss": 0.8269, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.7649769585253456, |
| "grad_norm": 2.1052298545837402, |
| "learning_rate": 1.6903020930228424e-05, |
| "loss": 0.2762, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.7741935483870968, |
| "grad_norm": 0.843791127204895, |
| "learning_rate": 1.6868961485998178e-05, |
| "loss": 0.5013, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.7834101382488479, |
| "grad_norm": 1.7998207807540894, |
| "learning_rate": 1.683475516915821e-05, |
| "loss": 0.5306, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.7926267281105992, |
| "grad_norm": 1.6218657493591309, |
| "learning_rate": 1.6800402835732367e-05, |
| "loss": 0.4226, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.80184331797235, |
| "grad_norm": 1.7625070810317993, |
| "learning_rate": 1.6765905345398618e-05, |
| "loss": 0.5424, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.8110599078341014, |
| "grad_norm": 2.727060556411743, |
| "learning_rate": 1.6731263561467514e-05, |
| "loss": 0.533, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.8202764976958525, |
| "grad_norm": 1.631017804145813, |
| "learning_rate": 1.6696478350860625e-05, |
| "loss": 0.6217, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.8294930875576036, |
| "grad_norm": 1.3059254884719849, |
| "learning_rate": 1.666155058408879e-05, |
| "loss": 0.2015, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.838709677419355, |
| "grad_norm": 1.4272629022598267, |
| "learning_rate": 1.6626481135230378e-05, |
| "loss": 0.3501, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.8479262672811059, |
| "grad_norm": 2.0381100177764893, |
| "learning_rate": 1.6591270881909393e-05, |
| "loss": 0.3421, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.8571428571428572, |
| "grad_norm": 0.9297720193862915, |
| "learning_rate": 1.6555920705273513e-05, |
| "loss": 0.4342, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.8663594470046083, |
| "grad_norm": 1.7821056842803955, |
| "learning_rate": 1.6520431489972043e-05, |
| "loss": 0.3369, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.8755760368663594, |
| "grad_norm": 1.8966842889785767, |
| "learning_rate": 1.6484804124133772e-05, |
| "loss": 0.4105, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.8847926267281108, |
| "grad_norm": 1.2168216705322266, |
| "learning_rate": 1.6449039499344755e-05, |
| "loss": 0.3152, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.8940092165898617, |
| "grad_norm": 1.5273537635803223, |
| "learning_rate": 1.6413138510625994e-05, |
| "loss": 0.5443, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.903225806451613, |
| "grad_norm": 1.479945421218872, |
| "learning_rate": 1.637710205641103e-05, |
| "loss": 0.3981, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.912442396313364, |
| "grad_norm": 1.8276926279067993, |
| "learning_rate": 1.634093103852349e-05, |
| "loss": 0.5654, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.9216589861751152, |
| "grad_norm": 1.8224812746047974, |
| "learning_rate": 1.6304626362154484e-05, |
| "loss": 0.5854, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.9308755760368663, |
| "grad_norm": 0.9820652604103088, |
| "learning_rate": 1.6268188935839976e-05, |
| "loss": 0.3632, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.9400921658986174, |
| "grad_norm": 1.9651046991348267, |
| "learning_rate": 1.623161967143803e-05, |
| "loss": 0.4772, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.9493087557603688, |
| "grad_norm": 1.4014381170272827, |
| "learning_rate": 1.6194919484106016e-05, |
| "loss": 0.313, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.9585253456221197, |
| "grad_norm": 2.253092050552368, |
| "learning_rate": 1.6158089292277674e-05, |
| "loss": 0.7339, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.967741935483871, |
| "grad_norm": 1.777227520942688, |
| "learning_rate": 1.612113001764016e-05, |
| "loss": 0.3213, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.976958525345622, |
| "grad_norm": 1.4329880475997925, |
| "learning_rate": 1.6084042585110955e-05, |
| "loss": 0.2623, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.9861751152073732, |
| "grad_norm": 1.3090839385986328, |
| "learning_rate": 1.6046827922814746e-05, |
| "loss": 0.6879, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.9953917050691246, |
| "grad_norm": 1.9153245687484741, |
| "learning_rate": 1.6009486962060175e-05, |
| "loss": 0.5763, |
| "step": 217 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.4464911222457886, |
| "learning_rate": 1.597202063731655e-05, |
| "loss": 0.1043, |
| "step": 218 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 648, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 8.704573679368929e+17, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|