| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0846560846560847, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.021164021164021163, | |
| "grad_norm": 6.048354625701904, | |
| "learning_rate": 1e-05, | |
| "loss": 3.1366, | |
| "mean_token_accuracy": 0.8490224033594131, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.042328042328042326, | |
| "grad_norm": 6.282668590545654, | |
| "learning_rate": 2e-05, | |
| "loss": 3.3445, | |
| "mean_token_accuracy": 0.8413912802934647, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.06349206349206349, | |
| "grad_norm": 5.931667327880859, | |
| "learning_rate": 3e-05, | |
| "loss": 3.6371, | |
| "mean_token_accuracy": 0.819273829460144, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.08465608465608465, | |
| "grad_norm": 5.271315097808838, | |
| "learning_rate": 4e-05, | |
| "loss": 2.8728, | |
| "mean_token_accuracy": 0.864514172077179, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.10582010582010581, | |
| "grad_norm": 6.306461811065674, | |
| "learning_rate": 5e-05, | |
| "loss": 3.2277, | |
| "mean_token_accuracy": 0.8422431498765945, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.12698412698412698, | |
| "grad_norm": 7.170639514923096, | |
| "learning_rate": 4.9473684210526315e-05, | |
| "loss": 3.1188, | |
| "mean_token_accuracy": 0.8523378223180771, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.14814814814814814, | |
| "grad_norm": 6.320353984832764, | |
| "learning_rate": 4.8947368421052635e-05, | |
| "loss": 3.2011, | |
| "mean_token_accuracy": 0.8276277035474777, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.1693121693121693, | |
| "grad_norm": 7.527361869812012, | |
| "learning_rate": 4.842105263157895e-05, | |
| "loss": 3.5806, | |
| "mean_token_accuracy": 0.8307382017374039, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 9.139302253723145, | |
| "learning_rate": 4.789473684210526e-05, | |
| "loss": 3.3045, | |
| "mean_token_accuracy": 0.8234163820743561, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.21164021164021163, | |
| "grad_norm": 8.344583511352539, | |
| "learning_rate": 4.736842105263158e-05, | |
| "loss": 3.4053, | |
| "mean_token_accuracy": 0.8328999727964401, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.2328042328042328, | |
| "grad_norm": 7.198660850524902, | |
| "learning_rate": 4.68421052631579e-05, | |
| "loss": 3.6999, | |
| "mean_token_accuracy": 0.8166868537664413, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.25396825396825395, | |
| "grad_norm": 7.656338691711426, | |
| "learning_rate": 4.6315789473684214e-05, | |
| "loss": 3.5236, | |
| "mean_token_accuracy": 0.8194563537836075, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.2751322751322751, | |
| "grad_norm": 6.596268653869629, | |
| "learning_rate": 4.5789473684210527e-05, | |
| "loss": 3.1002, | |
| "mean_token_accuracy": 0.8456875830888748, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.2962962962962963, | |
| "grad_norm": 7.026238918304443, | |
| "learning_rate": 4.5263157894736846e-05, | |
| "loss": 3.5912, | |
| "mean_token_accuracy": 0.8176615834236145, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.31746031746031744, | |
| "grad_norm": 6.6125640869140625, | |
| "learning_rate": 4.473684210526316e-05, | |
| "loss": 3.2967, | |
| "mean_token_accuracy": 0.8385807871818542, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.3386243386243386, | |
| "grad_norm": 6.488234043121338, | |
| "learning_rate": 4.421052631578947e-05, | |
| "loss": 2.9518, | |
| "mean_token_accuracy": 0.8566168546676636, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.35978835978835977, | |
| "grad_norm": 6.3693389892578125, | |
| "learning_rate": 4.368421052631579e-05, | |
| "loss": 3.4379, | |
| "mean_token_accuracy": 0.8285104632377625, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 6.490217208862305, | |
| "learning_rate": 4.3157894736842105e-05, | |
| "loss": 3.2799, | |
| "mean_token_accuracy": 0.8258774131536484, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.4021164021164021, | |
| "grad_norm": 6.357998847961426, | |
| "learning_rate": 4.2631578947368425e-05, | |
| "loss": 3.556, | |
| "mean_token_accuracy": 0.8310040086507797, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.42328042328042326, | |
| "grad_norm": 6.8284807205200195, | |
| "learning_rate": 4.210526315789474e-05, | |
| "loss": 3.225, | |
| "mean_token_accuracy": 0.8262167721986771, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 6.557299613952637, | |
| "learning_rate": 4.157894736842106e-05, | |
| "loss": 2.8915, | |
| "mean_token_accuracy": 0.8450914323329926, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.4656084656084656, | |
| "grad_norm": 6.227797508239746, | |
| "learning_rate": 4.105263157894737e-05, | |
| "loss": 3.5964, | |
| "mean_token_accuracy": 0.8293115049600601, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.48677248677248675, | |
| "grad_norm": 6.4811482429504395, | |
| "learning_rate": 4.0526315789473684e-05, | |
| "loss": 2.9146, | |
| "mean_token_accuracy": 0.8511508405208588, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.5079365079365079, | |
| "grad_norm": 7.390919208526611, | |
| "learning_rate": 4e-05, | |
| "loss": 3.0839, | |
| "mean_token_accuracy": 0.8521782755851746, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.5291005291005291, | |
| "grad_norm": 7.000582695007324, | |
| "learning_rate": 3.9473684210526316e-05, | |
| "loss": 3.419, | |
| "mean_token_accuracy": 0.8302050828933716, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.5502645502645502, | |
| "grad_norm": 7.646208763122559, | |
| "learning_rate": 3.894736842105263e-05, | |
| "loss": 3.5312, | |
| "mean_token_accuracy": 0.8222539722919464, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 8.202258110046387, | |
| "learning_rate": 3.842105263157895e-05, | |
| "loss": 3.2736, | |
| "mean_token_accuracy": 0.8385538011789322, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.5925925925925926, | |
| "grad_norm": 7.112030982971191, | |
| "learning_rate": 3.789473684210527e-05, | |
| "loss": 3.5607, | |
| "mean_token_accuracy": 0.8270172476768494, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.6137566137566137, | |
| "grad_norm": 7.77510404586792, | |
| "learning_rate": 3.736842105263158e-05, | |
| "loss": 3.4509, | |
| "mean_token_accuracy": 0.8338000029325485, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 8.627528190612793, | |
| "learning_rate": 3.6842105263157895e-05, | |
| "loss": 3.4089, | |
| "mean_token_accuracy": 0.8251242786645889, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.656084656084656, | |
| "grad_norm": 7.149316787719727, | |
| "learning_rate": 3.6315789473684214e-05, | |
| "loss": 3.0029, | |
| "mean_token_accuracy": 0.8458760529756546, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.6772486772486772, | |
| "grad_norm": 8.06534194946289, | |
| "learning_rate": 3.578947368421053e-05, | |
| "loss": 3.2129, | |
| "mean_token_accuracy": 0.8390202522277832, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.6984126984126984, | |
| "grad_norm": 6.458263397216797, | |
| "learning_rate": 3.526315789473684e-05, | |
| "loss": 3.7385, | |
| "mean_token_accuracy": 0.8196459114551544, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.7195767195767195, | |
| "grad_norm": 6.388949394226074, | |
| "learning_rate": 3.473684210526316e-05, | |
| "loss": 3.0692, | |
| "mean_token_accuracy": 0.8393152952194214, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 7.467120170593262, | |
| "learning_rate": 3.421052631578947e-05, | |
| "loss": 3.4118, | |
| "mean_token_accuracy": 0.8257147669792175, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 7.274282932281494, | |
| "learning_rate": 3.368421052631579e-05, | |
| "loss": 3.2201, | |
| "mean_token_accuracy": 0.8417303115129471, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.783068783068783, | |
| "grad_norm": 7.1840500831604, | |
| "learning_rate": 3.3157894736842106e-05, | |
| "loss": 3.2211, | |
| "mean_token_accuracy": 0.8373906314373016, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.8042328042328042, | |
| "grad_norm": 6.900996685028076, | |
| "learning_rate": 3.2631578947368426e-05, | |
| "loss": 3.4252, | |
| "mean_token_accuracy": 0.8245490044355392, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.8253968253968254, | |
| "grad_norm": 6.207952976226807, | |
| "learning_rate": 3.210526315789474e-05, | |
| "loss": 3.1038, | |
| "mean_token_accuracy": 0.8480600118637085, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.8465608465608465, | |
| "grad_norm": 6.920236587524414, | |
| "learning_rate": 3.157894736842105e-05, | |
| "loss": 3.3569, | |
| "mean_token_accuracy": 0.8315872251987457, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.8677248677248677, | |
| "grad_norm": 7.361562728881836, | |
| "learning_rate": 3.105263157894737e-05, | |
| "loss": 3.6389, | |
| "mean_token_accuracy": 0.8231257498264313, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 6.357892036437988, | |
| "learning_rate": 3.0526315789473684e-05, | |
| "loss": 3.3438, | |
| "mean_token_accuracy": 0.8264831602573395, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.91005291005291, | |
| "grad_norm": 6.043882369995117, | |
| "learning_rate": 3e-05, | |
| "loss": 3.3171, | |
| "mean_token_accuracy": 0.8378995358943939, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.9312169312169312, | |
| "grad_norm": 6.81577205657959, | |
| "learning_rate": 2.9473684210526314e-05, | |
| "loss": 3.2854, | |
| "mean_token_accuracy": 0.8394952565431595, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 6.861083030700684, | |
| "learning_rate": 2.8947368421052634e-05, | |
| "loss": 3.1802, | |
| "mean_token_accuracy": 0.8438083976507187, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.9735449735449735, | |
| "grad_norm": 7.255928039550781, | |
| "learning_rate": 2.842105263157895e-05, | |
| "loss": 3.7687, | |
| "mean_token_accuracy": 0.8107610791921616, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.9947089947089947, | |
| "grad_norm": 6.662189483642578, | |
| "learning_rate": 2.7894736842105263e-05, | |
| "loss": 2.8252, | |
| "mean_token_accuracy": 0.8516946136951447, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.8546807765960693, | |
| "learning_rate": 2.7368421052631583e-05, | |
| "loss": 0.7518, | |
| "mean_token_accuracy": 0.8395061492919922, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.0211640211640212, | |
| "grad_norm": 5.928465843200684, | |
| "learning_rate": 2.6842105263157896e-05, | |
| "loss": 2.9837, | |
| "mean_token_accuracy": 0.8488393574953079, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.0423280423280423, | |
| "grad_norm": 7.0468363761901855, | |
| "learning_rate": 2.6315789473684212e-05, | |
| "loss": 2.9965, | |
| "mean_token_accuracy": 0.8516333252191544, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.0634920634920635, | |
| "grad_norm": 6.113859176635742, | |
| "learning_rate": 2.578947368421053e-05, | |
| "loss": 2.9239, | |
| "mean_token_accuracy": 0.846625804901123, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 1.0846560846560847, | |
| "grad_norm": 6.449796199798584, | |
| "learning_rate": 2.5263157894736845e-05, | |
| "loss": 2.9745, | |
| "mean_token_accuracy": 0.8498904705047607, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.1058201058201058, | |
| "grad_norm": 6.227660655975342, | |
| "learning_rate": 2.4736842105263158e-05, | |
| "loss": 2.7134, | |
| "mean_token_accuracy": 0.8572945147752762, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.126984126984127, | |
| "grad_norm": 6.191722869873047, | |
| "learning_rate": 2.4210526315789474e-05, | |
| "loss": 2.9167, | |
| "mean_token_accuracy": 0.8419267982244492, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.1481481481481481, | |
| "grad_norm": 6.0019307136535645, | |
| "learning_rate": 2.368421052631579e-05, | |
| "loss": 2.8168, | |
| "mean_token_accuracy": 0.8558094352483749, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.1693121693121693, | |
| "grad_norm": 6.954141139984131, | |
| "learning_rate": 2.3157894736842107e-05, | |
| "loss": 3.0574, | |
| "mean_token_accuracy": 0.8407768905162811, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.1904761904761905, | |
| "grad_norm": 6.376841068267822, | |
| "learning_rate": 2.2631578947368423e-05, | |
| "loss": 3.1123, | |
| "mean_token_accuracy": 0.8525369018316269, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.2116402116402116, | |
| "grad_norm": 6.693148612976074, | |
| "learning_rate": 2.2105263157894736e-05, | |
| "loss": 2.351, | |
| "mean_token_accuracy": 0.8728293180465698, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.2328042328042328, | |
| "grad_norm": 7.382028102874756, | |
| "learning_rate": 2.1578947368421053e-05, | |
| "loss": 3.065, | |
| "mean_token_accuracy": 0.8475844711065292, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.253968253968254, | |
| "grad_norm": 6.0569682121276855, | |
| "learning_rate": 2.105263157894737e-05, | |
| "loss": 2.7, | |
| "mean_token_accuracy": 0.8730548322200775, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.2751322751322751, | |
| "grad_norm": 6.204995632171631, | |
| "learning_rate": 2.0526315789473685e-05, | |
| "loss": 2.4867, | |
| "mean_token_accuracy": 0.8747055679559708, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.2962962962962963, | |
| "grad_norm": 7.9132466316223145, | |
| "learning_rate": 2e-05, | |
| "loss": 2.634, | |
| "mean_token_accuracy": 0.8690300732851028, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.3174603174603174, | |
| "grad_norm": 6.757488250732422, | |
| "learning_rate": 1.9473684210526315e-05, | |
| "loss": 2.4285, | |
| "mean_token_accuracy": 0.8734888136386871, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.3386243386243386, | |
| "grad_norm": 6.5959930419921875, | |
| "learning_rate": 1.8947368421052634e-05, | |
| "loss": 2.7514, | |
| "mean_token_accuracy": 0.861017182469368, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.3597883597883598, | |
| "grad_norm": 6.472837448120117, | |
| "learning_rate": 1.8421052631578947e-05, | |
| "loss": 2.3507, | |
| "mean_token_accuracy": 0.8777505159378052, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.380952380952381, | |
| "grad_norm": 7.2989959716796875, | |
| "learning_rate": 1.7894736842105264e-05, | |
| "loss": 2.5765, | |
| "mean_token_accuracy": 0.8751945346593857, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.402116402116402, | |
| "grad_norm": 7.222543716430664, | |
| "learning_rate": 1.736842105263158e-05, | |
| "loss": 2.8945, | |
| "mean_token_accuracy": 0.8649410307407379, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.4232804232804233, | |
| "grad_norm": 8.021060943603516, | |
| "learning_rate": 1.6842105263157896e-05, | |
| "loss": 3.104, | |
| "mean_token_accuracy": 0.841856375336647, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 6.828577995300293, | |
| "learning_rate": 1.6315789473684213e-05, | |
| "loss": 2.562, | |
| "mean_token_accuracy": 0.8617057651281357, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.4656084656084656, | |
| "grad_norm": 7.925837993621826, | |
| "learning_rate": 1.5789473684210526e-05, | |
| "loss": 3.0748, | |
| "mean_token_accuracy": 0.8369355648756027, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.4867724867724867, | |
| "grad_norm": 7.627265930175781, | |
| "learning_rate": 1.5263157894736842e-05, | |
| "loss": 2.7155, | |
| "mean_token_accuracy": 0.8586581945419312, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.507936507936508, | |
| "grad_norm": 6.797243595123291, | |
| "learning_rate": 1.4736842105263157e-05, | |
| "loss": 2.9972, | |
| "mean_token_accuracy": 0.8588763475418091, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.529100529100529, | |
| "grad_norm": 6.557271957397461, | |
| "learning_rate": 1.4210526315789475e-05, | |
| "loss": 2.7672, | |
| "mean_token_accuracy": 0.8697129487991333, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.5502645502645502, | |
| "grad_norm": 7.8767828941345215, | |
| "learning_rate": 1.3684210526315791e-05, | |
| "loss": 2.6796, | |
| "mean_token_accuracy": 0.8595937788486481, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 6.961939334869385, | |
| "learning_rate": 1.3157894736842106e-05, | |
| "loss": 2.9875, | |
| "mean_token_accuracy": 0.8416125029325485, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.5925925925925926, | |
| "grad_norm": 7.682628154754639, | |
| "learning_rate": 1.2631578947368422e-05, | |
| "loss": 2.7506, | |
| "mean_token_accuracy": 0.8638267368078232, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.6137566137566137, | |
| "grad_norm": 7.030145168304443, | |
| "learning_rate": 1.2105263157894737e-05, | |
| "loss": 3.016, | |
| "mean_token_accuracy": 0.8373551815748215, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.6349206349206349, | |
| "grad_norm": 6.8483357429504395, | |
| "learning_rate": 1.1578947368421053e-05, | |
| "loss": 2.9225, | |
| "mean_token_accuracy": 0.8537007570266724, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.656084656084656, | |
| "grad_norm": 6.3635358810424805, | |
| "learning_rate": 1.1052631578947368e-05, | |
| "loss": 2.605, | |
| "mean_token_accuracy": 0.8712249547243118, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.6772486772486772, | |
| "grad_norm": 6.841208457946777, | |
| "learning_rate": 1.0526315789473684e-05, | |
| "loss": 2.649, | |
| "mean_token_accuracy": 0.8548033833503723, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.6984126984126984, | |
| "grad_norm": 6.934281826019287, | |
| "learning_rate": 1e-05, | |
| "loss": 2.7121, | |
| "mean_token_accuracy": 0.8624187707901001, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.7195767195767195, | |
| "grad_norm": 7.023396968841553, | |
| "learning_rate": 9.473684210526317e-06, | |
| "loss": 2.7811, | |
| "mean_token_accuracy": 0.8551411479711533, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.7407407407407407, | |
| "grad_norm": 6.556440830230713, | |
| "learning_rate": 8.947368421052632e-06, | |
| "loss": 2.4777, | |
| "mean_token_accuracy": 0.8696417659521103, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.7619047619047619, | |
| "grad_norm": 6.040866374969482, | |
| "learning_rate": 8.421052631578948e-06, | |
| "loss": 2.578, | |
| "mean_token_accuracy": 0.8646088093519211, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.783068783068783, | |
| "grad_norm": 6.976686000823975, | |
| "learning_rate": 7.894736842105263e-06, | |
| "loss": 3.0012, | |
| "mean_token_accuracy": 0.8455324172973633, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.8042328042328042, | |
| "grad_norm": 6.477259635925293, | |
| "learning_rate": 7.3684210526315784e-06, | |
| "loss": 2.6865, | |
| "mean_token_accuracy": 0.8627304285764694, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.8253968253968254, | |
| "grad_norm": 6.601570129394531, | |
| "learning_rate": 6.842105263157896e-06, | |
| "loss": 2.8802, | |
| "mean_token_accuracy": 0.8515039086341858, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.8465608465608465, | |
| "grad_norm": 6.802147388458252, | |
| "learning_rate": 6.315789473684211e-06, | |
| "loss": 2.6333, | |
| "mean_token_accuracy": 0.8652176707983017, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.8677248677248677, | |
| "grad_norm": 6.9757843017578125, | |
| "learning_rate": 5.789473684210527e-06, | |
| "loss": 2.5824, | |
| "mean_token_accuracy": 0.8683836162090302, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 8.725011825561523, | |
| "learning_rate": 5.263157894736842e-06, | |
| "loss": 2.0392, | |
| "mean_token_accuracy": 0.8908423334360123, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.91005291005291, | |
| "grad_norm": 6.747620105743408, | |
| "learning_rate": 4.736842105263159e-06, | |
| "loss": 2.5522, | |
| "mean_token_accuracy": 0.8612807095050812, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.9312169312169312, | |
| "grad_norm": 7.072465419769287, | |
| "learning_rate": 4.210526315789474e-06, | |
| "loss": 3.1815, | |
| "mean_token_accuracy": 0.836863324046135, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.9523809523809523, | |
| "grad_norm": 5.726354122161865, | |
| "learning_rate": 3.6842105263157892e-06, | |
| "loss": 2.8141, | |
| "mean_token_accuracy": 0.8544116318225861, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.9735449735449735, | |
| "grad_norm": 6.540099620819092, | |
| "learning_rate": 3.1578947368421056e-06, | |
| "loss": 2.2286, | |
| "mean_token_accuracy": 0.8898429274559021, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.9947089947089947, | |
| "grad_norm": 6.0961222648620605, | |
| "learning_rate": 2.631578947368421e-06, | |
| "loss": 2.634, | |
| "mean_token_accuracy": 0.8558461666107178, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.9811086654663086, | |
| "learning_rate": 2.105263157894737e-06, | |
| "loss": 0.3701, | |
| "mean_token_accuracy": 0.8978873491287231, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 2.0211640211640214, | |
| "grad_norm": 5.955318450927734, | |
| "learning_rate": 1.5789473684210528e-06, | |
| "loss": 2.499, | |
| "mean_token_accuracy": 0.8697298467159271, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 2.0423280423280423, | |
| "grad_norm": 5.380612373352051, | |
| "learning_rate": 1.0526315789473685e-06, | |
| "loss": 2.2655, | |
| "mean_token_accuracy": 0.8864545077085495, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 2.0634920634920633, | |
| "grad_norm": 5.8451457023620605, | |
| "learning_rate": 5.263157894736843e-07, | |
| "loss": 1.9556, | |
| "mean_token_accuracy": 0.8944731205701828, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 2.0846560846560847, | |
| "grad_norm": 6.115093231201172, | |
| "learning_rate": 0.0, | |
| "loss": 2.4345, | |
| "mean_token_accuracy": 0.8695619106292725, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 514181058527232.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |