{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0846560846560847, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.021164021164021163, "grad_norm": 6.048354625701904, "learning_rate": 1e-05, "loss": 3.1366, "mean_token_accuracy": 0.8490224033594131, "step": 1 }, { "epoch": 0.042328042328042326, "grad_norm": 6.282668590545654, "learning_rate": 2e-05, "loss": 3.3445, "mean_token_accuracy": 0.8413912802934647, "step": 2 }, { "epoch": 0.06349206349206349, "grad_norm": 5.931667327880859, "learning_rate": 3e-05, "loss": 3.6371, "mean_token_accuracy": 0.819273829460144, "step": 3 }, { "epoch": 0.08465608465608465, "grad_norm": 5.271315097808838, "learning_rate": 4e-05, "loss": 2.8728, "mean_token_accuracy": 0.864514172077179, "step": 4 }, { "epoch": 0.10582010582010581, "grad_norm": 6.306461811065674, "learning_rate": 5e-05, "loss": 3.2277, "mean_token_accuracy": 0.8422431498765945, "step": 5 }, { "epoch": 0.12698412698412698, "grad_norm": 7.170639514923096, "learning_rate": 4.9473684210526315e-05, "loss": 3.1188, "mean_token_accuracy": 0.8523378223180771, "step": 6 }, { "epoch": 0.14814814814814814, "grad_norm": 6.320353984832764, "learning_rate": 4.8947368421052635e-05, "loss": 3.2011, "mean_token_accuracy": 0.8276277035474777, "step": 7 }, { "epoch": 0.1693121693121693, "grad_norm": 7.527361869812012, "learning_rate": 4.842105263157895e-05, "loss": 3.5806, "mean_token_accuracy": 0.8307382017374039, "step": 8 }, { "epoch": 0.19047619047619047, "grad_norm": 9.139302253723145, "learning_rate": 4.789473684210526e-05, "loss": 3.3045, "mean_token_accuracy": 0.8234163820743561, "step": 9 }, { "epoch": 0.21164021164021163, "grad_norm": 8.344583511352539, "learning_rate": 4.736842105263158e-05, "loss": 3.4053, "mean_token_accuracy": 0.8328999727964401, "step": 10 }, { "epoch": 0.2328042328042328, "grad_norm": 7.198660850524902, "learning_rate": 4.68421052631579e-05, "loss": 3.6999, "mean_token_accuracy": 0.8166868537664413, "step": 11 }, { "epoch": 0.25396825396825395, "grad_norm": 7.656338691711426, "learning_rate": 4.6315789473684214e-05, "loss": 3.5236, "mean_token_accuracy": 0.8194563537836075, "step": 12 }, { "epoch": 0.2751322751322751, "grad_norm": 6.596268653869629, "learning_rate": 4.5789473684210527e-05, "loss": 3.1002, "mean_token_accuracy": 0.8456875830888748, "step": 13 }, { "epoch": 0.2962962962962963, "grad_norm": 7.026238918304443, "learning_rate": 4.5263157894736846e-05, "loss": 3.5912, "mean_token_accuracy": 0.8176615834236145, "step": 14 }, { "epoch": 0.31746031746031744, "grad_norm": 6.6125640869140625, "learning_rate": 4.473684210526316e-05, "loss": 3.2967, "mean_token_accuracy": 0.8385807871818542, "step": 15 }, { "epoch": 0.3386243386243386, "grad_norm": 6.488234043121338, "learning_rate": 4.421052631578947e-05, "loss": 2.9518, "mean_token_accuracy": 0.8566168546676636, "step": 16 }, { "epoch": 0.35978835978835977, "grad_norm": 6.3693389892578125, "learning_rate": 4.368421052631579e-05, "loss": 3.4379, "mean_token_accuracy": 0.8285104632377625, "step": 17 }, { "epoch": 0.38095238095238093, "grad_norm": 6.490217208862305, "learning_rate": 4.3157894736842105e-05, "loss": 3.2799, "mean_token_accuracy": 0.8258774131536484, "step": 18 }, { "epoch": 0.4021164021164021, "grad_norm": 6.357998847961426, "learning_rate": 4.2631578947368425e-05, "loss": 3.556, "mean_token_accuracy": 0.8310040086507797, "step": 19 }, { "epoch": 0.42328042328042326, "grad_norm": 6.8284807205200195, "learning_rate": 4.210526315789474e-05, "loss": 3.225, "mean_token_accuracy": 0.8262167721986771, "step": 20 }, { "epoch": 0.4444444444444444, "grad_norm": 6.557299613952637, "learning_rate": 4.157894736842106e-05, "loss": 2.8915, "mean_token_accuracy": 0.8450914323329926, "step": 21 }, { "epoch": 0.4656084656084656, "grad_norm": 6.227797508239746, "learning_rate": 4.105263157894737e-05, "loss": 3.5964, "mean_token_accuracy": 0.8293115049600601, "step": 22 }, { "epoch": 0.48677248677248675, "grad_norm": 6.4811482429504395, "learning_rate": 4.0526315789473684e-05, "loss": 2.9146, "mean_token_accuracy": 0.8511508405208588, "step": 23 }, { "epoch": 0.5079365079365079, "grad_norm": 7.390919208526611, "learning_rate": 4e-05, "loss": 3.0839, "mean_token_accuracy": 0.8521782755851746, "step": 24 }, { "epoch": 0.5291005291005291, "grad_norm": 7.000582695007324, "learning_rate": 3.9473684210526316e-05, "loss": 3.419, "mean_token_accuracy": 0.8302050828933716, "step": 25 }, { "epoch": 0.5502645502645502, "grad_norm": 7.646208763122559, "learning_rate": 3.894736842105263e-05, "loss": 3.5312, "mean_token_accuracy": 0.8222539722919464, "step": 26 }, { "epoch": 0.5714285714285714, "grad_norm": 8.202258110046387, "learning_rate": 3.842105263157895e-05, "loss": 3.2736, "mean_token_accuracy": 0.8385538011789322, "step": 27 }, { "epoch": 0.5925925925925926, "grad_norm": 7.112030982971191, "learning_rate": 3.789473684210527e-05, "loss": 3.5607, "mean_token_accuracy": 0.8270172476768494, "step": 28 }, { "epoch": 0.6137566137566137, "grad_norm": 7.77510404586792, "learning_rate": 3.736842105263158e-05, "loss": 3.4509, "mean_token_accuracy": 0.8338000029325485, "step": 29 }, { "epoch": 0.6349206349206349, "grad_norm": 8.627528190612793, "learning_rate": 3.6842105263157895e-05, "loss": 3.4089, "mean_token_accuracy": 0.8251242786645889, "step": 30 }, { "epoch": 0.656084656084656, "grad_norm": 7.149316787719727, "learning_rate": 3.6315789473684214e-05, "loss": 3.0029, "mean_token_accuracy": 0.8458760529756546, "step": 31 }, { "epoch": 0.6772486772486772, "grad_norm": 8.06534194946289, "learning_rate": 3.578947368421053e-05, "loss": 3.2129, "mean_token_accuracy": 0.8390202522277832, "step": 32 }, { "epoch": 0.6984126984126984, "grad_norm": 6.458263397216797, "learning_rate": 3.526315789473684e-05, "loss": 3.7385, "mean_token_accuracy": 0.8196459114551544, "step": 33 }, { "epoch": 0.7195767195767195, "grad_norm": 6.388949394226074, "learning_rate": 3.473684210526316e-05, "loss": 3.0692, "mean_token_accuracy": 0.8393152952194214, "step": 34 }, { "epoch": 0.7407407407407407, "grad_norm": 7.467120170593262, "learning_rate": 3.421052631578947e-05, "loss": 3.4118, "mean_token_accuracy": 0.8257147669792175, "step": 35 }, { "epoch": 0.7619047619047619, "grad_norm": 7.274282932281494, "learning_rate": 3.368421052631579e-05, "loss": 3.2201, "mean_token_accuracy": 0.8417303115129471, "step": 36 }, { "epoch": 0.783068783068783, "grad_norm": 7.1840500831604, "learning_rate": 3.3157894736842106e-05, "loss": 3.2211, "mean_token_accuracy": 0.8373906314373016, "step": 37 }, { "epoch": 0.8042328042328042, "grad_norm": 6.900996685028076, "learning_rate": 3.2631578947368426e-05, "loss": 3.4252, "mean_token_accuracy": 0.8245490044355392, "step": 38 }, { "epoch": 0.8253968253968254, "grad_norm": 6.207952976226807, "learning_rate": 3.210526315789474e-05, "loss": 3.1038, "mean_token_accuracy": 0.8480600118637085, "step": 39 }, { "epoch": 0.8465608465608465, "grad_norm": 6.920236587524414, "learning_rate": 3.157894736842105e-05, "loss": 3.3569, "mean_token_accuracy": 0.8315872251987457, "step": 40 }, { "epoch": 0.8677248677248677, "grad_norm": 7.361562728881836, "learning_rate": 3.105263157894737e-05, "loss": 3.6389, "mean_token_accuracy": 0.8231257498264313, "step": 41 }, { "epoch": 0.8888888888888888, "grad_norm": 6.357892036437988, "learning_rate": 3.0526315789473684e-05, "loss": 3.3438, "mean_token_accuracy": 0.8264831602573395, "step": 42 }, { "epoch": 0.91005291005291, "grad_norm": 6.043882369995117, "learning_rate": 3e-05, "loss": 3.3171, "mean_token_accuracy": 0.8378995358943939, "step": 43 }, { "epoch": 0.9312169312169312, "grad_norm": 6.81577205657959, "learning_rate": 2.9473684210526314e-05, "loss": 3.2854, "mean_token_accuracy": 0.8394952565431595, "step": 44 }, { "epoch": 0.9523809523809523, "grad_norm": 6.861083030700684, "learning_rate": 2.8947368421052634e-05, "loss": 3.1802, "mean_token_accuracy": 0.8438083976507187, "step": 45 }, { "epoch": 0.9735449735449735, "grad_norm": 7.255928039550781, "learning_rate": 2.842105263157895e-05, "loss": 3.7687, "mean_token_accuracy": 0.8107610791921616, "step": 46 }, { "epoch": 0.9947089947089947, "grad_norm": 6.662189483642578, "learning_rate": 2.7894736842105263e-05, "loss": 2.8252, "mean_token_accuracy": 0.8516946136951447, "step": 47 }, { "epoch": 1.0, "grad_norm": 2.8546807765960693, "learning_rate": 2.7368421052631583e-05, "loss": 0.7518, "mean_token_accuracy": 0.8395061492919922, "step": 48 }, { "epoch": 1.0211640211640212, "grad_norm": 5.928465843200684, "learning_rate": 2.6842105263157896e-05, "loss": 2.9837, "mean_token_accuracy": 0.8488393574953079, "step": 49 }, { "epoch": 1.0423280423280423, "grad_norm": 7.0468363761901855, "learning_rate": 2.6315789473684212e-05, "loss": 2.9965, "mean_token_accuracy": 0.8516333252191544, "step": 50 }, { "epoch": 1.0634920634920635, "grad_norm": 6.113859176635742, "learning_rate": 2.578947368421053e-05, "loss": 2.9239, "mean_token_accuracy": 0.846625804901123, "step": 51 }, { "epoch": 1.0846560846560847, "grad_norm": 6.449796199798584, "learning_rate": 2.5263157894736845e-05, "loss": 2.9745, "mean_token_accuracy": 0.8498904705047607, "step": 52 }, { "epoch": 1.1058201058201058, "grad_norm": 6.227660655975342, "learning_rate": 2.4736842105263158e-05, "loss": 2.7134, "mean_token_accuracy": 0.8572945147752762, "step": 53 }, { "epoch": 1.126984126984127, "grad_norm": 6.191722869873047, "learning_rate": 2.4210526315789474e-05, "loss": 2.9167, "mean_token_accuracy": 0.8419267982244492, "step": 54 }, { "epoch": 1.1481481481481481, "grad_norm": 6.0019307136535645, "learning_rate": 2.368421052631579e-05, "loss": 2.8168, "mean_token_accuracy": 0.8558094352483749, "step": 55 }, { "epoch": 1.1693121693121693, "grad_norm": 6.954141139984131, "learning_rate": 2.3157894736842107e-05, "loss": 3.0574, "mean_token_accuracy": 0.8407768905162811, "step": 56 }, { "epoch": 1.1904761904761905, "grad_norm": 6.376841068267822, "learning_rate": 2.2631578947368423e-05, "loss": 3.1123, "mean_token_accuracy": 0.8525369018316269, "step": 57 }, { "epoch": 1.2116402116402116, "grad_norm": 6.693148612976074, "learning_rate": 2.2105263157894736e-05, "loss": 2.351, "mean_token_accuracy": 0.8728293180465698, "step": 58 }, { "epoch": 1.2328042328042328, "grad_norm": 7.382028102874756, "learning_rate": 2.1578947368421053e-05, "loss": 3.065, "mean_token_accuracy": 0.8475844711065292, "step": 59 }, { "epoch": 1.253968253968254, "grad_norm": 6.0569682121276855, "learning_rate": 2.105263157894737e-05, "loss": 2.7, "mean_token_accuracy": 0.8730548322200775, "step": 60 }, { "epoch": 1.2751322751322751, "grad_norm": 6.204995632171631, "learning_rate": 2.0526315789473685e-05, "loss": 2.4867, "mean_token_accuracy": 0.8747055679559708, "step": 61 }, { "epoch": 1.2962962962962963, "grad_norm": 7.9132466316223145, "learning_rate": 2e-05, "loss": 2.634, "mean_token_accuracy": 0.8690300732851028, "step": 62 }, { "epoch": 1.3174603174603174, "grad_norm": 6.757488250732422, "learning_rate": 1.9473684210526315e-05, "loss": 2.4285, "mean_token_accuracy": 0.8734888136386871, "step": 63 }, { "epoch": 1.3386243386243386, "grad_norm": 6.5959930419921875, "learning_rate": 1.8947368421052634e-05, "loss": 2.7514, "mean_token_accuracy": 0.861017182469368, "step": 64 }, { "epoch": 1.3597883597883598, "grad_norm": 6.472837448120117, "learning_rate": 1.8421052631578947e-05, "loss": 2.3507, "mean_token_accuracy": 0.8777505159378052, "step": 65 }, { "epoch": 1.380952380952381, "grad_norm": 7.2989959716796875, "learning_rate": 1.7894736842105264e-05, "loss": 2.5765, "mean_token_accuracy": 0.8751945346593857, "step": 66 }, { "epoch": 1.402116402116402, "grad_norm": 7.222543716430664, "learning_rate": 1.736842105263158e-05, "loss": 2.8945, "mean_token_accuracy": 0.8649410307407379, "step": 67 }, { "epoch": 1.4232804232804233, "grad_norm": 8.021060943603516, "learning_rate": 1.6842105263157896e-05, "loss": 3.104, "mean_token_accuracy": 0.841856375336647, "step": 68 }, { "epoch": 1.4444444444444444, "grad_norm": 6.828577995300293, "learning_rate": 1.6315789473684213e-05, "loss": 2.562, "mean_token_accuracy": 0.8617057651281357, "step": 69 }, { "epoch": 1.4656084656084656, "grad_norm": 7.925837993621826, "learning_rate": 1.5789473684210526e-05, "loss": 3.0748, "mean_token_accuracy": 0.8369355648756027, "step": 70 }, { "epoch": 1.4867724867724867, "grad_norm": 7.627265930175781, "learning_rate": 1.5263157894736842e-05, "loss": 2.7155, "mean_token_accuracy": 0.8586581945419312, "step": 71 }, { "epoch": 1.507936507936508, "grad_norm": 6.797243595123291, "learning_rate": 1.4736842105263157e-05, "loss": 2.9972, "mean_token_accuracy": 0.8588763475418091, "step": 72 }, { "epoch": 1.529100529100529, "grad_norm": 6.557271957397461, "learning_rate": 1.4210526315789475e-05, "loss": 2.7672, "mean_token_accuracy": 0.8697129487991333, "step": 73 }, { "epoch": 1.5502645502645502, "grad_norm": 7.8767828941345215, "learning_rate": 1.3684210526315791e-05, "loss": 2.6796, "mean_token_accuracy": 0.8595937788486481, "step": 74 }, { "epoch": 1.5714285714285714, "grad_norm": 6.961939334869385, "learning_rate": 1.3157894736842106e-05, "loss": 2.9875, "mean_token_accuracy": 0.8416125029325485, "step": 75 }, { "epoch": 1.5925925925925926, "grad_norm": 7.682628154754639, "learning_rate": 1.2631578947368422e-05, "loss": 2.7506, "mean_token_accuracy": 0.8638267368078232, "step": 76 }, { "epoch": 1.6137566137566137, "grad_norm": 7.030145168304443, "learning_rate": 1.2105263157894737e-05, "loss": 3.016, "mean_token_accuracy": 0.8373551815748215, "step": 77 }, { "epoch": 1.6349206349206349, "grad_norm": 6.8483357429504395, "learning_rate": 1.1578947368421053e-05, "loss": 2.9225, "mean_token_accuracy": 0.8537007570266724, "step": 78 }, { "epoch": 1.656084656084656, "grad_norm": 6.3635358810424805, "learning_rate": 1.1052631578947368e-05, "loss": 2.605, "mean_token_accuracy": 0.8712249547243118, "step": 79 }, { "epoch": 1.6772486772486772, "grad_norm": 6.841208457946777, "learning_rate": 1.0526315789473684e-05, "loss": 2.649, "mean_token_accuracy": 0.8548033833503723, "step": 80 }, { "epoch": 1.6984126984126984, "grad_norm": 6.934281826019287, "learning_rate": 1e-05, "loss": 2.7121, "mean_token_accuracy": 0.8624187707901001, "step": 81 }, { "epoch": 1.7195767195767195, "grad_norm": 7.023396968841553, "learning_rate": 9.473684210526317e-06, "loss": 2.7811, "mean_token_accuracy": 0.8551411479711533, "step": 82 }, { "epoch": 1.7407407407407407, "grad_norm": 6.556440830230713, "learning_rate": 8.947368421052632e-06, "loss": 2.4777, "mean_token_accuracy": 0.8696417659521103, "step": 83 }, { "epoch": 1.7619047619047619, "grad_norm": 6.040866374969482, "learning_rate": 8.421052631578948e-06, "loss": 2.578, "mean_token_accuracy": 0.8646088093519211, "step": 84 }, { "epoch": 1.783068783068783, "grad_norm": 6.976686000823975, "learning_rate": 7.894736842105263e-06, "loss": 3.0012, "mean_token_accuracy": 0.8455324172973633, "step": 85 }, { "epoch": 1.8042328042328042, "grad_norm": 6.477259635925293, "learning_rate": 7.3684210526315784e-06, "loss": 2.6865, "mean_token_accuracy": 0.8627304285764694, "step": 86 }, { "epoch": 1.8253968253968254, "grad_norm": 6.601570129394531, "learning_rate": 6.842105263157896e-06, "loss": 2.8802, "mean_token_accuracy": 0.8515039086341858, "step": 87 }, { "epoch": 1.8465608465608465, "grad_norm": 6.802147388458252, "learning_rate": 6.315789473684211e-06, "loss": 2.6333, "mean_token_accuracy": 0.8652176707983017, "step": 88 }, { "epoch": 1.8677248677248677, "grad_norm": 6.9757843017578125, "learning_rate": 5.789473684210527e-06, "loss": 2.5824, "mean_token_accuracy": 0.8683836162090302, "step": 89 }, { "epoch": 1.8888888888888888, "grad_norm": 8.725011825561523, "learning_rate": 5.263157894736842e-06, "loss": 2.0392, "mean_token_accuracy": 0.8908423334360123, "step": 90 }, { "epoch": 1.91005291005291, "grad_norm": 6.747620105743408, "learning_rate": 4.736842105263159e-06, "loss": 2.5522, "mean_token_accuracy": 0.8612807095050812, "step": 91 }, { "epoch": 1.9312169312169312, "grad_norm": 7.072465419769287, "learning_rate": 4.210526315789474e-06, "loss": 3.1815, "mean_token_accuracy": 0.836863324046135, "step": 92 }, { "epoch": 1.9523809523809523, "grad_norm": 5.726354122161865, "learning_rate": 3.6842105263157892e-06, "loss": 2.8141, "mean_token_accuracy": 0.8544116318225861, "step": 93 }, { "epoch": 1.9735449735449735, "grad_norm": 6.540099620819092, "learning_rate": 3.1578947368421056e-06, "loss": 2.2286, "mean_token_accuracy": 0.8898429274559021, "step": 94 }, { "epoch": 1.9947089947089947, "grad_norm": 6.0961222648620605, "learning_rate": 2.631578947368421e-06, "loss": 2.634, "mean_token_accuracy": 0.8558461666107178, "step": 95 }, { "epoch": 2.0, "grad_norm": 2.9811086654663086, "learning_rate": 2.105263157894737e-06, "loss": 0.3701, "mean_token_accuracy": 0.8978873491287231, "step": 96 }, { "epoch": 2.0211640211640214, "grad_norm": 5.955318450927734, "learning_rate": 1.5789473684210528e-06, "loss": 2.499, "mean_token_accuracy": 0.8697298467159271, "step": 97 }, { "epoch": 2.0423280423280423, "grad_norm": 5.380612373352051, "learning_rate": 1.0526315789473685e-06, "loss": 2.2655, "mean_token_accuracy": 0.8864545077085495, "step": 98 }, { "epoch": 2.0634920634920633, "grad_norm": 5.8451457023620605, "learning_rate": 5.263157894736843e-07, "loss": 1.9556, "mean_token_accuracy": 0.8944731205701828, "step": 99 }, { "epoch": 2.0846560846560847, "grad_norm": 6.115093231201172, "learning_rate": 0.0, "loss": 2.4345, "mean_token_accuracy": 0.8695619106292725, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 514181058527232.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }