{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.328180737217598, "eval_steps": 500, "global_step": 2100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03170828378913991, "grad_norm": 11.620649337768555, "learning_rate": 0.00011399999999999999, "loss": 7.7223, "step": 20 }, { "epoch": 0.06341656757827982, "grad_norm": 1.8820377588272095, "learning_rate": 0.000234, "loss": 1.8545, "step": 40 }, { "epoch": 0.09512485136741974, "grad_norm": 1.3315837383270264, "learning_rate": 0.00029868292682926826, "loss": 1.2718, "step": 60 }, { "epoch": 0.12683313515655964, "grad_norm": 1.0603790283203125, "learning_rate": 0.00029575609756097557, "loss": 1.1449, "step": 80 }, { "epoch": 0.15854141894569956, "grad_norm": 1.1477017402648926, "learning_rate": 0.00029282926829268287, "loss": 1.1018, "step": 100 }, { "epoch": 0.1902497027348395, "grad_norm": 1.0604023933410645, "learning_rate": 0.0002899024390243902, "loss": 1.0883, "step": 120 }, { "epoch": 0.22195798652397938, "grad_norm": 0.8798519372940063, "learning_rate": 0.0002869756097560975, "loss": 1.0645, "step": 140 }, { "epoch": 0.2536662703131193, "grad_norm": 1.0906599760055542, "learning_rate": 0.0002840487804878048, "loss": 1.0418, "step": 160 }, { "epoch": 0.2853745541022592, "grad_norm": 0.9430219531059265, "learning_rate": 0.0002811219512195122, "loss": 1.0316, "step": 180 }, { "epoch": 0.3170828378913991, "grad_norm": 1.0706809759140015, "learning_rate": 0.0002781951219512195, "loss": 1.0286, "step": 200 }, { "epoch": 0.34879112168053905, "grad_norm": 0.8156995177268982, "learning_rate": 0.00027526829268292684, "loss": 1.0293, "step": 220 }, { "epoch": 0.380499405469679, "grad_norm": 0.8572260737419128, "learning_rate": 0.00027234146341463414, "loss": 1.0051, "step": 240 }, { "epoch": 0.41220768925881884, "grad_norm": 0.9287059307098389, "learning_rate": 0.00026941463414634144, "loss": 1.0152, "step": 260 }, { "epoch": 0.44391597304795877, "grad_norm": 0.8125821352005005, "learning_rate": 0.00026648780487804874, "loss": 1.0098, "step": 280 }, { "epoch": 0.4756242568370987, "grad_norm": 0.8053847551345825, "learning_rate": 0.0002635609756097561, "loss": 1.0029, "step": 300 }, { "epoch": 0.5073325406262386, "grad_norm": 0.8253501653671265, "learning_rate": 0.0002606341463414634, "loss": 0.9859, "step": 320 }, { "epoch": 0.5390408244153785, "grad_norm": 0.8003745675086975, "learning_rate": 0.0002577073170731707, "loss": 0.9773, "step": 340 }, { "epoch": 0.5707491082045184, "grad_norm": 0.7730509042739868, "learning_rate": 0.000254780487804878, "loss": 0.982, "step": 360 }, { "epoch": 0.6024573919936583, "grad_norm": 0.7878270149230957, "learning_rate": 0.00025185365853658536, "loss": 0.987, "step": 380 }, { "epoch": 0.6341656757827983, "grad_norm": 0.7862978577613831, "learning_rate": 0.00024892682926829266, "loss": 0.9802, "step": 400 }, { "epoch": 0.6658739595719382, "grad_norm": 2.1581475734710693, "learning_rate": 0.00024599999999999996, "loss": 0.9663, "step": 420 }, { "epoch": 0.6975822433610781, "grad_norm": 0.7952772974967957, "learning_rate": 0.00024307317073170732, "loss": 0.9788, "step": 440 }, { "epoch": 0.729290527150218, "grad_norm": 0.697483241558075, "learning_rate": 0.00024014634146341462, "loss": 0.9568, "step": 460 }, { "epoch": 0.760998810939358, "grad_norm": 0.6820840835571289, "learning_rate": 0.00023721951219512195, "loss": 0.9609, "step": 480 }, { "epoch": 0.7927070947284979, "grad_norm": 0.7179127931594849, "learning_rate": 0.00023429268292682925, "loss": 0.9726, "step": 500 }, { "epoch": 0.8244153785176377, "grad_norm": 0.6777030229568481, "learning_rate": 0.00023136585365853658, "loss": 0.9601, "step": 520 }, { "epoch": 0.8561236623067776, "grad_norm": 0.7026005387306213, "learning_rate": 0.00022843902439024388, "loss": 0.9522, "step": 540 }, { "epoch": 0.8878319460959175, "grad_norm": 0.9660900831222534, "learning_rate": 0.0002255121951219512, "loss": 0.9525, "step": 560 }, { "epoch": 0.9195402298850575, "grad_norm": 0.650274932384491, "learning_rate": 0.0002225853658536585, "loss": 0.9511, "step": 580 }, { "epoch": 0.9512485136741974, "grad_norm": 0.6481014490127563, "learning_rate": 0.00021965853658536584, "loss": 0.9527, "step": 600 }, { "epoch": 0.9829567974633373, "grad_norm": 0.6192963719367981, "learning_rate": 0.00021673170731707314, "loss": 0.9525, "step": 620 }, { "epoch": 1.014268727705113, "grad_norm": 0.6381222009658813, "learning_rate": 0.00021380487804878047, "loss": 0.9434, "step": 640 }, { "epoch": 1.0459770114942528, "grad_norm": 0.6611652970314026, "learning_rate": 0.00021087804878048777, "loss": 0.9355, "step": 660 }, { "epoch": 1.0776852952833929, "grad_norm": 0.6494982242584229, "learning_rate": 0.0002079512195121951, "loss": 0.9411, "step": 680 }, { "epoch": 1.1093935790725327, "grad_norm": 2.5105600357055664, "learning_rate": 0.00020502439024390243, "loss": 0.9471, "step": 700 }, { "epoch": 1.1411018628616727, "grad_norm": 0.6972084045410156, "learning_rate": 0.00020209756097560976, "loss": 0.9553, "step": 720 }, { "epoch": 1.1728101466508125, "grad_norm": 0.6206223368644714, "learning_rate": 0.00019917073170731706, "loss": 0.9335, "step": 740 }, { "epoch": 1.2045184304399523, "grad_norm": 0.7724215388298035, "learning_rate": 0.00019624390243902439, "loss": 0.9308, "step": 760 }, { "epoch": 1.2362267142290924, "grad_norm": 0.5925254821777344, "learning_rate": 0.0001933170731707317, "loss": 0.9325, "step": 780 }, { "epoch": 1.2679349980182324, "grad_norm": 0.522939920425415, "learning_rate": 0.00019039024390243902, "loss": 0.9316, "step": 800 }, { "epoch": 1.2996432818073722, "grad_norm": 0.5890282392501831, "learning_rate": 0.00018746341463414632, "loss": 0.9289, "step": 820 }, { "epoch": 1.331351565596512, "grad_norm": 0.7248061299324036, "learning_rate": 0.00018453658536585365, "loss": 0.9194, "step": 840 }, { "epoch": 1.363059849385652, "grad_norm": 0.6553404927253723, "learning_rate": 0.00018160975609756095, "loss": 0.9312, "step": 860 }, { "epoch": 1.3947681331747919, "grad_norm": 0.5486903786659241, "learning_rate": 0.00017868292682926828, "loss": 0.9217, "step": 880 }, { "epoch": 1.426476416963932, "grad_norm": 0.6329432725906372, "learning_rate": 0.00017575609756097558, "loss": 0.9181, "step": 900 }, { "epoch": 1.4581847007530717, "grad_norm": 0.5233189463615417, "learning_rate": 0.0001728292682926829, "loss": 0.942, "step": 920 }, { "epoch": 1.4898929845422115, "grad_norm": 0.6567553877830505, "learning_rate": 0.0001699024390243902, "loss": 0.9111, "step": 940 }, { "epoch": 1.5216012683313516, "grad_norm": 0.5668836236000061, "learning_rate": 0.00016697560975609756, "loss": 0.9253, "step": 960 }, { "epoch": 1.5533095521204916, "grad_norm": 0.5501447916030884, "learning_rate": 0.00016404878048780486, "loss": 0.9208, "step": 980 }, { "epoch": 1.5850178359096314, "grad_norm": 0.543779194355011, "learning_rate": 0.0001611219512195122, "loss": 0.9176, "step": 1000 }, { "epoch": 1.6167261196987712, "grad_norm": 0.6107056140899658, "learning_rate": 0.0001581951219512195, "loss": 0.9147, "step": 1020 }, { "epoch": 1.6484344034879113, "grad_norm": 0.4941338002681732, "learning_rate": 0.00015526829268292682, "loss": 0.9166, "step": 1040 }, { "epoch": 1.6801426872770513, "grad_norm": 0.5821026563644409, "learning_rate": 0.00015234146341463412, "loss": 0.9139, "step": 1060 }, { "epoch": 1.711850971066191, "grad_norm": 0.5568034052848816, "learning_rate": 0.00014941463414634145, "loss": 0.9177, "step": 1080 }, { "epoch": 1.743559254855331, "grad_norm": 0.5890582203865051, "learning_rate": 0.00014648780487804875, "loss": 0.9106, "step": 1100 }, { "epoch": 1.7752675386444707, "grad_norm": 0.6119087338447571, "learning_rate": 0.00014356097560975608, "loss": 0.9051, "step": 1120 }, { "epoch": 1.8069758224336108, "grad_norm": 0.562029242515564, "learning_rate": 0.00014063414634146338, "loss": 0.9212, "step": 1140 }, { "epoch": 1.8386841062227508, "grad_norm": 0.5576140284538269, "learning_rate": 0.0001377073170731707, "loss": 0.9078, "step": 1160 }, { "epoch": 1.8703923900118906, "grad_norm": 0.518469512462616, "learning_rate": 0.00013478048780487804, "loss": 0.9127, "step": 1180 }, { "epoch": 1.9021006738010304, "grad_norm": 0.4536910951137543, "learning_rate": 0.00013185365853658534, "loss": 0.9018, "step": 1200 }, { "epoch": 1.9338089575901705, "grad_norm": 0.5300338268280029, "learning_rate": 0.00012892682926829267, "loss": 0.9142, "step": 1220 }, { "epoch": 1.9655172413793105, "grad_norm": 0.5239934325218201, "learning_rate": 0.00012599999999999997, "loss": 0.9068, "step": 1240 }, { "epoch": 1.9972255251684503, "grad_norm": 0.4621521830558777, "learning_rate": 0.0001230731707317073, "loss": 0.9011, "step": 1260 }, { "epoch": 2.028537455410226, "grad_norm": 0.5628905296325684, "learning_rate": 0.00012014634146341463, "loss": 0.9064, "step": 1280 }, { "epoch": 2.060245739199366, "grad_norm": 0.5678831934928894, "learning_rate": 0.00011721951219512194, "loss": 0.9192, "step": 1300 }, { "epoch": 2.0919540229885056, "grad_norm": 0.5180283188819885, "learning_rate": 0.00011429268292682926, "loss": 0.8935, "step": 1320 }, { "epoch": 2.1236623067776454, "grad_norm": 0.5497546195983887, "learning_rate": 0.00011136585365853657, "loss": 0.8939, "step": 1340 }, { "epoch": 2.1553705905667857, "grad_norm": 0.5264196991920471, "learning_rate": 0.00010843902439024389, "loss": 0.8968, "step": 1360 }, { "epoch": 2.1870788743559255, "grad_norm": 0.48166030645370483, "learning_rate": 0.0001055121951219512, "loss": 0.8891, "step": 1380 }, { "epoch": 2.2187871581450653, "grad_norm": 0.5162549018859863, "learning_rate": 0.00010258536585365853, "loss": 0.9003, "step": 1400 }, { "epoch": 2.250495441934205, "grad_norm": 0.5740045309066772, "learning_rate": 9.965853658536585e-05, "loss": 0.8955, "step": 1420 }, { "epoch": 2.2822037257233454, "grad_norm": 0.507210910320282, "learning_rate": 9.673170731707316e-05, "loss": 0.8845, "step": 1440 }, { "epoch": 2.313912009512485, "grad_norm": 0.5239551663398743, "learning_rate": 9.380487804878048e-05, "loss": 0.8971, "step": 1460 }, { "epoch": 2.345620293301625, "grad_norm": 0.46981072425842285, "learning_rate": 9.08780487804878e-05, "loss": 0.8897, "step": 1480 }, { "epoch": 2.377328577090765, "grad_norm": 0.5130921602249146, "learning_rate": 8.795121951219511e-05, "loss": 0.8939, "step": 1500 }, { "epoch": 2.4090368608799047, "grad_norm": 0.5038473606109619, "learning_rate": 8.502439024390242e-05, "loss": 0.9096, "step": 1520 }, { "epoch": 2.440745144669045, "grad_norm": 0.4756928086280823, "learning_rate": 8.209756097560975e-05, "loss": 0.8815, "step": 1540 }, { "epoch": 2.4724534284581847, "grad_norm": 0.5105359554290771, "learning_rate": 7.917073170731707e-05, "loss": 0.8857, "step": 1560 }, { "epoch": 2.5041617122473245, "grad_norm": 0.5070236921310425, "learning_rate": 7.624390243902438e-05, "loss": 0.8935, "step": 1580 }, { "epoch": 2.535869996036465, "grad_norm": 0.5580913424491882, "learning_rate": 7.33170731707317e-05, "loss": 0.8894, "step": 1600 }, { "epoch": 2.5675782798256046, "grad_norm": 0.5412284731864929, "learning_rate": 7.039024390243901e-05, "loss": 0.8838, "step": 1620 }, { "epoch": 2.5992865636147444, "grad_norm": 0.5017954111099243, "learning_rate": 6.746341463414634e-05, "loss": 0.8764, "step": 1640 }, { "epoch": 2.6309948474038842, "grad_norm": 0.46863794326782227, "learning_rate": 6.453658536585366e-05, "loss": 0.8731, "step": 1660 }, { "epoch": 2.662703131193024, "grad_norm": 0.4468729496002197, "learning_rate": 6.160975609756097e-05, "loss": 0.8901, "step": 1680 }, { "epoch": 2.694411414982164, "grad_norm": 0.5184731483459473, "learning_rate": 5.868292682926829e-05, "loss": 0.8921, "step": 1700 }, { "epoch": 2.726119698771304, "grad_norm": 0.44308602809906006, "learning_rate": 5.575609756097561e-05, "loss": 0.8772, "step": 1720 }, { "epoch": 2.757827982560444, "grad_norm": 0.47546738386154175, "learning_rate": 5.2829268292682916e-05, "loss": 0.8861, "step": 1740 }, { "epoch": 2.7895362663495837, "grad_norm": 0.4518582224845886, "learning_rate": 4.9902439024390244e-05, "loss": 0.8794, "step": 1760 }, { "epoch": 2.821244550138724, "grad_norm": 0.49235859513282776, "learning_rate": 4.697560975609756e-05, "loss": 0.8846, "step": 1780 }, { "epoch": 2.852952833927864, "grad_norm": 0.45942848920822144, "learning_rate": 4.4048780487804874e-05, "loss": 0.8812, "step": 1800 }, { "epoch": 2.8846611177170036, "grad_norm": 0.4411655068397522, "learning_rate": 4.1121951219512196e-05, "loss": 0.8748, "step": 1820 }, { "epoch": 2.9163694015061434, "grad_norm": 0.46850547194480896, "learning_rate": 3.819512195121951e-05, "loss": 0.8783, "step": 1840 }, { "epoch": 2.9480776852952832, "grad_norm": 0.42767903208732605, "learning_rate": 3.5268292682926826e-05, "loss": 0.8776, "step": 1860 }, { "epoch": 2.979785969084423, "grad_norm": 0.47117599844932556, "learning_rate": 3.234146341463414e-05, "loss": 0.9012, "step": 1880 }, { "epoch": 3.0110978993261988, "grad_norm": 0.46736887097358704, "learning_rate": 2.9414634146341463e-05, "loss": 0.8801, "step": 1900 }, { "epoch": 3.042806183115339, "grad_norm": 0.4541435241699219, "learning_rate": 2.6487804878048778e-05, "loss": 0.8877, "step": 1920 }, { "epoch": 3.074514466904479, "grad_norm": 0.49423545598983765, "learning_rate": 2.3560975609756097e-05, "loss": 0.8761, "step": 1940 }, { "epoch": 3.1062227506936186, "grad_norm": 0.432778000831604, "learning_rate": 2.0634146341463415e-05, "loss": 0.8883, "step": 1960 }, { "epoch": 3.1379310344827585, "grad_norm": 0.46009406447410583, "learning_rate": 1.770731707317073e-05, "loss": 0.8798, "step": 1980 }, { "epoch": 3.1696393182718987, "grad_norm": 0.45351386070251465, "learning_rate": 1.4780487804878048e-05, "loss": 0.8706, "step": 2000 }, { "epoch": 3.2013476020610385, "grad_norm": 0.46693041920661926, "learning_rate": 1.1853658536585365e-05, "loss": 0.8805, "step": 2020 }, { "epoch": 3.2330558858501783, "grad_norm": 0.4250204563140869, "learning_rate": 8.926829268292682e-06, "loss": 0.8667, "step": 2040 }, { "epoch": 3.264764169639318, "grad_norm": 0.43274539709091187, "learning_rate": 5.999999999999999e-06, "loss": 0.8879, "step": 2060 }, { "epoch": 3.296472453428458, "grad_norm": 0.3914950489997864, "learning_rate": 3.073170731707317e-06, "loss": 0.8669, "step": 2080 }, { "epoch": 3.328180737217598, "grad_norm": 0.41752079129219055, "learning_rate": 1.4634146341463413e-07, "loss": 0.8665, "step": 2100 } ], "logging_steps": 20, "max_steps": 2100, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0992069136896492e+18, "train_batch_size": 6, "trial_name": null, "trial_params": null }