{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4096, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01220703125, "grad_norm": 2.0304784774780273, "learning_rate": 0.0002964111328125, "loss": 3.188, "step": 50 }, { "epoch": 0.0244140625, "grad_norm": 2.4612932205200195, "learning_rate": 0.00029274902343749996, "loss": 2.6364, "step": 100 }, { "epoch": 0.03662109375, "grad_norm": 1.7923061847686768, "learning_rate": 0.0002890869140625, "loss": 2.596, "step": 150 }, { "epoch": 0.048828125, "grad_norm": 1.646581768989563, "learning_rate": 0.00028542480468749995, "loss": 2.4141, "step": 200 }, { "epoch": 0.06103515625, "grad_norm": 1.8792452812194824, "learning_rate": 0.0002817626953125, "loss": 2.2905, "step": 250 }, { "epoch": 0.0732421875, "grad_norm": 2.2685205936431885, "learning_rate": 0.0002781005859375, "loss": 2.3898, "step": 300 }, { "epoch": 0.08544921875, "grad_norm": 1.5816388130187988, "learning_rate": 0.00027443847656249997, "loss": 2.311, "step": 350 }, { "epoch": 0.09765625, "grad_norm": 2.45890474319458, "learning_rate": 0.0002707763671875, "loss": 2.2857, "step": 400 }, { "epoch": 0.10986328125, "grad_norm": 2.430391788482666, "learning_rate": 0.00026711425781249996, "loss": 2.3965, "step": 450 }, { "epoch": 0.1220703125, "grad_norm": 1.7718228101730347, "learning_rate": 0.0002634521484375, "loss": 2.2186, "step": 500 }, { "epoch": 0.13427734375, "grad_norm": 1.6528531312942505, "learning_rate": 0.00025979003906249996, "loss": 2.2731, "step": 550 }, { "epoch": 0.146484375, "grad_norm": 2.2321362495422363, "learning_rate": 0.0002561279296875, "loss": 2.1835, "step": 600 }, { "epoch": 0.15869140625, "grad_norm": 1.4447129964828491, "learning_rate": 0.0002524658203125, "loss": 2.1551, "step": 650 }, { "epoch": 0.1708984375, "grad_norm": 2.109698534011841, "learning_rate": 0.0002488037109375, "loss": 2.1099, "step": 700 }, { "epoch": 0.18310546875, "grad_norm": 2.069277763366699, "learning_rate": 0.0002451416015625, "loss": 2.2276, "step": 750 }, { "epoch": 0.1953125, "grad_norm": 1.364721655845642, "learning_rate": 0.00024147949218749997, "loss": 2.1446, "step": 800 }, { "epoch": 0.20751953125, "grad_norm": 1.3938268423080444, "learning_rate": 0.0002378173828125, "loss": 2.0738, "step": 850 }, { "epoch": 0.2197265625, "grad_norm": 1.6320847272872925, "learning_rate": 0.0002341552734375, "loss": 2.1222, "step": 900 }, { "epoch": 0.23193359375, "grad_norm": 1.6347249746322632, "learning_rate": 0.00023049316406249999, "loss": 2.0938, "step": 950 }, { "epoch": 0.244140625, "grad_norm": 1.4087275266647339, "learning_rate": 0.00022683105468749998, "loss": 2.0383, "step": 1000 }, { "epoch": 0.25634765625, "grad_norm": 2.272686243057251, "learning_rate": 0.00022316894531249998, "loss": 2.2069, "step": 1050 }, { "epoch": 0.2685546875, "grad_norm": 1.3984218835830688, "learning_rate": 0.00021950683593749998, "loss": 2.0441, "step": 1100 }, { "epoch": 0.28076171875, "grad_norm": 1.2001489400863647, "learning_rate": 0.00021584472656249997, "loss": 2.1488, "step": 1150 }, { "epoch": 0.29296875, "grad_norm": 2.4557433128356934, "learning_rate": 0.00021218261718749997, "loss": 2.2129, "step": 1200 }, { "epoch": 0.30517578125, "grad_norm": 1.6851530075073242, "learning_rate": 0.0002085205078125, "loss": 2.113, "step": 1250 }, { "epoch": 0.3173828125, "grad_norm": 2.06068754196167, "learning_rate": 0.0002048583984375, "loss": 2.0173, "step": 1300 }, { "epoch": 0.32958984375, "grad_norm": 1.4833667278289795, "learning_rate": 0.00020119628906249999, "loss": 2.0549, "step": 1350 }, { "epoch": 0.341796875, "grad_norm": 1.8183746337890625, "learning_rate": 0.00019753417968749998, "loss": 2.144, "step": 1400 }, { "epoch": 0.35400390625, "grad_norm": 1.4679142236709595, "learning_rate": 0.00019387207031249998, "loss": 1.9735, "step": 1450 }, { "epoch": 0.3662109375, "grad_norm": 1.4751473665237427, "learning_rate": 0.00019020996093749998, "loss": 2.0885, "step": 1500 }, { "epoch": 0.37841796875, "grad_norm": 1.6953202486038208, "learning_rate": 0.00018654785156249997, "loss": 2.0334, "step": 1550 }, { "epoch": 0.390625, "grad_norm": 1.902228593826294, "learning_rate": 0.0001828857421875, "loss": 2.0944, "step": 1600 }, { "epoch": 0.40283203125, "grad_norm": 2.122025966644287, "learning_rate": 0.0001792236328125, "loss": 2.0127, "step": 1650 }, { "epoch": 0.4150390625, "grad_norm": 1.9450644254684448, "learning_rate": 0.0001755615234375, "loss": 2.0664, "step": 1700 }, { "epoch": 0.42724609375, "grad_norm": 1.5735949277877808, "learning_rate": 0.0001718994140625, "loss": 2.0909, "step": 1750 }, { "epoch": 0.439453125, "grad_norm": 1.5673192739486694, "learning_rate": 0.00016823730468749998, "loss": 1.9894, "step": 1800 }, { "epoch": 0.45166015625, "grad_norm": 1.8855547904968262, "learning_rate": 0.00016457519531249998, "loss": 2.0431, "step": 1850 }, { "epoch": 0.4638671875, "grad_norm": 1.7565497159957886, "learning_rate": 0.00016091308593749998, "loss": 2.004, "step": 1900 }, { "epoch": 0.47607421875, "grad_norm": 2.02719783782959, "learning_rate": 0.00015725097656249997, "loss": 2.031, "step": 1950 }, { "epoch": 0.48828125, "grad_norm": 1.4837744235992432, "learning_rate": 0.0001535888671875, "loss": 1.9936, "step": 2000 }, { "epoch": 0.50048828125, "grad_norm": 1.8650251626968384, "learning_rate": 0.0001499267578125, "loss": 1.8625, "step": 2050 }, { "epoch": 0.5126953125, "grad_norm": 1.4077361822128296, "learning_rate": 0.0001462646484375, "loss": 2.1258, "step": 2100 }, { "epoch": 0.52490234375, "grad_norm": 1.7397881746292114, "learning_rate": 0.0001426025390625, "loss": 1.9648, "step": 2150 }, { "epoch": 0.537109375, "grad_norm": 1.5107667446136475, "learning_rate": 0.00013894042968749999, "loss": 2.0116, "step": 2200 }, { "epoch": 0.54931640625, "grad_norm": 1.8365352153778076, "learning_rate": 0.00013527832031249998, "loss": 1.9269, "step": 2250 }, { "epoch": 0.5615234375, "grad_norm": 2.032548666000366, "learning_rate": 0.00013161621093749998, "loss": 1.9682, "step": 2300 }, { "epoch": 0.57373046875, "grad_norm": 1.6477632522583008, "learning_rate": 0.00012795410156249998, "loss": 1.9131, "step": 2350 }, { "epoch": 0.5859375, "grad_norm": 1.5629210472106934, "learning_rate": 0.0001242919921875, "loss": 1.9753, "step": 2400 }, { "epoch": 0.59814453125, "grad_norm": 1.5665594339370728, "learning_rate": 0.0001206298828125, "loss": 1.9507, "step": 2450 }, { "epoch": 0.6103515625, "grad_norm": 1.4493316411972046, "learning_rate": 0.00011696777343749999, "loss": 1.9458, "step": 2500 }, { "epoch": 0.62255859375, "grad_norm": 1.966045618057251, "learning_rate": 0.00011330566406249999, "loss": 2.0389, "step": 2550 }, { "epoch": 0.634765625, "grad_norm": 1.5992895364761353, "learning_rate": 0.00010964355468749999, "loss": 1.9464, "step": 2600 }, { "epoch": 0.64697265625, "grad_norm": 1.627054214477539, "learning_rate": 0.0001059814453125, "loss": 1.9309, "step": 2650 }, { "epoch": 0.6591796875, "grad_norm": 1.63454008102417, "learning_rate": 0.0001023193359375, "loss": 1.9872, "step": 2700 }, { "epoch": 0.67138671875, "grad_norm": 2.3954381942749023, "learning_rate": 9.865722656249999e-05, "loss": 1.9504, "step": 2750 }, { "epoch": 0.68359375, "grad_norm": 1.3977527618408203, "learning_rate": 9.499511718749999e-05, "loss": 1.9377, "step": 2800 }, { "epoch": 0.69580078125, "grad_norm": 1.9443901777267456, "learning_rate": 9.13330078125e-05, "loss": 1.9843, "step": 2850 }, { "epoch": 0.7080078125, "grad_norm": 2.0238633155822754, "learning_rate": 8.76708984375e-05, "loss": 1.8598, "step": 2900 }, { "epoch": 0.72021484375, "grad_norm": 1.5116349458694458, "learning_rate": 8.400878906249999e-05, "loss": 2.0174, "step": 2950 }, { "epoch": 0.732421875, "grad_norm": 1.8400050401687622, "learning_rate": 8.034667968749999e-05, "loss": 1.9492, "step": 3000 }, { "epoch": 0.74462890625, "grad_norm": 1.6228867769241333, "learning_rate": 7.66845703125e-05, "loss": 2.0785, "step": 3050 }, { "epoch": 0.7568359375, "grad_norm": 1.8740808963775635, "learning_rate": 7.30224609375e-05, "loss": 1.8879, "step": 3100 }, { "epoch": 0.76904296875, "grad_norm": 1.744222640991211, "learning_rate": 6.936035156249999e-05, "loss": 1.9744, "step": 3150 }, { "epoch": 0.78125, "grad_norm": 1.3467113971710205, "learning_rate": 6.569824218749999e-05, "loss": 2.0236, "step": 3200 }, { "epoch": 0.79345703125, "grad_norm": 1.9058157205581665, "learning_rate": 6.20361328125e-05, "loss": 1.9184, "step": 3250 }, { "epoch": 0.8056640625, "grad_norm": 2.2363059520721436, "learning_rate": 5.8374023437499995e-05, "loss": 1.9164, "step": 3300 }, { "epoch": 0.81787109375, "grad_norm": 1.8223354816436768, "learning_rate": 5.47119140625e-05, "loss": 1.918, "step": 3350 }, { "epoch": 0.830078125, "grad_norm": 1.7750871181488037, "learning_rate": 5.1049804687499995e-05, "loss": 1.9546, "step": 3400 }, { "epoch": 0.84228515625, "grad_norm": 1.5826454162597656, "learning_rate": 4.73876953125e-05, "loss": 1.8607, "step": 3450 }, { "epoch": 0.8544921875, "grad_norm": 2.49688458442688, "learning_rate": 4.3725585937499995e-05, "loss": 2.034, "step": 3500 }, { "epoch": 0.86669921875, "grad_norm": 1.9298906326293945, "learning_rate": 4.00634765625e-05, "loss": 1.8788, "step": 3550 }, { "epoch": 0.87890625, "grad_norm": 1.8771145343780518, "learning_rate": 3.6401367187499996e-05, "loss": 1.9758, "step": 3600 }, { "epoch": 0.89111328125, "grad_norm": 1.7721848487854004, "learning_rate": 3.27392578125e-05, "loss": 1.9588, "step": 3650 }, { "epoch": 0.9033203125, "grad_norm": 1.921703577041626, "learning_rate": 2.9077148437499996e-05, "loss": 2.0364, "step": 3700 }, { "epoch": 0.91552734375, "grad_norm": 1.6842303276062012, "learning_rate": 2.5415039062499996e-05, "loss": 1.8983, "step": 3750 }, { "epoch": 0.927734375, "grad_norm": 1.5029670000076294, "learning_rate": 2.17529296875e-05, "loss": 1.9142, "step": 3800 }, { "epoch": 0.93994140625, "grad_norm": 1.4353697299957275, "learning_rate": 1.80908203125e-05, "loss": 1.8683, "step": 3850 }, { "epoch": 0.9521484375, "grad_norm": 1.6614630222320557, "learning_rate": 1.4428710937499998e-05, "loss": 1.9563, "step": 3900 }, { "epoch": 0.96435546875, "grad_norm": 1.5036600828170776, "learning_rate": 1.0766601562499998e-05, "loss": 2.0508, "step": 3950 }, { "epoch": 0.9765625, "grad_norm": 2.6189305782318115, "learning_rate": 7.104492187499999e-06, "loss": 1.8859, "step": 4000 }, { "epoch": 0.98876953125, "grad_norm": 1.81717050075531, "learning_rate": 3.4423828124999995e-06, "loss": 1.8413, "step": 4050 } ], "logging_steps": 50, "max_steps": 4096, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.724093307657216e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }