{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 532, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.037638202775817454, "grad_norm": 13.610795098502107, "learning_rate": 8.333333333333333e-07, "loss": 0.5914, "step": 10 }, { "epoch": 0.07527640555163491, "grad_norm": 0.9699645754094162, "learning_rate": 1.7592592592592594e-06, "loss": 0.5843, "step": 20 }, { "epoch": 0.11291460832745237, "grad_norm": 0.5845916009861534, "learning_rate": 2.6851851851851856e-06, "loss": 0.5781, "step": 30 }, { "epoch": 0.15055281110326982, "grad_norm": 0.5917279916406908, "learning_rate": 3.6111111111111115e-06, "loss": 0.5653, "step": 40 }, { "epoch": 0.18819101387908727, "grad_norm": 0.5696097079287838, "learning_rate": 4.537037037037038e-06, "loss": 0.5927, "step": 50 }, { "epoch": 0.22582921665490474, "grad_norm": 0.5761389794818529, "learning_rate": 4.998650245168965e-06, "loss": 0.5859, "step": 60 }, { "epoch": 0.26346741943072216, "grad_norm": 0.576477237978447, "learning_rate": 4.987860949769804e-06, "loss": 0.582, "step": 70 }, { "epoch": 0.30110562220653964, "grad_norm": 0.556511371295206, "learning_rate": 4.9663289476829e-06, "loss": 0.5712, "step": 80 }, { "epoch": 0.3387438249823571, "grad_norm": 0.47938070366971103, "learning_rate": 4.934147215158732e-06, "loss": 0.5614, "step": 90 }, { "epoch": 0.37638202775817453, "grad_norm": 0.41700685090674505, "learning_rate": 4.891454714510784e-06, "loss": 0.6062, "step": 100 }, { "epoch": 0.414020230533992, "grad_norm": 0.4084825379730698, "learning_rate": 4.838435794069406e-06, "loss": 0.5703, "step": 110 }, { "epoch": 0.4516584333098095, "grad_norm": 0.4167412677659701, "learning_rate": 4.775319392156593e-06, "loss": 0.5933, "step": 120 }, { "epoch": 0.4892966360856269, "grad_norm": 0.4066934770436394, "learning_rate": 4.70237804851899e-06, "loss": 0.5457, "step": 130 }, { "epoch": 0.5269348388614443, "grad_norm": 0.39475972226137096, "learning_rate": 4.619926727487774e-06, "loss": 0.5797, "step": 140 }, { "epoch": 0.5645730416372619, "grad_norm": 0.42220410240509404, "learning_rate": 4.528321457947091e-06, "loss": 0.5977, "step": 150 }, { "epoch": 0.6022112444130793, "grad_norm": 0.3914884231722468, "learning_rate": 4.427957795983715e-06, "loss": 0.5789, "step": 160 }, { "epoch": 0.6398494471888967, "grad_norm": 0.40266440109112717, "learning_rate": 4.319269116856291e-06, "loss": 0.5716, "step": 170 }, { "epoch": 0.6774876499647142, "grad_norm": 0.41129740802214726, "learning_rate": 4.2027247436595245e-06, "loss": 0.5597, "step": 180 }, { "epoch": 0.7151258527405316, "grad_norm": 0.4686973204175566, "learning_rate": 4.078827920763835e-06, "loss": 0.5803, "step": 190 }, { "epoch": 0.7527640555163491, "grad_norm": 0.38540729078595337, "learning_rate": 3.948113640781265e-06, "loss": 0.5723, "step": 200 }, { "epoch": 0.7904022582921666, "grad_norm": 0.40445228822396667, "learning_rate": 3.8111463344409026e-06, "loss": 0.561, "step": 210 }, { "epoch": 0.828040461067984, "grad_norm": 0.4653487942714477, "learning_rate": 3.668517433349069e-06, "loss": 0.5807, "step": 220 }, { "epoch": 0.8656786638438014, "grad_norm": 0.4241292760945627, "learning_rate": 3.520842816158374e-06, "loss": 0.5889, "step": 230 }, { "epoch": 0.903316866619619, "grad_norm": 0.4124860362346966, "learning_rate": 3.368760149173219e-06, "loss": 0.5976, "step": 240 }, { "epoch": 0.9409550693954364, "grad_norm": 0.40795355323908694, "learning_rate": 3.212926132875141e-06, "loss": 0.5658, "step": 250 }, { "epoch": 0.9785932721712538, "grad_norm": 0.41045446321734286, "learning_rate": 3.054013666257638e-06, "loss": 0.5814, "step": 260 }, { "epoch": 1.015055281110327, "grad_norm": 0.4409995582342298, "learning_rate": 2.8927089412150176e-06, "loss": 0.5907, "step": 270 }, { "epoch": 1.0526934838861444, "grad_norm": 0.4043201145148639, "learning_rate": 2.729708479531844e-06, "loss": 0.5346, "step": 280 }, { "epoch": 1.090331686661962, "grad_norm": 0.3985913687368217, "learning_rate": 2.5657161252674047e-06, "loss": 0.535, "step": 290 }, { "epoch": 1.1279698894377794, "grad_norm": 0.3946559086316167, "learning_rate": 2.4014400055222337e-06, "loss": 0.5177, "step": 300 }, { "epoch": 1.1656080922135967, "grad_norm": 0.42945485549653484, "learning_rate": 2.2375894727102552e-06, "loss": 0.553, "step": 310 }, { "epoch": 1.2032462949894143, "grad_norm": 0.39437996434435546, "learning_rate": 2.0748720415399542e-06, "loss": 0.5312, "step": 320 }, { "epoch": 1.2408844977652318, "grad_norm": 0.3716214397572245, "learning_rate": 1.913990333930858e-06, "loss": 0.5286, "step": 330 }, { "epoch": 1.278522700541049, "grad_norm": 0.40068573317589684, "learning_rate": 1.7556390450573213e-06, "loss": 0.5362, "step": 340 }, { "epoch": 1.3161609033168666, "grad_norm": 0.3616596747761561, "learning_rate": 1.600501943620384e-06, "loss": 0.5286, "step": 350 }, { "epoch": 1.353799106092684, "grad_norm": 0.38779033566892823, "learning_rate": 1.4492489193006884e-06, "loss": 0.5263, "step": 360 }, { "epoch": 1.3914373088685015, "grad_norm": 0.3757853739580612, "learning_rate": 1.302533090141689e-06, "loss": 0.516, "step": 370 }, { "epoch": 1.429075511644319, "grad_norm": 0.38727201501707054, "learning_rate": 1.1609879823536233e-06, "loss": 0.5416, "step": 380 }, { "epoch": 1.4667137144201363, "grad_norm": 0.41256807796835565, "learning_rate": 1.0252247947159846e-06, "loss": 0.5489, "step": 390 }, { "epoch": 1.5043519171959538, "grad_norm": 0.3710868452573983, "learning_rate": 8.95829759390954e-07, "loss": 0.545, "step": 400 }, { "epoch": 1.5419901199717714, "grad_norm": 0.3694905808253818, "learning_rate": 7.733616105439077e-07, "loss": 0.5208, "step": 410 }, { "epoch": 1.5796283227475887, "grad_norm": 0.3615477041656024, "learning_rate": 6.58349171701651e-07, "loss": 0.5237, "step": 420 }, { "epoch": 1.6172665255234062, "grad_norm": 0.3804185781850594, "learning_rate": 5.51289072266255e-07, "loss": 0.5608, "step": 430 }, { "epoch": 1.6549047282992237, "grad_norm": 0.40468195026925746, "learning_rate": 4.5264360304473065e-07, "loss": 0.5576, "step": 440 }, { "epoch": 1.692542931075041, "grad_norm": 0.367137564529641, "learning_rate": 3.6283872005444087e-07, "loss": 0.5312, "step": 450 }, { "epoch": 1.7301811338508586, "grad_norm": 0.3963215645004468, "learning_rate": 2.8226220522394735e-07, "loss": 0.5362, "step": 460 }, { "epoch": 1.7678193366266761, "grad_norm": 0.38136903383145776, "learning_rate": 2.1126199193144904e-07, "loss": 0.558, "step": 470 }, { "epoch": 1.8054575394024934, "grad_norm": 0.38938465164870284, "learning_rate": 1.5014466261124128e-07, "loss": 0.5114, "step": 480 }, { "epoch": 1.843095742178311, "grad_norm": 0.37548505603471355, "learning_rate": 9.917412491559337e-08, "loss": 0.5402, "step": 490 }, { "epoch": 1.8807339449541285, "grad_norm": 0.3575809194576506, "learning_rate": 5.8570472148445633e-08, "loss": 0.5349, "step": 500 }, { "epoch": 1.9183721477299458, "grad_norm": 0.3899130339561275, "learning_rate": 2.8509032891635146e-08, "loss": 0.5419, "step": 510 }, { "epoch": 1.9560103505057633, "grad_norm": 0.4032737933879565, "learning_rate": 9.119613927399684e-09, "loss": 0.5312, "step": 520 }, { "epoch": 1.9936485532815809, "grad_norm": 0.3608459597802774, "learning_rate": 4.859397262726995e-10, "loss": 0.5583, "step": 530 }, { "epoch": 2.0, "step": 532, "total_flos": 8.119097669378376e+17, "train_loss": 0.558736775154458, "train_runtime": 23106.9266, "train_samples_per_second": 11.774, "train_steps_per_second": 0.023 } ], "logging_steps": 10, "max_steps": 532, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.119097669378376e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }