{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 532, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018819101387908727, "grad_norm": 119.85382415184827, "learning_rate": 8.333333333333333e-07, "loss": 3.7563, "step": 10 }, { "epoch": 0.037638202775817454, "grad_norm": 33.81891320175042, "learning_rate": 1.7592592592592594e-06, "loss": 3.0, "step": 20 }, { "epoch": 0.056457304163726185, "grad_norm": 11.708086861495808, "learning_rate": 2.6851851851851856e-06, "loss": 1.9966, "step": 30 }, { "epoch": 0.07527640555163491, "grad_norm": 11.971367057507148, "learning_rate": 3.6111111111111115e-06, "loss": 1.6552, "step": 40 }, { "epoch": 0.09409550693954363, "grad_norm": 23.209165550861357, "learning_rate": 4.537037037037038e-06, "loss": 1.4811, "step": 50 }, { "epoch": 0.11291460832745237, "grad_norm": 9.899478278575582, "learning_rate": 4.998650245168965e-06, "loss": 1.3785, "step": 60 }, { "epoch": 0.13173370971536108, "grad_norm": 8.96121751266139, "learning_rate": 4.987860949769804e-06, "loss": 1.2907, "step": 70 }, { "epoch": 0.15055281110326982, "grad_norm": 7.298252173607856, "learning_rate": 4.9663289476829e-06, "loss": 1.2165, "step": 80 }, { "epoch": 0.16937191249117856, "grad_norm": 4.002757463166737, "learning_rate": 4.934147215158732e-06, "loss": 1.1604, "step": 90 }, { "epoch": 0.18819101387908727, "grad_norm": 3.6552314011131473, "learning_rate": 4.891454714510784e-06, "loss": 1.1411, "step": 100 }, { "epoch": 0.207010115266996, "grad_norm": 4.688923092060107, "learning_rate": 4.838435794069406e-06, "loss": 1.1315, "step": 110 }, { "epoch": 0.22582921665490474, "grad_norm": 4.575073558083306, "learning_rate": 4.775319392156593e-06, "loss": 1.1167, "step": 120 }, { "epoch": 0.24464831804281345, "grad_norm": 3.851078841596319, "learning_rate": 4.70237804851899e-06, "loss": 1.1103, "step": 130 }, { "epoch": 0.26346741943072216, "grad_norm": 3.2193694749710215, "learning_rate": 4.619926727487774e-06, "loss": 1.1076, "step": 140 }, { "epoch": 0.2822865208186309, "grad_norm": 2.8438246971893637, "learning_rate": 4.528321457947091e-06, "loss": 1.0984, "step": 150 }, { "epoch": 0.30110562220653964, "grad_norm": 3.0947911049743686, "learning_rate": 4.427957795983715e-06, "loss": 1.0917, "step": 160 }, { "epoch": 0.31992472359444835, "grad_norm": 4.227096537902385, "learning_rate": 4.319269116856291e-06, "loss": 1.0883, "step": 170 }, { "epoch": 0.3387438249823571, "grad_norm": 3.3099714841544965, "learning_rate": 4.2027247436595245e-06, "loss": 1.0899, "step": 180 }, { "epoch": 0.3575629263702658, "grad_norm": 3.074228439753722, "learning_rate": 4.078827920763835e-06, "loss": 1.0809, "step": 190 }, { "epoch": 0.37638202775817453, "grad_norm": 3.9331955228691893, "learning_rate": 3.948113640781265e-06, "loss": 1.0803, "step": 200 }, { "epoch": 0.3952011291460833, "grad_norm": 3.1687316695184595, "learning_rate": 3.8111463344409026e-06, "loss": 1.0752, "step": 210 }, { "epoch": 0.414020230533992, "grad_norm": 3.1086870103207263, "learning_rate": 3.668517433349069e-06, "loss": 1.0801, "step": 220 }, { "epoch": 0.4328393319219007, "grad_norm": 3.569818521101058, "learning_rate": 3.520842816158374e-06, "loss": 1.0737, "step": 230 }, { "epoch": 0.4516584333098095, "grad_norm": 2.821249949263903, "learning_rate": 3.368760149173219e-06, "loss": 1.0578, "step": 240 }, { "epoch": 0.4704775346977182, "grad_norm": 3.3886861570744524, "learning_rate": 3.212926132875141e-06, "loss": 1.07, "step": 250 }, { "epoch": 0.4892966360856269, "grad_norm": 3.1106639034791805, "learning_rate": 3.054013666257638e-06, "loss": 1.0513, "step": 260 }, { "epoch": 0.5081157374735357, "grad_norm": 3.2421734605805144, "learning_rate": 2.8927089412150176e-06, "loss": 1.059, "step": 270 }, { "epoch": 0.5269348388614443, "grad_norm": 2.8138590739611065, "learning_rate": 2.729708479531844e-06, "loss": 1.059, "step": 280 }, { "epoch": 0.5457539402493531, "grad_norm": 3.085937678600229, "learning_rate": 2.5657161252674047e-06, "loss": 1.0506, "step": 290 }, { "epoch": 0.5645730416372619, "grad_norm": 2.5910896828670555, "learning_rate": 2.4014400055222337e-06, "loss": 1.0458, "step": 300 }, { "epoch": 0.5833921430251705, "grad_norm": 2.711615583893317, "learning_rate": 2.2375894727102552e-06, "loss": 1.0489, "step": 310 }, { "epoch": 0.6022112444130793, "grad_norm": 2.8163698166832067, "learning_rate": 2.0748720415399542e-06, "loss": 1.0492, "step": 320 }, { "epoch": 0.621030345800988, "grad_norm": 2.396874440884708, "learning_rate": 1.913990333930858e-06, "loss": 1.0448, "step": 330 }, { "epoch": 0.6398494471888967, "grad_norm": 2.537957050013955, "learning_rate": 1.7556390450573213e-06, "loss": 1.0396, "step": 340 }, { "epoch": 0.6586685485768055, "grad_norm": 2.6769709470166068, "learning_rate": 1.600501943620384e-06, "loss": 1.0483, "step": 350 }, { "epoch": 0.6774876499647142, "grad_norm": 2.6349913820783066, "learning_rate": 1.4492489193006884e-06, "loss": 1.0441, "step": 360 }, { "epoch": 0.6963067513526229, "grad_norm": 2.496611917295113, "learning_rate": 1.302533090141689e-06, "loss": 1.0413, "step": 370 }, { "epoch": 0.7151258527405316, "grad_norm": 2.4335057884143807, "learning_rate": 1.1609879823536233e-06, "loss": 1.0397, "step": 380 }, { "epoch": 0.7339449541284404, "grad_norm": 2.2916566931708533, "learning_rate": 1.0252247947159846e-06, "loss": 1.0288, "step": 390 }, { "epoch": 0.7527640555163491, "grad_norm": 2.597561005712493, "learning_rate": 8.95829759390954e-07, "loss": 1.0419, "step": 400 }, { "epoch": 0.7715831569042578, "grad_norm": 2.40328918066519, "learning_rate": 7.733616105439077e-07, "loss": 1.0387, "step": 410 }, { "epoch": 0.7904022582921666, "grad_norm": 2.3994734100827197, "learning_rate": 6.58349171701651e-07, "loss": 1.036, "step": 420 }, { "epoch": 0.8092213596800752, "grad_norm": 2.2299325222354267, "learning_rate": 5.51289072266255e-07, "loss": 1.0345, "step": 430 }, { "epoch": 0.828040461067984, "grad_norm": 1.9175406133141235, "learning_rate": 4.5264360304473065e-07, "loss": 1.033, "step": 440 }, { "epoch": 0.8468595624558928, "grad_norm": 2.147736712406261, "learning_rate": 3.6283872005444087e-07, "loss": 1.0334, "step": 450 }, { "epoch": 0.8656786638438014, "grad_norm": 1.9503396710553103, "learning_rate": 2.8226220522394735e-07, "loss": 1.0358, "step": 460 }, { "epoch": 0.8844977652317102, "grad_norm": 2.205405844751076, "learning_rate": 2.1126199193144904e-07, "loss": 1.0429, "step": 470 }, { "epoch": 0.903316866619619, "grad_norm": 1.9900450300171217, "learning_rate": 1.5014466261124128e-07, "loss": 1.0368, "step": 480 }, { "epoch": 0.9221359680075276, "grad_norm": 2.0904507985722534, "learning_rate": 9.917412491559337e-08, "loss": 1.0237, "step": 490 }, { "epoch": 0.9409550693954364, "grad_norm": 2.186757730889461, "learning_rate": 5.8570472148445633e-08, "loss": 1.0242, "step": 500 }, { "epoch": 0.9597741707833451, "grad_norm": 2.2178555824642445, "learning_rate": 2.8509032891635146e-08, "loss": 1.0294, "step": 510 }, { "epoch": 0.9785932721712538, "grad_norm": 1.9581112450691653, "learning_rate": 9.119613927399684e-09, "loss": 1.0281, "step": 520 }, { "epoch": 0.9974123735591626, "grad_norm": 1.908843408243316, "learning_rate": 4.859397262726995e-10, "loss": 1.0255, "step": 530 }, { "epoch": 1.0, "step": 532, "total_flos": 8.800004805943624e+17, "train_loss": 1.198594591671363, "train_runtime": 17174.361, "train_samples_per_second": 15.841, "train_steps_per_second": 0.031 } ], "logging_steps": 10, "max_steps": 532, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.800004805943624e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }