{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9958081043316254, "eval_steps": 500, "global_step": 804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.037261294829995344, "grad_norm": 3.177923185690651, "learning_rate": 5e-06, "loss": 1.0446, "step": 10 }, { "epoch": 0.07452258965999069, "grad_norm": 1.5655148840692104, "learning_rate": 5e-06, "loss": 0.9101, "step": 20 }, { "epoch": 0.11178388448998602, "grad_norm": 1.2741458134596388, "learning_rate": 5e-06, "loss": 0.8739, "step": 30 }, { "epoch": 0.14904517931998137, "grad_norm": 1.1521931357563782, "learning_rate": 5e-06, "loss": 0.8458, "step": 40 }, { "epoch": 0.18630647414997673, "grad_norm": 1.1371625192876917, "learning_rate": 5e-06, "loss": 0.8221, "step": 50 }, { "epoch": 0.22356776897997205, "grad_norm": 0.9080984413465669, "learning_rate": 5e-06, "loss": 0.8159, "step": 60 }, { "epoch": 0.2608290638099674, "grad_norm": 0.9330400679788563, "learning_rate": 5e-06, "loss": 0.8017, "step": 70 }, { "epoch": 0.29809035863996275, "grad_norm": 0.7711080271829996, "learning_rate": 5e-06, "loss": 0.7893, "step": 80 }, { "epoch": 0.3353516534699581, "grad_norm": 0.7516775112714674, "learning_rate": 5e-06, "loss": 0.792, "step": 90 }, { "epoch": 0.37261294829995345, "grad_norm": 0.6720880112699107, "learning_rate": 5e-06, "loss": 0.7807, "step": 100 }, { "epoch": 0.40987424312994875, "grad_norm": 1.0015408115889342, "learning_rate": 5e-06, "loss": 0.7739, "step": 110 }, { "epoch": 0.4471355379599441, "grad_norm": 0.7967270535006579, "learning_rate": 5e-06, "loss": 0.7753, "step": 120 }, { "epoch": 0.48439683278993945, "grad_norm": 0.7992952214381639, "learning_rate": 5e-06, "loss": 0.7685, "step": 130 }, { "epoch": 0.5216581276199348, "grad_norm": 0.822692265972509, "learning_rate": 5e-06, "loss": 0.7664, "step": 140 }, { "epoch": 0.5589194224499301, "grad_norm": 0.6248743169023847, "learning_rate": 5e-06, "loss": 0.7619, "step": 150 }, { "epoch": 0.5961807172799255, "grad_norm": 0.7681721127721153, "learning_rate": 5e-06, "loss": 0.7601, "step": 160 }, { "epoch": 0.6334420121099208, "grad_norm": 0.6670980060596134, "learning_rate": 5e-06, "loss": 0.7594, "step": 170 }, { "epoch": 0.6707033069399162, "grad_norm": 1.6054759679741344, "learning_rate": 5e-06, "loss": 0.7627, "step": 180 }, { "epoch": 0.7079646017699115, "grad_norm": 1.0157126085127113, "learning_rate": 5e-06, "loss": 0.7612, "step": 190 }, { "epoch": 0.7452258965999069, "grad_norm": 1.0987509598845462, "learning_rate": 5e-06, "loss": 0.7566, "step": 200 }, { "epoch": 0.7824871914299022, "grad_norm": 0.8874349124844947, "learning_rate": 5e-06, "loss": 0.7578, "step": 210 }, { "epoch": 0.8197484862598975, "grad_norm": 0.8625833938546213, "learning_rate": 5e-06, "loss": 0.7472, "step": 220 }, { "epoch": 0.8570097810898929, "grad_norm": 0.6536315928798102, "learning_rate": 5e-06, "loss": 0.7499, "step": 230 }, { "epoch": 0.8942710759198882, "grad_norm": 0.5976487344814351, "learning_rate": 5e-06, "loss": 0.7493, "step": 240 }, { "epoch": 0.9315323707498836, "grad_norm": 0.6377854255157969, "learning_rate": 5e-06, "loss": 0.7449, "step": 250 }, { "epoch": 0.9687936655798789, "grad_norm": 0.6255963141536746, "learning_rate": 5e-06, "loss": 0.7463, "step": 260 }, { "epoch": 0.9986027014438752, "eval_loss": 0.753118097782135, "eval_runtime": 282.6878, "eval_samples_per_second": 25.572, "eval_steps_per_second": 0.4, "step": 268 }, { "epoch": 1.0060549604098743, "grad_norm": 0.7967440184692027, "learning_rate": 5e-06, "loss": 0.7858, "step": 270 }, { "epoch": 1.0433162552398696, "grad_norm": 0.6961321881435635, "learning_rate": 5e-06, "loss": 0.6975, "step": 280 }, { "epoch": 1.080577550069865, "grad_norm": 0.762772210563081, "learning_rate": 5e-06, "loss": 0.6948, "step": 290 }, { "epoch": 1.1178388448998602, "grad_norm": 0.7180309443204612, "learning_rate": 5e-06, "loss": 0.6976, "step": 300 }, { "epoch": 1.1551001397298557, "grad_norm": 0.7681051261580779, "learning_rate": 5e-06, "loss": 0.6927, "step": 310 }, { "epoch": 1.192361434559851, "grad_norm": 0.5509672175600794, "learning_rate": 5e-06, "loss": 0.6914, "step": 320 }, { "epoch": 1.2296227293898463, "grad_norm": 0.6540568092405752, "learning_rate": 5e-06, "loss": 0.7042, "step": 330 }, { "epoch": 1.2668840242198416, "grad_norm": 0.6081452030296887, "learning_rate": 5e-06, "loss": 0.6964, "step": 340 }, { "epoch": 1.3041453190498369, "grad_norm": 0.6548111894829255, "learning_rate": 5e-06, "loss": 0.6982, "step": 350 }, { "epoch": 1.3414066138798324, "grad_norm": 0.6390670197121491, "learning_rate": 5e-06, "loss": 0.6958, "step": 360 }, { "epoch": 1.3786679087098277, "grad_norm": 0.6212133244114497, "learning_rate": 5e-06, "loss": 0.6966, "step": 370 }, { "epoch": 1.415929203539823, "grad_norm": 0.5799521943921166, "learning_rate": 5e-06, "loss": 0.6891, "step": 380 }, { "epoch": 1.4531904983698183, "grad_norm": 0.6895979564472926, "learning_rate": 5e-06, "loss": 0.6961, "step": 390 }, { "epoch": 1.4904517931998136, "grad_norm": 0.7051355249345896, "learning_rate": 5e-06, "loss": 0.6937, "step": 400 }, { "epoch": 1.527713088029809, "grad_norm": 0.5848394801525845, "learning_rate": 5e-06, "loss": 0.6934, "step": 410 }, { "epoch": 1.5649743828598044, "grad_norm": 0.7559716416869524, "learning_rate": 5e-06, "loss": 0.6895, "step": 420 }, { "epoch": 1.6022356776897997, "grad_norm": 0.6232619179980586, "learning_rate": 5e-06, "loss": 0.6982, "step": 430 }, { "epoch": 1.6394969725197952, "grad_norm": 0.5824898687040235, "learning_rate": 5e-06, "loss": 0.6952, "step": 440 }, { "epoch": 1.6767582673497903, "grad_norm": 0.6210814650937644, "learning_rate": 5e-06, "loss": 0.6973, "step": 450 }, { "epoch": 1.7140195621797858, "grad_norm": 0.6197587117855798, "learning_rate": 5e-06, "loss": 0.6941, "step": 460 }, { "epoch": 1.751280857009781, "grad_norm": 0.5610090469048251, "learning_rate": 5e-06, "loss": 0.6918, "step": 470 }, { "epoch": 1.7885421518397764, "grad_norm": 0.6397472245527632, "learning_rate": 5e-06, "loss": 0.6896, "step": 480 }, { "epoch": 1.825803446669772, "grad_norm": 0.8540795729270456, "learning_rate": 5e-06, "loss": 0.6949, "step": 490 }, { "epoch": 1.863064741499767, "grad_norm": 0.5933184699793366, "learning_rate": 5e-06, "loss": 0.6941, "step": 500 }, { "epoch": 1.9003260363297625, "grad_norm": 0.5702682781752477, "learning_rate": 5e-06, "loss": 0.6947, "step": 510 }, { "epoch": 1.9375873311597578, "grad_norm": 0.6485728293989254, "learning_rate": 5e-06, "loss": 0.6934, "step": 520 }, { "epoch": 1.974848625989753, "grad_norm": 0.658819869120822, "learning_rate": 5e-06, "loss": 0.6935, "step": 530 }, { "epoch": 1.9972054028877504, "eval_loss": 0.7391706705093384, "eval_runtime": 284.9966, "eval_samples_per_second": 25.365, "eval_steps_per_second": 0.396, "step": 536 }, { "epoch": 2.0121099208197486, "grad_norm": 0.8176370133975521, "learning_rate": 5e-06, "loss": 0.7119, "step": 540 }, { "epoch": 2.0493712156497437, "grad_norm": 1.0394836748871732, "learning_rate": 5e-06, "loss": 0.6404, "step": 550 }, { "epoch": 2.086632510479739, "grad_norm": 0.7102912397553353, "learning_rate": 5e-06, "loss": 0.6384, "step": 560 }, { "epoch": 2.1238938053097347, "grad_norm": 0.6303371150790377, "learning_rate": 5e-06, "loss": 0.6368, "step": 570 }, { "epoch": 2.16115510013973, "grad_norm": 0.7194505328759309, "learning_rate": 5e-06, "loss": 0.6331, "step": 580 }, { "epoch": 2.1984163949697253, "grad_norm": 0.8581082758870584, "learning_rate": 5e-06, "loss": 0.6414, "step": 590 }, { "epoch": 2.2356776897997204, "grad_norm": 0.7882079908607245, "learning_rate": 5e-06, "loss": 0.642, "step": 600 }, { "epoch": 2.272938984629716, "grad_norm": 0.7493026584846839, "learning_rate": 5e-06, "loss": 0.644, "step": 610 }, { "epoch": 2.3102002794597114, "grad_norm": 0.6779697760274137, "learning_rate": 5e-06, "loss": 0.6446, "step": 620 }, { "epoch": 2.3474615742897065, "grad_norm": 0.7469773522523896, "learning_rate": 5e-06, "loss": 0.6413, "step": 630 }, { "epoch": 2.384722869119702, "grad_norm": 0.8234476990672971, "learning_rate": 5e-06, "loss": 0.6383, "step": 640 }, { "epoch": 2.421984163949697, "grad_norm": 0.6740528425843195, "learning_rate": 5e-06, "loss": 0.6403, "step": 650 }, { "epoch": 2.4592454587796926, "grad_norm": 0.8699308162958452, "learning_rate": 5e-06, "loss": 0.6424, "step": 660 }, { "epoch": 2.496506753609688, "grad_norm": 0.7201278110102733, "learning_rate": 5e-06, "loss": 0.645, "step": 670 }, { "epoch": 2.533768048439683, "grad_norm": 0.6724034506353075, "learning_rate": 5e-06, "loss": 0.6432, "step": 680 }, { "epoch": 2.5710293432696787, "grad_norm": 0.69360608529311, "learning_rate": 5e-06, "loss": 0.6396, "step": 690 }, { "epoch": 2.6082906380996738, "grad_norm": 0.8274824206163831, "learning_rate": 5e-06, "loss": 0.6392, "step": 700 }, { "epoch": 2.6455519329296693, "grad_norm": 0.6599355927406194, "learning_rate": 5e-06, "loss": 0.6376, "step": 710 }, { "epoch": 2.682813227759665, "grad_norm": 0.6028719649182044, "learning_rate": 5e-06, "loss": 0.6426, "step": 720 }, { "epoch": 2.72007452258966, "grad_norm": 0.8529220528875564, "learning_rate": 5e-06, "loss": 0.6442, "step": 730 }, { "epoch": 2.7573358174196554, "grad_norm": 0.5833740770838279, "learning_rate": 5e-06, "loss": 0.6435, "step": 740 }, { "epoch": 2.794597112249651, "grad_norm": 0.75585676373348, "learning_rate": 5e-06, "loss": 0.643, "step": 750 }, { "epoch": 2.831858407079646, "grad_norm": 0.7316206001732912, "learning_rate": 5e-06, "loss": 0.645, "step": 760 }, { "epoch": 2.8691197019096415, "grad_norm": 0.6022571096523994, "learning_rate": 5e-06, "loss": 0.643, "step": 770 }, { "epoch": 2.9063809967396366, "grad_norm": 0.6980539018883889, "learning_rate": 5e-06, "loss": 0.6444, "step": 780 }, { "epoch": 2.943642291569632, "grad_norm": 0.6379664123364885, "learning_rate": 5e-06, "loss": 0.6418, "step": 790 }, { "epoch": 2.980903586399627, "grad_norm": 0.617861393266688, "learning_rate": 5e-06, "loss": 0.6473, "step": 800 }, { "epoch": 2.9958081043316254, "eval_loss": 0.7408209443092346, "eval_runtime": 283.5662, "eval_samples_per_second": 25.493, "eval_steps_per_second": 0.398, "step": 804 }, { "epoch": 2.9958081043316254, "step": 804, "total_flos": 1346520565678080.0, "train_loss": 0.7105088189466676, "train_runtime": 47269.4774, "train_samples_per_second": 8.717, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 804, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1346520565678080.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }