| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9958081043316254, | |
| "eval_steps": 500, | |
| "global_step": 804, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.037261294829995344, | |
| "grad_norm": 3.177923185690651, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0446, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07452258965999069, | |
| "grad_norm": 1.5655148840692104, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9101, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11178388448998602, | |
| "grad_norm": 1.2741458134596388, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8739, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14904517931998137, | |
| "grad_norm": 1.1521931357563782, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8458, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18630647414997673, | |
| "grad_norm": 1.1371625192876917, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8221, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22356776897997205, | |
| "grad_norm": 0.9080984413465669, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8159, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2608290638099674, | |
| "grad_norm": 0.9330400679788563, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8017, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.29809035863996275, | |
| "grad_norm": 0.7711080271829996, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7893, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3353516534699581, | |
| "grad_norm": 0.7516775112714674, | |
| "learning_rate": 5e-06, | |
| "loss": 0.792, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.37261294829995345, | |
| "grad_norm": 0.6720880112699107, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7807, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.40987424312994875, | |
| "grad_norm": 1.0015408115889342, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7739, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4471355379599441, | |
| "grad_norm": 0.7967270535006579, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7753, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.48439683278993945, | |
| "grad_norm": 0.7992952214381639, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7685, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5216581276199348, | |
| "grad_norm": 0.822692265972509, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7664, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5589194224499301, | |
| "grad_norm": 0.6248743169023847, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7619, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5961807172799255, | |
| "grad_norm": 0.7681721127721153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7601, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6334420121099208, | |
| "grad_norm": 0.6670980060596134, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7594, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6707033069399162, | |
| "grad_norm": 1.6054759679741344, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7627, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7079646017699115, | |
| "grad_norm": 1.0157126085127113, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7612, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7452258965999069, | |
| "grad_norm": 1.0987509598845462, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7566, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7824871914299022, | |
| "grad_norm": 0.8874349124844947, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7578, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8197484862598975, | |
| "grad_norm": 0.8625833938546213, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7472, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8570097810898929, | |
| "grad_norm": 0.6536315928798102, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7499, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8942710759198882, | |
| "grad_norm": 0.5976487344814351, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7493, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9315323707498836, | |
| "grad_norm": 0.6377854255157969, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7449, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9687936655798789, | |
| "grad_norm": 0.6255963141536746, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7463, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9986027014438752, | |
| "eval_loss": 0.753118097782135, | |
| "eval_runtime": 282.6878, | |
| "eval_samples_per_second": 25.572, | |
| "eval_steps_per_second": 0.4, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.0060549604098743, | |
| "grad_norm": 0.7967440184692027, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7858, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.0433162552398696, | |
| "grad_norm": 0.6961321881435635, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6975, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.080577550069865, | |
| "grad_norm": 0.762772210563081, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6948, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.1178388448998602, | |
| "grad_norm": 0.7180309443204612, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6976, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.1551001397298557, | |
| "grad_norm": 0.7681051261580779, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6927, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.192361434559851, | |
| "grad_norm": 0.5509672175600794, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6914, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2296227293898463, | |
| "grad_norm": 0.6540568092405752, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7042, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.2668840242198416, | |
| "grad_norm": 0.6081452030296887, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6964, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3041453190498369, | |
| "grad_norm": 0.6548111894829255, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6982, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.3414066138798324, | |
| "grad_norm": 0.6390670197121491, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6958, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.3786679087098277, | |
| "grad_norm": 0.6212133244114497, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6966, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.415929203539823, | |
| "grad_norm": 0.5799521943921166, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6891, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4531904983698183, | |
| "grad_norm": 0.6895979564472926, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6961, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.4904517931998136, | |
| "grad_norm": 0.7051355249345896, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6937, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.527713088029809, | |
| "grad_norm": 0.5848394801525845, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6934, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.5649743828598044, | |
| "grad_norm": 0.7559716416869524, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6895, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.6022356776897997, | |
| "grad_norm": 0.6232619179980586, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6982, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.6394969725197952, | |
| "grad_norm": 0.5824898687040235, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6952, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.6767582673497903, | |
| "grad_norm": 0.6210814650937644, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6973, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.7140195621797858, | |
| "grad_norm": 0.6197587117855798, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6941, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.751280857009781, | |
| "grad_norm": 0.5610090469048251, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6918, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.7885421518397764, | |
| "grad_norm": 0.6397472245527632, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6896, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.825803446669772, | |
| "grad_norm": 0.8540795729270456, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6949, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.863064741499767, | |
| "grad_norm": 0.5933184699793366, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6941, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.9003260363297625, | |
| "grad_norm": 0.5702682781752477, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6947, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.9375873311597578, | |
| "grad_norm": 0.6485728293989254, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6934, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.974848625989753, | |
| "grad_norm": 0.658819869120822, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6935, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.9972054028877504, | |
| "eval_loss": 0.7391706705093384, | |
| "eval_runtime": 284.9966, | |
| "eval_samples_per_second": 25.365, | |
| "eval_steps_per_second": 0.396, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 2.0121099208197486, | |
| "grad_norm": 0.8176370133975521, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7119, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.0493712156497437, | |
| "grad_norm": 1.0394836748871732, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6404, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.086632510479739, | |
| "grad_norm": 0.7102912397553353, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6384, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.1238938053097347, | |
| "grad_norm": 0.6303371150790377, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6368, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.16115510013973, | |
| "grad_norm": 0.7194505328759309, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6331, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.1984163949697253, | |
| "grad_norm": 0.8581082758870584, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6414, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.2356776897997204, | |
| "grad_norm": 0.7882079908607245, | |
| "learning_rate": 5e-06, | |
| "loss": 0.642, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.272938984629716, | |
| "grad_norm": 0.7493026584846839, | |
| "learning_rate": 5e-06, | |
| "loss": 0.644, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.3102002794597114, | |
| "grad_norm": 0.6779697760274137, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6446, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.3474615742897065, | |
| "grad_norm": 0.7469773522523896, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6413, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.384722869119702, | |
| "grad_norm": 0.8234476990672971, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6383, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.421984163949697, | |
| "grad_norm": 0.6740528425843195, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6403, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.4592454587796926, | |
| "grad_norm": 0.8699308162958452, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6424, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.496506753609688, | |
| "grad_norm": 0.7201278110102733, | |
| "learning_rate": 5e-06, | |
| "loss": 0.645, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.533768048439683, | |
| "grad_norm": 0.6724034506353075, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6432, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.5710293432696787, | |
| "grad_norm": 0.69360608529311, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6396, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.6082906380996738, | |
| "grad_norm": 0.8274824206163831, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6392, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.6455519329296693, | |
| "grad_norm": 0.6599355927406194, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6376, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.682813227759665, | |
| "grad_norm": 0.6028719649182044, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6426, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.72007452258966, | |
| "grad_norm": 0.8529220528875564, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6442, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.7573358174196554, | |
| "grad_norm": 0.5833740770838279, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6435, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.794597112249651, | |
| "grad_norm": 0.75585676373348, | |
| "learning_rate": 5e-06, | |
| "loss": 0.643, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.831858407079646, | |
| "grad_norm": 0.7316206001732912, | |
| "learning_rate": 5e-06, | |
| "loss": 0.645, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.8691197019096415, | |
| "grad_norm": 0.6022571096523994, | |
| "learning_rate": 5e-06, | |
| "loss": 0.643, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.9063809967396366, | |
| "grad_norm": 0.6980539018883889, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6444, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.943642291569632, | |
| "grad_norm": 0.6379664123364885, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6418, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.980903586399627, | |
| "grad_norm": 0.617861393266688, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6473, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.9958081043316254, | |
| "eval_loss": 0.7408209443092346, | |
| "eval_runtime": 283.5662, | |
| "eval_samples_per_second": 25.493, | |
| "eval_steps_per_second": 0.398, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 2.9958081043316254, | |
| "step": 804, | |
| "total_flos": 1346520565678080.0, | |
| "train_loss": 0.7105088189466676, | |
| "train_runtime": 47269.4774, | |
| "train_samples_per_second": 8.717, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 804, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1346520565678080.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |