{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9902370990237097, "eval_steps": 500, "global_step": 804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.037192003719200374, "grad_norm": 6.881888050242323, "learning_rate": 5e-06, "loss": 1.0402, "step": 10 }, { "epoch": 0.07438400743840075, "grad_norm": 3.5091968880412154, "learning_rate": 5e-06, "loss": 0.9234, "step": 20 }, { "epoch": 0.11157601115760112, "grad_norm": 1.6873942990209014, "learning_rate": 5e-06, "loss": 0.8833, "step": 30 }, { "epoch": 0.1487680148768015, "grad_norm": 2.407800596967581, "learning_rate": 5e-06, "loss": 0.8524, "step": 40 }, { "epoch": 0.18596001859600186, "grad_norm": 2.4319398224790505, "learning_rate": 5e-06, "loss": 0.828, "step": 50 }, { "epoch": 0.22315202231520223, "grad_norm": 1.6767100693953145, "learning_rate": 5e-06, "loss": 0.8117, "step": 60 }, { "epoch": 0.2603440260344026, "grad_norm": 1.1194486400318895, "learning_rate": 5e-06, "loss": 0.8019, "step": 70 }, { "epoch": 0.297536029753603, "grad_norm": 0.7791268668811029, "learning_rate": 5e-06, "loss": 0.7909, "step": 80 }, { "epoch": 0.33472803347280333, "grad_norm": 1.0231329358996892, "learning_rate": 5e-06, "loss": 0.7745, "step": 90 }, { "epoch": 0.3719200371920037, "grad_norm": 1.0119634746718196, "learning_rate": 5e-06, "loss": 0.7768, "step": 100 }, { "epoch": 0.40911204091120407, "grad_norm": 0.7286643555576803, "learning_rate": 5e-06, "loss": 0.7715, "step": 110 }, { "epoch": 0.44630404463040446, "grad_norm": 0.8830873875823773, "learning_rate": 5e-06, "loss": 0.7694, "step": 120 }, { "epoch": 0.48349604834960486, "grad_norm": 1.0262942694467152, "learning_rate": 5e-06, "loss": 0.7639, "step": 130 }, { "epoch": 0.5206880520688052, "grad_norm": 0.8416190902804997, "learning_rate": 5e-06, "loss": 0.7627, "step": 140 }, { "epoch": 0.5578800557880056, "grad_norm": 0.8213144249800172, "learning_rate": 5e-06, "loss": 0.7584, "step": 150 }, { "epoch": 0.595072059507206, "grad_norm": 0.7761932672369115, "learning_rate": 5e-06, "loss": 0.7527, "step": 160 }, { "epoch": 0.6322640632264063, "grad_norm": 0.594310752303111, "learning_rate": 5e-06, "loss": 0.7551, "step": 170 }, { "epoch": 0.6694560669456067, "grad_norm": 0.723753760030973, "learning_rate": 5e-06, "loss": 0.7505, "step": 180 }, { "epoch": 0.7066480706648071, "grad_norm": 0.8255927381229027, "learning_rate": 5e-06, "loss": 0.7537, "step": 190 }, { "epoch": 0.7438400743840075, "grad_norm": 0.6756994735330465, "learning_rate": 5e-06, "loss": 0.7471, "step": 200 }, { "epoch": 0.7810320781032078, "grad_norm": 0.7417685138593264, "learning_rate": 5e-06, "loss": 0.7519, "step": 210 }, { "epoch": 0.8182240818224081, "grad_norm": 0.7199232164518533, "learning_rate": 5e-06, "loss": 0.7445, "step": 220 }, { "epoch": 0.8554160855416085, "grad_norm": 0.5805134046445549, "learning_rate": 5e-06, "loss": 0.7469, "step": 230 }, { "epoch": 0.8926080892608089, "grad_norm": 0.8083557503057547, "learning_rate": 5e-06, "loss": 0.7426, "step": 240 }, { "epoch": 0.9298000929800093, "grad_norm": 0.5958428143365987, "learning_rate": 5e-06, "loss": 0.7429, "step": 250 }, { "epoch": 0.9669920966992097, "grad_norm": 0.7200821647238617, "learning_rate": 5e-06, "loss": 0.7455, "step": 260 }, { "epoch": 0.9967456996745699, "eval_loss": 0.7387776970863342, "eval_runtime": 285.1365, "eval_samples_per_second": 25.402, "eval_steps_per_second": 0.4, "step": 268 }, { "epoch": 1.00418410041841, "grad_norm": 0.9192811448757029, "learning_rate": 5e-06, "loss": 0.7395, "step": 270 }, { "epoch": 1.0413761041376104, "grad_norm": 0.7114550344465745, "learning_rate": 5e-06, "loss": 0.6952, "step": 280 }, { "epoch": 1.0785681078568108, "grad_norm": 0.7773431995256311, "learning_rate": 5e-06, "loss": 0.6894, "step": 290 }, { "epoch": 1.1157601115760112, "grad_norm": 0.8395302785079569, "learning_rate": 5e-06, "loss": 0.6895, "step": 300 }, { "epoch": 1.1529521152952116, "grad_norm": 0.5978839456204598, "learning_rate": 5e-06, "loss": 0.6839, "step": 310 }, { "epoch": 1.190144119014412, "grad_norm": 0.6886905957705741, "learning_rate": 5e-06, "loss": 0.6869, "step": 320 }, { "epoch": 1.2273361227336124, "grad_norm": 0.6062023199694873, "learning_rate": 5e-06, "loss": 0.6861, "step": 330 }, { "epoch": 1.2645281264528125, "grad_norm": 0.6559502974519889, "learning_rate": 5e-06, "loss": 0.6848, "step": 340 }, { "epoch": 1.301720130172013, "grad_norm": 0.6672718286848371, "learning_rate": 5e-06, "loss": 0.6894, "step": 350 }, { "epoch": 1.3389121338912133, "grad_norm": 0.6968811181243937, "learning_rate": 5e-06, "loss": 0.6856, "step": 360 }, { "epoch": 1.3761041376104137, "grad_norm": 0.713126345107699, "learning_rate": 5e-06, "loss": 0.6921, "step": 370 }, { "epoch": 1.4132961413296141, "grad_norm": 0.555101582608444, "learning_rate": 5e-06, "loss": 0.685, "step": 380 }, { "epoch": 1.4504881450488145, "grad_norm": 0.8155799633973349, "learning_rate": 5e-06, "loss": 0.688, "step": 390 }, { "epoch": 1.487680148768015, "grad_norm": 0.7809410682542598, "learning_rate": 5e-06, "loss": 0.6848, "step": 400 }, { "epoch": 1.524872152487215, "grad_norm": 0.859503762107944, "learning_rate": 5e-06, "loss": 0.6842, "step": 410 }, { "epoch": 1.5620641562064157, "grad_norm": 0.7216593694701429, "learning_rate": 5e-06, "loss": 0.6846, "step": 420 }, { "epoch": 1.5992561599256159, "grad_norm": 0.7100718746210283, "learning_rate": 5e-06, "loss": 0.6892, "step": 430 }, { "epoch": 1.6364481636448165, "grad_norm": 0.6713429745171309, "learning_rate": 5e-06, "loss": 0.693, "step": 440 }, { "epoch": 1.6736401673640167, "grad_norm": 0.8718788500797711, "learning_rate": 5e-06, "loss": 0.6814, "step": 450 }, { "epoch": 1.710832171083217, "grad_norm": 0.8350277998981831, "learning_rate": 5e-06, "loss": 0.6845, "step": 460 }, { "epoch": 1.7480241748024175, "grad_norm": 0.6507252875638763, "learning_rate": 5e-06, "loss": 0.6888, "step": 470 }, { "epoch": 1.7852161785216178, "grad_norm": 0.6271601112082867, "learning_rate": 5e-06, "loss": 0.6863, "step": 480 }, { "epoch": 1.8224081822408182, "grad_norm": 0.616595241070034, "learning_rate": 5e-06, "loss": 0.6849, "step": 490 }, { "epoch": 1.8596001859600186, "grad_norm": 0.6956379811592589, "learning_rate": 5e-06, "loss": 0.6821, "step": 500 }, { "epoch": 1.896792189679219, "grad_norm": 0.6804741810216053, "learning_rate": 5e-06, "loss": 0.6808, "step": 510 }, { "epoch": 1.9339841933984192, "grad_norm": 0.6471606933029862, "learning_rate": 5e-06, "loss": 0.6884, "step": 520 }, { "epoch": 1.9711761971176198, "grad_norm": 0.6546763243128592, "learning_rate": 5e-06, "loss": 0.6844, "step": 530 }, { "epoch": 1.9972105997210599, "eval_loss": 0.7266745567321777, "eval_runtime": 286.325, "eval_samples_per_second": 25.296, "eval_steps_per_second": 0.398, "step": 537 }, { "epoch": 2.00836820083682, "grad_norm": 1.0262138186877314, "learning_rate": 5e-06, "loss": 0.6783, "step": 540 }, { "epoch": 2.0455602045560206, "grad_norm": 0.8739740031885926, "learning_rate": 5e-06, "loss": 0.6332, "step": 550 }, { "epoch": 2.082752208275221, "grad_norm": 0.7117432291580535, "learning_rate": 5e-06, "loss": 0.6296, "step": 560 }, { "epoch": 2.1199442119944214, "grad_norm": 0.5899527267147682, "learning_rate": 5e-06, "loss": 0.6285, "step": 570 }, { "epoch": 2.1571362157136216, "grad_norm": 0.6690509984383408, "learning_rate": 5e-06, "loss": 0.6297, "step": 580 }, { "epoch": 2.1943282194328217, "grad_norm": 0.9042165551862072, "learning_rate": 5e-06, "loss": 0.6327, "step": 590 }, { "epoch": 2.2315202231520224, "grad_norm": 0.7099236499244134, "learning_rate": 5e-06, "loss": 0.6313, "step": 600 }, { "epoch": 2.2687122268712225, "grad_norm": 0.7526574548917114, "learning_rate": 5e-06, "loss": 0.6343, "step": 610 }, { "epoch": 2.305904230590423, "grad_norm": 0.5914475009740064, "learning_rate": 5e-06, "loss": 0.6304, "step": 620 }, { "epoch": 2.3430962343096233, "grad_norm": 0.6210916718103259, "learning_rate": 5e-06, "loss": 0.6266, "step": 630 }, { "epoch": 2.380288238028824, "grad_norm": 0.9617962871174769, "learning_rate": 5e-06, "loss": 0.6334, "step": 640 }, { "epoch": 2.417480241748024, "grad_norm": 0.68177525923663, "learning_rate": 5e-06, "loss": 0.6374, "step": 650 }, { "epoch": 2.4546722454672247, "grad_norm": 0.7346141491432675, "learning_rate": 5e-06, "loss": 0.6354, "step": 660 }, { "epoch": 2.491864249186425, "grad_norm": 0.765713000031021, "learning_rate": 5e-06, "loss": 0.632, "step": 670 }, { "epoch": 2.529056252905625, "grad_norm": 0.9254737796398699, "learning_rate": 5e-06, "loss": 0.6355, "step": 680 }, { "epoch": 2.5662482566248257, "grad_norm": 0.5791067801191988, "learning_rate": 5e-06, "loss": 0.635, "step": 690 }, { "epoch": 2.603440260344026, "grad_norm": 0.6114487645448582, "learning_rate": 5e-06, "loss": 0.6338, "step": 700 }, { "epoch": 2.6406322640632265, "grad_norm": 0.6334477556910006, "learning_rate": 5e-06, "loss": 0.6298, "step": 710 }, { "epoch": 2.6778242677824267, "grad_norm": 0.6414121652994028, "learning_rate": 5e-06, "loss": 0.6388, "step": 720 }, { "epoch": 2.7150162715016273, "grad_norm": 0.6324796752766957, "learning_rate": 5e-06, "loss": 0.6365, "step": 730 }, { "epoch": 2.7522082752208274, "grad_norm": 0.7190932899296674, "learning_rate": 5e-06, "loss": 0.6418, "step": 740 }, { "epoch": 2.789400278940028, "grad_norm": 0.5758231899781839, "learning_rate": 5e-06, "loss": 0.636, "step": 750 }, { "epoch": 2.8265922826592282, "grad_norm": 0.6399727518476824, "learning_rate": 5e-06, "loss": 0.6359, "step": 760 }, { "epoch": 2.8637842863784284, "grad_norm": 0.6927434726600722, "learning_rate": 5e-06, "loss": 0.6371, "step": 770 }, { "epoch": 2.900976290097629, "grad_norm": 0.626988233626989, "learning_rate": 5e-06, "loss": 0.6404, "step": 780 }, { "epoch": 2.9381682938168296, "grad_norm": 0.5823110300301132, "learning_rate": 5e-06, "loss": 0.6339, "step": 790 }, { "epoch": 2.97536029753603, "grad_norm": 0.6355859003574489, "learning_rate": 5e-06, "loss": 0.6382, "step": 800 }, { "epoch": 2.9902370990237097, "eval_loss": 0.7294604778289795, "eval_runtime": 287.7675, "eval_samples_per_second": 25.17, "eval_steps_per_second": 0.396, "step": 804 }, { "epoch": 2.9902370990237097, "step": 804, "total_flos": 1346520565678080.0, "train_loss": 0.7033177981922283, "train_runtime": 47430.6382, "train_samples_per_second": 8.704, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 804, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1346520565678080.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }