| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9902370990237097, | |
| "eval_steps": 500, | |
| "global_step": 804, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.037192003719200374, | |
| "grad_norm": 6.881888050242323, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0402, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07438400743840075, | |
| "grad_norm": 3.5091968880412154, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9234, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11157601115760112, | |
| "grad_norm": 1.6873942990209014, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8833, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1487680148768015, | |
| "grad_norm": 2.407800596967581, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8524, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18596001859600186, | |
| "grad_norm": 2.4319398224790505, | |
| "learning_rate": 5e-06, | |
| "loss": 0.828, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22315202231520223, | |
| "grad_norm": 1.6767100693953145, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8117, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2603440260344026, | |
| "grad_norm": 1.1194486400318895, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8019, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.297536029753603, | |
| "grad_norm": 0.7791268668811029, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7909, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.33472803347280333, | |
| "grad_norm": 1.0231329358996892, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7745, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3719200371920037, | |
| "grad_norm": 1.0119634746718196, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7768, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.40911204091120407, | |
| "grad_norm": 0.7286643555576803, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7715, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.44630404463040446, | |
| "grad_norm": 0.8830873875823773, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7694, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.48349604834960486, | |
| "grad_norm": 1.0262942694467152, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7639, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5206880520688052, | |
| "grad_norm": 0.8416190902804997, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7627, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5578800557880056, | |
| "grad_norm": 0.8213144249800172, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7584, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.595072059507206, | |
| "grad_norm": 0.7761932672369115, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7527, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6322640632264063, | |
| "grad_norm": 0.594310752303111, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7551, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6694560669456067, | |
| "grad_norm": 0.723753760030973, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7505, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7066480706648071, | |
| "grad_norm": 0.8255927381229027, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7537, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7438400743840075, | |
| "grad_norm": 0.6756994735330465, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7471, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7810320781032078, | |
| "grad_norm": 0.7417685138593264, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7519, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8182240818224081, | |
| "grad_norm": 0.7199232164518533, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7445, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8554160855416085, | |
| "grad_norm": 0.5805134046445549, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7469, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8926080892608089, | |
| "grad_norm": 0.8083557503057547, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7426, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9298000929800093, | |
| "grad_norm": 0.5958428143365987, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7429, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9669920966992097, | |
| "grad_norm": 0.7200821647238617, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7455, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9967456996745699, | |
| "eval_loss": 0.7387776970863342, | |
| "eval_runtime": 285.1365, | |
| "eval_samples_per_second": 25.402, | |
| "eval_steps_per_second": 0.4, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.00418410041841, | |
| "grad_norm": 0.9192811448757029, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7395, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.0413761041376104, | |
| "grad_norm": 0.7114550344465745, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6952, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.0785681078568108, | |
| "grad_norm": 0.7773431995256311, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6894, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.1157601115760112, | |
| "grad_norm": 0.8395302785079569, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6895, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.1529521152952116, | |
| "grad_norm": 0.5978839456204598, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6839, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.190144119014412, | |
| "grad_norm": 0.6886905957705741, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6869, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2273361227336124, | |
| "grad_norm": 0.6062023199694873, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6861, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.2645281264528125, | |
| "grad_norm": 0.6559502974519889, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6848, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.301720130172013, | |
| "grad_norm": 0.6672718286848371, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6894, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.3389121338912133, | |
| "grad_norm": 0.6968811181243937, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6856, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.3761041376104137, | |
| "grad_norm": 0.713126345107699, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6921, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.4132961413296141, | |
| "grad_norm": 0.555101582608444, | |
| "learning_rate": 5e-06, | |
| "loss": 0.685, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4504881450488145, | |
| "grad_norm": 0.8155799633973349, | |
| "learning_rate": 5e-06, | |
| "loss": 0.688, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.487680148768015, | |
| "grad_norm": 0.7809410682542598, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6848, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.524872152487215, | |
| "grad_norm": 0.859503762107944, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6842, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.5620641562064157, | |
| "grad_norm": 0.7216593694701429, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6846, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.5992561599256159, | |
| "grad_norm": 0.7100718746210283, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6892, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.6364481636448165, | |
| "grad_norm": 0.6713429745171309, | |
| "learning_rate": 5e-06, | |
| "loss": 0.693, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.6736401673640167, | |
| "grad_norm": 0.8718788500797711, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6814, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.710832171083217, | |
| "grad_norm": 0.8350277998981831, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6845, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.7480241748024175, | |
| "grad_norm": 0.6507252875638763, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6888, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.7852161785216178, | |
| "grad_norm": 0.6271601112082867, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6863, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.8224081822408182, | |
| "grad_norm": 0.616595241070034, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6849, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.8596001859600186, | |
| "grad_norm": 0.6956379811592589, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6821, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.896792189679219, | |
| "grad_norm": 0.6804741810216053, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6808, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.9339841933984192, | |
| "grad_norm": 0.6471606933029862, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6884, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.9711761971176198, | |
| "grad_norm": 0.6546763243128592, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6844, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.9972105997210599, | |
| "eval_loss": 0.7266745567321777, | |
| "eval_runtime": 286.325, | |
| "eval_samples_per_second": 25.296, | |
| "eval_steps_per_second": 0.398, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 2.00836820083682, | |
| "grad_norm": 1.0262138186877314, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6783, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.0455602045560206, | |
| "grad_norm": 0.8739740031885926, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6332, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.082752208275221, | |
| "grad_norm": 0.7117432291580535, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6296, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.1199442119944214, | |
| "grad_norm": 0.5899527267147682, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6285, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.1571362157136216, | |
| "grad_norm": 0.6690509984383408, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6297, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.1943282194328217, | |
| "grad_norm": 0.9042165551862072, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6327, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.2315202231520224, | |
| "grad_norm": 0.7099236499244134, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6313, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.2687122268712225, | |
| "grad_norm": 0.7526574548917114, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6343, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.305904230590423, | |
| "grad_norm": 0.5914475009740064, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6304, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.3430962343096233, | |
| "grad_norm": 0.6210916718103259, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6266, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.380288238028824, | |
| "grad_norm": 0.9617962871174769, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6334, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.417480241748024, | |
| "grad_norm": 0.68177525923663, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6374, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.4546722454672247, | |
| "grad_norm": 0.7346141491432675, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6354, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.491864249186425, | |
| "grad_norm": 0.765713000031021, | |
| "learning_rate": 5e-06, | |
| "loss": 0.632, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.529056252905625, | |
| "grad_norm": 0.9254737796398699, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6355, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.5662482566248257, | |
| "grad_norm": 0.5791067801191988, | |
| "learning_rate": 5e-06, | |
| "loss": 0.635, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.603440260344026, | |
| "grad_norm": 0.6114487645448582, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6338, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.6406322640632265, | |
| "grad_norm": 0.6334477556910006, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6298, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.6778242677824267, | |
| "grad_norm": 0.6414121652994028, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6388, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.7150162715016273, | |
| "grad_norm": 0.6324796752766957, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6365, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.7522082752208274, | |
| "grad_norm": 0.7190932899296674, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6418, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.789400278940028, | |
| "grad_norm": 0.5758231899781839, | |
| "learning_rate": 5e-06, | |
| "loss": 0.636, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.8265922826592282, | |
| "grad_norm": 0.6399727518476824, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6359, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.8637842863784284, | |
| "grad_norm": 0.6927434726600722, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6371, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.900976290097629, | |
| "grad_norm": 0.626988233626989, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6404, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.9381682938168296, | |
| "grad_norm": 0.5823110300301132, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6339, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.97536029753603, | |
| "grad_norm": 0.6355859003574489, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6382, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.9902370990237097, | |
| "eval_loss": 0.7294604778289795, | |
| "eval_runtime": 287.7675, | |
| "eval_samples_per_second": 25.17, | |
| "eval_steps_per_second": 0.396, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 2.9902370990237097, | |
| "step": 804, | |
| "total_flos": 1346520565678080.0, | |
| "train_loss": 0.7033177981922283, | |
| "train_runtime": 47430.6382, | |
| "train_samples_per_second": 8.704, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 804, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1346520565678080.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |