| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.993049522154648, |
| "eval_steps": 500, |
| "global_step": 861, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03475238922675934, |
| "grad_norm": 4.204217083914466, |
| "learning_rate": 5e-06, |
| "loss": 1.0615, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06950477845351868, |
| "grad_norm": 1.422784268276639, |
| "learning_rate": 5e-06, |
| "loss": 0.9179, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10425716768027801, |
| "grad_norm": 1.7251757262516607, |
| "learning_rate": 5e-06, |
| "loss": 0.8716, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.13900955690703737, |
| "grad_norm": 1.1547403296199443, |
| "learning_rate": 5e-06, |
| "loss": 0.8538, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.1737619461337967, |
| "grad_norm": 1.3147732137734534, |
| "learning_rate": 5e-06, |
| "loss": 0.8315, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.20851433536055602, |
| "grad_norm": 1.2212040088296596, |
| "learning_rate": 5e-06, |
| "loss": 0.8137, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.24326672458731538, |
| "grad_norm": 1.0105503239364757, |
| "learning_rate": 5e-06, |
| "loss": 0.8079, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.27801911381407474, |
| "grad_norm": 1.0559222477057983, |
| "learning_rate": 5e-06, |
| "loss": 0.794, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.31277150304083406, |
| "grad_norm": 0.7128592380359542, |
| "learning_rate": 5e-06, |
| "loss": 0.79, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3475238922675934, |
| "grad_norm": 0.7316521398436672, |
| "learning_rate": 5e-06, |
| "loss": 0.7808, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3822762814943527, |
| "grad_norm": 0.9194116933386736, |
| "learning_rate": 5e-06, |
| "loss": 0.7743, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.41702867072111205, |
| "grad_norm": 0.724708916285198, |
| "learning_rate": 5e-06, |
| "loss": 0.7727, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.45178105994787143, |
| "grad_norm": 0.6304484292299692, |
| "learning_rate": 5e-06, |
| "loss": 0.7708, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.48653344917463076, |
| "grad_norm": 0.8580390763526664, |
| "learning_rate": 5e-06, |
| "loss": 0.7663, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5212858384013901, |
| "grad_norm": 0.680533196209476, |
| "learning_rate": 5e-06, |
| "loss": 0.7666, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5560382276281495, |
| "grad_norm": 0.5754590021742806, |
| "learning_rate": 5e-06, |
| "loss": 0.7677, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5907906168549087, |
| "grad_norm": 0.6990861984775781, |
| "learning_rate": 5e-06, |
| "loss": 0.7586, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6255430060816681, |
| "grad_norm": 0.8182472984882369, |
| "learning_rate": 5e-06, |
| "loss": 0.7581, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6602953953084274, |
| "grad_norm": 0.8231305567479303, |
| "learning_rate": 5e-06, |
| "loss": 0.7578, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6950477845351868, |
| "grad_norm": 0.6513447002925578, |
| "learning_rate": 5e-06, |
| "loss": 0.755, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7298001737619462, |
| "grad_norm": 0.5789033103066399, |
| "learning_rate": 5e-06, |
| "loss": 0.756, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.7645525629887054, |
| "grad_norm": 0.6090095361122515, |
| "learning_rate": 5e-06, |
| "loss": 0.7547, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.7993049522154648, |
| "grad_norm": 0.5529700900735016, |
| "learning_rate": 5e-06, |
| "loss": 0.7511, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8340573414422241, |
| "grad_norm": 0.6891665664865682, |
| "learning_rate": 5e-06, |
| "loss": 0.7536, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.8688097306689835, |
| "grad_norm": 0.8354216869977991, |
| "learning_rate": 5e-06, |
| "loss": 0.7495, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.9035621198957429, |
| "grad_norm": 0.750227289167969, |
| "learning_rate": 5e-06, |
| "loss": 0.7473, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9383145091225021, |
| "grad_norm": 0.590848403292903, |
| "learning_rate": 5e-06, |
| "loss": 0.7475, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.9730668983492615, |
| "grad_norm": 0.7308589738950354, |
| "learning_rate": 5e-06, |
| "loss": 0.7466, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.9973935708079931, |
| "eval_loss": 0.7464137077331543, |
| "eval_runtime": 306.0354, |
| "eval_samples_per_second": 25.33, |
| "eval_steps_per_second": 0.399, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.0082536924413554, |
| "grad_norm": 0.8888839481114714, |
| "learning_rate": 5e-06, |
| "loss": 0.7656, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.0430060816681146, |
| "grad_norm": 0.7736290679704527, |
| "learning_rate": 5e-06, |
| "loss": 0.6927, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.077758470894874, |
| "grad_norm": 0.6086798023669154, |
| "learning_rate": 5e-06, |
| "loss": 0.6937, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.1125108601216334, |
| "grad_norm": 0.7277120931979707, |
| "learning_rate": 5e-06, |
| "loss": 0.6956, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.1472632493483927, |
| "grad_norm": 0.6477654174773422, |
| "learning_rate": 5e-06, |
| "loss": 0.6945, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.1820156385751521, |
| "grad_norm": 0.6746834402435494, |
| "learning_rate": 5e-06, |
| "loss": 0.6928, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.2167680278019113, |
| "grad_norm": 0.7909338588584797, |
| "learning_rate": 5e-06, |
| "loss": 0.6975, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.2515204170286707, |
| "grad_norm": 0.6996994953219825, |
| "learning_rate": 5e-06, |
| "loss": 0.693, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.28627280625543, |
| "grad_norm": 0.7572207346579433, |
| "learning_rate": 5e-06, |
| "loss": 0.6919, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.3210251954821894, |
| "grad_norm": 0.6315378764527892, |
| "learning_rate": 5e-06, |
| "loss": 0.7012, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.3557775847089486, |
| "grad_norm": 0.7377025919254079, |
| "learning_rate": 5e-06, |
| "loss": 0.6976, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.390529973935708, |
| "grad_norm": 0.6170787424269338, |
| "learning_rate": 5e-06, |
| "loss": 0.6923, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.4252823631624674, |
| "grad_norm": 0.6720319783859334, |
| "learning_rate": 5e-06, |
| "loss": 0.6957, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.4600347523892268, |
| "grad_norm": 0.5493772411957378, |
| "learning_rate": 5e-06, |
| "loss": 0.6945, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.4947871416159861, |
| "grad_norm": 0.6013975213270655, |
| "learning_rate": 5e-06, |
| "loss": 0.6929, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.5295395308427455, |
| "grad_norm": 0.6939858737269444, |
| "learning_rate": 5e-06, |
| "loss": 0.6926, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.564291920069505, |
| "grad_norm": 0.5880706906541838, |
| "learning_rate": 5e-06, |
| "loss": 0.6922, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.599044309296264, |
| "grad_norm": 0.7548284182746593, |
| "learning_rate": 5e-06, |
| "loss": 0.6902, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.6337966985230234, |
| "grad_norm": 0.6183278356571063, |
| "learning_rate": 5e-06, |
| "loss": 0.6925, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.6685490877497828, |
| "grad_norm": 0.9398095529537541, |
| "learning_rate": 5e-06, |
| "loss": 0.6857, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.703301476976542, |
| "grad_norm": 0.6367587092612689, |
| "learning_rate": 5e-06, |
| "loss": 0.689, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.7380538662033014, |
| "grad_norm": 0.7639239518306171, |
| "learning_rate": 5e-06, |
| "loss": 0.6939, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.7728062554300608, |
| "grad_norm": 0.706056725045481, |
| "learning_rate": 5e-06, |
| "loss": 0.6916, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.8075586446568201, |
| "grad_norm": 0.7327907255249769, |
| "learning_rate": 5e-06, |
| "loss": 0.6913, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.8423110338835795, |
| "grad_norm": 0.7090737638783119, |
| "learning_rate": 5e-06, |
| "loss": 0.687, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.877063423110339, |
| "grad_norm": 0.641981559236637, |
| "learning_rate": 5e-06, |
| "loss": 0.6871, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.9118158123370983, |
| "grad_norm": 0.6003290685340094, |
| "learning_rate": 5e-06, |
| "loss": 0.6885, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.9465682015638577, |
| "grad_norm": 0.5991204401446062, |
| "learning_rate": 5e-06, |
| "loss": 0.6885, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.9813205907906168, |
| "grad_norm": 0.6603357286091912, |
| "learning_rate": 5e-06, |
| "loss": 0.693, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.9986967854039965, |
| "eval_loss": 0.7325075268745422, |
| "eval_runtime": 306.5804, |
| "eval_samples_per_second": 25.285, |
| "eval_steps_per_second": 0.398, |
| "step": 575 |
| }, |
| { |
| "epoch": 2.016507384882711, |
| "grad_norm": 0.9010986091915655, |
| "learning_rate": 5e-06, |
| "loss": 0.694, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.05125977410947, |
| "grad_norm": 0.6963098532911728, |
| "learning_rate": 5e-06, |
| "loss": 0.6416, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.086012163336229, |
| "grad_norm": 0.7981128982665585, |
| "learning_rate": 5e-06, |
| "loss": 0.6347, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.1207645525629886, |
| "grad_norm": 0.7045115982630096, |
| "learning_rate": 5e-06, |
| "loss": 0.6382, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.155516941789748, |
| "grad_norm": 0.6178456961772154, |
| "learning_rate": 5e-06, |
| "loss": 0.6316, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.1902693310165073, |
| "grad_norm": 0.6984364299074612, |
| "learning_rate": 5e-06, |
| "loss": 0.64, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.2250217202432667, |
| "grad_norm": 0.6153473997901224, |
| "learning_rate": 5e-06, |
| "loss": 0.6352, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.259774109470026, |
| "grad_norm": 0.7551289583137311, |
| "learning_rate": 5e-06, |
| "loss": 0.6415, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.2945264986967855, |
| "grad_norm": 1.0727937828194958, |
| "learning_rate": 5e-06, |
| "loss": 0.6409, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.329278887923545, |
| "grad_norm": 0.6961000957869337, |
| "learning_rate": 5e-06, |
| "loss": 0.641, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.3640312771503043, |
| "grad_norm": 1.5380586860018097, |
| "learning_rate": 5e-06, |
| "loss": 0.6343, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.3987836663770636, |
| "grad_norm": 1.473915330175611, |
| "learning_rate": 5e-06, |
| "loss": 0.6424, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.4335360556038226, |
| "grad_norm": 1.3707384922893442, |
| "learning_rate": 5e-06, |
| "loss": 0.6402, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.468288444830582, |
| "grad_norm": 1.2856734997208332, |
| "learning_rate": 5e-06, |
| "loss": 0.6405, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.5030408340573413, |
| "grad_norm": 1.0655758845498153, |
| "learning_rate": 5e-06, |
| "loss": 0.639, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.5377932232841007, |
| "grad_norm": 0.7063563183382034, |
| "learning_rate": 5e-06, |
| "loss": 0.6451, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.57254561251086, |
| "grad_norm": 1.0030690870134182, |
| "learning_rate": 5e-06, |
| "loss": 0.6451, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.6072980017376195, |
| "grad_norm": 0.642378453539176, |
| "learning_rate": 5e-06, |
| "loss": 0.6452, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.642050390964379, |
| "grad_norm": 0.5960942912485575, |
| "learning_rate": 5e-06, |
| "loss": 0.6435, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.6768027801911383, |
| "grad_norm": 0.6526728525085794, |
| "learning_rate": 5e-06, |
| "loss": 0.6393, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.711555169417897, |
| "grad_norm": 0.7230688644533029, |
| "learning_rate": 5e-06, |
| "loss": 0.6416, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.7463075586446566, |
| "grad_norm": 0.7601768679175701, |
| "learning_rate": 5e-06, |
| "loss": 0.6413, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.781059947871416, |
| "grad_norm": 0.9110327754774364, |
| "learning_rate": 5e-06, |
| "loss": 0.6441, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.8158123370981754, |
| "grad_norm": 0.8925826699350575, |
| "learning_rate": 5e-06, |
| "loss": 0.6432, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.8505647263249347, |
| "grad_norm": 0.6237089071073183, |
| "learning_rate": 5e-06, |
| "loss": 0.644, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.885317115551694, |
| "grad_norm": 0.6110859257293438, |
| "learning_rate": 5e-06, |
| "loss": 0.6399, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.9200695047784535, |
| "grad_norm": 0.6421968773513084, |
| "learning_rate": 5e-06, |
| "loss": 0.6454, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.954821894005213, |
| "grad_norm": 0.5712412822442149, |
| "learning_rate": 5e-06, |
| "loss": 0.6416, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.9895742832319723, |
| "grad_norm": 0.5672582221675043, |
| "learning_rate": 5e-06, |
| "loss": 0.644, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.993049522154648, |
| "eval_loss": 0.7338809370994568, |
| "eval_runtime": 308.0146, |
| "eval_samples_per_second": 25.168, |
| "eval_steps_per_second": 0.396, |
| "step": 861 |
| }, |
| { |
| "epoch": 2.993049522154648, |
| "step": 861, |
| "total_flos": 1441997688668160.0, |
| "train_loss": 0.7089061699677843, |
| "train_runtime": 50775.3746, |
| "train_samples_per_second": 8.702, |
| "train_steps_per_second": 0.017 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 861, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1441997688668160.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|