| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.9988716502115658, | |
| "eval_steps": 500, | |
| "global_step": 1772, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.022566995768688293, | |
| "grad_norm": 1.1533478871584986, | |
| "learning_rate": 5e-06, | |
| "loss": 0.744, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.045133991537376586, | |
| "grad_norm": 0.964708516920586, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6804, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06770098730606489, | |
| "grad_norm": 0.7414689176853874, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6629, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09026798307475317, | |
| "grad_norm": 0.6402562297823468, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6403, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11283497884344147, | |
| "grad_norm": 0.6438158764026781, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6334, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13540197461212977, | |
| "grad_norm": 0.6086347979873902, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6331, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.15796897038081806, | |
| "grad_norm": 0.637737303544012, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6285, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18053596614950634, | |
| "grad_norm": 0.6180779913246882, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6309, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.20310296191819463, | |
| "grad_norm": 0.7628906889304371, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6285, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.22566995768688294, | |
| "grad_norm": 0.6560412532230508, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6273, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.24823695345557123, | |
| "grad_norm": 0.5971484737001814, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6183, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.27080394922425954, | |
| "grad_norm": 0.62454415866273, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6169, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2933709449929478, | |
| "grad_norm": 0.9358718838187522, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6184, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3159379407616361, | |
| "grad_norm": 0.6707162435124239, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6134, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3385049365303244, | |
| "grad_norm": 0.607779817667474, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6135, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3610719322990127, | |
| "grad_norm": 0.5866694919013473, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6163, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.383638928067701, | |
| "grad_norm": 0.6395663339457631, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6124, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.40620592383638926, | |
| "grad_norm": 0.5802311797909204, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6086, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4287729196050776, | |
| "grad_norm": 0.5655529914842923, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6034, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4513399153737659, | |
| "grad_norm": 0.5978170679918469, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6094, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.47390691114245415, | |
| "grad_norm": 0.5598729087663762, | |
| "learning_rate": 5e-06, | |
| "loss": 0.606, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.49647390691114246, | |
| "grad_norm": 0.6135515117029436, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5985, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5190409026798307, | |
| "grad_norm": 0.5982246973373736, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6074, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5416078984485191, | |
| "grad_norm": 0.634679350959857, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6056, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5641748942172073, | |
| "grad_norm": 0.5848371303406198, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5998, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5867418899858956, | |
| "grad_norm": 0.7237357300883238, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6029, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.609308885754584, | |
| "grad_norm": 0.5758396058045636, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5982, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6318758815232722, | |
| "grad_norm": 0.551647922547292, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6043, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6544428772919605, | |
| "grad_norm": 0.6070055411621582, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5979, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6770098730606487, | |
| "grad_norm": 0.6071346856787038, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6042, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6995768688293371, | |
| "grad_norm": 0.5694826541205662, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5972, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7221438645980254, | |
| "grad_norm": 0.5219595834784061, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6006, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7447108603667136, | |
| "grad_norm": 0.5425184695486164, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5962, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.767277856135402, | |
| "grad_norm": 0.5849183234601607, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5956, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7898448519040903, | |
| "grad_norm": 0.6577639097966262, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5968, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8124118476727785, | |
| "grad_norm": 0.6147316484890941, | |
| "learning_rate": 5e-06, | |
| "loss": 0.596, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8349788434414669, | |
| "grad_norm": 0.5747506292979353, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6027, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8575458392101551, | |
| "grad_norm": 0.5687682085546433, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5999, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8801128349788434, | |
| "grad_norm": 0.616324958523922, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5941, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9026798307475318, | |
| "grad_norm": 0.5900329665323057, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5962, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.92524682651622, | |
| "grad_norm": 0.5841528129448796, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5939, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9478138222849083, | |
| "grad_norm": 0.586906708055177, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5944, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9703808180535967, | |
| "grad_norm": 0.5824744589920343, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5887, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9929478138222849, | |
| "grad_norm": 0.7143042613195987, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5935, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9997179125528914, | |
| "eval_loss": 0.5913009643554688, | |
| "eval_runtime": 359.8373, | |
| "eval_samples_per_second": 33.184, | |
| "eval_steps_per_second": 0.52, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 1.0155148095909732, | |
| "grad_norm": 0.7349947461827919, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6021, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.0380818053596614, | |
| "grad_norm": 0.5967862073767953, | |
| "learning_rate": 5e-06, | |
| "loss": 0.528, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.0606488011283497, | |
| "grad_norm": 0.5908248473188383, | |
| "learning_rate": 5e-06, | |
| "loss": 0.543, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.0832157968970382, | |
| "grad_norm": 0.5783595460572618, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5306, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.1057827926657264, | |
| "grad_norm": 0.6027833555237775, | |
| "learning_rate": 5e-06, | |
| "loss": 0.541, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.1283497884344147, | |
| "grad_norm": 0.5779961111728702, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5393, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.150916784203103, | |
| "grad_norm": 0.609691227257698, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5409, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.1734837799717912, | |
| "grad_norm": 0.6288262969758429, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5336, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.1960507757404795, | |
| "grad_norm": 0.6441493312512533, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5393, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.2186177715091677, | |
| "grad_norm": 0.5801582242360297, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5371, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.2411847672778562, | |
| "grad_norm": 0.6407989788015263, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5422, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.2637517630465445, | |
| "grad_norm": 0.6145932063398675, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5438, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.2863187588152327, | |
| "grad_norm": 0.5894609854660963, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5415, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.308885754583921, | |
| "grad_norm": 0.5712716247350705, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5409, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.3314527503526092, | |
| "grad_norm": 0.5545979807457848, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5409, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.3540197461212977, | |
| "grad_norm": 0.6116638284685687, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5463, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.376586741889986, | |
| "grad_norm": 0.603741984926436, | |
| "learning_rate": 5e-06, | |
| "loss": 0.539, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.3991537376586742, | |
| "grad_norm": 0.5576981790641173, | |
| "learning_rate": 5e-06, | |
| "loss": 0.543, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.4217207334273625, | |
| "grad_norm": 0.5585863324687254, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5411, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.4442877291960508, | |
| "grad_norm": 0.6685966084888875, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5407, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.466854724964739, | |
| "grad_norm": 0.563715043671093, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5418, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.4894217207334273, | |
| "grad_norm": 0.664496256459769, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5456, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.5119887165021155, | |
| "grad_norm": 0.5712447353649974, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5405, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.5345557122708038, | |
| "grad_norm": 0.5486833946787922, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5345, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.5571227080394923, | |
| "grad_norm": 0.6092638196410558, | |
| "learning_rate": 5e-06, | |
| "loss": 0.537, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.5796897038081805, | |
| "grad_norm": 0.5716085259389777, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5407, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6022566995768688, | |
| "grad_norm": 0.5372993001292306, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5437, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.6248236953455573, | |
| "grad_norm": 0.5950554022209111, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5516, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.6473906911142455, | |
| "grad_norm": 0.5644998290854271, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5347, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.6699576868829338, | |
| "grad_norm": 0.7194589295267496, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5506, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.692524682651622, | |
| "grad_norm": 0.654147586643372, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5424, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.7150916784203103, | |
| "grad_norm": 0.5858691889312596, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5412, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.7376586741889986, | |
| "grad_norm": 0.6698089700161991, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5414, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.7602256699576868, | |
| "grad_norm": 0.5539764143704193, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5372, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.782792665726375, | |
| "grad_norm": 0.5902260571431798, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5411, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.8053596614950633, | |
| "grad_norm": 0.6653827196776052, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5364, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.8279266572637518, | |
| "grad_norm": 0.5629278214932536, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5471, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.85049365303244, | |
| "grad_norm": 0.716393148987963, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5343, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.8730606488011283, | |
| "grad_norm": 0.5869857580731778, | |
| "learning_rate": 5e-06, | |
| "loss": 0.546, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.8956276445698168, | |
| "grad_norm": 0.5932511634770244, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5461, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.918194640338505, | |
| "grad_norm": 0.5548454392387719, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5342, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.9407616361071933, | |
| "grad_norm": 0.6292606426061244, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5391, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.9633286318758816, | |
| "grad_norm": 0.6964512767642522, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5457, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.9858956276445698, | |
| "grad_norm": 0.597052250032102, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5435, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.9994358251057829, | |
| "eval_loss": 0.5871431231498718, | |
| "eval_runtime": 359.7238, | |
| "eval_samples_per_second": 33.195, | |
| "eval_steps_per_second": 0.52, | |
| "step": 886 | |
| }, | |
| { | |
| "epoch": 2.008462623413258, | |
| "grad_norm": 1.0307272915735466, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5672, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.0310296191819464, | |
| "grad_norm": 0.7570634539626596, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4779, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.0535966149506346, | |
| "grad_norm": 0.7952962689694523, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4796, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.076163610719323, | |
| "grad_norm": 0.6214258578581712, | |
| "learning_rate": 5e-06, | |
| "loss": 0.472, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.098730606488011, | |
| "grad_norm": 0.6061682325391837, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4792, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.1212976022566994, | |
| "grad_norm": 0.5893935434044923, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4838, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.143864598025388, | |
| "grad_norm": 0.5983033582412839, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4855, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.1664315937940763, | |
| "grad_norm": 0.6061301651629116, | |
| "learning_rate": 5e-06, | |
| "loss": 0.482, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.1889985895627646, | |
| "grad_norm": 0.7208585763313551, | |
| "learning_rate": 5e-06, | |
| "loss": 0.48, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.211565585331453, | |
| "grad_norm": 0.6130737469375042, | |
| "learning_rate": 5e-06, | |
| "loss": 0.475, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.234132581100141, | |
| "grad_norm": 0.5687127566641503, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4748, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.2566995768688294, | |
| "grad_norm": 0.7034322954282024, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4839, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.2792665726375176, | |
| "grad_norm": 0.5935224377076785, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4907, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.301833568406206, | |
| "grad_norm": 0.7990706799966594, | |
| "learning_rate": 5e-06, | |
| "loss": 0.485, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.324400564174894, | |
| "grad_norm": 0.6024785296050031, | |
| "learning_rate": 5e-06, | |
| "loss": 0.489, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.3469675599435824, | |
| "grad_norm": 0.6533399713336523, | |
| "learning_rate": 5e-06, | |
| "loss": 0.487, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.3695345557122707, | |
| "grad_norm": 0.6341103214680793, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4849, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.392101551480959, | |
| "grad_norm": 0.6145632264645421, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4844, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.414668547249647, | |
| "grad_norm": 0.7153893920420706, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4848, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.4372355430183354, | |
| "grad_norm": 0.7079536947244722, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4883, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.459802538787024, | |
| "grad_norm": 0.5750394845236195, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4863, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.4823695345557124, | |
| "grad_norm": 0.5940313086667759, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4896, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.5049365303244007, | |
| "grad_norm": 0.5915949703706999, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4922, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.527503526093089, | |
| "grad_norm": 0.6599431703449483, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4838, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.550070521861777, | |
| "grad_norm": 0.5987887803366096, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4883, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.5726375176304654, | |
| "grad_norm": 0.79487550387113, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4871, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.5952045133991537, | |
| "grad_norm": 0.712422302635071, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4877, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.617771509167842, | |
| "grad_norm": 0.6372686005869476, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4915, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.64033850493653, | |
| "grad_norm": 0.6258750499046419, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4809, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.6629055007052185, | |
| "grad_norm": 0.6254081356701218, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4913, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.685472496473907, | |
| "grad_norm": 0.5772656389234755, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4919, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.7080394922425954, | |
| "grad_norm": 0.5958356234239867, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4901, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.7306064880112837, | |
| "grad_norm": 0.5845924683927192, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4913, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.753173483779972, | |
| "grad_norm": 0.5627157674386412, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4884, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.77574047954866, | |
| "grad_norm": 0.5821830745956523, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4945, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.7983074753173485, | |
| "grad_norm": 0.6229685304395023, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4934, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.8208744710860367, | |
| "grad_norm": 0.5845914499963646, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4933, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.843441466854725, | |
| "grad_norm": 0.5574457219985247, | |
| "learning_rate": 5e-06, | |
| "loss": 0.495, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.8660084626234132, | |
| "grad_norm": 0.5996150224624698, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4919, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.8885754583921015, | |
| "grad_norm": 0.5752771355797817, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4901, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.9111424541607898, | |
| "grad_norm": 0.6240736677369982, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4938, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.933709449929478, | |
| "grad_norm": 0.614687551114313, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4889, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.9562764456981663, | |
| "grad_norm": 0.6546354143056629, | |
| "learning_rate": 5e-06, | |
| "loss": 0.493, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.9788434414668545, | |
| "grad_norm": 0.5981369023208123, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4899, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.9991537376586743, | |
| "eval_loss": 0.6006776094436646, | |
| "eval_runtime": 359.5275, | |
| "eval_samples_per_second": 33.213, | |
| "eval_steps_per_second": 0.52, | |
| "step": 1329 | |
| }, | |
| { | |
| "epoch": 3.0014104372355432, | |
| "grad_norm": 1.7818284423885802, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5325, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 3.0239774330042315, | |
| "grad_norm": 0.7669244086926804, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4154, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 3.0465444287729198, | |
| "grad_norm": 0.6707220051250778, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4178, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 3.069111424541608, | |
| "grad_norm": 0.6712211926030488, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4179, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 3.0916784203102963, | |
| "grad_norm": 0.6852453014581671, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4199, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 3.1142454160789845, | |
| "grad_norm": 0.6952511595354455, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4172, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 3.136812411847673, | |
| "grad_norm": 0.6739341752053783, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4196, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 3.159379407616361, | |
| "grad_norm": 0.6810240432279709, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4217, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.1819464033850493, | |
| "grad_norm": 0.6451044056426664, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4228, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 3.2045133991537376, | |
| "grad_norm": 0.7365176770943456, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4264, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 3.227080394922426, | |
| "grad_norm": 0.6889861565739961, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4236, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 3.249647390691114, | |
| "grad_norm": 0.6619155809062343, | |
| "learning_rate": 5e-06, | |
| "loss": 0.429, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 3.272214386459803, | |
| "grad_norm": 0.6938859033729257, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4315, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.294781382228491, | |
| "grad_norm": 0.6269637473408832, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4311, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 3.3173483779971793, | |
| "grad_norm": 0.6871689545622378, | |
| "learning_rate": 5e-06, | |
| "loss": 0.429, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 3.3399153737658676, | |
| "grad_norm": 0.6431121820988837, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4268, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 3.362482369534556, | |
| "grad_norm": 0.6227148520808465, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4291, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 3.385049365303244, | |
| "grad_norm": 0.6926627432900843, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4369, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.4076163610719323, | |
| "grad_norm": 0.6753898611704962, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4289, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 3.4301833568406206, | |
| "grad_norm": 0.6235946356953773, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4285, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 3.452750352609309, | |
| "grad_norm": 0.6452950491570345, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4339, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 3.475317348377997, | |
| "grad_norm": 0.6466567653039634, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4298, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 3.4978843441466854, | |
| "grad_norm": 0.7157977999949227, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4367, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.5204513399153736, | |
| "grad_norm": 0.746147622100308, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4333, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 3.543018335684062, | |
| "grad_norm": 0.63252883620392, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4381, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 3.56558533145275, | |
| "grad_norm": 0.645023562996112, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4287, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 3.5881523272214384, | |
| "grad_norm": 0.6438099493606649, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4288, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 3.610719322990127, | |
| "grad_norm": 0.6919344965122289, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4322, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.6332863187588154, | |
| "grad_norm": 0.6663827038922323, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4353, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 3.6558533145275036, | |
| "grad_norm": 0.6633293647142574, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4321, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 3.678420310296192, | |
| "grad_norm": 0.6568725046800021, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4352, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 3.70098730606488, | |
| "grad_norm": 0.673099496711102, | |
| "learning_rate": 5e-06, | |
| "loss": 0.436, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 3.7235543018335684, | |
| "grad_norm": 0.6438662463205753, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4321, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.7461212976022567, | |
| "grad_norm": 0.6523537779817469, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4308, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 3.768688293370945, | |
| "grad_norm": 0.6785421362215835, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4363, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 3.791255289139633, | |
| "grad_norm": 0.6620925087943027, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4421, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 3.8138222849083214, | |
| "grad_norm": 0.6148218995999806, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4332, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 3.83638928067701, | |
| "grad_norm": 0.7122084725034734, | |
| "learning_rate": 5e-06, | |
| "loss": 0.431, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.8589562764456984, | |
| "grad_norm": 0.6454980786635541, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4398, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 3.8815232722143866, | |
| "grad_norm": 0.6274368423659055, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4387, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 3.904090267983075, | |
| "grad_norm": 0.6324679361432074, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4365, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 3.926657263751763, | |
| "grad_norm": 0.6304335216014254, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4373, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 3.9492242595204514, | |
| "grad_norm": 0.6460344821987493, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4366, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.9717912552891397, | |
| "grad_norm": 0.6439249065834609, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4362, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 3.994358251057828, | |
| "grad_norm": 0.6241101368641281, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4359, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 3.9988716502115658, | |
| "eval_loss": 0.6362190246582031, | |
| "eval_runtime": 359.9366, | |
| "eval_samples_per_second": 33.175, | |
| "eval_steps_per_second": 0.52, | |
| "step": 1772 | |
| }, | |
| { | |
| "epoch": 3.9988716502115658, | |
| "step": 1772, | |
| "total_flos": 6752101308825600.0, | |
| "train_loss": 0.5187085666788351, | |
| "train_runtime": 82722.7317, | |
| "train_samples_per_second": 10.97, | |
| "train_steps_per_second": 0.021 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1772, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6752101308825600.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |