| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9946777054997042, | |
| "eval_steps": 500, | |
| "global_step": 1266, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02365464222353637, | |
| "grad_norm": 2.64785060100586, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8881, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04730928444707274, | |
| "grad_norm": 4.492435661685993, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7888, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0709639266706091, | |
| "grad_norm": 1.1116541886300364, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7578, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09461856889414548, | |
| "grad_norm": 1.218928037085855, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7402, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11827321111768184, | |
| "grad_norm": 1.020418802489519, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7286, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1419278533412182, | |
| "grad_norm": 0.962265802957487, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7167, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.16558249556475457, | |
| "grad_norm": 0.7842406295728257, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7122, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18923713778829096, | |
| "grad_norm": 0.8009401339234955, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7019, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.21289178001182732, | |
| "grad_norm": 0.7411954553666439, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6865, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.23654642223536368, | |
| "grad_norm": 0.6604551640687066, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6854, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.26020106445890007, | |
| "grad_norm": 0.5272790934190837, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6904, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2838557066824364, | |
| "grad_norm": 0.5812760583772804, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6776, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3075103489059728, | |
| "grad_norm": 0.5288883948662314, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6738, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.33116499112950915, | |
| "grad_norm": 0.63694474679749, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6822, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.35481963335304556, | |
| "grad_norm": 0.8869276352236491, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6649, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3784742755765819, | |
| "grad_norm": 0.518289407793539, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6696, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4021289178001183, | |
| "grad_norm": 0.5358648478939547, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6671, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.42578356002365464, | |
| "grad_norm": 0.797540982889909, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6743, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.449438202247191, | |
| "grad_norm": 0.4908769384958429, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6704, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.47309284447072736, | |
| "grad_norm": 0.5763974723432601, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6692, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4967474866942638, | |
| "grad_norm": 0.9377426988511532, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6696, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5204021289178001, | |
| "grad_norm": 0.576249879822413, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6688, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5440567711413364, | |
| "grad_norm": 0.5017180552678443, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6605, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5677114133648729, | |
| "grad_norm": 0.46598378368467036, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6649, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5913660555884093, | |
| "grad_norm": 0.44951386010612127, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6618, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6150206978119456, | |
| "grad_norm": 0.6107309306047823, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6525, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.638675340035482, | |
| "grad_norm": 0.6063385890447376, | |
| "learning_rate": 5e-06, | |
| "loss": 0.664, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6623299822590183, | |
| "grad_norm": 0.62161938246522, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6658, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6859846244825547, | |
| "grad_norm": 0.48803716140240055, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6599, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7096392667060911, | |
| "grad_norm": 0.5643745069932464, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6536, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7332939089296274, | |
| "grad_norm": 0.45536574485759423, | |
| "learning_rate": 5e-06, | |
| "loss": 0.662, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7569485511531638, | |
| "grad_norm": 0.5246164914927769, | |
| "learning_rate": 5e-06, | |
| "loss": 0.656, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7806031933767001, | |
| "grad_norm": 0.5653234939153071, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6541, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8042578356002366, | |
| "grad_norm": 0.553750057338791, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6539, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8279124778237729, | |
| "grad_norm": 0.5823597490849275, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6602, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8515671200473093, | |
| "grad_norm": 0.432948815077171, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6537, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8752217622708457, | |
| "grad_norm": 0.44732587095055915, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6571, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.898876404494382, | |
| "grad_norm": 0.45966183440739067, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6478, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9225310467179184, | |
| "grad_norm": 0.6457761210048674, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6552, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9461856889414547, | |
| "grad_norm": 0.4724156783274873, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6467, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9698403311649911, | |
| "grad_norm": 0.4559392408256043, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6492, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9934949733885275, | |
| "grad_norm": 0.5566904074437511, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6483, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9982259018332348, | |
| "eval_loss": 0.6504195332527161, | |
| "eval_runtime": 226.3163, | |
| "eval_samples_per_second": 50.323, | |
| "eval_steps_per_second": 0.393, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.0171496156120639, | |
| "grad_norm": 0.5100430406130203, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6374, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.0408042578356003, | |
| "grad_norm": 0.6182142994254941, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6056, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.0644589000591367, | |
| "grad_norm": 0.612159821355247, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6041, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.0881135422826729, | |
| "grad_norm": 0.620568436985301, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6125, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.1117681845062093, | |
| "grad_norm": 0.48353189963783877, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6114, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.1354228267297457, | |
| "grad_norm": 0.553405903828158, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6114, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.1590774689532821, | |
| "grad_norm": 0.48890661916776557, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6086, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.1827321111768185, | |
| "grad_norm": 0.4626708227952605, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6048, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2063867534003547, | |
| "grad_norm": 0.5562602420898135, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6045, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2300413956238911, | |
| "grad_norm": 0.5583615267258394, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6106, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.2536960378474276, | |
| "grad_norm": 0.5120824375752575, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6116, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.277350680070964, | |
| "grad_norm": 0.5477302994052297, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6119, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3010053222945004, | |
| "grad_norm": 0.6186813978530775, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6102, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.3246599645180366, | |
| "grad_norm": 0.4482998491087788, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6082, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.348314606741573, | |
| "grad_norm": 0.8072488882089637, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6082, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.3719692489651094, | |
| "grad_norm": 0.48728798126789036, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6118, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.3956238911886458, | |
| "grad_norm": 0.4366149357515769, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6123, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.4192785334121822, | |
| "grad_norm": 0.5669347135329578, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5995, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.4429331756357184, | |
| "grad_norm": 0.43980097172154653, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6012, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.4665878178592548, | |
| "grad_norm": 0.4546181925945345, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6039, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.4902424600827913, | |
| "grad_norm": 0.4439868785027596, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6145, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.5138971023063275, | |
| "grad_norm": 0.4840991507057293, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6128, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.537551744529864, | |
| "grad_norm": 0.5028857797195146, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6037, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.5612063867534003, | |
| "grad_norm": 0.6295444122174948, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6081, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.5848610289769367, | |
| "grad_norm": 0.5574140089154338, | |
| "learning_rate": 5e-06, | |
| "loss": 0.605, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.6085156712004731, | |
| "grad_norm": 0.5278087358672146, | |
| "learning_rate": 5e-06, | |
| "loss": 0.609, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.6321703134240093, | |
| "grad_norm": 0.5407251516890358, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6023, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.655824955647546, | |
| "grad_norm": 0.47669538747864815, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6098, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6794795978710821, | |
| "grad_norm": 0.5117309364481941, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6083, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.7031342400946186, | |
| "grad_norm": 0.4622792038975577, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6145, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.726788882318155, | |
| "grad_norm": 0.476155429063029, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6102, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.7504435245416912, | |
| "grad_norm": 0.6072162079153387, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6047, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.7740981667652278, | |
| "grad_norm": 0.4580614369890106, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6104, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.797752808988764, | |
| "grad_norm": 0.48163931069349186, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6152, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.8214074512123004, | |
| "grad_norm": 0.44236726486886196, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6159, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.8450620934358368, | |
| "grad_norm": 0.4140807715487973, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6077, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.868716735659373, | |
| "grad_norm": 0.49794149670638754, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6064, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.8923713778829097, | |
| "grad_norm": 0.48140566757242875, | |
| "learning_rate": 5e-06, | |
| "loss": 0.613, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.9160260201064458, | |
| "grad_norm": 0.4407179231154673, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6126, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.9396806623299823, | |
| "grad_norm": 0.41696082412030117, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5994, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.9633353045535187, | |
| "grad_norm": 0.47981289616269807, | |
| "learning_rate": 5e-06, | |
| "loss": 0.612, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.9869899467770549, | |
| "grad_norm": 0.4484580880097136, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5968, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.9988172678888232, | |
| "eval_loss": 0.6400034427642822, | |
| "eval_runtime": 225.8288, | |
| "eval_samples_per_second": 50.432, | |
| "eval_steps_per_second": 0.394, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 2.0106445890005915, | |
| "grad_norm": 0.6332180702757985, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5995, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.0342992312241277, | |
| "grad_norm": 0.5223111355986098, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5669, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.057953873447664, | |
| "grad_norm": 0.5560692373029835, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5668, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.0816085156712005, | |
| "grad_norm": 0.5874280616479215, | |
| "learning_rate": 5e-06, | |
| "loss": 0.559, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 0.5381432115880647, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5615, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.1289178001182734, | |
| "grad_norm": 0.5340607102638687, | |
| "learning_rate": 5e-06, | |
| "loss": 0.567, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.1525724423418096, | |
| "grad_norm": 0.4709298987555601, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5616, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.1762270845653457, | |
| "grad_norm": 0.47531624539694045, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5634, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.1998817267888824, | |
| "grad_norm": 0.5690095865272987, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5627, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.2235363690124186, | |
| "grad_norm": 0.6002806476085656, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5643, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.247191011235955, | |
| "grad_norm": 0.5868800123289198, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5702, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.2708456534594914, | |
| "grad_norm": 0.545743902564753, | |
| "learning_rate": 5e-06, | |
| "loss": 0.564, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.2945002956830276, | |
| "grad_norm": 0.5069047371924982, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5657, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.3181549379065642, | |
| "grad_norm": 0.5832343780575133, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5696, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.3418095801301004, | |
| "grad_norm": 0.49163574506732044, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5673, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.365464222353637, | |
| "grad_norm": 0.5769534565432972, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5674, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.3891188645771733, | |
| "grad_norm": 0.4748456336186354, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5656, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.4127735068007095, | |
| "grad_norm": 0.5571397878943326, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5632, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.436428149024246, | |
| "grad_norm": 0.4977538685424081, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5662, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.4600827912477823, | |
| "grad_norm": 0.6212689514507728, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5669, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.483737433471319, | |
| "grad_norm": 0.5004435631831575, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5694, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.507392075694855, | |
| "grad_norm": 0.5290951851110267, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5686, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.5310467179183913, | |
| "grad_norm": 0.48790551419012185, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5709, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.554701360141928, | |
| "grad_norm": 0.5234534909018554, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5718, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.578356002365464, | |
| "grad_norm": 0.4907774872609342, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5661, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.6020106445890008, | |
| "grad_norm": 0.44774060132900434, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5681, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.625665286812537, | |
| "grad_norm": 0.5280954839046109, | |
| "learning_rate": 5e-06, | |
| "loss": 0.566, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.649319929036073, | |
| "grad_norm": 0.6085042539902774, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5773, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.67297457125961, | |
| "grad_norm": 0.5793916767265476, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5646, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.696629213483146, | |
| "grad_norm": 0.452089081498202, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5653, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.7202838557066826, | |
| "grad_norm": 0.548967392597449, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5689, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.743938497930219, | |
| "grad_norm": 0.5154350449304891, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5751, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.767593140153755, | |
| "grad_norm": 0.5408130206005414, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5659, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.7912477823772917, | |
| "grad_norm": 0.448067927236881, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5696, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.814902424600828, | |
| "grad_norm": 0.46851404269934493, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5692, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.8385570668243645, | |
| "grad_norm": 0.4742832176992908, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5676, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.8622117090479007, | |
| "grad_norm": 0.5257873035609077, | |
| "learning_rate": 5e-06, | |
| "loss": 0.569, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.885866351271437, | |
| "grad_norm": 0.4515773409971667, | |
| "learning_rate": 5e-06, | |
| "loss": 0.563, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.9095209934949735, | |
| "grad_norm": 0.4991731794016107, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5695, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.9331756357185097, | |
| "grad_norm": 0.5065264471878053, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5706, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.9568302779420463, | |
| "grad_norm": 0.44403389629974643, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5786, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.9804849201655825, | |
| "grad_norm": 0.5042460817434247, | |
| "learning_rate": 5e-06, | |
| "loss": 0.572, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.9946777054997042, | |
| "eval_loss": 0.6413525938987732, | |
| "eval_runtime": 226.2058, | |
| "eval_samples_per_second": 50.348, | |
| "eval_steps_per_second": 0.393, | |
| "step": 1266 | |
| }, | |
| { | |
| "epoch": 2.9946777054997042, | |
| "step": 1266, | |
| "total_flos": 2120178393415680.0, | |
| "train_loss": 0.6193434227887676, | |
| "train_runtime": 37811.5197, | |
| "train_samples_per_second": 17.167, | |
| "train_steps_per_second": 0.033 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1266, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2120178393415680.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |