{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.992903607332939, "eval_steps": 500, "global_step": 1688, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02365464222353637, "grad_norm": 2.022057166002628, "learning_rate": 5e-06, "loss": 0.888, "step": 10 }, { "epoch": 0.04730928444707274, "grad_norm": 4.2838438835520005, "learning_rate": 5e-06, "loss": 0.7936, "step": 20 }, { "epoch": 0.0709639266706091, "grad_norm": 5.767519526095153, "learning_rate": 5e-06, "loss": 0.7721, "step": 30 }, { "epoch": 0.09461856889414548, "grad_norm": 2.248924439813238, "learning_rate": 5e-06, "loss": 0.7537, "step": 40 }, { "epoch": 0.11827321111768184, "grad_norm": 1.2653464134086752, "learning_rate": 5e-06, "loss": 0.7355, "step": 50 }, { "epoch": 0.1419278533412182, "grad_norm": 0.9171723485495338, "learning_rate": 5e-06, "loss": 0.7205, "step": 60 }, { "epoch": 0.16558249556475457, "grad_norm": 0.5551007647195453, "learning_rate": 5e-06, "loss": 0.7141, "step": 70 }, { "epoch": 0.18923713778829096, "grad_norm": 0.6529142931017199, "learning_rate": 5e-06, "loss": 0.703, "step": 80 }, { "epoch": 0.21289178001182732, "grad_norm": 0.827144174510547, "learning_rate": 5e-06, "loss": 0.687, "step": 90 }, { "epoch": 0.23654642223536368, "grad_norm": 0.6269470648770907, "learning_rate": 5e-06, "loss": 0.6856, "step": 100 }, { "epoch": 0.26020106445890007, "grad_norm": 0.5434193508379527, "learning_rate": 5e-06, "loss": 0.6906, "step": 110 }, { "epoch": 0.2838557066824364, "grad_norm": 0.536059739065128, "learning_rate": 5e-06, "loss": 0.6776, "step": 120 }, { "epoch": 0.3075103489059728, "grad_norm": 0.5739273992748293, "learning_rate": 5e-06, "loss": 0.6737, "step": 130 }, { "epoch": 0.33116499112950915, "grad_norm": 0.6849912224993461, "learning_rate": 5e-06, "loss": 0.682, "step": 140 }, { "epoch": 0.35481963335304556, "grad_norm": 0.847605379720015, "learning_rate": 5e-06, "loss": 0.6647, "step": 150 }, { "epoch": 0.3784742755765819, "grad_norm": 0.4574232609702974, "learning_rate": 5e-06, "loss": 0.6693, "step": 160 }, { "epoch": 0.4021289178001183, "grad_norm": 0.5749856137025093, "learning_rate": 5e-06, "loss": 0.6668, "step": 170 }, { "epoch": 0.42578356002365464, "grad_norm": 0.5270300994955561, "learning_rate": 5e-06, "loss": 0.6739, "step": 180 }, { "epoch": 0.449438202247191, "grad_norm": 0.4564112773847777, "learning_rate": 5e-06, "loss": 0.67, "step": 190 }, { "epoch": 0.47309284447072736, "grad_norm": 0.6424018806184643, "learning_rate": 5e-06, "loss": 0.6689, "step": 200 }, { "epoch": 0.4967474866942638, "grad_norm": 0.9633823614289843, "learning_rate": 5e-06, "loss": 0.6693, "step": 210 }, { "epoch": 0.5204021289178001, "grad_norm": 0.5911923837305756, "learning_rate": 5e-06, "loss": 0.6684, "step": 220 }, { "epoch": 0.5440567711413364, "grad_norm": 0.42377038905272263, "learning_rate": 5e-06, "loss": 0.6601, "step": 230 }, { "epoch": 0.5677114133648729, "grad_norm": 0.45358915803398175, "learning_rate": 5e-06, "loss": 0.6645, "step": 240 }, { "epoch": 0.5913660555884093, "grad_norm": 0.5561319066414647, "learning_rate": 5e-06, "loss": 0.6615, "step": 250 }, { "epoch": 0.6150206978119456, "grad_norm": 0.48146552840960954, "learning_rate": 5e-06, "loss": 0.6521, "step": 260 }, { "epoch": 0.638675340035482, "grad_norm": 0.6646986767108664, "learning_rate": 5e-06, "loss": 0.6636, "step": 270 }, { "epoch": 0.6623299822590183, "grad_norm": 0.6286029463531491, "learning_rate": 5e-06, "loss": 0.6653, "step": 280 }, { "epoch": 0.6859846244825547, "grad_norm": 0.47552147839170555, "learning_rate": 5e-06, "loss": 0.6594, "step": 290 }, { "epoch": 0.7096392667060911, "grad_norm": 0.5643579688885435, "learning_rate": 5e-06, "loss": 0.6532, "step": 300 }, { "epoch": 0.7332939089296274, "grad_norm": 0.47432297972285264, "learning_rate": 5e-06, "loss": 0.6615, "step": 310 }, { "epoch": 0.7569485511531638, "grad_norm": 0.4398747123335146, "learning_rate": 5e-06, "loss": 0.6555, "step": 320 }, { "epoch": 0.7806031933767001, "grad_norm": 0.5194948335540115, "learning_rate": 5e-06, "loss": 0.6537, "step": 330 }, { "epoch": 0.8042578356002366, "grad_norm": 0.5361623479539347, "learning_rate": 5e-06, "loss": 0.6535, "step": 340 }, { "epoch": 0.8279124778237729, "grad_norm": 0.5378540604132823, "learning_rate": 5e-06, "loss": 0.6598, "step": 350 }, { "epoch": 0.8515671200473093, "grad_norm": 0.43570116634348305, "learning_rate": 5e-06, "loss": 0.6533, "step": 360 }, { "epoch": 0.8752217622708457, "grad_norm": 0.4760030489635267, "learning_rate": 5e-06, "loss": 0.6567, "step": 370 }, { "epoch": 0.898876404494382, "grad_norm": 0.5221489125969696, "learning_rate": 5e-06, "loss": 0.6475, "step": 380 }, { "epoch": 0.9225310467179184, "grad_norm": 0.6037987178986088, "learning_rate": 5e-06, "loss": 0.6548, "step": 390 }, { "epoch": 0.9461856889414547, "grad_norm": 0.4648850754901681, "learning_rate": 5e-06, "loss": 0.6464, "step": 400 }, { "epoch": 0.9698403311649911, "grad_norm": 0.4722370230360759, "learning_rate": 5e-06, "loss": 0.6489, "step": 410 }, { "epoch": 0.9934949733885275, "grad_norm": 0.506164447733558, "learning_rate": 5e-06, "loss": 0.6479, "step": 420 }, { "epoch": 0.9982259018332348, "eval_loss": 0.6500382423400879, "eval_runtime": 226.0859, "eval_samples_per_second": 50.375, "eval_steps_per_second": 0.394, "step": 422 }, { "epoch": 1.0171496156120639, "grad_norm": 0.5094563043289723, "learning_rate": 5e-06, "loss": 0.6364, "step": 430 }, { "epoch": 1.0408042578356003, "grad_norm": 0.5191261264055049, "learning_rate": 5e-06, "loss": 0.6046, "step": 440 }, { "epoch": 1.0644589000591367, "grad_norm": 0.5080870756726318, "learning_rate": 5e-06, "loss": 0.6028, "step": 450 }, { "epoch": 1.0881135422826729, "grad_norm": 0.5071759369122493, "learning_rate": 5e-06, "loss": 0.6114, "step": 460 }, { "epoch": 1.1117681845062093, "grad_norm": 0.44639682629606836, "learning_rate": 5e-06, "loss": 0.6104, "step": 470 }, { "epoch": 1.1354228267297457, "grad_norm": 0.4671904061290404, "learning_rate": 5e-06, "loss": 0.6105, "step": 480 }, { "epoch": 1.1590774689532821, "grad_norm": 0.47682156648938734, "learning_rate": 5e-06, "loss": 0.6076, "step": 490 }, { "epoch": 1.1827321111768185, "grad_norm": 0.46620317831305, "learning_rate": 5e-06, "loss": 0.6038, "step": 500 }, { "epoch": 1.2063867534003547, "grad_norm": 0.5319219132983622, "learning_rate": 5e-06, "loss": 0.6036, "step": 510 }, { "epoch": 1.2300413956238911, "grad_norm": 0.5376832276402331, "learning_rate": 5e-06, "loss": 0.6095, "step": 520 }, { "epoch": 1.2536960378474276, "grad_norm": 0.5612860356721774, "learning_rate": 5e-06, "loss": 0.6106, "step": 530 }, { "epoch": 1.277350680070964, "grad_norm": 0.5310917873879784, "learning_rate": 5e-06, "loss": 0.611, "step": 540 }, { "epoch": 1.3010053222945004, "grad_norm": 0.5263300247672861, "learning_rate": 5e-06, "loss": 0.6091, "step": 550 }, { "epoch": 1.3246599645180366, "grad_norm": 0.46491345435473175, "learning_rate": 5e-06, "loss": 0.6073, "step": 560 }, { "epoch": 1.348314606741573, "grad_norm": 0.7395109471227356, "learning_rate": 5e-06, "loss": 0.6074, "step": 570 }, { "epoch": 1.3719692489651094, "grad_norm": 0.46119094942392375, "learning_rate": 5e-06, "loss": 0.6109, "step": 580 }, { "epoch": 1.3956238911886458, "grad_norm": 0.4450854938398166, "learning_rate": 5e-06, "loss": 0.6115, "step": 590 }, { "epoch": 1.4192785334121822, "grad_norm": 0.5551565902862219, "learning_rate": 5e-06, "loss": 0.5986, "step": 600 }, { "epoch": 1.4429331756357184, "grad_norm": 0.4018778370206095, "learning_rate": 5e-06, "loss": 0.6004, "step": 610 }, { "epoch": 1.4665878178592548, "grad_norm": 0.44942300591311213, "learning_rate": 5e-06, "loss": 0.603, "step": 620 }, { "epoch": 1.4902424600827913, "grad_norm": 0.4465193449169376, "learning_rate": 5e-06, "loss": 0.6136, "step": 630 }, { "epoch": 1.5138971023063275, "grad_norm": 0.43030425922494153, "learning_rate": 5e-06, "loss": 0.6119, "step": 640 }, { "epoch": 1.537551744529864, "grad_norm": 0.48189491125310013, "learning_rate": 5e-06, "loss": 0.6029, "step": 650 }, { "epoch": 1.5612063867534003, "grad_norm": 0.6004161164250926, "learning_rate": 5e-06, "loss": 0.6071, "step": 660 }, { "epoch": 1.5848610289769367, "grad_norm": 0.5769771492223703, "learning_rate": 5e-06, "loss": 0.6039, "step": 670 }, { "epoch": 1.6085156712004731, "grad_norm": 0.5178134406863251, "learning_rate": 5e-06, "loss": 0.6082, "step": 680 }, { "epoch": 1.6321703134240093, "grad_norm": 0.544609037335345, "learning_rate": 5e-06, "loss": 0.6015, "step": 690 }, { "epoch": 1.655824955647546, "grad_norm": 0.4825223518027102, "learning_rate": 5e-06, "loss": 0.6088, "step": 700 }, { "epoch": 1.6794795978710821, "grad_norm": 0.4914541229221081, "learning_rate": 5e-06, "loss": 0.6074, "step": 710 }, { "epoch": 1.7031342400946186, "grad_norm": 0.5142329564921958, "learning_rate": 5e-06, "loss": 0.6137, "step": 720 }, { "epoch": 1.726788882318155, "grad_norm": 0.4827136360568082, "learning_rate": 5e-06, "loss": 0.6091, "step": 730 }, { "epoch": 1.7504435245416912, "grad_norm": 0.6337370950629847, "learning_rate": 5e-06, "loss": 0.6038, "step": 740 }, { "epoch": 1.7740981667652278, "grad_norm": 0.44154103246732906, "learning_rate": 5e-06, "loss": 0.6095, "step": 750 }, { "epoch": 1.797752808988764, "grad_norm": 0.45443332352805516, "learning_rate": 5e-06, "loss": 0.6142, "step": 760 }, { "epoch": 1.8214074512123004, "grad_norm": 0.4163069710397612, "learning_rate": 5e-06, "loss": 0.615, "step": 770 }, { "epoch": 1.8450620934358368, "grad_norm": 0.4161179540449698, "learning_rate": 5e-06, "loss": 0.6068, "step": 780 }, { "epoch": 1.868716735659373, "grad_norm": 0.5280163972542002, "learning_rate": 5e-06, "loss": 0.6056, "step": 790 }, { "epoch": 1.8923713778829097, "grad_norm": 0.46668538311447527, "learning_rate": 5e-06, "loss": 0.612, "step": 800 }, { "epoch": 1.9160260201064458, "grad_norm": 0.42346894046836875, "learning_rate": 5e-06, "loss": 0.6118, "step": 810 }, { "epoch": 1.9396806623299823, "grad_norm": 0.41896781546766526, "learning_rate": 5e-06, "loss": 0.5986, "step": 820 }, { "epoch": 1.9633353045535187, "grad_norm": 0.4633335148255916, "learning_rate": 5e-06, "loss": 0.6111, "step": 830 }, { "epoch": 1.9869899467770549, "grad_norm": 0.476695749395824, "learning_rate": 5e-06, "loss": 0.596, "step": 840 }, { "epoch": 1.9988172678888232, "eval_loss": 0.6398171186447144, "eval_runtime": 226.8946, "eval_samples_per_second": 50.195, "eval_steps_per_second": 0.392, "step": 845 }, { "epoch": 2.0106445890005915, "grad_norm": 0.6169607703828688, "learning_rate": 5e-06, "loss": 0.5986, "step": 850 }, { "epoch": 2.0342992312241277, "grad_norm": 0.5426446180098695, "learning_rate": 5e-06, "loss": 0.5657, "step": 860 }, { "epoch": 2.057953873447664, "grad_norm": 0.5390790167711137, "learning_rate": 5e-06, "loss": 0.5657, "step": 870 }, { "epoch": 2.0816085156712005, "grad_norm": 0.578360411177351, "learning_rate": 5e-06, "loss": 0.5579, "step": 880 }, { "epoch": 2.1052631578947367, "grad_norm": 0.5197231353776123, "learning_rate": 5e-06, "loss": 0.5602, "step": 890 }, { "epoch": 2.1289178001182734, "grad_norm": 0.4859072638645338, "learning_rate": 5e-06, "loss": 0.5657, "step": 900 }, { "epoch": 2.1525724423418096, "grad_norm": 0.4779934502377749, "learning_rate": 5e-06, "loss": 0.5603, "step": 910 }, { "epoch": 2.1762270845653457, "grad_norm": 0.4708802789385491, "learning_rate": 5e-06, "loss": 0.5622, "step": 920 }, { "epoch": 2.1998817267888824, "grad_norm": 0.5551422131481033, "learning_rate": 5e-06, "loss": 0.5615, "step": 930 }, { "epoch": 2.2235363690124186, "grad_norm": 0.6102253316948937, "learning_rate": 5e-06, "loss": 0.5631, "step": 940 }, { "epoch": 2.247191011235955, "grad_norm": 0.5515235751158143, "learning_rate": 5e-06, "loss": 0.5691, "step": 950 }, { "epoch": 2.2708456534594914, "grad_norm": 0.5435816919940853, "learning_rate": 5e-06, "loss": 0.5628, "step": 960 }, { "epoch": 2.2945002956830276, "grad_norm": 0.5029810305754802, "learning_rate": 5e-06, "loss": 0.5646, "step": 970 }, { "epoch": 2.3181549379065642, "grad_norm": 0.5615005267431546, "learning_rate": 5e-06, "loss": 0.5685, "step": 980 }, { "epoch": 2.3418095801301004, "grad_norm": 0.49168169351440816, "learning_rate": 5e-06, "loss": 0.5662, "step": 990 }, { "epoch": 2.365464222353637, "grad_norm": 0.5771081751524688, "learning_rate": 5e-06, "loss": 0.5665, "step": 1000 }, { "epoch": 2.3891188645771733, "grad_norm": 0.4680061355705797, "learning_rate": 5e-06, "loss": 0.5646, "step": 1010 }, { "epoch": 2.4127735068007095, "grad_norm": 0.564004480266281, "learning_rate": 5e-06, "loss": 0.5621, "step": 1020 }, { "epoch": 2.436428149024246, "grad_norm": 0.4778290387931295, "learning_rate": 5e-06, "loss": 0.5651, "step": 1030 }, { "epoch": 2.4600827912477823, "grad_norm": 0.6235579512913804, "learning_rate": 5e-06, "loss": 0.5658, "step": 1040 }, { "epoch": 2.483737433471319, "grad_norm": 0.5224755229558726, "learning_rate": 5e-06, "loss": 0.5683, "step": 1050 }, { "epoch": 2.507392075694855, "grad_norm": 0.5155806397749756, "learning_rate": 5e-06, "loss": 0.5676, "step": 1060 }, { "epoch": 2.5310467179183913, "grad_norm": 0.4957936150342283, "learning_rate": 5e-06, "loss": 0.5699, "step": 1070 }, { "epoch": 2.554701360141928, "grad_norm": 0.5154987387729463, "learning_rate": 5e-06, "loss": 0.5707, "step": 1080 }, { "epoch": 2.578356002365464, "grad_norm": 0.5044551175485149, "learning_rate": 5e-06, "loss": 0.565, "step": 1090 }, { "epoch": 2.6020106445890008, "grad_norm": 0.4516076390737273, "learning_rate": 5e-06, "loss": 0.5671, "step": 1100 }, { "epoch": 2.625665286812537, "grad_norm": 0.5045929619515908, "learning_rate": 5e-06, "loss": 0.5649, "step": 1110 }, { "epoch": 2.649319929036073, "grad_norm": 0.580462719659177, "learning_rate": 5e-06, "loss": 0.5762, "step": 1120 }, { "epoch": 2.67297457125961, "grad_norm": 0.575884289754489, "learning_rate": 5e-06, "loss": 0.5635, "step": 1130 }, { "epoch": 2.696629213483146, "grad_norm": 0.45389440525574193, "learning_rate": 5e-06, "loss": 0.5642, "step": 1140 }, { "epoch": 2.7202838557066826, "grad_norm": 0.5500836980193272, "learning_rate": 5e-06, "loss": 0.5679, "step": 1150 }, { "epoch": 2.743938497930219, "grad_norm": 0.5317230345652454, "learning_rate": 5e-06, "loss": 0.574, "step": 1160 }, { "epoch": 2.767593140153755, "grad_norm": 0.5423217141987767, "learning_rate": 5e-06, "loss": 0.5647, "step": 1170 }, { "epoch": 2.7912477823772917, "grad_norm": 0.45026276354942224, "learning_rate": 5e-06, "loss": 0.5687, "step": 1180 }, { "epoch": 2.814902424600828, "grad_norm": 0.4729801358724606, "learning_rate": 5e-06, "loss": 0.5681, "step": 1190 }, { "epoch": 2.8385570668243645, "grad_norm": 0.4541256119629826, "learning_rate": 5e-06, "loss": 0.5666, "step": 1200 }, { "epoch": 2.8622117090479007, "grad_norm": 0.5065447399632258, "learning_rate": 5e-06, "loss": 0.568, "step": 1210 }, { "epoch": 2.885866351271437, "grad_norm": 0.46787373309179847, "learning_rate": 5e-06, "loss": 0.562, "step": 1220 }, { "epoch": 2.9095209934949735, "grad_norm": 0.5140600861948555, "learning_rate": 5e-06, "loss": 0.5685, "step": 1230 }, { "epoch": 2.9331756357185097, "grad_norm": 0.4924658786836096, "learning_rate": 5e-06, "loss": 0.5695, "step": 1240 }, { "epoch": 2.9568302779420463, "grad_norm": 0.43768785522082726, "learning_rate": 5e-06, "loss": 0.5776, "step": 1250 }, { "epoch": 2.9804849201655825, "grad_norm": 0.49992037039861137, "learning_rate": 5e-06, "loss": 0.5709, "step": 1260 }, { "epoch": 2.9994086339444115, "eval_loss": 0.6412045359611511, "eval_runtime": 227.4087, "eval_samples_per_second": 50.082, "eval_steps_per_second": 0.391, "step": 1268 }, { "epoch": 3.0041395623891187, "grad_norm": 0.7997038665721562, "learning_rate": 5e-06, "loss": 0.5734, "step": 1270 }, { "epoch": 3.0277942046126554, "grad_norm": 0.7087021533978439, "learning_rate": 5e-06, "loss": 0.5192, "step": 1280 }, { "epoch": 3.0514488468361916, "grad_norm": 0.6369517216502235, "learning_rate": 5e-06, "loss": 0.5252, "step": 1290 }, { "epoch": 3.075103489059728, "grad_norm": 0.5422250793349076, "learning_rate": 5e-06, "loss": 0.518, "step": 1300 }, { "epoch": 3.0987581312832644, "grad_norm": 0.4983982055937172, "learning_rate": 5e-06, "loss": 0.523, "step": 1310 }, { "epoch": 3.1224127735068006, "grad_norm": 0.5243425125520329, "learning_rate": 5e-06, "loss": 0.5206, "step": 1320 }, { "epoch": 3.146067415730337, "grad_norm": 0.4867213185448702, "learning_rate": 5e-06, "loss": 0.5243, "step": 1330 }, { "epoch": 3.1697220579538734, "grad_norm": 0.5758407627499461, "learning_rate": 5e-06, "loss": 0.5252, "step": 1340 }, { "epoch": 3.19337670017741, "grad_norm": 0.5312930409261694, "learning_rate": 5e-06, "loss": 0.5239, "step": 1350 }, { "epoch": 3.2170313424009462, "grad_norm": 0.5655992541782802, "learning_rate": 5e-06, "loss": 0.5284, "step": 1360 }, { "epoch": 3.2406859846244824, "grad_norm": 0.5600254916460519, "learning_rate": 5e-06, "loss": 0.5247, "step": 1370 }, { "epoch": 3.264340626848019, "grad_norm": 0.5493705668694846, "learning_rate": 5e-06, "loss": 0.5256, "step": 1380 }, { "epoch": 3.2879952690715553, "grad_norm": 0.5454062241643639, "learning_rate": 5e-06, "loss": 0.5251, "step": 1390 }, { "epoch": 3.311649911295092, "grad_norm": 0.5317414152397003, "learning_rate": 5e-06, "loss": 0.5264, "step": 1400 }, { "epoch": 3.335304553518628, "grad_norm": 0.522262224626374, "learning_rate": 5e-06, "loss": 0.5246, "step": 1410 }, { "epoch": 3.3589591957421643, "grad_norm": 0.5296232089165864, "learning_rate": 5e-06, "loss": 0.5337, "step": 1420 }, { "epoch": 3.382613837965701, "grad_norm": 0.5272756646376445, "learning_rate": 5e-06, "loss": 0.5293, "step": 1430 }, { "epoch": 3.406268480189237, "grad_norm": 0.55914181485459, "learning_rate": 5e-06, "loss": 0.5237, "step": 1440 }, { "epoch": 3.4299231224127738, "grad_norm": 0.4921288128015139, "learning_rate": 5e-06, "loss": 0.5271, "step": 1450 }, { "epoch": 3.45357776463631, "grad_norm": 0.5365784280337929, "learning_rate": 5e-06, "loss": 0.5257, "step": 1460 }, { "epoch": 3.477232406859846, "grad_norm": 0.5463822390250614, "learning_rate": 5e-06, "loss": 0.5282, "step": 1470 }, { "epoch": 3.5008870490833828, "grad_norm": 0.5296060277501131, "learning_rate": 5e-06, "loss": 0.5307, "step": 1480 }, { "epoch": 3.524541691306919, "grad_norm": 0.5118534830144524, "learning_rate": 5e-06, "loss": 0.529, "step": 1490 }, { "epoch": 3.5481963335304556, "grad_norm": 0.4803120353260472, "learning_rate": 5e-06, "loss": 0.5339, "step": 1500 }, { "epoch": 3.571850975753992, "grad_norm": 0.4786671860417403, "learning_rate": 5e-06, "loss": 0.5304, "step": 1510 }, { "epoch": 3.595505617977528, "grad_norm": 0.5285834688732408, "learning_rate": 5e-06, "loss": 0.5284, "step": 1520 }, { "epoch": 3.619160260201064, "grad_norm": 0.584807508614226, "learning_rate": 5e-06, "loss": 0.5227, "step": 1530 }, { "epoch": 3.642814902424601, "grad_norm": 0.5839419610849975, "learning_rate": 5e-06, "loss": 0.5322, "step": 1540 }, { "epoch": 3.6664695446481375, "grad_norm": 0.4514003865869288, "learning_rate": 5e-06, "loss": 0.5277, "step": 1550 }, { "epoch": 3.6901241868716737, "grad_norm": 0.5330412676715586, "learning_rate": 5e-06, "loss": 0.5334, "step": 1560 }, { "epoch": 3.71377882909521, "grad_norm": 0.49755559197077215, "learning_rate": 5e-06, "loss": 0.536, "step": 1570 }, { "epoch": 3.737433471318746, "grad_norm": 0.5204717016848357, "learning_rate": 5e-06, "loss": 0.534, "step": 1580 }, { "epoch": 3.7610881135422827, "grad_norm": 0.49061356565453146, "learning_rate": 5e-06, "loss": 0.5281, "step": 1590 }, { "epoch": 3.7847427557658193, "grad_norm": 0.6053218200808435, "learning_rate": 5e-06, "loss": 0.5316, "step": 1600 }, { "epoch": 3.8083973979893555, "grad_norm": 0.6606016275554902, "learning_rate": 5e-06, "loss": 0.5353, "step": 1610 }, { "epoch": 3.8320520402128917, "grad_norm": 0.4907218140881002, "learning_rate": 5e-06, "loss": 0.5281, "step": 1620 }, { "epoch": 3.855706682436428, "grad_norm": 0.45979128640427186, "learning_rate": 5e-06, "loss": 0.5304, "step": 1630 }, { "epoch": 3.8793613246599645, "grad_norm": 0.46823925120727844, "learning_rate": 5e-06, "loss": 0.5349, "step": 1640 }, { "epoch": 3.903015966883501, "grad_norm": 0.4817910084064857, "learning_rate": 5e-06, "loss": 0.5306, "step": 1650 }, { "epoch": 3.9266706091070374, "grad_norm": 0.6114087825987051, "learning_rate": 5e-06, "loss": 0.5288, "step": 1660 }, { "epoch": 3.9503252513305736, "grad_norm": 0.48214763063708094, "learning_rate": 5e-06, "loss": 0.5289, "step": 1670 }, { "epoch": 3.9739798935541097, "grad_norm": 0.4984632076912188, "learning_rate": 5e-06, "loss": 0.5291, "step": 1680 }, { "epoch": 3.992903607332939, "eval_loss": 0.6530380845069885, "eval_runtime": 225.9522, "eval_samples_per_second": 50.404, "eval_steps_per_second": 0.394, "step": 1688 }, { "epoch": 3.992903607332939, "step": 1688, "total_flos": 2827044110991360.0, "train_loss": 0.5963167059478036, "train_runtime": 50350.8421, "train_samples_per_second": 17.189, "train_steps_per_second": 0.034 } ], "logging_steps": 10, "max_steps": 1688, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2827044110991360.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }