| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.992903607332939, | |
| "eval_steps": 500, | |
| "global_step": 1688, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02365464222353637, | |
| "grad_norm": 2.022057166002628, | |
| "learning_rate": 5e-06, | |
| "loss": 0.888, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04730928444707274, | |
| "grad_norm": 4.2838438835520005, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7936, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0709639266706091, | |
| "grad_norm": 5.767519526095153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7721, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09461856889414548, | |
| "grad_norm": 2.248924439813238, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7537, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11827321111768184, | |
| "grad_norm": 1.2653464134086752, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7355, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1419278533412182, | |
| "grad_norm": 0.9171723485495338, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7205, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.16558249556475457, | |
| "grad_norm": 0.5551007647195453, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7141, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18923713778829096, | |
| "grad_norm": 0.6529142931017199, | |
| "learning_rate": 5e-06, | |
| "loss": 0.703, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.21289178001182732, | |
| "grad_norm": 0.827144174510547, | |
| "learning_rate": 5e-06, | |
| "loss": 0.687, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.23654642223536368, | |
| "grad_norm": 0.6269470648770907, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6856, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.26020106445890007, | |
| "grad_norm": 0.5434193508379527, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6906, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2838557066824364, | |
| "grad_norm": 0.536059739065128, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6776, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3075103489059728, | |
| "grad_norm": 0.5739273992748293, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6737, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.33116499112950915, | |
| "grad_norm": 0.6849912224993461, | |
| "learning_rate": 5e-06, | |
| "loss": 0.682, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.35481963335304556, | |
| "grad_norm": 0.847605379720015, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6647, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3784742755765819, | |
| "grad_norm": 0.4574232609702974, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6693, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4021289178001183, | |
| "grad_norm": 0.5749856137025093, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6668, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.42578356002365464, | |
| "grad_norm": 0.5270300994955561, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6739, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.449438202247191, | |
| "grad_norm": 0.4564112773847777, | |
| "learning_rate": 5e-06, | |
| "loss": 0.67, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.47309284447072736, | |
| "grad_norm": 0.6424018806184643, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6689, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4967474866942638, | |
| "grad_norm": 0.9633823614289843, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6693, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5204021289178001, | |
| "grad_norm": 0.5911923837305756, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6684, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5440567711413364, | |
| "grad_norm": 0.42377038905272263, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6601, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5677114133648729, | |
| "grad_norm": 0.45358915803398175, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6645, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5913660555884093, | |
| "grad_norm": 0.5561319066414647, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6615, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6150206978119456, | |
| "grad_norm": 0.48146552840960954, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6521, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.638675340035482, | |
| "grad_norm": 0.6646986767108664, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6636, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6623299822590183, | |
| "grad_norm": 0.6286029463531491, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6653, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6859846244825547, | |
| "grad_norm": 0.47552147839170555, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6594, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7096392667060911, | |
| "grad_norm": 0.5643579688885435, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6532, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7332939089296274, | |
| "grad_norm": 0.47432297972285264, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6615, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7569485511531638, | |
| "grad_norm": 0.4398747123335146, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6555, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7806031933767001, | |
| "grad_norm": 0.5194948335540115, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6537, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8042578356002366, | |
| "grad_norm": 0.5361623479539347, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6535, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8279124778237729, | |
| "grad_norm": 0.5378540604132823, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6598, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8515671200473093, | |
| "grad_norm": 0.43570116634348305, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6533, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8752217622708457, | |
| "grad_norm": 0.4760030489635267, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6567, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.898876404494382, | |
| "grad_norm": 0.5221489125969696, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6475, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9225310467179184, | |
| "grad_norm": 0.6037987178986088, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6548, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9461856889414547, | |
| "grad_norm": 0.4648850754901681, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6464, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9698403311649911, | |
| "grad_norm": 0.4722370230360759, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6489, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9934949733885275, | |
| "grad_norm": 0.506164447733558, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6479, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9982259018332348, | |
| "eval_loss": 0.6500382423400879, | |
| "eval_runtime": 226.0859, | |
| "eval_samples_per_second": 50.375, | |
| "eval_steps_per_second": 0.394, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.0171496156120639, | |
| "grad_norm": 0.5094563043289723, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6364, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.0408042578356003, | |
| "grad_norm": 0.5191261264055049, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6046, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.0644589000591367, | |
| "grad_norm": 0.5080870756726318, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6028, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.0881135422826729, | |
| "grad_norm": 0.5071759369122493, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6114, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.1117681845062093, | |
| "grad_norm": 0.44639682629606836, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6104, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.1354228267297457, | |
| "grad_norm": 0.4671904061290404, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6105, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.1590774689532821, | |
| "grad_norm": 0.47682156648938734, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6076, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.1827321111768185, | |
| "grad_norm": 0.46620317831305, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6038, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2063867534003547, | |
| "grad_norm": 0.5319219132983622, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6036, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2300413956238911, | |
| "grad_norm": 0.5376832276402331, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6095, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.2536960378474276, | |
| "grad_norm": 0.5612860356721774, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6106, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.277350680070964, | |
| "grad_norm": 0.5310917873879784, | |
| "learning_rate": 5e-06, | |
| "loss": 0.611, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3010053222945004, | |
| "grad_norm": 0.5263300247672861, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6091, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.3246599645180366, | |
| "grad_norm": 0.46491345435473175, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6073, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.348314606741573, | |
| "grad_norm": 0.7395109471227356, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6074, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.3719692489651094, | |
| "grad_norm": 0.46119094942392375, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6109, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.3956238911886458, | |
| "grad_norm": 0.4450854938398166, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6115, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.4192785334121822, | |
| "grad_norm": 0.5551565902862219, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5986, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.4429331756357184, | |
| "grad_norm": 0.4018778370206095, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6004, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.4665878178592548, | |
| "grad_norm": 0.44942300591311213, | |
| "learning_rate": 5e-06, | |
| "loss": 0.603, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.4902424600827913, | |
| "grad_norm": 0.4465193449169376, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6136, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.5138971023063275, | |
| "grad_norm": 0.43030425922494153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6119, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.537551744529864, | |
| "grad_norm": 0.48189491125310013, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6029, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.5612063867534003, | |
| "grad_norm": 0.6004161164250926, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6071, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.5848610289769367, | |
| "grad_norm": 0.5769771492223703, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6039, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.6085156712004731, | |
| "grad_norm": 0.5178134406863251, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6082, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.6321703134240093, | |
| "grad_norm": 0.544609037335345, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6015, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.655824955647546, | |
| "grad_norm": 0.4825223518027102, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6088, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6794795978710821, | |
| "grad_norm": 0.4914541229221081, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6074, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.7031342400946186, | |
| "grad_norm": 0.5142329564921958, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6137, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.726788882318155, | |
| "grad_norm": 0.4827136360568082, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6091, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.7504435245416912, | |
| "grad_norm": 0.6337370950629847, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6038, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.7740981667652278, | |
| "grad_norm": 0.44154103246732906, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6095, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.797752808988764, | |
| "grad_norm": 0.45443332352805516, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6142, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.8214074512123004, | |
| "grad_norm": 0.4163069710397612, | |
| "learning_rate": 5e-06, | |
| "loss": 0.615, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.8450620934358368, | |
| "grad_norm": 0.4161179540449698, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6068, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.868716735659373, | |
| "grad_norm": 0.5280163972542002, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6056, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.8923713778829097, | |
| "grad_norm": 0.46668538311447527, | |
| "learning_rate": 5e-06, | |
| "loss": 0.612, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.9160260201064458, | |
| "grad_norm": 0.42346894046836875, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6118, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.9396806623299823, | |
| "grad_norm": 0.41896781546766526, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5986, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.9633353045535187, | |
| "grad_norm": 0.4633335148255916, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6111, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.9869899467770549, | |
| "grad_norm": 0.476695749395824, | |
| "learning_rate": 5e-06, | |
| "loss": 0.596, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.9988172678888232, | |
| "eval_loss": 0.6398171186447144, | |
| "eval_runtime": 226.8946, | |
| "eval_samples_per_second": 50.195, | |
| "eval_steps_per_second": 0.392, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 2.0106445890005915, | |
| "grad_norm": 0.6169607703828688, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5986, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.0342992312241277, | |
| "grad_norm": 0.5426446180098695, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5657, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.057953873447664, | |
| "grad_norm": 0.5390790167711137, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5657, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.0816085156712005, | |
| "grad_norm": 0.578360411177351, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5579, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 0.5197231353776123, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5602, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.1289178001182734, | |
| "grad_norm": 0.4859072638645338, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5657, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.1525724423418096, | |
| "grad_norm": 0.4779934502377749, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5603, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.1762270845653457, | |
| "grad_norm": 0.4708802789385491, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5622, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.1998817267888824, | |
| "grad_norm": 0.5551422131481033, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5615, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.2235363690124186, | |
| "grad_norm": 0.6102253316948937, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5631, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.247191011235955, | |
| "grad_norm": 0.5515235751158143, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5691, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.2708456534594914, | |
| "grad_norm": 0.5435816919940853, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5628, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.2945002956830276, | |
| "grad_norm": 0.5029810305754802, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5646, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.3181549379065642, | |
| "grad_norm": 0.5615005267431546, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5685, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.3418095801301004, | |
| "grad_norm": 0.49168169351440816, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5662, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.365464222353637, | |
| "grad_norm": 0.5771081751524688, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5665, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.3891188645771733, | |
| "grad_norm": 0.4680061355705797, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5646, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.4127735068007095, | |
| "grad_norm": 0.564004480266281, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5621, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.436428149024246, | |
| "grad_norm": 0.4778290387931295, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5651, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.4600827912477823, | |
| "grad_norm": 0.6235579512913804, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5658, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.483737433471319, | |
| "grad_norm": 0.5224755229558726, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5683, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.507392075694855, | |
| "grad_norm": 0.5155806397749756, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5676, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.5310467179183913, | |
| "grad_norm": 0.4957936150342283, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5699, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.554701360141928, | |
| "grad_norm": 0.5154987387729463, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5707, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.578356002365464, | |
| "grad_norm": 0.5044551175485149, | |
| "learning_rate": 5e-06, | |
| "loss": 0.565, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.6020106445890008, | |
| "grad_norm": 0.4516076390737273, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5671, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.625665286812537, | |
| "grad_norm": 0.5045929619515908, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5649, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.649319929036073, | |
| "grad_norm": 0.580462719659177, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5762, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.67297457125961, | |
| "grad_norm": 0.575884289754489, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5635, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.696629213483146, | |
| "grad_norm": 0.45389440525574193, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5642, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.7202838557066826, | |
| "grad_norm": 0.5500836980193272, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5679, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.743938497930219, | |
| "grad_norm": 0.5317230345652454, | |
| "learning_rate": 5e-06, | |
| "loss": 0.574, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.767593140153755, | |
| "grad_norm": 0.5423217141987767, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5647, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.7912477823772917, | |
| "grad_norm": 0.45026276354942224, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5687, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.814902424600828, | |
| "grad_norm": 0.4729801358724606, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5681, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.8385570668243645, | |
| "grad_norm": 0.4541256119629826, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5666, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.8622117090479007, | |
| "grad_norm": 0.5065447399632258, | |
| "learning_rate": 5e-06, | |
| "loss": 0.568, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.885866351271437, | |
| "grad_norm": 0.46787373309179847, | |
| "learning_rate": 5e-06, | |
| "loss": 0.562, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.9095209934949735, | |
| "grad_norm": 0.5140600861948555, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5685, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.9331756357185097, | |
| "grad_norm": 0.4924658786836096, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5695, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.9568302779420463, | |
| "grad_norm": 0.43768785522082726, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5776, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.9804849201655825, | |
| "grad_norm": 0.49992037039861137, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5709, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.9994086339444115, | |
| "eval_loss": 0.6412045359611511, | |
| "eval_runtime": 227.4087, | |
| "eval_samples_per_second": 50.082, | |
| "eval_steps_per_second": 0.391, | |
| "step": 1268 | |
| }, | |
| { | |
| "epoch": 3.0041395623891187, | |
| "grad_norm": 0.7997038665721562, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5734, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 3.0277942046126554, | |
| "grad_norm": 0.7087021533978439, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5192, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 3.0514488468361916, | |
| "grad_norm": 0.6369517216502235, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5252, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 3.075103489059728, | |
| "grad_norm": 0.5422250793349076, | |
| "learning_rate": 5e-06, | |
| "loss": 0.518, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.0987581312832644, | |
| "grad_norm": 0.4983982055937172, | |
| "learning_rate": 5e-06, | |
| "loss": 0.523, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 3.1224127735068006, | |
| "grad_norm": 0.5243425125520329, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5206, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 3.146067415730337, | |
| "grad_norm": 0.4867213185448702, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5243, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 3.1697220579538734, | |
| "grad_norm": 0.5758407627499461, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5252, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 3.19337670017741, | |
| "grad_norm": 0.5312930409261694, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5239, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 3.2170313424009462, | |
| "grad_norm": 0.5655992541782802, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5284, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 3.2406859846244824, | |
| "grad_norm": 0.5600254916460519, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5247, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 3.264340626848019, | |
| "grad_norm": 0.5493705668694846, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5256, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 3.2879952690715553, | |
| "grad_norm": 0.5454062241643639, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5251, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 3.311649911295092, | |
| "grad_norm": 0.5317414152397003, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5264, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.335304553518628, | |
| "grad_norm": 0.522262224626374, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5246, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 3.3589591957421643, | |
| "grad_norm": 0.5296232089165864, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5337, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 3.382613837965701, | |
| "grad_norm": 0.5272756646376445, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5293, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 3.406268480189237, | |
| "grad_norm": 0.55914181485459, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5237, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 3.4299231224127738, | |
| "grad_norm": 0.4921288128015139, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5271, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.45357776463631, | |
| "grad_norm": 0.5365784280337929, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5257, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 3.477232406859846, | |
| "grad_norm": 0.5463822390250614, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5282, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 3.5008870490833828, | |
| "grad_norm": 0.5296060277501131, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5307, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 3.524541691306919, | |
| "grad_norm": 0.5118534830144524, | |
| "learning_rate": 5e-06, | |
| "loss": 0.529, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 3.5481963335304556, | |
| "grad_norm": 0.4803120353260472, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5339, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.571850975753992, | |
| "grad_norm": 0.4786671860417403, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5304, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 3.595505617977528, | |
| "grad_norm": 0.5285834688732408, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5284, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 3.619160260201064, | |
| "grad_norm": 0.584807508614226, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5227, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 3.642814902424601, | |
| "grad_norm": 0.5839419610849975, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5322, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 3.6664695446481375, | |
| "grad_norm": 0.4514003865869288, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5277, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.6901241868716737, | |
| "grad_norm": 0.5330412676715586, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5334, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 3.71377882909521, | |
| "grad_norm": 0.49755559197077215, | |
| "learning_rate": 5e-06, | |
| "loss": 0.536, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 3.737433471318746, | |
| "grad_norm": 0.5204717016848357, | |
| "learning_rate": 5e-06, | |
| "loss": 0.534, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 3.7610881135422827, | |
| "grad_norm": 0.49061356565453146, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5281, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 3.7847427557658193, | |
| "grad_norm": 0.6053218200808435, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5316, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.8083973979893555, | |
| "grad_norm": 0.6606016275554902, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5353, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 3.8320520402128917, | |
| "grad_norm": 0.4907218140881002, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5281, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 3.855706682436428, | |
| "grad_norm": 0.45979128640427186, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5304, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 3.8793613246599645, | |
| "grad_norm": 0.46823925120727844, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5349, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 3.903015966883501, | |
| "grad_norm": 0.4817910084064857, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5306, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.9266706091070374, | |
| "grad_norm": 0.6114087825987051, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5288, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 3.9503252513305736, | |
| "grad_norm": 0.48214763063708094, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5289, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 3.9739798935541097, | |
| "grad_norm": 0.4984632076912188, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5291, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 3.992903607332939, | |
| "eval_loss": 0.6530380845069885, | |
| "eval_runtime": 225.9522, | |
| "eval_samples_per_second": 50.404, | |
| "eval_steps_per_second": 0.394, | |
| "step": 1688 | |
| }, | |
| { | |
| "epoch": 3.992903607332939, | |
| "step": 1688, | |
| "total_flos": 2827044110991360.0, | |
| "train_loss": 0.5963167059478036, | |
| "train_runtime": 50350.8421, | |
| "train_samples_per_second": 17.189, | |
| "train_steps_per_second": 0.034 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1688, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2827044110991360.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |