| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.0625, | |
| "eval_steps": 500, | |
| "global_step": 1160, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.078125, | |
| "grad_norm": 6.125, | |
| "learning_rate": 1e-05, | |
| "loss": 12.2019, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.15625, | |
| "grad_norm": 6.5, | |
| "learning_rate": 2e-05, | |
| "loss": 11.8976, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.234375, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 3e-05, | |
| "loss": 11.1732, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 7.0625, | |
| "learning_rate": 4e-05, | |
| "loss": 9.8831, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "grad_norm": 7.375, | |
| "learning_rate": 5e-05, | |
| "loss": 8.4578, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.46875, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 6e-05, | |
| "loss": 7.2717, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.546875, | |
| "grad_norm": 12.25, | |
| "learning_rate": 7e-05, | |
| "loss": 6.1268, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 4.9462, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.703125, | |
| "grad_norm": 15.375, | |
| "learning_rate": 9e-05, | |
| "loss": 3.5646, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 15.375, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0469, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.859375, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 9.915254237288136e-05, | |
| "loss": 0.6762, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 9.830508474576272e-05, | |
| "loss": 0.3449, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.015625, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 9.745762711864407e-05, | |
| "loss": 0.2995, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.09375, | |
| "grad_norm": 2.375, | |
| "learning_rate": 9.661016949152543e-05, | |
| "loss": 0.2842, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.171875, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 9.576271186440679e-05, | |
| "loss": 0.2791, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 9.491525423728815e-05, | |
| "loss": 0.2459, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.328125, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 9.40677966101695e-05, | |
| "loss": 0.2479, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.40625, | |
| "grad_norm": 1.0, | |
| "learning_rate": 9.322033898305085e-05, | |
| "loss": 0.2573, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.484375, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 9.237288135593221e-05, | |
| "loss": 0.2424, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.5625, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 9.152542372881357e-05, | |
| "loss": 0.2339, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.640625, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 9.067796610169493e-05, | |
| "loss": 0.2326, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.71875, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 8.983050847457629e-05, | |
| "loss": 0.2267, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.796875, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 8.898305084745763e-05, | |
| "loss": 0.2233, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 8.813559322033899e-05, | |
| "loss": 0.2211, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.953125, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 8.728813559322035e-05, | |
| "loss": 0.2287, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.03125, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 8.644067796610171e-05, | |
| "loss": 0.2169, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.109375, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 8.559322033898305e-05, | |
| "loss": 0.2161, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.1875, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 8.474576271186441e-05, | |
| "loss": 0.2192, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.265625, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 8.389830508474577e-05, | |
| "loss": 0.2136, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.34375, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 8.305084745762712e-05, | |
| "loss": 0.1974, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.421875, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 8.220338983050848e-05, | |
| "loss": 0.2046, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 8.135593220338983e-05, | |
| "loss": 0.1961, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.578125, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 8.050847457627118e-05, | |
| "loss": 0.196, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.65625, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 7.966101694915254e-05, | |
| "loss": 0.1955, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.734375, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 7.88135593220339e-05, | |
| "loss": 0.1867, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.8125, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 7.796610169491526e-05, | |
| "loss": 0.184, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.890625, | |
| "grad_norm": 0.5, | |
| "learning_rate": 7.711864406779662e-05, | |
| "loss": 0.1911, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.96875, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 7.627118644067796e-05, | |
| "loss": 0.185, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.046875, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 7.542372881355932e-05, | |
| "loss": 0.1682, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.125, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 7.457627118644068e-05, | |
| "loss": 0.166, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.203125, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 7.372881355932204e-05, | |
| "loss": 0.1683, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.28125, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 7.288135593220338e-05, | |
| "loss": 0.1672, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.359375, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 7.203389830508474e-05, | |
| "loss": 0.1518, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.4375, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 7.11864406779661e-05, | |
| "loss": 0.1555, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.515625, | |
| "grad_norm": 0.5, | |
| "learning_rate": 7.033898305084746e-05, | |
| "loss": 0.1632, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.59375, | |
| "grad_norm": 15.8125, | |
| "learning_rate": 6.949152542372882e-05, | |
| "loss": 0.1647, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.671875, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 6.864406779661017e-05, | |
| "loss": 0.156, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 6.779661016949152e-05, | |
| "loss": 0.1445, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.828125, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 6.694915254237288e-05, | |
| "loss": 0.125, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.90625, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 6.610169491525424e-05, | |
| "loss": 0.1408, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.984375, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 6.52542372881356e-05, | |
| "loss": 0.1295, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.0625, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 6.440677966101695e-05, | |
| "loss": 0.127, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.140625, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 6.35593220338983e-05, | |
| "loss": 0.1191, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.21875, | |
| "grad_norm": 0.447265625, | |
| "learning_rate": 6.271186440677966e-05, | |
| "loss": 0.1288, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.296875, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 6.186440677966102e-05, | |
| "loss": 0.105, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.375, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 6.101694915254238e-05, | |
| "loss": 0.1171, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 4.453125, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 6.016949152542373e-05, | |
| "loss": 0.1279, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 4.53125, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 5.932203389830509e-05, | |
| "loss": 0.1049, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 4.609375, | |
| "grad_norm": 31.0, | |
| "learning_rate": 5.8474576271186446e-05, | |
| "loss": 0.1511, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 4.6875, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 5.76271186440678e-05, | |
| "loss": 0.1048, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.765625, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 5.677966101694916e-05, | |
| "loss": 0.1087, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.84375, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 5.593220338983051e-05, | |
| "loss": 0.1018, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 4.921875, | |
| "grad_norm": 1.25, | |
| "learning_rate": 5.508474576271186e-05, | |
| "loss": 0.1116, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 5.423728813559322e-05, | |
| "loss": 0.1077, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 5.078125, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 5.338983050847458e-05, | |
| "loss": 0.1047, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 5.15625, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 5.254237288135594e-05, | |
| "loss": 0.101, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 5.234375, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 5.1694915254237284e-05, | |
| "loss": 0.0999, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 5.3125, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 5.0847457627118643e-05, | |
| "loss": 0.0964, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 5.390625, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 5e-05, | |
| "loss": 0.1011, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 5.46875, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 4.915254237288136e-05, | |
| "loss": 0.1019, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 5.546875, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 4.8305084745762714e-05, | |
| "loss": 0.1089, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 5.625, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 4.745762711864407e-05, | |
| "loss": 0.0998, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 5.703125, | |
| "grad_norm": 10.375, | |
| "learning_rate": 4.6610169491525425e-05, | |
| "loss": 0.1138, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 5.78125, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 4.5762711864406784e-05, | |
| "loss": 0.1189, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 5.859375, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 4.491525423728814e-05, | |
| "loss": 0.1006, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 5.9375, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 4.4067796610169495e-05, | |
| "loss": 0.1049, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 6.015625, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 4.3220338983050854e-05, | |
| "loss": 0.0988, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 6.09375, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 4.2372881355932206e-05, | |
| "loss": 0.1061, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 6.171875, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 4.152542372881356e-05, | |
| "loss": 0.0999, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 4.067796610169492e-05, | |
| "loss": 0.102, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 6.328125, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 3.983050847457627e-05, | |
| "loss": 0.0933, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 6.40625, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 3.898305084745763e-05, | |
| "loss": 0.0963, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 6.484375, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 3.813559322033898e-05, | |
| "loss": 0.1018, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 6.5625, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 3.728813559322034e-05, | |
| "loss": 0.1038, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 6.640625, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 3.644067796610169e-05, | |
| "loss": 0.0978, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 6.71875, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 3.559322033898305e-05, | |
| "loss": 0.1033, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 6.796875, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 3.474576271186441e-05, | |
| "loss": 0.0998, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 6.875, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 3.389830508474576e-05, | |
| "loss": 0.0943, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 6.953125, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 3.305084745762712e-05, | |
| "loss": 0.1026, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 7.03125, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 3.2203389830508473e-05, | |
| "loss": 0.1009, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 7.109375, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 3.135593220338983e-05, | |
| "loss": 0.0949, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 7.1875, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 3.050847457627119e-05, | |
| "loss": 0.0969, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 7.265625, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 2.9661016949152544e-05, | |
| "loss": 0.0965, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 7.34375, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 2.88135593220339e-05, | |
| "loss": 0.0929, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 7.421875, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 2.7966101694915255e-05, | |
| "loss": 0.0982, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 2.711864406779661e-05, | |
| "loss": 0.1018, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 7.578125, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 2.627118644067797e-05, | |
| "loss": 0.0926, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 7.65625, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 2.5423728813559322e-05, | |
| "loss": 0.0944, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 7.734375, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 2.457627118644068e-05, | |
| "loss": 0.0957, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 7.8125, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 2.3728813559322036e-05, | |
| "loss": 0.0974, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 7.890625, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 2.2881355932203392e-05, | |
| "loss": 0.1005, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 7.96875, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 2.2033898305084748e-05, | |
| "loss": 0.0988, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 8.046875, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 2.1186440677966103e-05, | |
| "loss": 0.0968, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 8.125, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 2.033898305084746e-05, | |
| "loss": 0.0951, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 8.203125, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 1.9491525423728814e-05, | |
| "loss": 0.0925, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 8.28125, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 1.864406779661017e-05, | |
| "loss": 0.0953, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 8.359375, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 1.7796610169491526e-05, | |
| "loss": 0.0966, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 8.4375, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 1.694915254237288e-05, | |
| "loss": 0.1083, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 8.515625, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 1.6101694915254237e-05, | |
| "loss": 0.0992, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 8.59375, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 1.5254237288135596e-05, | |
| "loss": 0.11, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 8.671875, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 1.440677966101695e-05, | |
| "loss": 0.1163, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 8.75, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 1.3559322033898305e-05, | |
| "loss": 0.1086, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 8.828125, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 1.2711864406779661e-05, | |
| "loss": 0.0908, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 8.90625, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 1.1864406779661018e-05, | |
| "loss": 0.0967, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 8.984375, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 1.1016949152542374e-05, | |
| "loss": 0.0959, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 9.0625, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 1.016949152542373e-05, | |
| "loss": 0.0855, | |
| "step": 1160 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1280, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4466660675659776.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |