| { | |
| "best_metric": 0.2531912922859192, | |
| "best_model_checkpoint": "./ryan_model314/checkpoint-600", | |
| "epoch": 4.0, | |
| "eval_steps": 100, | |
| "global_step": 2500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.1103402376174927, | |
| "learning_rate": 0.00019920000000000002, | |
| "loss": 0.5731, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.8193413019180298, | |
| "learning_rate": 0.0001984, | |
| "loss": 0.4217, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.3536686897277832, | |
| "learning_rate": 0.0001976, | |
| "loss": 0.3709, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.9998810887336731, | |
| "learning_rate": 0.0001968, | |
| "loss": 0.3398, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.5689244270324707, | |
| "learning_rate": 0.000196, | |
| "loss": 0.3346, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.1778826713562012, | |
| "learning_rate": 0.0001952, | |
| "loss": 0.3406, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.3193926811218262, | |
| "learning_rate": 0.0001944, | |
| "loss": 0.2755, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.1302804946899414, | |
| "learning_rate": 0.00019360000000000002, | |
| "loss": 0.3944, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.8255844712257385, | |
| "learning_rate": 0.0001928, | |
| "loss": 0.3473, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.0871790647506714, | |
| "learning_rate": 0.000192, | |
| "loss": 0.3042, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_loss": 0.3673088252544403, | |
| "eval_na_accuracy": 0.928, | |
| "eval_ordinal_accuracy": 0.4671280276816609, | |
| "eval_runtime": 110.8646, | |
| "eval_samples_per_second": 9.02, | |
| "eval_steps_per_second": 1.128, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.5816177129745483, | |
| "learning_rate": 0.0001912, | |
| "loss": 0.4058, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.8431822061538696, | |
| "learning_rate": 0.0001904, | |
| "loss": 0.2781, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.0826754570007324, | |
| "learning_rate": 0.0001896, | |
| "loss": 0.2587, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 3.7366294860839844, | |
| "learning_rate": 0.0001888, | |
| "loss": 0.3432, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.5233088731765747, | |
| "learning_rate": 0.000188, | |
| "loss": 0.3484, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.8766111135482788, | |
| "learning_rate": 0.00018720000000000002, | |
| "loss": 0.3597, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.0037935972213745, | |
| "learning_rate": 0.00018640000000000003, | |
| "loss": 0.288, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.3281046152114868, | |
| "learning_rate": 0.0001856, | |
| "loss": 0.3207, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.4793013334274292, | |
| "learning_rate": 0.00018480000000000002, | |
| "loss": 0.3372, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.8796727657318115, | |
| "learning_rate": 0.00018400000000000003, | |
| "loss": 0.2904, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_loss": 0.29769936203956604, | |
| "eval_na_accuracy": 0.933, | |
| "eval_ordinal_accuracy": 0.5790080738177624, | |
| "eval_runtime": 39.6284, | |
| "eval_samples_per_second": 25.234, | |
| "eval_steps_per_second": 3.154, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.3502057790756226, | |
| "learning_rate": 0.0001832, | |
| "loss": 0.3519, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.5546174049377441, | |
| "learning_rate": 0.00018240000000000002, | |
| "loss": 0.3243, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.7677227854728699, | |
| "learning_rate": 0.00018160000000000002, | |
| "loss": 0.2914, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.1754639148712158, | |
| "learning_rate": 0.0001808, | |
| "loss": 0.3539, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.8472470641136169, | |
| "learning_rate": 0.00018, | |
| "loss": 0.2395, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.1917964220046997, | |
| "learning_rate": 0.00017920000000000002, | |
| "loss": 0.2295, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.7398644685745239, | |
| "learning_rate": 0.0001784, | |
| "loss": 0.2398, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.5953208804130554, | |
| "learning_rate": 0.0001776, | |
| "loss": 0.2786, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 2.0648913383483887, | |
| "learning_rate": 0.00017680000000000001, | |
| "loss": 0.3661, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.4048805236816406, | |
| "learning_rate": 0.00017600000000000002, | |
| "loss": 0.2648, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_loss": 0.2830840051174164, | |
| "eval_na_accuracy": 0.944, | |
| "eval_ordinal_accuracy": 0.5940023068050749, | |
| "eval_runtime": 39.7255, | |
| "eval_samples_per_second": 25.173, | |
| "eval_steps_per_second": 3.147, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.8102580904960632, | |
| "learning_rate": 0.0001752, | |
| "loss": 0.2359, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 2.0220913887023926, | |
| "learning_rate": 0.0001744, | |
| "loss": 0.2557, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.2111886739730835, | |
| "learning_rate": 0.00017360000000000002, | |
| "loss": 0.3025, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.788378119468689, | |
| "learning_rate": 0.0001728, | |
| "loss": 0.3067, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.7332974076271057, | |
| "learning_rate": 0.000172, | |
| "loss": 0.2612, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.5220205783843994, | |
| "learning_rate": 0.00017120000000000001, | |
| "loss": 0.2924, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.8991191387176514, | |
| "learning_rate": 0.0001704, | |
| "loss": 0.2379, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.6633837223052979, | |
| "learning_rate": 0.0001696, | |
| "loss": 0.2792, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.9553330540657043, | |
| "learning_rate": 0.0001688, | |
| "loss": 0.2512, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.4544942080974579, | |
| "learning_rate": 0.000168, | |
| "loss": 0.3036, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_loss": 0.27759096026420593, | |
| "eval_na_accuracy": 0.949, | |
| "eval_ordinal_accuracy": 0.5870818915801614, | |
| "eval_runtime": 39.7611, | |
| "eval_samples_per_second": 25.15, | |
| "eval_steps_per_second": 3.144, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.2751814126968384, | |
| "learning_rate": 0.0001672, | |
| "loss": 0.3042, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.791074514389038, | |
| "learning_rate": 0.0001664, | |
| "loss": 0.3341, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.9887642860412598, | |
| "learning_rate": 0.0001656, | |
| "loss": 0.2868, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.3511923551559448, | |
| "learning_rate": 0.0001648, | |
| "loss": 0.3763, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.7992609739303589, | |
| "learning_rate": 0.000164, | |
| "loss": 0.2264, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.0241813659667969, | |
| "learning_rate": 0.0001632, | |
| "loss": 0.3018, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.628193736076355, | |
| "learning_rate": 0.00016240000000000002, | |
| "loss": 0.3323, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.8471026420593262, | |
| "learning_rate": 0.00016160000000000002, | |
| "loss": 0.2005, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.1799852848052979, | |
| "learning_rate": 0.0001608, | |
| "loss": 0.2984, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.595058560371399, | |
| "learning_rate": 0.00016, | |
| "loss": 0.2656, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 0.2846027612686157, | |
| "eval_na_accuracy": 0.931, | |
| "eval_ordinal_accuracy": 0.6101499423298731, | |
| "eval_runtime": 39.9154, | |
| "eval_samples_per_second": 25.053, | |
| "eval_steps_per_second": 3.132, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.9246352910995483, | |
| "learning_rate": 0.00015920000000000002, | |
| "loss": 0.3591, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.9456105828285217, | |
| "learning_rate": 0.00015840000000000003, | |
| "loss": 0.3569, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.111274003982544, | |
| "learning_rate": 0.0001576, | |
| "loss": 0.3243, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00015688, | |
| "loss": 0.2911, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.8232502341270447, | |
| "learning_rate": 0.00015616000000000002, | |
| "loss": 0.3236, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.6359846591949463, | |
| "learning_rate": 0.00015536, | |
| "loss": 0.3211, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.545005738735199, | |
| "learning_rate": 0.00015456, | |
| "loss": 0.205, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.6029797196388245, | |
| "learning_rate": 0.00015376000000000002, | |
| "loss": 0.1928, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.7442355155944824, | |
| "learning_rate": 0.00015296000000000003, | |
| "loss": 0.3273, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.6751519441604614, | |
| "learning_rate": 0.00015216, | |
| "loss": 0.2954, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_loss": 0.2531912922859192, | |
| "eval_na_accuracy": 0.947, | |
| "eval_ordinal_accuracy": 0.5951557093425606, | |
| "eval_runtime": 39.4037, | |
| "eval_samples_per_second": 25.378, | |
| "eval_steps_per_second": 3.172, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.6347012519836426, | |
| "learning_rate": 0.00015136000000000001, | |
| "loss": 0.2256, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 6.180319309234619, | |
| "learning_rate": 0.00015056000000000002, | |
| "loss": 0.2003, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.8919633626937866, | |
| "learning_rate": 0.00014976, | |
| "loss": 0.198, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 0.9197341203689575, | |
| "learning_rate": 0.00014896, | |
| "loss": 0.1685, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.37014976143836975, | |
| "learning_rate": 0.00014816000000000002, | |
| "loss": 0.1729, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 0.8919755220413208, | |
| "learning_rate": 0.00014736, | |
| "loss": 0.1993, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 0.7291600704193115, | |
| "learning_rate": 0.00014656, | |
| "loss": 0.1893, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 2.347400665283203, | |
| "learning_rate": 0.00014576000000000001, | |
| "loss": 0.1799, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.3188568949699402, | |
| "learning_rate": 0.00014496, | |
| "loss": 0.167, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.342278242111206, | |
| "learning_rate": 0.00014416, | |
| "loss": 0.1991, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_loss": 0.26034072041511536, | |
| "eval_na_accuracy": 0.942, | |
| "eval_ordinal_accuracy": 0.6078431372549019, | |
| "eval_runtime": 39.5088, | |
| "eval_samples_per_second": 25.311, | |
| "eval_steps_per_second": 3.164, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 0.48655757308006287, | |
| "learning_rate": 0.00014336, | |
| "loss": 0.1885, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 0.556333065032959, | |
| "learning_rate": 0.00014256000000000002, | |
| "loss": 0.1449, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 0.4880894422531128, | |
| "learning_rate": 0.00014176, | |
| "loss": 0.1164, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 0.599926233291626, | |
| "learning_rate": 0.00014096, | |
| "loss": 0.2113, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.6070149540901184, | |
| "learning_rate": 0.00014016, | |
| "loss": 0.1534, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 0.7789746522903442, | |
| "learning_rate": 0.00013936, | |
| "loss": 0.1655, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 0.5523375868797302, | |
| "learning_rate": 0.00013856, | |
| "loss": 0.298, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.4257819652557373, | |
| "learning_rate": 0.00013776, | |
| "loss": 0.2101, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 0.5729731321334839, | |
| "learning_rate": 0.00013696, | |
| "loss": 0.133, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.4050444960594177, | |
| "learning_rate": 0.00013616, | |
| "loss": 0.1678, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_loss": 0.2904650568962097, | |
| "eval_na_accuracy": 0.942, | |
| "eval_ordinal_accuracy": 0.6332179930795848, | |
| "eval_runtime": 39.7139, | |
| "eval_samples_per_second": 25.18, | |
| "eval_steps_per_second": 3.148, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 4.782747745513916, | |
| "learning_rate": 0.00013536, | |
| "loss": 0.207, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 2.577669143676758, | |
| "learning_rate": 0.00013455999999999999, | |
| "loss": 0.1818, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 2.8163273334503174, | |
| "learning_rate": 0.00013376, | |
| "loss": 0.1761, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 2.213799238204956, | |
| "learning_rate": 0.00013296, | |
| "loss": 0.2966, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 0.8946444988250732, | |
| "learning_rate": 0.00013216, | |
| "loss": 0.1569, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 0.6494708061218262, | |
| "learning_rate": 0.00013136000000000002, | |
| "loss": 0.1746, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 1.0058079957962036, | |
| "learning_rate": 0.00013056000000000002, | |
| "loss": 0.1204, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.1752161979675293, | |
| "learning_rate": 0.00012976, | |
| "loss": 0.2082, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 0.5655858516693115, | |
| "learning_rate": 0.00012896, | |
| "loss": 0.1971, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 2.5486743450164795, | |
| "learning_rate": 0.00012816000000000002, | |
| "loss": 0.2514, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_loss": 0.25656750798225403, | |
| "eval_na_accuracy": 0.94, | |
| "eval_ordinal_accuracy": 0.6089965397923875, | |
| "eval_runtime": 39.7194, | |
| "eval_samples_per_second": 25.177, | |
| "eval_steps_per_second": 3.147, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 0.878511369228363, | |
| "learning_rate": 0.00012736, | |
| "loss": 0.162, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.1985282897949219, | |
| "learning_rate": 0.00012656, | |
| "loss": 0.2268, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 0.521425187587738, | |
| "learning_rate": 0.00012576000000000002, | |
| "loss": 0.1556, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.9773241877555847, | |
| "learning_rate": 0.00012496000000000002, | |
| "loss": 0.1457, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.6476322412490845, | |
| "learning_rate": 0.00012416, | |
| "loss": 0.1913, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.7127236127853394, | |
| "learning_rate": 0.00012336, | |
| "loss": 0.1961, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 4.41243314743042, | |
| "learning_rate": 0.00012256000000000002, | |
| "loss": 0.2061, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.5907992124557495, | |
| "learning_rate": 0.00012176000000000001, | |
| "loss": 0.1299, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 0.5711427927017212, | |
| "learning_rate": 0.00012096000000000001, | |
| "loss": 0.1755, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 2.925363302230835, | |
| "learning_rate": 0.00012016, | |
| "loss": 0.2328, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 0.2884255647659302, | |
| "eval_na_accuracy": 0.94, | |
| "eval_ordinal_accuracy": 0.5617070357554786, | |
| "eval_runtime": 39.4898, | |
| "eval_samples_per_second": 25.323, | |
| "eval_steps_per_second": 3.165, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.1306260824203491, | |
| "learning_rate": 0.00011936000000000001, | |
| "loss": 0.1595, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.8953267335891724, | |
| "learning_rate": 0.00011856, | |
| "loss": 0.2489, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 0.7074128985404968, | |
| "learning_rate": 0.00011776, | |
| "loss": 0.2485, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 0.7052355408668518, | |
| "learning_rate": 0.00011696, | |
| "loss": 0.2075, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 0.7830259203910828, | |
| "learning_rate": 0.00011616, | |
| "loss": 0.2346, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.5882430672645569, | |
| "learning_rate": 0.00011536000000000001, | |
| "loss": 0.2136, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.0235962867736816, | |
| "learning_rate": 0.00011456, | |
| "loss": 0.1753, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 0.9401603937149048, | |
| "learning_rate": 0.00011376, | |
| "loss": 0.173, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.0735399723052979, | |
| "learning_rate": 0.00011296, | |
| "loss": 0.1993, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.6592912673950195, | |
| "learning_rate": 0.00011216, | |
| "loss": 0.1826, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_loss": 0.2869604229927063, | |
| "eval_na_accuracy": 0.943, | |
| "eval_ordinal_accuracy": 0.6043829296424452, | |
| "eval_runtime": 39.3069, | |
| "eval_samples_per_second": 25.441, | |
| "eval_steps_per_second": 3.18, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 2.1649601459503174, | |
| "learning_rate": 0.00011135999999999999, | |
| "loss": 0.221, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.7881801128387451, | |
| "learning_rate": 0.00011056, | |
| "loss": 0.1765, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 1.3527191877365112, | |
| "learning_rate": 0.00010975999999999999, | |
| "loss": 0.1325, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 0.7212499976158142, | |
| "learning_rate": 0.00010896, | |
| "loss": 0.2445, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 0.9492518901824951, | |
| "learning_rate": 0.00010816, | |
| "loss": 0.2704, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 0.4344118535518646, | |
| "learning_rate": 0.00010736000000000002, | |
| "loss": 0.1624, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 0.4115823209285736, | |
| "learning_rate": 0.00010656000000000001, | |
| "loss": 0.2069, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 0.6738015413284302, | |
| "learning_rate": 0.00010576000000000002, | |
| "loss": 0.208, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 0.9090007543563843, | |
| "learning_rate": 0.00010496000000000001, | |
| "loss": 0.1793, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.1480025053024292, | |
| "learning_rate": 0.00010416000000000002, | |
| "loss": 0.2013, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_loss": 0.29365527629852295, | |
| "eval_na_accuracy": 0.941, | |
| "eval_ordinal_accuracy": 0.5905420991926182, | |
| "eval_runtime": 39.6842, | |
| "eval_samples_per_second": 25.199, | |
| "eval_steps_per_second": 3.15, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 0.62380051612854, | |
| "learning_rate": 0.00010336000000000001, | |
| "loss": 0.192, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 0.9949710965156555, | |
| "learning_rate": 0.00010256000000000001, | |
| "loss": 0.226, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 1.0634446144104004, | |
| "learning_rate": 0.00010176000000000002, | |
| "loss": 0.142, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 0.8875225782394409, | |
| "learning_rate": 0.00010096000000000001, | |
| "loss": 0.1729, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.6193259358406067, | |
| "learning_rate": 0.00010016, | |
| "loss": 0.102, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 0.4042517840862274, | |
| "learning_rate": 9.936000000000001e-05, | |
| "loss": 0.0976, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 0.4051195979118347, | |
| "learning_rate": 9.856e-05, | |
| "loss": 0.1448, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 0.46061789989471436, | |
| "learning_rate": 9.776000000000001e-05, | |
| "loss": 0.0768, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 0.5934004783630371, | |
| "learning_rate": 9.696000000000001e-05, | |
| "loss": 0.1404, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.6819984316825867, | |
| "learning_rate": 9.616e-05, | |
| "loss": 0.0663, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_loss": 0.2954486608505249, | |
| "eval_na_accuracy": 0.938, | |
| "eval_ordinal_accuracy": 0.6251441753171857, | |
| "eval_runtime": 39.658, | |
| "eval_samples_per_second": 25.216, | |
| "eval_steps_per_second": 3.152, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 0.5849266052246094, | |
| "learning_rate": 9.536000000000001e-05, | |
| "loss": 0.1574, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 0.9393780827522278, | |
| "learning_rate": 9.456e-05, | |
| "loss": 0.0979, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 0.47529059648513794, | |
| "learning_rate": 9.376e-05, | |
| "loss": 0.1073, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 0.4079722464084625, | |
| "learning_rate": 9.296e-05, | |
| "loss": 0.0868, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 0.7292589545249939, | |
| "learning_rate": 9.216e-05, | |
| "loss": 0.1446, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 0.9205511212348938, | |
| "learning_rate": 9.136e-05, | |
| "loss": 0.0907, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 0.9218105673789978, | |
| "learning_rate": 9.056e-05, | |
| "loss": 0.1387, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 0.5730422139167786, | |
| "learning_rate": 8.976e-05, | |
| "loss": 0.0882, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 0.6922823190689087, | |
| "learning_rate": 8.896e-05, | |
| "loss": 0.0741, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.1872971057891846, | |
| "learning_rate": 8.816000000000001e-05, | |
| "loss": 0.1503, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_loss": 0.3187769651412964, | |
| "eval_na_accuracy": 0.937, | |
| "eval_ordinal_accuracy": 0.5986159169550173, | |
| "eval_runtime": 39.7243, | |
| "eval_samples_per_second": 25.173, | |
| "eval_steps_per_second": 3.147, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 2.3350443840026855, | |
| "learning_rate": 8.736e-05, | |
| "loss": 0.1083, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 0.8266046643257141, | |
| "learning_rate": 8.656000000000001e-05, | |
| "loss": 0.0684, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 0.41480687260627747, | |
| "learning_rate": 8.576e-05, | |
| "loss": 0.0809, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 0.4657377600669861, | |
| "learning_rate": 8.496e-05, | |
| "loss": 0.0718, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.5419800877571106, | |
| "learning_rate": 8.416000000000001e-05, | |
| "loss": 0.1322, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 1.167611837387085, | |
| "learning_rate": 8.336e-05, | |
| "loss": 0.1017, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 0.449034184217453, | |
| "learning_rate": 8.256000000000001e-05, | |
| "loss": 0.0636, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 0.6716451048851013, | |
| "learning_rate": 8.176e-05, | |
| "loss": 0.1109, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 4.306596755981445, | |
| "learning_rate": 8.096e-05, | |
| "loss": 0.0898, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.41288742423057556, | |
| "learning_rate": 8.016e-05, | |
| "loss": 0.0611, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_loss": 0.33932703733444214, | |
| "eval_na_accuracy": 0.945, | |
| "eval_ordinal_accuracy": 0.5997693194925029, | |
| "eval_runtime": 39.236, | |
| "eval_samples_per_second": 25.487, | |
| "eval_steps_per_second": 3.186, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 0.7951626181602478, | |
| "learning_rate": 7.936e-05, | |
| "loss": 0.0799, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 1.0197049379348755, | |
| "learning_rate": 7.856000000000001e-05, | |
| "loss": 0.0928, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.6486759185791016, | |
| "learning_rate": 7.776e-05, | |
| "loss": 0.0964, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 1.0220657587051392, | |
| "learning_rate": 7.696e-05, | |
| "loss": 0.1736, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 2.3006441593170166, | |
| "learning_rate": 7.616e-05, | |
| "loss": 0.0993, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.4701670706272125, | |
| "learning_rate": 7.536000000000001e-05, | |
| "loss": 0.0779, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 0.641832172870636, | |
| "learning_rate": 7.456e-05, | |
| "loss": 0.0586, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 0.5836305618286133, | |
| "learning_rate": 7.376000000000001e-05, | |
| "loss": 0.053, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 0.6500815153121948, | |
| "learning_rate": 7.296e-05, | |
| "loss": 0.0779, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.5682386755943298, | |
| "learning_rate": 7.216e-05, | |
| "loss": 0.0743, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_loss": 0.3182476758956909, | |
| "eval_na_accuracy": 0.942, | |
| "eval_ordinal_accuracy": 0.6482122260668973, | |
| "eval_runtime": 40.5507, | |
| "eval_samples_per_second": 24.66, | |
| "eval_steps_per_second": 3.083, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 0.5527540445327759, | |
| "learning_rate": 7.136000000000001e-05, | |
| "loss": 0.094, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 0.8710426092147827, | |
| "learning_rate": 7.056e-05, | |
| "loss": 0.0839, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 0.9312260746955872, | |
| "learning_rate": 6.976000000000001e-05, | |
| "loss": 0.155, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 0.48695412278175354, | |
| "learning_rate": 6.896e-05, | |
| "loss": 0.0606, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.525652289390564, | |
| "learning_rate": 6.816e-05, | |
| "loss": 0.0715, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 0.7670960426330566, | |
| "learning_rate": 6.736e-05, | |
| "loss": 0.1108, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 1.0041375160217285, | |
| "learning_rate": 6.656e-05, | |
| "loss": 0.1257, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 0.3819805681705475, | |
| "learning_rate": 6.576e-05, | |
| "loss": 0.1004, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.5372006893157959, | |
| "learning_rate": 6.496e-05, | |
| "loss": 0.0825, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 0.5835949182510376, | |
| "learning_rate": 6.416e-05, | |
| "loss": 0.0908, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "eval_loss": 0.3332485854625702, | |
| "eval_na_accuracy": 0.942, | |
| "eval_ordinal_accuracy": 0.6482122260668973, | |
| "eval_runtime": 39.9642, | |
| "eval_samples_per_second": 25.022, | |
| "eval_steps_per_second": 3.128, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 0.677947461605072, | |
| "learning_rate": 6.336e-05, | |
| "loss": 0.1086, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.7373325228691101, | |
| "learning_rate": 6.256000000000001e-05, | |
| "loss": 0.0698, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.7738047242164612, | |
| "learning_rate": 6.176e-05, | |
| "loss": 0.1118, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 2.052891254425049, | |
| "learning_rate": 6.0960000000000006e-05, | |
| "loss": 0.1002, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.26311352849006653, | |
| "learning_rate": 6.016000000000001e-05, | |
| "loss": 0.0944, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 0.8190409541130066, | |
| "learning_rate": 5.936000000000001e-05, | |
| "loss": 0.0717, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 0.5824436545372009, | |
| "learning_rate": 5.856e-05, | |
| "loss": 0.0746, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 0.5489352941513062, | |
| "learning_rate": 5.776e-05, | |
| "loss": 0.1063, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.656225323677063, | |
| "learning_rate": 5.6960000000000004e-05, | |
| "loss": 0.0763, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.8495000600814819, | |
| "learning_rate": 5.6160000000000004e-05, | |
| "loss": 0.1108, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_loss": 0.32561835646629333, | |
| "eval_na_accuracy": 0.943, | |
| "eval_ordinal_accuracy": 0.6459054209919262, | |
| "eval_runtime": 39.4673, | |
| "eval_samples_per_second": 25.337, | |
| "eval_steps_per_second": 3.167, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 0.617258608341217, | |
| "learning_rate": 5.536e-05, | |
| "loss": 0.1203, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 0.4484919011592865, | |
| "learning_rate": 5.456e-05, | |
| "loss": 0.0573, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 0.533388614654541, | |
| "learning_rate": 5.376e-05, | |
| "loss": 0.0762, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 0.4078121483325958, | |
| "learning_rate": 5.296e-05, | |
| "loss": 0.0643, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.5678732395172119, | |
| "learning_rate": 5.2159999999999995e-05, | |
| "loss": 0.1126, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 0.6543716192245483, | |
| "learning_rate": 5.1359999999999996e-05, | |
| "loss": 0.0763, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 0.6005885601043701, | |
| "learning_rate": 5.056000000000001e-05, | |
| "loss": 0.0949, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 0.4467845559120178, | |
| "learning_rate": 4.976e-05, | |
| "loss": 0.0481, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 0.48746606707572937, | |
| "learning_rate": 4.896e-05, | |
| "loss": 0.0415, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 0.7011713981628418, | |
| "learning_rate": 4.816e-05, | |
| "loss": 0.0786, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_loss": 0.3222349286079407, | |
| "eval_na_accuracy": 0.944, | |
| "eval_ordinal_accuracy": 0.6539792387543253, | |
| "eval_runtime": 39.2271, | |
| "eval_samples_per_second": 25.493, | |
| "eval_steps_per_second": 3.187, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 0.5153183341026306, | |
| "learning_rate": 4.736000000000001e-05, | |
| "loss": 0.0387, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 3.07, | |
| "grad_norm": 0.43751129508018494, | |
| "learning_rate": 4.656e-05, | |
| "loss": 0.0364, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 3.09, | |
| "grad_norm": 0.7584701776504517, | |
| "learning_rate": 4.576e-05, | |
| "loss": 0.0397, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 0.20170661807060242, | |
| "learning_rate": 4.496e-05, | |
| "loss": 0.0288, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 0.2583639919757843, | |
| "learning_rate": 4.4160000000000004e-05, | |
| "loss": 0.0302, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 3.9720704555511475, | |
| "learning_rate": 4.336e-05, | |
| "loss": 0.0484, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 0.3367606997489929, | |
| "learning_rate": 4.256e-05, | |
| "loss": 0.0387, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 0.7610962986946106, | |
| "learning_rate": 4.176000000000001e-05, | |
| "loss": 0.0424, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 0.6901140213012695, | |
| "learning_rate": 4.096e-05, | |
| "loss": 0.035, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.2873363196849823, | |
| "learning_rate": 4.016e-05, | |
| "loss": 0.043, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_loss": 0.35012441873550415, | |
| "eval_na_accuracy": 0.941, | |
| "eval_ordinal_accuracy": 0.6482122260668973, | |
| "eval_runtime": 39.427, | |
| "eval_samples_per_second": 25.363, | |
| "eval_steps_per_second": 3.17, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 0.5277103781700134, | |
| "learning_rate": 3.936e-05, | |
| "loss": 0.0423, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 0.3088182508945465, | |
| "learning_rate": 3.8560000000000004e-05, | |
| "loss": 0.0305, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 0.3621159791946411, | |
| "learning_rate": 3.776e-05, | |
| "loss": 0.0398, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 0.6761226654052734, | |
| "learning_rate": 3.696e-05, | |
| "loss": 0.0606, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 0.9860779047012329, | |
| "learning_rate": 3.616e-05, | |
| "loss": 0.0437, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 0.6743874549865723, | |
| "learning_rate": 3.536000000000001e-05, | |
| "loss": 0.0328, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 1.5928354263305664, | |
| "learning_rate": 3.456e-05, | |
| "loss": 0.0261, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 0.9067389965057373, | |
| "learning_rate": 3.376e-05, | |
| "loss": 0.0268, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 0.5733221173286438, | |
| "learning_rate": 3.296e-05, | |
| "loss": 0.0335, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 0.7042862772941589, | |
| "learning_rate": 3.2160000000000004e-05, | |
| "loss": 0.0472, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "eval_loss": 0.34554365277290344, | |
| "eval_na_accuracy": 0.943, | |
| "eval_ordinal_accuracy": 0.6608996539792388, | |
| "eval_runtime": 40.1655, | |
| "eval_samples_per_second": 24.897, | |
| "eval_steps_per_second": 3.112, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 0.8036783933639526, | |
| "learning_rate": 3.136e-05, | |
| "loss": 0.0402, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 3.39, | |
| "grad_norm": 0.4863825738430023, | |
| "learning_rate": 3.056e-05, | |
| "loss": 0.0507, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 0.5171158313751221, | |
| "learning_rate": 2.976e-05, | |
| "loss": 0.0333, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 0.21965381503105164, | |
| "learning_rate": 2.8960000000000004e-05, | |
| "loss": 0.0277, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 0.20841450989246368, | |
| "learning_rate": 2.816e-05, | |
| "loss": 0.0259, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 0.5015869736671448, | |
| "learning_rate": 2.7360000000000002e-05, | |
| "loss": 0.0316, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 3.47, | |
| "grad_norm": 0.7938678860664368, | |
| "learning_rate": 2.6560000000000003e-05, | |
| "loss": 0.0301, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 3.49, | |
| "grad_norm": 0.44840845465660095, | |
| "learning_rate": 2.576e-05, | |
| "loss": 0.0406, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.35510167479515076, | |
| "learning_rate": 2.496e-05, | |
| "loss": 0.0268, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 0.37328681349754333, | |
| "learning_rate": 2.4160000000000002e-05, | |
| "loss": 0.032, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "eval_loss": 0.35616353154182434, | |
| "eval_na_accuracy": 0.94, | |
| "eval_ordinal_accuracy": 0.6516724336793541, | |
| "eval_runtime": 40.3825, | |
| "eval_samples_per_second": 24.763, | |
| "eval_steps_per_second": 3.095, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 0.24070143699645996, | |
| "learning_rate": 2.336e-05, | |
| "loss": 0.0262, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 1.0428861379623413, | |
| "learning_rate": 2.256e-05, | |
| "loss": 0.04, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "grad_norm": 0.626348614692688, | |
| "learning_rate": 2.176e-05, | |
| "loss": 0.0275, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 0.47826460003852844, | |
| "learning_rate": 2.0960000000000003e-05, | |
| "loss": 0.0379, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 2.685340166091919, | |
| "learning_rate": 2.016e-05, | |
| "loss": 0.047, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 0.1495877057313919, | |
| "learning_rate": 1.936e-05, | |
| "loss": 0.0389, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 0.5789759755134583, | |
| "learning_rate": 1.856e-05, | |
| "loss": 0.0337, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 0.4255303144454956, | |
| "learning_rate": 1.7760000000000003e-05, | |
| "loss": 0.0282, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 0.7483348846435547, | |
| "learning_rate": 1.696e-05, | |
| "loss": 0.0668, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 0.23885439336299896, | |
| "learning_rate": 1.616e-05, | |
| "loss": 0.0434, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "eval_loss": 0.34990155696868896, | |
| "eval_na_accuracy": 0.94, | |
| "eval_ordinal_accuracy": 0.6597462514417531, | |
| "eval_runtime": 40.407, | |
| "eval_samples_per_second": 24.748, | |
| "eval_steps_per_second": 3.094, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 0.3341818153858185, | |
| "learning_rate": 1.536e-05, | |
| "loss": 0.0358, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 0.6008884310722351, | |
| "learning_rate": 1.4560000000000001e-05, | |
| "loss": 0.0394, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 0.3966546654701233, | |
| "learning_rate": 1.376e-05, | |
| "loss": 0.0346, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "grad_norm": 0.46933791041374207, | |
| "learning_rate": 1.296e-05, | |
| "loss": 0.0227, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 0.6652282476425171, | |
| "learning_rate": 1.216e-05, | |
| "loss": 0.0393, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 0.23938482999801636, | |
| "learning_rate": 1.1360000000000001e-05, | |
| "loss": 0.0267, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 0.6050881147384644, | |
| "learning_rate": 1.056e-05, | |
| "loss": 0.0287, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 0.22671189904212952, | |
| "learning_rate": 9.760000000000001e-06, | |
| "loss": 0.0491, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 3.82, | |
| "grad_norm": 0.5296955704689026, | |
| "learning_rate": 8.96e-06, | |
| "loss": 0.0266, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 0.5424560308456421, | |
| "learning_rate": 8.160000000000001e-06, | |
| "loss": 0.0341, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "eval_loss": 0.3610887825489044, | |
| "eval_na_accuracy": 0.94, | |
| "eval_ordinal_accuracy": 0.6482122260668973, | |
| "eval_runtime": 40.741, | |
| "eval_samples_per_second": 24.545, | |
| "eval_steps_per_second": 3.068, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.86, | |
| "grad_norm": 0.4790421724319458, | |
| "learning_rate": 7.36e-06, | |
| "loss": 0.0319, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 0.4021483063697815, | |
| "learning_rate": 6.560000000000001e-06, | |
| "loss": 0.0551, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 0.43051794171333313, | |
| "learning_rate": 5.76e-06, | |
| "loss": 0.0281, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 0.23781944811344147, | |
| "learning_rate": 4.96e-06, | |
| "loss": 0.0306, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 0.6060004234313965, | |
| "learning_rate": 4.16e-06, | |
| "loss": 0.0326, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 0.5149852633476257, | |
| "learning_rate": 3.36e-06, | |
| "loss": 0.0266, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 0.579931914806366, | |
| "learning_rate": 2.56e-06, | |
| "loss": 0.0236, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 3.97, | |
| "grad_norm": 0.14379101991653442, | |
| "learning_rate": 1.76e-06, | |
| "loss": 0.0221, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 0.5184658765792847, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0281, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.5299363732337952, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0305, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.36354970932006836, | |
| "eval_na_accuracy": 0.939, | |
| "eval_ordinal_accuracy": 0.6608996539792388, | |
| "eval_runtime": 40.233, | |
| "eval_samples_per_second": 24.855, | |
| "eval_steps_per_second": 3.107, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "step": 2500, | |
| "total_flos": 3.0997907103744e+18, | |
| "train_loss": 0.15650403581261635, | |
| "train_runtime": 3981.9696, | |
| "train_samples_per_second": 10.045, | |
| "train_steps_per_second": 0.628 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 100, | |
| "total_flos": 3.0997907103744e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |