| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 6250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 0.24006609618663788, | |
| "learning_rate": 0.0001984, | |
| "loss": 1.0243, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 0.21576859056949615, | |
| "learning_rate": 0.0001968, | |
| "loss": 0.9669, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 0.20258210599422455, | |
| "learning_rate": 0.0001952, | |
| "loss": 0.9397, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.2684886157512665, | |
| "learning_rate": 0.00019360000000000002, | |
| "loss": 0.8793, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.22625429928302765, | |
| "learning_rate": 0.000192, | |
| "loss": 0.9209, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 0.25748205184936523, | |
| "learning_rate": 0.0001904, | |
| "loss": 0.8553, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 0.24440611898899078, | |
| "learning_rate": 0.0001888, | |
| "loss": 0.9141, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.24657492339611053, | |
| "learning_rate": 0.00018720000000000002, | |
| "loss": 0.8973, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 0.21331937611103058, | |
| "learning_rate": 0.0001856, | |
| "loss": 0.8836, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.23378247022628784, | |
| "learning_rate": 0.00018400000000000003, | |
| "loss": 0.8992, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 0.21668757498264313, | |
| "learning_rate": 0.00018240000000000002, | |
| "loss": 0.8479, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.22808493673801422, | |
| "learning_rate": 0.0001808, | |
| "loss": 0.8805, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 0.2394104301929474, | |
| "learning_rate": 0.00017920000000000002, | |
| "loss": 0.8633, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.22561711072921753, | |
| "learning_rate": 0.0001776, | |
| "loss": 0.8766, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.29312923550605774, | |
| "learning_rate": 0.00017600000000000002, | |
| "loss": 0.891, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.21683369576931, | |
| "learning_rate": 0.0001744, | |
| "loss": 0.8473, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 0.2582944929599762, | |
| "learning_rate": 0.0001728, | |
| "loss": 0.8949, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.2305222898721695, | |
| "learning_rate": 0.00017120000000000001, | |
| "loss": 0.8535, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 0.2317700982093811, | |
| "learning_rate": 0.0001696, | |
| "loss": 0.8677, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.2534327208995819, | |
| "learning_rate": 0.000168, | |
| "loss": 0.8662, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 0.2503647208213806, | |
| "learning_rate": 0.0001664, | |
| "loss": 0.8606, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.2579597234725952, | |
| "learning_rate": 0.0001648, | |
| "loss": 0.833, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 0.2581159472465515, | |
| "learning_rate": 0.0001632, | |
| "loss": 0.8415, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.24251115322113037, | |
| "learning_rate": 0.00016160000000000002, | |
| "loss": 0.8484, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.28298044204711914, | |
| "learning_rate": 0.00016, | |
| "loss": 0.8405, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.2423146665096283, | |
| "learning_rate": 0.00015840000000000003, | |
| "loss": 0.8315, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 0.22693276405334473, | |
| "learning_rate": 0.00015680000000000002, | |
| "loss": 0.8757, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.24105730652809143, | |
| "learning_rate": 0.0001552, | |
| "loss": 0.8485, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 0.22538623213768005, | |
| "learning_rate": 0.00015360000000000002, | |
| "loss": 0.8946, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.2406863272190094, | |
| "learning_rate": 0.000152, | |
| "loss": 0.8389, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 0.2680400311946869, | |
| "learning_rate": 0.0001504, | |
| "loss": 0.8283, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.2538231909275055, | |
| "learning_rate": 0.0001488, | |
| "loss": 0.8384, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 0.26007387042045593, | |
| "learning_rate": 0.0001472, | |
| "loss": 0.8589, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.2680321931838989, | |
| "learning_rate": 0.00014560000000000002, | |
| "loss": 0.8872, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.2379782646894455, | |
| "learning_rate": 0.000144, | |
| "loss": 0.8483, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.2729997932910919, | |
| "learning_rate": 0.0001424, | |
| "loss": 0.8381, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 0.24824947118759155, | |
| "learning_rate": 0.0001408, | |
| "loss": 0.8274, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.26121681928634644, | |
| "learning_rate": 0.0001392, | |
| "loss": 0.8491, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 0.24737513065338135, | |
| "learning_rate": 0.00013759999999999998, | |
| "loss": 0.8416, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.2495298683643341, | |
| "learning_rate": 0.00013600000000000003, | |
| "loss": 0.829, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 0.25478219985961914, | |
| "learning_rate": 0.00013440000000000001, | |
| "loss": 0.8247, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.24906663596630096, | |
| "learning_rate": 0.0001328, | |
| "loss": 0.853, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 0.2297120839357376, | |
| "learning_rate": 0.00013120000000000002, | |
| "loss": 0.8289, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.2441694438457489, | |
| "learning_rate": 0.0001296, | |
| "loss": 0.8592, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.2818741202354431, | |
| "learning_rate": 0.00012800000000000002, | |
| "loss": 0.8065, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.2782455086708069, | |
| "learning_rate": 0.0001264, | |
| "loss": 0.8333, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 0.3333749771118164, | |
| "learning_rate": 0.0001248, | |
| "loss": 0.8714, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.38234880566596985, | |
| "learning_rate": 0.0001232, | |
| "loss": 0.8206, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 0.2502688765525818, | |
| "learning_rate": 0.0001216, | |
| "loss": 0.8561, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.2759045958518982, | |
| "learning_rate": 0.00012, | |
| "loss": 0.858, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 0.3138297200202942, | |
| "learning_rate": 0.0001184, | |
| "loss": 0.8148, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.27893930673599243, | |
| "learning_rate": 0.00011679999999999999, | |
| "loss": 0.8181, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 0.2226714789867401, | |
| "learning_rate": 0.0001152, | |
| "loss": 0.8326, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.23976224660873413, | |
| "learning_rate": 0.0001136, | |
| "loss": 0.8205, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.23596003651618958, | |
| "learning_rate": 0.00011200000000000001, | |
| "loss": 0.8336, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.2597508430480957, | |
| "learning_rate": 0.00011040000000000001, | |
| "loss": 0.8202, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 0.2530119717121124, | |
| "learning_rate": 0.00010880000000000002, | |
| "loss": 0.8193, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.267026424407959, | |
| "learning_rate": 0.00010720000000000002, | |
| "loss": 0.8272, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 0.3313164710998535, | |
| "learning_rate": 0.0001056, | |
| "loss": 0.829, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.24725987017154694, | |
| "learning_rate": 0.00010400000000000001, | |
| "loss": 0.8204, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 0.3102189302444458, | |
| "learning_rate": 0.00010240000000000001, | |
| "loss": 0.8344, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.2404993623495102, | |
| "learning_rate": 0.00010080000000000001, | |
| "loss": 0.8172, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 0.24549129605293274, | |
| "learning_rate": 9.92e-05, | |
| "loss": 0.833, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.2698763906955719, | |
| "learning_rate": 9.76e-05, | |
| "loss": 0.8206, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.26306065917015076, | |
| "learning_rate": 9.6e-05, | |
| "loss": 0.8517, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.2710007429122925, | |
| "learning_rate": 9.44e-05, | |
| "loss": 0.8115, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 0.2536482810974121, | |
| "learning_rate": 9.28e-05, | |
| "loss": 0.8196, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.28047412633895874, | |
| "learning_rate": 9.120000000000001e-05, | |
| "loss": 0.8134, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 0.2554551661014557, | |
| "learning_rate": 8.960000000000001e-05, | |
| "loss": 0.8085, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.2920052707195282, | |
| "learning_rate": 8.800000000000001e-05, | |
| "loss": 0.8155, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 0.2649592161178589, | |
| "learning_rate": 8.64e-05, | |
| "loss": 0.8123, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.2757607102394104, | |
| "learning_rate": 8.48e-05, | |
| "loss": 0.8182, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 0.2552868127822876, | |
| "learning_rate": 8.32e-05, | |
| "loss": 0.8155, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 0.24389928579330444, | |
| "learning_rate": 8.16e-05, | |
| "loss": 0.7927, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.2750285565853119, | |
| "learning_rate": 8e-05, | |
| "loss": 0.8362, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.27113068103790283, | |
| "learning_rate": 7.840000000000001e-05, | |
| "loss": 0.8225, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 0.25137266516685486, | |
| "learning_rate": 7.680000000000001e-05, | |
| "loss": 0.8049, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.26775482296943665, | |
| "learning_rate": 7.52e-05, | |
| "loss": 0.8059, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 0.24626025557518005, | |
| "learning_rate": 7.36e-05, | |
| "loss": 0.8036, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.2576785385608673, | |
| "learning_rate": 7.2e-05, | |
| "loss": 0.828, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 0.28320208191871643, | |
| "learning_rate": 7.04e-05, | |
| "loss": 0.8015, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.292799711227417, | |
| "learning_rate": 6.879999999999999e-05, | |
| "loss": 0.8165, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 0.2561057507991791, | |
| "learning_rate": 6.720000000000001e-05, | |
| "loss": 0.8035, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.318015456199646, | |
| "learning_rate": 6.560000000000001e-05, | |
| "loss": 0.7909, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.2850799858570099, | |
| "learning_rate": 6.400000000000001e-05, | |
| "loss": 0.8111, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 0.3318442404270172, | |
| "learning_rate": 6.24e-05, | |
| "loss": 0.8216, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 0.24898533523082733, | |
| "learning_rate": 6.08e-05, | |
| "loss": 0.79, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.2550944685935974, | |
| "learning_rate": 5.92e-05, | |
| "loss": 0.7977, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 0.33275166153907776, | |
| "learning_rate": 5.76e-05, | |
| "loss": 0.808, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.2537584900856018, | |
| "learning_rate": 5.6000000000000006e-05, | |
| "loss": 0.8177, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 0.2836528420448303, | |
| "learning_rate": 5.440000000000001e-05, | |
| "loss": 0.797, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.24697652459144592, | |
| "learning_rate": 5.28e-05, | |
| "loss": 0.8047, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 0.2920159101486206, | |
| "learning_rate": 5.1200000000000004e-05, | |
| "loss": 0.793, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 0.24589219689369202, | |
| "learning_rate": 4.96e-05, | |
| "loss": 0.8218, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.28507670760154724, | |
| "learning_rate": 4.8e-05, | |
| "loss": 0.8143, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.22233684360980988, | |
| "learning_rate": 4.64e-05, | |
| "loss": 0.8108, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 0.24076737463474274, | |
| "learning_rate": 4.4800000000000005e-05, | |
| "loss": 0.8077, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 0.24421195685863495, | |
| "learning_rate": 4.32e-05, | |
| "loss": 0.8102, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 0.26924577355384827, | |
| "learning_rate": 4.16e-05, | |
| "loss": 0.827, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.3362084627151489, | |
| "learning_rate": 4e-05, | |
| "loss": 0.8222, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 0.28752413392066956, | |
| "learning_rate": 3.8400000000000005e-05, | |
| "loss": 0.8209, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.3482488989830017, | |
| "learning_rate": 3.68e-05, | |
| "loss": 0.8077, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 0.2573867738246918, | |
| "learning_rate": 3.52e-05, | |
| "loss": 0.8089, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.27762147784233093, | |
| "learning_rate": 3.3600000000000004e-05, | |
| "loss": 0.7974, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.30096402764320374, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 0.8051, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 0.25133249163627625, | |
| "learning_rate": 3.04e-05, | |
| "loss": 0.8179, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 0.26828518509864807, | |
| "learning_rate": 2.88e-05, | |
| "loss": 0.8136, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.2636822462081909, | |
| "learning_rate": 2.7200000000000004e-05, | |
| "loss": 0.8122, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 0.29320311546325684, | |
| "learning_rate": 2.5600000000000002e-05, | |
| "loss": 0.8294, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.25704631209373474, | |
| "learning_rate": 2.4e-05, | |
| "loss": 0.8278, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 0.3402651250362396, | |
| "learning_rate": 2.2400000000000002e-05, | |
| "loss": 0.8084, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.24089041352272034, | |
| "learning_rate": 2.08e-05, | |
| "loss": 0.8014, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 0.3702704608440399, | |
| "learning_rate": 1.9200000000000003e-05, | |
| "loss": 0.804, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.295099675655365, | |
| "learning_rate": 1.76e-05, | |
| "loss": 0.8098, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.24626977741718292, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.8051, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.27932465076446533, | |
| "learning_rate": 1.44e-05, | |
| "loss": 0.8031, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 0.27001097798347473, | |
| "learning_rate": 1.2800000000000001e-05, | |
| "loss": 0.8068, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.2695982754230499, | |
| "learning_rate": 1.1200000000000001e-05, | |
| "loss": 0.8047, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 0.27733317017555237, | |
| "learning_rate": 9.600000000000001e-06, | |
| "loss": 0.8238, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.25620317459106445, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.78, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 0.24671310186386108, | |
| "learning_rate": 6.4000000000000006e-06, | |
| "loss": 0.8233, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 0.2986748516559601, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": 0.8183, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 0.28886911273002625, | |
| "learning_rate": 3.2000000000000003e-06, | |
| "loss": 0.7884, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 0.27886465191841125, | |
| "learning_rate": 1.6000000000000001e-06, | |
| "loss": 0.7885, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.260232537984848, | |
| "learning_rate": 0.0, | |
| "loss": 0.7775, | |
| "step": 6250 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 6250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.2752831434633216e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |