| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 31250, | |
| "global_step": 125000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.3911694288253784, | |
| "learning_rate": 0.0002988, | |
| "loss": 0.1122, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.48731762170791626, | |
| "learning_rate": 0.00029759999999999997, | |
| "loss": 0.1165, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.6664834022521973, | |
| "learning_rate": 0.0002964, | |
| "loss": 0.1364, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.37042468786239624, | |
| "learning_rate": 0.00029519999999999997, | |
| "loss": 0.1335, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.42679572105407715, | |
| "learning_rate": 0.000294, | |
| "loss": 0.1434, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.6656017899513245, | |
| "learning_rate": 0.00029279999999999996, | |
| "loss": 0.1353, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.6870421171188354, | |
| "learning_rate": 0.0002916, | |
| "loss": 0.1316, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.993772566318512, | |
| "learning_rate": 0.00029039999999999996, | |
| "loss": 0.1354, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.8672360181808472, | |
| "learning_rate": 0.0002892, | |
| "loss": 0.1536, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.8261292576789856, | |
| "learning_rate": 0.00028799999999999995, | |
| "loss": 0.1469, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.0536205768585205, | |
| "learning_rate": 0.0002868, | |
| "loss": 0.1446, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.5625632405281067, | |
| "learning_rate": 0.00028559999999999995, | |
| "loss": 0.1454, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.6709353923797607, | |
| "learning_rate": 0.0002844, | |
| "loss": 0.1441, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.0651500225067139, | |
| "learning_rate": 0.00028319999999999994, | |
| "loss": 0.1427, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.2373555898666382, | |
| "learning_rate": 0.00028199999999999997, | |
| "loss": 0.1439, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.6624026298522949, | |
| "learning_rate": 0.0002808, | |
| "loss": 0.152, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.4510903358459473, | |
| "learning_rate": 0.00027959999999999997, | |
| "loss": 0.1508, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.47858643531799316, | |
| "learning_rate": 0.0002784, | |
| "loss": 0.1599, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.5771904587745667, | |
| "learning_rate": 0.0002772, | |
| "loss": 0.1519, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.6509794592857361, | |
| "learning_rate": 0.000276, | |
| "loss": 0.1568, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.7169878482818604, | |
| "learning_rate": 0.0002748, | |
| "loss": 0.1531, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.7649238109588623, | |
| "learning_rate": 0.0002736, | |
| "loss": 0.1545, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.1057517528533936, | |
| "learning_rate": 0.0002724, | |
| "loss": 0.1523, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.4265155792236328, | |
| "learning_rate": 0.0002712, | |
| "loss": 0.1496, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.9195236563682556, | |
| "learning_rate": 0.00027, | |
| "loss": 0.156, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.6371893882751465, | |
| "learning_rate": 0.0002688, | |
| "loss": 0.1538, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.3511596620082855, | |
| "learning_rate": 0.0002676, | |
| "loss": 0.1535, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.599895715713501, | |
| "learning_rate": 0.00026639999999999997, | |
| "loss": 0.1512, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.9206880927085876, | |
| "learning_rate": 0.0002652, | |
| "loss": 0.1611, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.49088287353515625, | |
| "learning_rate": 0.00026399999999999997, | |
| "loss": 0.1459, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.42937585711479187, | |
| "learning_rate": 0.0002628, | |
| "loss": 0.1605, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.413435161113739, | |
| "learning_rate": 0.00026159999999999996, | |
| "loss": 0.1482, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.9658032655715942, | |
| "learning_rate": 0.0002604, | |
| "loss": 0.1462, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.7052133083343506, | |
| "learning_rate": 0.00025919999999999996, | |
| "loss": 0.1494, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.18123334646224976, | |
| "learning_rate": 0.000258, | |
| "loss": 0.1517, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.4006165862083435, | |
| "learning_rate": 0.00025679999999999995, | |
| "loss": 0.1533, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.49044716358184814, | |
| "learning_rate": 0.0002556, | |
| "loss": 0.1616, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.7895859479904175, | |
| "learning_rate": 0.00025439999999999995, | |
| "loss": 0.1575, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.7281160950660706, | |
| "learning_rate": 0.0002532, | |
| "loss": 0.1519, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.84168541431427, | |
| "learning_rate": 0.00025199999999999995, | |
| "loss": 0.1513, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.320218563079834, | |
| "learning_rate": 0.00025079999999999997, | |
| "loss": 0.1512, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.6346606612205505, | |
| "learning_rate": 0.00024959999999999994, | |
| "loss": 0.1512, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.6091498136520386, | |
| "learning_rate": 0.00024839999999999997, | |
| "loss": 0.1528, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.37541925907135, | |
| "learning_rate": 0.0002472, | |
| "loss": 0.1472, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.41054484248161316, | |
| "learning_rate": 0.00024599999999999996, | |
| "loss": 0.1548, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.3798141479492188, | |
| "learning_rate": 0.0002448, | |
| "loss": 0.1471, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.5459690690040588, | |
| "learning_rate": 0.00024359999999999999, | |
| "loss": 0.1426, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.6976202726364136, | |
| "learning_rate": 0.00024239999999999998, | |
| "loss": 0.1552, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.4615379869937897, | |
| "learning_rate": 0.00024119999999999998, | |
| "loss": 0.1625, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.9051592350006104, | |
| "learning_rate": 0.00023999999999999998, | |
| "loss": 0.1626, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.3928375840187073, | |
| "learning_rate": 0.0002388, | |
| "loss": 0.1532, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.570114254951477, | |
| "learning_rate": 0.0002376, | |
| "loss": 0.1519, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.360300898551941, | |
| "learning_rate": 0.0002364, | |
| "loss": 0.1571, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.0905797481536865, | |
| "learning_rate": 0.0002352, | |
| "loss": 0.1544, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.8806708455085754, | |
| "learning_rate": 0.000234, | |
| "loss": 0.1483, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.876801073551178, | |
| "learning_rate": 0.0002328, | |
| "loss": 0.1533, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.6989384293556213, | |
| "learning_rate": 0.0002316, | |
| "loss": 0.1478, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.1341756582260132, | |
| "learning_rate": 0.0002304, | |
| "loss": 0.1525, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.5615713000297546, | |
| "learning_rate": 0.0002292, | |
| "loss": 0.1442, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.6419674754142761, | |
| "learning_rate": 0.00022799999999999999, | |
| "loss": 0.1546, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.48240768909454346, | |
| "learning_rate": 0.00022679999999999998, | |
| "loss": 0.1526, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.5017668008804321, | |
| "learning_rate": 0.00022559999999999998, | |
| "loss": 0.1522, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "eval_loss": 0.1550917774438858, | |
| "eval_runtime": 203.798, | |
| "eval_samples_per_second": 98.136, | |
| "eval_steps_per_second": 24.534, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.9850152134895325, | |
| "learning_rate": 0.00022439999999999998, | |
| "loss": 0.1478, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.5590870976448059, | |
| "learning_rate": 0.00022319999999999998, | |
| "loss": 0.1501, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.128318428993225, | |
| "learning_rate": 0.00022199999999999998, | |
| "loss": 0.156, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.5681318044662476, | |
| "learning_rate": 0.00022079999999999997, | |
| "loss": 0.1492, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.7050376534461975, | |
| "learning_rate": 0.00021959999999999997, | |
| "loss": 0.1632, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.3747576177120209, | |
| "learning_rate": 0.00021839999999999997, | |
| "loss": 0.1492, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.44152018427848816, | |
| "learning_rate": 0.00021719999999999997, | |
| "loss": 0.1505, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.40513116121292114, | |
| "learning_rate": 0.00021599999999999996, | |
| "loss": 0.1484, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.7306041121482849, | |
| "learning_rate": 0.00021479999999999996, | |
| "loss": 0.1448, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.8463369011878967, | |
| "learning_rate": 0.00021359999999999996, | |
| "loss": 0.1534, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.9181163311004639, | |
| "learning_rate": 0.00021239999999999996, | |
| "loss": 0.1493, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.0870494842529297, | |
| "learning_rate": 0.00021119999999999996, | |
| "loss": 0.1572, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.41071075201034546, | |
| "learning_rate": 0.00020999999999999998, | |
| "loss": 0.1542, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.1432327032089233, | |
| "learning_rate": 0.00020879999999999998, | |
| "loss": 0.1448, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.465071439743042, | |
| "learning_rate": 0.00020759999999999998, | |
| "loss": 0.1452, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.7455276250839233, | |
| "learning_rate": 0.00020639999999999998, | |
| "loss": 0.1491, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.6694576144218445, | |
| "learning_rate": 0.0002052, | |
| "loss": 0.1439, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.6440109014511108, | |
| "learning_rate": 0.000204, | |
| "loss": 0.1497, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.6777958273887634, | |
| "learning_rate": 0.0002028, | |
| "loss": 0.1489, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.4175439476966858, | |
| "learning_rate": 0.0002016, | |
| "loss": 0.1549, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.4022385776042938, | |
| "learning_rate": 0.0002004, | |
| "loss": 0.1519, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.1414192914962769, | |
| "learning_rate": 0.0001992, | |
| "loss": 0.1478, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.484553575515747, | |
| "learning_rate": 0.000198, | |
| "loss": 0.1571, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.7761896848678589, | |
| "learning_rate": 0.00019679999999999999, | |
| "loss": 0.1539, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.6768946051597595, | |
| "learning_rate": 0.00019559999999999998, | |
| "loss": 0.1501, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 2.1360421180725098, | |
| "learning_rate": 0.00019439999999999998, | |
| "loss": 0.1531, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.5634905099868774, | |
| "learning_rate": 0.00019319999999999998, | |
| "loss": 0.1452, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.3370414972305298, | |
| "learning_rate": 0.00019199999999999998, | |
| "loss": 0.1486, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.3636302947998047, | |
| "learning_rate": 0.00019079999999999998, | |
| "loss": 0.1506, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.5136697292327881, | |
| "learning_rate": 0.00018959999999999997, | |
| "loss": 0.1453, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.4970598518848419, | |
| "learning_rate": 0.00018839999999999997, | |
| "loss": 0.1447, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.8702558279037476, | |
| "learning_rate": 0.0001872, | |
| "loss": 0.1505, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.3983975648880005, | |
| "learning_rate": 0.000186, | |
| "loss": 0.157, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.6854807734489441, | |
| "learning_rate": 0.0001848, | |
| "loss": 0.1482, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.45792344212532043, | |
| "learning_rate": 0.0001836, | |
| "loss": 0.1522, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.8602573871612549, | |
| "learning_rate": 0.0001824, | |
| "loss": 0.1466, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.4061354100704193, | |
| "learning_rate": 0.00018119999999999999, | |
| "loss": 0.143, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.1822609901428223, | |
| "learning_rate": 0.00017999999999999998, | |
| "loss": 0.1377, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.3173510730266571, | |
| "learning_rate": 0.00017879999999999998, | |
| "loss": 0.1445, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.9117286205291748, | |
| "learning_rate": 0.00017759999999999998, | |
| "loss": 0.148, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.5054415464401245, | |
| "learning_rate": 0.00017639999999999998, | |
| "loss": 0.1397, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.496282935142517, | |
| "learning_rate": 0.00017519999999999998, | |
| "loss": 0.1487, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.48316746950149536, | |
| "learning_rate": 0.00017399999999999997, | |
| "loss": 0.1588, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.00754976272583, | |
| "learning_rate": 0.00017279999999999997, | |
| "loss": 0.1559, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.4439372420310974, | |
| "learning_rate": 0.00017159999999999997, | |
| "loss": 0.1422, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.2646929025650024, | |
| "learning_rate": 0.00017039999999999997, | |
| "loss": 0.144, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.8087254762649536, | |
| "learning_rate": 0.00016919999999999997, | |
| "loss": 0.1538, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.2770575284957886, | |
| "learning_rate": 0.000168, | |
| "loss": 0.1454, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.0025547742843628, | |
| "learning_rate": 0.0001668, | |
| "loss": 0.149, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.5146603584289551, | |
| "learning_rate": 0.0001656, | |
| "loss": 0.1512, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.5651877522468567, | |
| "learning_rate": 0.0001644, | |
| "loss": 0.1494, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.3547286987304688, | |
| "learning_rate": 0.0001632, | |
| "loss": 0.1435, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.29811692237854004, | |
| "learning_rate": 0.000162, | |
| "loss": 0.1536, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.111806869506836, | |
| "learning_rate": 0.0001608, | |
| "loss": 0.1468, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.6532767415046692, | |
| "learning_rate": 0.0001596, | |
| "loss": 0.1525, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.31857454776763916, | |
| "learning_rate": 0.0001584, | |
| "loss": 0.139, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.8610038757324219, | |
| "learning_rate": 0.0001572, | |
| "loss": 0.1503, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.5853925347328186, | |
| "learning_rate": 0.000156, | |
| "loss": 0.1448, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.6353251934051514, | |
| "learning_rate": 0.0001548, | |
| "loss": 0.1538, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.0176310539245605, | |
| "learning_rate": 0.0001536, | |
| "loss": 0.1443, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.3258388638496399, | |
| "learning_rate": 0.0001524, | |
| "loss": 0.1386, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.5362368822097778, | |
| "learning_rate": 0.0001512, | |
| "loss": 0.1465, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.46234962344169617, | |
| "learning_rate": 0.00015, | |
| "loss": 0.1449, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 0.1452270895242691, | |
| "eval_runtime": 201.7763, | |
| "eval_samples_per_second": 99.12, | |
| "eval_steps_per_second": 24.78, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.3215588331222534, | |
| "learning_rate": 0.00014879999999999998, | |
| "loss": 0.1436, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.9984625577926636, | |
| "learning_rate": 0.00014759999999999998, | |
| "loss": 0.1453, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.3026807308197021, | |
| "learning_rate": 0.00014639999999999998, | |
| "loss": 0.145, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.2961193323135376, | |
| "learning_rate": 0.00014519999999999998, | |
| "loss": 0.1458, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.6397677659988403, | |
| "learning_rate": 0.00014399999999999998, | |
| "loss": 0.1442, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.2444109916687012, | |
| "learning_rate": 0.00014279999999999997, | |
| "loss": 0.1533, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.3235061764717102, | |
| "learning_rate": 0.00014159999999999997, | |
| "loss": 0.1473, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.508976936340332, | |
| "learning_rate": 0.0001404, | |
| "loss": 0.1455, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.44948485493659973, | |
| "learning_rate": 0.0001392, | |
| "loss": 0.1473, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.8090860247612, | |
| "learning_rate": 0.000138, | |
| "loss": 0.144, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.7624330520629883, | |
| "learning_rate": 0.0001368, | |
| "loss": 0.1513, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.3432750105857849, | |
| "learning_rate": 0.0001356, | |
| "loss": 0.1518, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.5248464345932007, | |
| "learning_rate": 0.0001344, | |
| "loss": 0.1481, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.5486373901367188, | |
| "learning_rate": 0.00013319999999999999, | |
| "loss": 0.1493, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.1218228340148926, | |
| "learning_rate": 0.00013199999999999998, | |
| "loss": 0.1513, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.6330295205116272, | |
| "learning_rate": 0.00013079999999999998, | |
| "loss": 0.1426, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.347390055656433, | |
| "learning_rate": 0.00012959999999999998, | |
| "loss": 0.1481, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.37359416484832764, | |
| "learning_rate": 0.00012839999999999998, | |
| "loss": 0.147, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.5048085451126099, | |
| "learning_rate": 0.00012719999999999997, | |
| "loss": 0.1454, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.4849306344985962, | |
| "learning_rate": 0.00012599999999999997, | |
| "loss": 0.1519, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.022958755493164, | |
| "learning_rate": 0.00012479999999999997, | |
| "loss": 0.1514, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.671461284160614, | |
| "learning_rate": 0.0001236, | |
| "loss": 0.143, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.6191059947013855, | |
| "learning_rate": 0.0001224, | |
| "loss": 0.1509, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.8567871451377869, | |
| "learning_rate": 0.00012119999999999999, | |
| "loss": 0.1505, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.5671202540397644, | |
| "learning_rate": 0.00011999999999999999, | |
| "loss": 0.1472, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.9271816611289978, | |
| "learning_rate": 0.0001188, | |
| "loss": 0.154, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.8567375540733337, | |
| "learning_rate": 0.0001176, | |
| "loss": 0.1458, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.40565225481987, | |
| "learning_rate": 0.0001164, | |
| "loss": 0.1529, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.1839731931686401, | |
| "learning_rate": 0.0001152, | |
| "loss": 0.1416, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.0027508735656738, | |
| "learning_rate": 0.00011399999999999999, | |
| "loss": 0.1494, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.7692805528640747, | |
| "learning_rate": 0.00011279999999999999, | |
| "loss": 0.1426, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.5682281255722046, | |
| "learning_rate": 0.00011159999999999999, | |
| "loss": 0.1496, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.4132564961910248, | |
| "learning_rate": 0.00011039999999999999, | |
| "loss": 0.1537, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.06421959400177, | |
| "learning_rate": 0.00010919999999999998, | |
| "loss": 0.148, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.649330198764801, | |
| "learning_rate": 0.00010799999999999998, | |
| "loss": 0.1485, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.5109010338783264, | |
| "learning_rate": 0.00010679999999999998, | |
| "loss": 0.1512, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.40312623977661133, | |
| "learning_rate": 0.00010559999999999998, | |
| "loss": 0.1533, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.678674578666687, | |
| "learning_rate": 0.00010439999999999999, | |
| "loss": 0.1522, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.4731667935848236, | |
| "learning_rate": 0.00010319999999999999, | |
| "loss": 0.154, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.742722749710083, | |
| "learning_rate": 0.000102, | |
| "loss": 0.1511, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.5631982684135437, | |
| "learning_rate": 0.0001008, | |
| "loss": 0.1442, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.3675552010536194, | |
| "learning_rate": 9.96e-05, | |
| "loss": 0.1448, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.697551429271698, | |
| "learning_rate": 9.839999999999999e-05, | |
| "loss": 0.1434, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.7331182956695557, | |
| "learning_rate": 9.719999999999999e-05, | |
| "loss": 0.1459, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.344137042760849, | |
| "learning_rate": 9.599999999999999e-05, | |
| "loss": 0.1537, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.1674726009368896, | |
| "learning_rate": 9.479999999999999e-05, | |
| "loss": 0.1515, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.5482670664787292, | |
| "learning_rate": 9.36e-05, | |
| "loss": 0.1487, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.8054832220077515, | |
| "learning_rate": 9.24e-05, | |
| "loss": 0.1545, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.1015689373016357, | |
| "learning_rate": 9.12e-05, | |
| "loss": 0.1475, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.3283549547195435, | |
| "learning_rate": 8.999999999999999e-05, | |
| "loss": 0.1546, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.41955164074897766, | |
| "learning_rate": 8.879999999999999e-05, | |
| "loss": 0.1457, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.647566556930542, | |
| "learning_rate": 8.759999999999999e-05, | |
| "loss": 0.149, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.6643834710121155, | |
| "learning_rate": 8.639999999999999e-05, | |
| "loss": 0.1531, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.21100400388240814, | |
| "learning_rate": 8.519999999999998e-05, | |
| "loss": 0.1452, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.537507951259613, | |
| "learning_rate": 8.4e-05, | |
| "loss": 0.1441, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.9034580588340759, | |
| "learning_rate": 8.28e-05, | |
| "loss": 0.1513, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 2.2073426246643066, | |
| "learning_rate": 8.16e-05, | |
| "loss": 0.1534, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.3272952735424042, | |
| "learning_rate": 8.04e-05, | |
| "loss": 0.1343, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.18572063744068146, | |
| "learning_rate": 7.92e-05, | |
| "loss": 0.1525, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.6784604787826538, | |
| "learning_rate": 7.8e-05, | |
| "loss": 0.1498, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.6597641110420227, | |
| "learning_rate": 7.68e-05, | |
| "loss": 0.1482, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.2626825273036957, | |
| "learning_rate": 7.56e-05, | |
| "loss": 0.1483, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "eval_loss": 0.13335928320884705, | |
| "eval_runtime": 202.0526, | |
| "eval_samples_per_second": 98.984, | |
| "eval_steps_per_second": 24.746, | |
| "step": 93750 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.8486179113388062, | |
| "learning_rate": 7.439999999999999e-05, | |
| "loss": 0.1397, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.38229885697364807, | |
| "learning_rate": 7.319999999999999e-05, | |
| "loss": 0.1599, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.6869509220123291, | |
| "learning_rate": 7.199999999999999e-05, | |
| "loss": 0.1467, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.5972515344619751, | |
| "learning_rate": 7.079999999999999e-05, | |
| "loss": 0.1465, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.3241545557975769, | |
| "learning_rate": 6.96e-05, | |
| "loss": 0.1539, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.406515508890152, | |
| "learning_rate": 6.84e-05, | |
| "loss": 0.1464, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.8518499135971069, | |
| "learning_rate": 6.72e-05, | |
| "loss": 0.1481, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.6099169254302979, | |
| "learning_rate": 6.599999999999999e-05, | |
| "loss": 0.1492, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.002388834953308, | |
| "learning_rate": 6.479999999999999e-05, | |
| "loss": 0.152, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.734794557094574, | |
| "learning_rate": 6.359999999999999e-05, | |
| "loss": 0.1443, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.0499773025512695, | |
| "learning_rate": 6.239999999999999e-05, | |
| "loss": 0.1572, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.2162612676620483, | |
| "learning_rate": 6.12e-05, | |
| "loss": 0.1523, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.9487478733062744, | |
| "learning_rate": 5.9999999999999995e-05, | |
| "loss": 0.1563, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.7570409178733826, | |
| "learning_rate": 5.88e-05, | |
| "loss": 0.1555, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.8198479413986206, | |
| "learning_rate": 5.76e-05, | |
| "loss": 0.1529, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.0460014343261719, | |
| "learning_rate": 5.6399999999999995e-05, | |
| "loss": 0.1447, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.6016435027122498, | |
| "learning_rate": 5.519999999999999e-05, | |
| "loss": 0.1527, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.0050832033157349, | |
| "learning_rate": 5.399999999999999e-05, | |
| "loss": 0.1482, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.5460426807403564, | |
| "learning_rate": 5.279999999999999e-05, | |
| "loss": 0.1616, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.7548345327377319, | |
| "learning_rate": 5.1599999999999994e-05, | |
| "loss": 0.1477, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.590357780456543, | |
| "learning_rate": 5.04e-05, | |
| "loss": 0.156, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.5072462558746338, | |
| "learning_rate": 4.9199999999999997e-05, | |
| "loss": 0.1597, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.5731006264686584, | |
| "learning_rate": 4.7999999999999994e-05, | |
| "loss": 0.1463, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.1715773344039917, | |
| "learning_rate": 4.68e-05, | |
| "loss": 0.1499, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.20436818897724152, | |
| "learning_rate": 4.56e-05, | |
| "loss": 0.1547, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.46894416213035583, | |
| "learning_rate": 4.4399999999999995e-05, | |
| "loss": 0.1579, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.5912017822265625, | |
| "learning_rate": 4.319999999999999e-05, | |
| "loss": 0.1428, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.9041949510574341, | |
| "learning_rate": 4.2e-05, | |
| "loss": 0.15, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.5391335487365723, | |
| "learning_rate": 4.08e-05, | |
| "loss": 0.1634, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.38163840770721436, | |
| "learning_rate": 3.96e-05, | |
| "loss": 0.1578, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.4478495419025421, | |
| "learning_rate": 3.84e-05, | |
| "loss": 0.1588, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.6843103766441345, | |
| "learning_rate": 3.7199999999999996e-05, | |
| "loss": 0.1507, | |
| "step": 109500 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.832815408706665, | |
| "learning_rate": 3.5999999999999994e-05, | |
| "loss": 0.1588, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.35042092204093933, | |
| "learning_rate": 3.48e-05, | |
| "loss": 0.1581, | |
| "step": 110500 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.49709609150886536, | |
| "learning_rate": 3.36e-05, | |
| "loss": 0.1546, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.85088711977005, | |
| "learning_rate": 3.2399999999999995e-05, | |
| "loss": 0.1639, | |
| "step": 111500 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.7669118642807007, | |
| "learning_rate": 3.119999999999999e-05, | |
| "loss": 0.1585, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.6558898091316223, | |
| "learning_rate": 2.9999999999999997e-05, | |
| "loss": 0.1552, | |
| "step": 112500 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.9164992570877075, | |
| "learning_rate": 2.88e-05, | |
| "loss": 0.1552, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.1778265237808228, | |
| "learning_rate": 2.7599999999999997e-05, | |
| "loss": 0.1603, | |
| "step": 113500 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.4928775727748871, | |
| "learning_rate": 2.6399999999999995e-05, | |
| "loss": 0.1558, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.10056832432746887, | |
| "learning_rate": 2.52e-05, | |
| "loss": 0.1581, | |
| "step": 114500 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.104854054749012, | |
| "learning_rate": 2.3999999999999997e-05, | |
| "loss": 0.1584, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.1790800094604492, | |
| "learning_rate": 2.28e-05, | |
| "loss": 0.1634, | |
| "step": 115500 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.7926896214485168, | |
| "learning_rate": 2.1599999999999996e-05, | |
| "loss": 0.1546, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.6564472317695618, | |
| "learning_rate": 2.04e-05, | |
| "loss": 0.1628, | |
| "step": 116500 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.5036044716835022, | |
| "learning_rate": 1.92e-05, | |
| "loss": 0.1684, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.9592491984367371, | |
| "learning_rate": 1.7999999999999997e-05, | |
| "loss": 0.1628, | |
| "step": 117500 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.5925645232200623, | |
| "learning_rate": 1.68e-05, | |
| "loss": 0.1658, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.5361761450767517, | |
| "learning_rate": 1.5599999999999996e-05, | |
| "loss": 0.1639, | |
| "step": 118500 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.6238356828689575, | |
| "learning_rate": 1.44e-05, | |
| "loss": 0.1506, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.3824297189712524, | |
| "learning_rate": 1.3199999999999997e-05, | |
| "loss": 0.1642, | |
| "step": 119500 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.9775289297103882, | |
| "learning_rate": 1.1999999999999999e-05, | |
| "loss": 0.1631, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.29735296964645386, | |
| "learning_rate": 1.0799999999999998e-05, | |
| "loss": 0.1693, | |
| "step": 120500 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.5140572786331177, | |
| "learning_rate": 9.6e-06, | |
| "loss": 0.1651, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.5154848098754883, | |
| "learning_rate": 8.4e-06, | |
| "loss": 0.1639, | |
| "step": 121500 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.21747368574142456, | |
| "learning_rate": 7.2e-06, | |
| "loss": 0.1679, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.51055908203125, | |
| "learning_rate": 5.999999999999999e-06, | |
| "loss": 0.1723, | |
| "step": 122500 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.451370120048523, | |
| "learning_rate": 4.8e-06, | |
| "loss": 0.1694, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.4937916398048401, | |
| "learning_rate": 3.6e-06, | |
| "loss": 0.1545, | |
| "step": 123500 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.46715646982192993, | |
| "learning_rate": 2.4e-06, | |
| "loss": 0.1748, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.35662218928337097, | |
| "learning_rate": 1.2e-06, | |
| "loss": 0.1627, | |
| "step": 124500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.8029049634933472, | |
| "learning_rate": 0.0, | |
| "loss": 0.1636, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.12450261414051056, | |
| "eval_runtime": 202.3721, | |
| "eval_samples_per_second": 98.828, | |
| "eval_steps_per_second": 24.707, | |
| "step": 125000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 125000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 125000, | |
| "total_flos": 6.527143521828864e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |