| { | |
| "best_metric": 0.26116234064102173, | |
| "best_model_checkpoint": "outputs/checkpoint-108", | |
| "epoch": 5.0, | |
| "eval_steps": 6, | |
| "global_step": 110, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.045454545454545456, | |
| "grad_norm": 12.375, | |
| "learning_rate": 4e-05, | |
| "loss": 8.7425, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.09090909090909091, | |
| "grad_norm": 17.125, | |
| "learning_rate": 8e-05, | |
| "loss": 8.6536, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.13636363636363635, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 0.00012, | |
| "loss": 8.602, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.18181818181818182, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 0.00016, | |
| "loss": 8.3064, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.22727272727272727, | |
| "grad_norm": 10.125, | |
| "learning_rate": 0.0002, | |
| "loss": 7.7739, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.2727272727272727, | |
| "grad_norm": 7.25, | |
| "learning_rate": 0.00024, | |
| "loss": 7.1622, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.2727272727272727, | |
| "eval_loss": 6.408970832824707, | |
| "eval_runtime": 1.4799, | |
| "eval_samples_per_second": 95.274, | |
| "eval_steps_per_second": 12.163, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.3181818181818182, | |
| "grad_norm": 6.78125, | |
| "learning_rate": 0.00028000000000000003, | |
| "loss": 6.4402, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.36363636363636365, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 0.00032, | |
| "loss": 5.616, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.4090909090909091, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 0.00035999999999999997, | |
| "loss": 4.8702, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 0.0004, | |
| "loss": 4.2606, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 0.00044, | |
| "loss": 3.7622, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.5454545454545454, | |
| "grad_norm": 5.3125, | |
| "learning_rate": 0.00048, | |
| "loss": 3.5397, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.5454545454545454, | |
| "eval_loss": 2.9970808029174805, | |
| "eval_runtime": 1.4818, | |
| "eval_samples_per_second": 95.155, | |
| "eval_steps_per_second": 12.147, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.5909090909090909, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 0.0005200000000000001, | |
| "loss": 2.9788, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.6363636363636364, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 0.0005600000000000001, | |
| "loss": 2.5801, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.6818181818181818, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 0.0006, | |
| "loss": 2.3461, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.7272727272727273, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 0.00064, | |
| "loss": 2.0174, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.7727272727272727, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 0.00068, | |
| "loss": 1.889, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.8181818181818182, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 0.0007199999999999999, | |
| "loss": 1.7954, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.8181818181818182, | |
| "eval_loss": 1.589725136756897, | |
| "eval_runtime": 1.4857, | |
| "eval_samples_per_second": 94.907, | |
| "eval_steps_per_second": 12.116, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.8636363636363636, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 0.00076, | |
| "loss": 1.6871, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 0.0008, | |
| "loss": 1.5787, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.9545454545454546, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 0.00084, | |
| "loss": 1.4167, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 0.00088, | |
| "loss": 1.3882, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 1.0454545454545454, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 0.00092, | |
| "loss": 1.2626, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 1.0909090909090908, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 0.00096, | |
| "loss": 1.1743, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 1.0909090909090908, | |
| "eval_loss": 1.088915467262268, | |
| "eval_runtime": 1.4822, | |
| "eval_samples_per_second": 95.126, | |
| "eval_steps_per_second": 12.144, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 1.1363636363636362, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 0.001, | |
| "loss": 1.1278, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.1818181818181819, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 0.0009996585300715115, | |
| "loss": 1.1023, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 1.2272727272727273, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 0.0009986345866928941, | |
| "loss": 1.0403, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 1.2727272727272727, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 0.000996929568447637, | |
| "loss": 1.0496, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 1.3181818181818181, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 0.000994545804185573, | |
| "loss": 0.8593, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 0.000991486549841951, | |
| "loss": 0.9413, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "eval_loss": 0.7773878574371338, | |
| "eval_runtime": 1.491, | |
| "eval_samples_per_second": 94.566, | |
| "eval_steps_per_second": 12.072, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.4090909090909092, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 0.0009877559839902184, | |
| "loss": 0.7758, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 1.4545454545454546, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 0.0009833592021345938, | |
| "loss": 0.8344, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 0.0009783022097502204, | |
| "loss": 0.6183, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 1.5454545454545454, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 0.0009725919140804099, | |
| "loss": 0.7497, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 1.5909090909090908, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 0.0009662361147021779, | |
| "loss": 0.7042, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.6363636363636362, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 0.0009592434928729616, | |
| "loss": 0.7236, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 1.6363636363636362, | |
| "eval_loss": 0.6186583042144775, | |
| "eval_runtime": 1.4853, | |
| "eval_samples_per_second": 94.931, | |
| "eval_steps_per_second": 12.119, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 1.6818181818181817, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 0.0009516235996730644, | |
| "loss": 0.6119, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 1.7272727272727273, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 0.0009433868429600309, | |
| "loss": 0.606, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 1.7727272727272727, | |
| "grad_norm": 17.625, | |
| "learning_rate": 0.0009345444731527642, | |
| "loss": 0.6787, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 0.0009251085678648072, | |
| "loss": 0.6607, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.8636363636363638, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 0.0009150920154077753, | |
| "loss": 0.6514, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 1.9090909090909092, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.0009045084971874737, | |
| "loss": 0.6084, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.9090909090909092, | |
| "eval_loss": 0.555232048034668, | |
| "eval_runtime": 1.4852, | |
| "eval_samples_per_second": 94.934, | |
| "eval_steps_per_second": 12.119, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.9545454545454546, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 0.0008933724690167416, | |
| "loss": 0.5991, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 0.0008816991413705516, | |
| "loss": 0.6085, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 2.0454545454545454, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 0.0008695044586103295, | |
| "loss": 0.4946, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 2.090909090909091, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 0.0008568050772058762, | |
| "loss": 0.4987, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 2.1363636363636362, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0008436183429846313, | |
| "loss": 0.4656, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 2.1818181818181817, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 0.0008299622674393614, | |
| "loss": 0.5454, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 2.1818181818181817, | |
| "eval_loss": 0.4745166301727295, | |
| "eval_runtime": 1.4854, | |
| "eval_samples_per_second": 94.921, | |
| "eval_steps_per_second": 12.118, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 2.227272727272727, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0008158555031266255, | |
| "loss": 0.4058, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 2.2727272727272725, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 0.0008013173181896282, | |
| "loss": 0.5267, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 2.3181818181818183, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 0.0007863675700402526, | |
| "loss": 0.524, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 2.3636363636363638, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.0007710266782362247, | |
| "loss": 0.5331, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 2.409090909090909, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 0.0007553155965904535, | |
| "loss": 0.4235, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 2.4545454545454546, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 0.0007392557845506433, | |
| "loss": 0.5147, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 2.4545454545454546, | |
| "eval_loss": 0.437049001455307, | |
| "eval_runtime": 1.4804, | |
| "eval_samples_per_second": 95.243, | |
| "eval_steps_per_second": 12.159, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0007228691778882692, | |
| "loss": 0.4376, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 2.5454545454545454, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0007061781587369518, | |
| "loss": 0.4396, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 2.590909090909091, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 0.0006892055250211552, | |
| "loss": 0.4257, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 2.6363636363636362, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0006719744593169641, | |
| "loss": 0.4447, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 2.6818181818181817, | |
| "grad_norm": 0.75, | |
| "learning_rate": 0.0006545084971874737, | |
| "loss": 0.4591, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 2.7272727272727275, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 0.0006368314950360416, | |
| "loss": 0.4645, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 2.7272727272727275, | |
| "eval_loss": 0.3943060338497162, | |
| "eval_runtime": 1.4805, | |
| "eval_samples_per_second": 95.235, | |
| "eval_steps_per_second": 12.158, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 2.7727272727272725, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.0006189675975213093, | |
| "loss": 0.4733, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 2.8181818181818183, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.0006009412045785051, | |
| "loss": 0.4227, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 2.8636363636363638, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.000582776938092065, | |
| "loss": 0.485, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 2.909090909090909, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.0005644996082651017, | |
| "loss": 0.4154, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 2.9545454545454546, | |
| "grad_norm": 0.625, | |
| "learning_rate": 0.000546134179731651, | |
| "loss": 0.4602, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.000527705737457985, | |
| "loss": 0.4371, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.35816648602485657, | |
| "eval_runtime": 1.4795, | |
| "eval_samples_per_second": 95.3, | |
| "eval_steps_per_second": 12.166, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 3.0454545454545454, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 0.000509239452479565, | |
| "loss": 0.3674, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 3.090909090909091, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 0.0004907605475204352, | |
| "loss": 0.3405, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 3.1363636363636362, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 0.00047229426254201504, | |
| "loss": 0.3669, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 3.1818181818181817, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 0.00045386582026834903, | |
| "loss": 0.3333, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 3.227272727272727, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.0004355003917348985, | |
| "loss": 0.3032, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 3.2727272727272725, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.000417223061907935, | |
| "loss": 0.3557, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 3.2727272727272725, | |
| "eval_loss": 0.3237670361995697, | |
| "eval_runtime": 1.4942, | |
| "eval_samples_per_second": 94.367, | |
| "eval_steps_per_second": 12.047, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 3.3181818181818183, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 0.000399058795421495, | |
| "loss": 0.3774, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 3.3636363636363638, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.00038103240247869074, | |
| "loss": 0.3433, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 3.409090909090909, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 0.0003631685049639586, | |
| "loss": 0.3872, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 3.4545454545454546, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 0.00034549150281252633, | |
| "loss": 0.3675, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.0003280255406830359, | |
| "loss": 0.3581, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 3.5454545454545454, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 0.00031079447497884486, | |
| "loss": 0.3062, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 3.5454545454545454, | |
| "eval_loss": 0.3086094558238983, | |
| "eval_runtime": 1.4971, | |
| "eval_samples_per_second": 94.182, | |
| "eval_steps_per_second": 12.023, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 3.590909090909091, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 0.00029382184126304836, | |
| "loss": 0.3324, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 3.6363636363636362, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.0002771308221117309, | |
| "loss": 0.338, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 3.6818181818181817, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0002607442154493568, | |
| "loss": 0.3319, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 3.7272727272727275, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0002446844034095466, | |
| "loss": 0.3577, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 3.7727272727272725, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.00022897332176377528, | |
| "loss": 0.3463, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 3.8181818181818183, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.00021363242995974742, | |
| "loss": 0.3065, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 3.8181818181818183, | |
| "eval_loss": 0.2896404266357422, | |
| "eval_runtime": 1.4869, | |
| "eval_samples_per_second": 94.829, | |
| "eval_steps_per_second": 12.106, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 3.8636363636363638, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 0.00019868268181037185, | |
| "loss": 0.339, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 3.909090909090909, | |
| "grad_norm": 0.5, | |
| "learning_rate": 0.00018414449687337466, | |
| "loss": 0.3104, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 3.9545454545454546, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.0001700377325606388, | |
| "loss": 0.3248, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 0.00015638165701536866, | |
| "loss": 0.3155, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 4.045454545454546, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 0.00014319492279412388, | |
| "loss": 0.2769, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 4.090909090909091, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 0.0001304955413896705, | |
| "loss": 0.2873, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 4.090909090909091, | |
| "eval_loss": 0.274143785238266, | |
| "eval_runtime": 1.5006, | |
| "eval_samples_per_second": 93.962, | |
| "eval_steps_per_second": 11.995, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 4.136363636363637, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 0.00011830085862944851, | |
| "loss": 0.2952, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 4.181818181818182, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 0.00010662753098325839, | |
| "loss": 0.2559, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 4.2272727272727275, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 9.549150281252633e-05, | |
| "loss": 0.2737, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 4.2727272727272725, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 8.490798459222476e-05, | |
| "loss": 0.2822, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 4.318181818181818, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 7.489143213519301e-05, | |
| "loss": 0.3014, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 4.363636363636363, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 6.545552684723583e-05, | |
| "loss": 0.2827, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 4.363636363636363, | |
| "eval_loss": 0.26471802592277527, | |
| "eval_runtime": 1.4885, | |
| "eval_samples_per_second": 94.724, | |
| "eval_steps_per_second": 12.092, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 4.409090909090909, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 5.6613157039969057e-05, | |
| "loss": 0.2638, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 4.454545454545454, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 4.8376400326935575e-05, | |
| "loss": 0.2592, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 4.075650712703849e-05, | |
| "loss": 0.298, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 4.545454545454545, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 3.376388529782215e-05, | |
| "loss": 0.2632, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 4.590909090909091, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 2.7408085919590266e-05, | |
| "loss": 0.2404, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 4.636363636363637, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 2.1697790249779635e-05, | |
| "loss": 0.265, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 4.636363636363637, | |
| "eval_loss": 0.26171576976776123, | |
| "eval_runtime": 1.4803, | |
| "eval_samples_per_second": 95.248, | |
| "eval_steps_per_second": 12.159, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 4.681818181818182, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 1.6640797865406288e-05, | |
| "loss": 0.3012, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 4.7272727272727275, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 1.22440160097817e-05, | |
| "loss": 0.3019, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 4.7727272727272725, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 8.513450158049108e-06, | |
| "loss": 0.2667, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 4.818181818181818, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 5.454195814427021e-06, | |
| "loss": 0.2781, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 4.863636363636363, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 3.0704315523631954e-06, | |
| "loss": 0.281, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 4.909090909090909, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 1.3654133071059894e-06, | |
| "loss": 0.2935, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 4.909090909090909, | |
| "eval_loss": 0.26116234064102173, | |
| "eval_runtime": 1.4806, | |
| "eval_samples_per_second": 95.232, | |
| "eval_steps_per_second": 12.157, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 4.954545454545455, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 3.4146992848854695e-07, | |
| "loss": 0.2806, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 0.0, | |
| "loss": 0.2668, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 110, | |
| "total_flos": 5704372783549440.0, | |
| "train_loss": 1.2665127342397517, | |
| "train_runtime": 277.3258, | |
| "train_samples_per_second": 25.349, | |
| "train_steps_per_second": 0.397 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 110, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 6, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 5, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5704372783549440.0, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |