| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.998745294855709, |
| "eval_steps": 100, |
| "global_step": 398, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0, |
| "grad_norm": 0.1427878886461258, |
| "learning_rate": 2e-05, |
| "loss": 1.3359, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0, |
| "eval_loss": 1.324710488319397, |
| "eval_runtime": 82.0652, |
| "eval_samples_per_second": 31.67, |
| "eval_steps_per_second": 31.67, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.13537771999835968, |
| "learning_rate": 4e-05, |
| "loss": 1.2865, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.14623422920703888, |
| "learning_rate": 6e-05, |
| "loss": 1.3192, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.15388095378875732, |
| "learning_rate": 8e-05, |
| "loss": 1.3244, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.1628686636686325, |
| "learning_rate": 0.0001, |
| "loss": 1.3, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.20623335242271423, |
| "learning_rate": 0.00012, |
| "loss": 1.244, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.1510678231716156, |
| "learning_rate": 0.00014, |
| "loss": 1.2799, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.15237094461917877, |
| "learning_rate": 0.00016, |
| "loss": 1.2979, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.15166334807872772, |
| "learning_rate": 0.00018, |
| "loss": 1.28, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.17794868350028992, |
| "learning_rate": 0.0002, |
| "loss": 1.2298, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.25811436772346497, |
| "learning_rate": 0.0001999998028228211, |
| "loss": 1.1909, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.19142530858516693, |
| "learning_rate": 0.000199999211292062, |
| "loss": 1.178, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.1891462802886963, |
| "learning_rate": 0.00019999822541005537, |
| "loss": 1.1173, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.17077742516994476, |
| "learning_rate": 0.00019999684518068916, |
| "loss": 1.2092, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.15135815739631653, |
| "learning_rate": 0.00019999507060940625, |
| "loss": 1.1439, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.1767009049654007, |
| "learning_rate": 0.00019999290170320485, |
| "loss": 1.1408, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.1310850977897644, |
| "learning_rate": 0.00019999033847063811, |
| "loss": 1.2369, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.12432192265987396, |
| "learning_rate": 0.00019998738092181421, |
| "loss": 1.152, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.12430022656917572, |
| "learning_rate": 0.00019998402906839643, |
| "loss": 1.2111, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.12175025045871735, |
| "learning_rate": 0.00019998028292360286, |
| "loss": 1.1686, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.11878372728824615, |
| "learning_rate": 0.0001999761425022067, |
| "loss": 1.2452, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.11329779773950577, |
| "learning_rate": 0.00019997160782053578, |
| "loss": 1.0964, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.11987729370594025, |
| "learning_rate": 0.00019996667889647288, |
| "loss": 1.1809, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.12245086580514908, |
| "learning_rate": 0.00019996135574945544, |
| "loss": 1.1138, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.1399640142917633, |
| "learning_rate": 0.00019995563840047542, |
| "loss": 1.184, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.13597123324871063, |
| "learning_rate": 0.00019994952687207954, |
| "loss": 1.1872, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.13976556062698364, |
| "learning_rate": 0.00019994302118836883, |
| "loss": 1.1685, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.13106240332126617, |
| "learning_rate": 0.00019993612137499876, |
| "loss": 1.1872, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.12896399199962616, |
| "learning_rate": 0.00019992882745917902, |
| "loss": 1.1462, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.13873620331287384, |
| "learning_rate": 0.00019992113946967353, |
| "loss": 1.1742, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.14103546738624573, |
| "learning_rate": 0.00019991305743680013, |
| "loss": 1.1245, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.1377720981836319, |
| "learning_rate": 0.00019990458139243077, |
| "loss": 1.2045, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.13191157579421997, |
| "learning_rate": 0.000199895711369991, |
| "loss": 1.1716, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.13426551222801208, |
| "learning_rate": 0.00019988644740446022, |
| "loss": 1.1382, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.13733097910881042, |
| "learning_rate": 0.00019987678953237127, |
| "loss": 1.1677, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.12618272006511688, |
| "learning_rate": 0.00019986673779181033, |
| "loss": 1.2195, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.13636991381645203, |
| "learning_rate": 0.00019985629222241694, |
| "loss": 1.1577, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.13234035670757294, |
| "learning_rate": 0.0001998454528653836, |
| "loss": 1.1089, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.1395445317029953, |
| "learning_rate": 0.00019983421976345586, |
| "loss": 1.139, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.1284484714269638, |
| "learning_rate": 0.0001998225929609319, |
| "loss": 1.117, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.13304275274276733, |
| "learning_rate": 0.00019981057250366253, |
| "loss": 1.161, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.13184913992881775, |
| "learning_rate": 0.00019979815843905097, |
| "loss": 1.1826, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.12830235064029694, |
| "learning_rate": 0.0001997853508160526, |
| "loss": 1.0739, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.1346379965543747, |
| "learning_rate": 0.0001997721496851748, |
| "loss": 1.191, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.13036642968654633, |
| "learning_rate": 0.00019975855509847686, |
| "loss": 1.1361, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.12707848846912384, |
| "learning_rate": 0.00019974456710956964, |
| "loss": 1.101, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.12984970211982727, |
| "learning_rate": 0.00019973018577361536, |
| "loss": 1.1085, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.12627972662448883, |
| "learning_rate": 0.00019971541114732741, |
| "loss": 1.1607, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.13074152171611786, |
| "learning_rate": 0.00019970024328897022, |
| "loss": 1.1004, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.1309152990579605, |
| "learning_rate": 0.0001996846822583589, |
| "loss": 1.1378, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.1303664743900299, |
| "learning_rate": 0.000199668728116859, |
| "loss": 1.0956, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.13290388882160187, |
| "learning_rate": 0.00019965238092738643, |
| "loss": 1.1264, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.12805409729480743, |
| "learning_rate": 0.00019963564075440703, |
| "loss": 1.183, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.1399564892053604, |
| "learning_rate": 0.0001996185076639364, |
| "loss": 1.1102, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.12978173792362213, |
| "learning_rate": 0.00019960098172353962, |
| "loss": 1.1634, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.13925811648368835, |
| "learning_rate": 0.00019958306300233098, |
| "loss": 1.0636, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.13258852064609528, |
| "learning_rate": 0.00019956475157097378, |
| "loss": 1.1428, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.1285356879234314, |
| "learning_rate": 0.00019954604750167993, |
| "loss": 1.1664, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.1321210116147995, |
| "learning_rate": 0.00019952695086820975, |
| "loss": 1.1419, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.14086973667144775, |
| "learning_rate": 0.00019950746174587163, |
| "loss": 1.1827, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.1311366856098175, |
| "learning_rate": 0.0001994875802115218, |
| "loss": 1.1971, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.14063993096351624, |
| "learning_rate": 0.0001994673063435639, |
| "loss": 1.1945, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.12695981562137604, |
| "learning_rate": 0.00019944664022194885, |
| "loss": 1.0385, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.14170674979686737, |
| "learning_rate": 0.0001994255819281744, |
| "loss": 1.0883, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.13162197172641754, |
| "learning_rate": 0.0001994041315452849, |
| "loss": 1.153, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.1326906979084015, |
| "learning_rate": 0.0001993822891578708, |
| "loss": 1.1186, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.13306689262390137, |
| "learning_rate": 0.00019936005485206851, |
| "loss": 1.1587, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.13625258207321167, |
| "learning_rate": 0.00019933742871556, |
| "loss": 1.1339, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.13773800432682037, |
| "learning_rate": 0.00019931441083757245, |
| "loss": 1.1944, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.15291447937488556, |
| "learning_rate": 0.00019929100130887782, |
| "loss": 1.1028, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.15140767395496368, |
| "learning_rate": 0.0001992672002217926, |
| "loss": 1.1896, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.1344233751296997, |
| "learning_rate": 0.0001992430076701775, |
| "loss": 1.0561, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.13877920806407928, |
| "learning_rate": 0.0001992184237494368, |
| "loss": 1.1108, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.1359027922153473, |
| "learning_rate": 0.00019919344855651833, |
| "loss": 1.1563, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.14610135555267334, |
| "learning_rate": 0.0001991680821899128, |
| "loss": 1.1299, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.14259958267211914, |
| "learning_rate": 0.00019914232474965365, |
| "loss": 1.1021, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.14158602058887482, |
| "learning_rate": 0.00019911617633731638, |
| "loss": 1.0787, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.1418074518442154, |
| "learning_rate": 0.00019908963705601846, |
| "loss": 1.1359, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.12850767374038696, |
| "learning_rate": 0.0001990627070104187, |
| "loss": 1.1373, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.1312914341688156, |
| "learning_rate": 0.0001990353863067169, |
| "loss": 1.0832, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.13280583918094635, |
| "learning_rate": 0.0001990076750526534, |
| "loss": 1.0462, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.13617292046546936, |
| "learning_rate": 0.00019897957335750878, |
| "loss": 1.1059, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.15030132234096527, |
| "learning_rate": 0.00019895108133210335, |
| "loss": 1.0761, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.14291270077228546, |
| "learning_rate": 0.00019892219908879653, |
| "loss": 1.1217, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.1685461699962616, |
| "learning_rate": 0.00019889292674148682, |
| "loss": 1.1607, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.13756121695041656, |
| "learning_rate": 0.00019886326440561093, |
| "loss": 1.0914, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.13901358842849731, |
| "learning_rate": 0.0001988332121981436, |
| "loss": 1.1234, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.13816247880458832, |
| "learning_rate": 0.00019880277023759702, |
| "loss": 1.1583, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.13309679925441742, |
| "learning_rate": 0.00019877193864402038, |
| "loss": 1.163, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.13356180489063263, |
| "learning_rate": 0.0001987407175389994, |
| "loss": 1.1301, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.1388397067785263, |
| "learning_rate": 0.00019870910704565588, |
| "loss": 1.1326, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.13303454220294952, |
| "learning_rate": 0.0001986771072886472, |
| "loss": 1.0779, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.1316283941268921, |
| "learning_rate": 0.00019864471839416576, |
| "loss": 1.0935, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.1348309963941574, |
| "learning_rate": 0.00019861194048993863, |
| "loss": 1.1816, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.1341564655303955, |
| "learning_rate": 0.00019857877370522685, |
| "loss": 1.1187, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.13689687848091125, |
| "learning_rate": 0.0001985452181708251, |
| "loss": 1.1637, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.13348707556724548, |
| "learning_rate": 0.0001985112740190611, |
| "loss": 1.1026, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.13700643181800842, |
| "learning_rate": 0.00019847694138379506, |
| "loss": 1.1508, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.13654476404190063, |
| "learning_rate": 0.00019844222040041928, |
| "loss": 1.1668, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.15331624448299408, |
| "learning_rate": 0.0001984071112058574, |
| "loss": 1.1121, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.25, |
| "eval_loss": 1.1294280290603638, |
| "eval_runtime": 81.6595, |
| "eval_samples_per_second": 31.827, |
| "eval_steps_per_second": 31.827, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.14425526559352875, |
| "learning_rate": 0.0001983716139385641, |
| "loss": 1.1447, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.13741208612918854, |
| "learning_rate": 0.00019833572873852444, |
| "loss": 1.1001, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.1282232254743576, |
| "learning_rate": 0.0001982994557472532, |
| "loss": 1.1199, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.13605354726314545, |
| "learning_rate": 0.00019826279510779454, |
| "loss": 1.154, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.13503985106945038, |
| "learning_rate": 0.00019822574696472126, |
| "loss": 1.0565, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.13878273963928223, |
| "learning_rate": 0.00019818831146413434, |
| "loss": 1.106, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.141740083694458, |
| "learning_rate": 0.00019815048875366234, |
| "loss": 1.0848, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.13799507915973663, |
| "learning_rate": 0.0001981122789824607, |
| "loss": 1.1582, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.1441466212272644, |
| "learning_rate": 0.0001980736823012114, |
| "loss": 1.0787, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.1377534121274948, |
| "learning_rate": 0.0001980346988621221, |
| "loss": 1.1092, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.1400901973247528, |
| "learning_rate": 0.00019799532881892564, |
| "loss": 1.0549, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.13621239364147186, |
| "learning_rate": 0.00019795557232687956, |
| "loss": 1.0991, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.1324262171983719, |
| "learning_rate": 0.0001979154295427653, |
| "loss": 1.0583, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.13273654878139496, |
| "learning_rate": 0.0001978749006248877, |
| "loss": 1.1504, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.14279481768608093, |
| "learning_rate": 0.00019783398573307428, |
| "loss": 1.0941, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.1432316154241562, |
| "learning_rate": 0.00019779268502867473, |
| "loss": 1.1111, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.14505276083946228, |
| "learning_rate": 0.00019775099867456013, |
| "loss": 1.0941, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.13935014605522156, |
| "learning_rate": 0.0001977089268351225, |
| "loss": 1.0597, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.14532430469989777, |
| "learning_rate": 0.0001976664696762739, |
| "loss": 1.1116, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.14096760749816895, |
| "learning_rate": 0.00019762362736544607, |
| "loss": 1.1381, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.1470746099948883, |
| "learning_rate": 0.00019758040007158948, |
| "loss": 1.1215, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.13610850274562836, |
| "learning_rate": 0.00019753678796517282, |
| "loss": 1.136, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.1399529129266739, |
| "learning_rate": 0.00019749279121818235, |
| "loss": 1.1035, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.13626012206077576, |
| "learning_rate": 0.00019744841000412123, |
| "loss": 1.1248, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.13053762912750244, |
| "learning_rate": 0.0001974036444980086, |
| "loss": 1.1286, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.14427675306797028, |
| "learning_rate": 0.00019735849487637929, |
| "loss": 1.2792, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.14464688301086426, |
| "learning_rate": 0.0001973129613172827, |
| "loss": 1.1091, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.12712322175502777, |
| "learning_rate": 0.0001972670440002825, |
| "loss": 1.1219, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.13343971967697144, |
| "learning_rate": 0.00019722074310645553, |
| "loss": 1.1296, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.15525247156620026, |
| "learning_rate": 0.00019717405881839145, |
| "loss": 1.159, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.12908953428268433, |
| "learning_rate": 0.0001971269913201918, |
| "loss": 1.0821, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.24165280163288116, |
| "learning_rate": 0.00019707954079746927, |
| "loss": 1.1388, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.1432817280292511, |
| "learning_rate": 0.00019703170743734706, |
| "loss": 1.1184, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.14007362723350525, |
| "learning_rate": 0.00019698349142845814, |
| "loss": 1.1576, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.14235983788967133, |
| "learning_rate": 0.00019693489296094443, |
| "loss": 1.0847, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.1430092453956604, |
| "learning_rate": 0.00019688591222645607, |
| "loss": 1.1562, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.13986627757549286, |
| "learning_rate": 0.00019683654941815077, |
| "loss": 1.124, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.13933469355106354, |
| "learning_rate": 0.00019678680473069293, |
| "loss": 1.1001, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.13476844131946564, |
| "learning_rate": 0.00019673667836025283, |
| "loss": 1.1186, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.13418316841125488, |
| "learning_rate": 0.00019668617050450603, |
| "loss": 1.1309, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.12794847786426544, |
| "learning_rate": 0.00019663528136263246, |
| "loss": 1.1142, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.1326293647289276, |
| "learning_rate": 0.00019658401113531565, |
| "loss": 1.0503, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.14793147146701813, |
| "learning_rate": 0.000196532360024742, |
| "loss": 1.2104, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.13718444108963013, |
| "learning_rate": 0.00019648032823459994, |
| "loss": 1.1685, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.14404018223285675, |
| "learning_rate": 0.00019642791597007902, |
| "loss": 1.09, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.14241506159305573, |
| "learning_rate": 0.00019637512343786937, |
| "loss": 1.1355, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.14581352472305298, |
| "learning_rate": 0.00019632195084616063, |
| "loss": 1.1005, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.14792676270008087, |
| "learning_rate": 0.00019626839840464119, |
| "loss": 1.1168, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.1484677940607071, |
| "learning_rate": 0.00019621446632449744, |
| "loss": 1.1138, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.15315671265125275, |
| "learning_rate": 0.0001961601548184129, |
| "loss": 1.1636, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.14746810495853424, |
| "learning_rate": 0.0001961054641005674, |
| "loss": 1.0881, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.1407732516527176, |
| "learning_rate": 0.00019605039438663614, |
| "loss": 1.0347, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.14150719344615936, |
| "learning_rate": 0.0001959949458937889, |
| "loss": 1.1112, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.16782569885253906, |
| "learning_rate": 0.0001959391188406893, |
| "loss": 1.0496, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.1452791690826416, |
| "learning_rate": 0.0001958829134474937, |
| "loss": 1.1185, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.145284965634346, |
| "learning_rate": 0.00019582632993585052, |
| "loss": 1.1431, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.15500612556934357, |
| "learning_rate": 0.00019576936852889936, |
| "loss": 1.1679, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.1416521966457367, |
| "learning_rate": 0.00019571202945126994, |
| "loss": 1.1322, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.1465340405702591, |
| "learning_rate": 0.00019565431292908146, |
| "loss": 1.0693, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.13601765036582947, |
| "learning_rate": 0.0001955962191899415, |
| "loss": 1.0676, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.14759162068367004, |
| "learning_rate": 0.0001955377484629453, |
| "loss": 1.0506, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.14839032292366028, |
| "learning_rate": 0.00019547890097867468, |
| "loss": 1.1245, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.1440214365720749, |
| "learning_rate": 0.0001954196769691973, |
| "loss": 1.1672, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.1372719258069992, |
| "learning_rate": 0.00019536007666806556, |
| "loss": 1.1084, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.14372558891773224, |
| "learning_rate": 0.00019530010031031586, |
| "loss": 1.1679, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.13789264857769012, |
| "learning_rate": 0.00019523974813246767, |
| "loss": 1.1253, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.14368915557861328, |
| "learning_rate": 0.0001951790203725223, |
| "loss": 1.085, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.1380469799041748, |
| "learning_rate": 0.00019511791726996243, |
| "loss": 1.1379, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.13288158178329468, |
| "learning_rate": 0.00019505643906575073, |
| "loss": 1.113, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.1390606164932251, |
| "learning_rate": 0.0001949945860023292, |
| "loss": 1.095, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.14271940290927887, |
| "learning_rate": 0.0001949323583236181, |
| "loss": 1.1063, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.13795693218708038, |
| "learning_rate": 0.00019486975627501502, |
| "loss": 1.0628, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.14073535799980164, |
| "learning_rate": 0.0001948067801033938, |
| "loss": 1.1192, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.138822540640831, |
| "learning_rate": 0.0001947434300571038, |
| "loss": 1.1299, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.13592712581157684, |
| "learning_rate": 0.0001946797063859686, |
| "loss": 1.0868, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.1379610300064087, |
| "learning_rate": 0.00019461560934128533, |
| "loss": 1.069, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.14286787807941437, |
| "learning_rate": 0.00019455113917582346, |
| "loss": 1.139, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.14168201386928558, |
| "learning_rate": 0.0001944862961438239, |
| "loss": 1.1405, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.1345077008008957, |
| "learning_rate": 0.000194421080500998, |
| "loss": 1.1039, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.1363426297903061, |
| "learning_rate": 0.00019435549250452645, |
| "loss": 1.1056, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.14109478890895844, |
| "learning_rate": 0.00019428953241305838, |
| "loss": 1.0927, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.14332321286201477, |
| "learning_rate": 0.0001942232004867103, |
| "loss": 1.0305, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.15956294536590576, |
| "learning_rate": 0.00019415649698706507, |
| "loss": 1.1245, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.14164718985557556, |
| "learning_rate": 0.0001940894221771708, |
| "loss": 1.0963, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.14296875894069672, |
| "learning_rate": 0.00019402197632153992, |
| "loss": 1.0853, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.12994709610939026, |
| "learning_rate": 0.00019395415968614813, |
| "loss": 1.0503, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.1399766504764557, |
| "learning_rate": 0.00019388597253843334, |
| "loss": 1.0623, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.14874404668807983, |
| "learning_rate": 0.00019381741514729443, |
| "loss": 1.0885, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.1453857719898224, |
| "learning_rate": 0.00019374848778309055, |
| "loss": 1.1702, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.14976643025875092, |
| "learning_rate": 0.0001936791907176397, |
| "loss": 1.0834, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.1418897956609726, |
| "learning_rate": 0.00019360952422421793, |
| "loss": 1.0918, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.14602817595005035, |
| "learning_rate": 0.00019353948857755803, |
| "loss": 1.0825, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.14669157564640045, |
| "learning_rate": 0.00019346908405384867, |
| "loss": 1.0973, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.14327263832092285, |
| "learning_rate": 0.00019339831093073318, |
| "loss": 1.1191, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.13806897401809692, |
| "learning_rate": 0.0001933271694873084, |
| "loss": 1.1504, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.13992969691753387, |
| "learning_rate": 0.00019325566000412376, |
| "loss": 1.0865, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.14395759999752045, |
| "learning_rate": 0.00019318378276318, |
| "loss": 1.1204, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.1409691572189331, |
| "learning_rate": 0.0001931115380479281, |
| "loss": 1.0766, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.1448824405670166, |
| "learning_rate": 0.00019303892614326836, |
| "loss": 1.1741, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.142364963889122, |
| "learning_rate": 0.00019296594733554892, |
| "loss": 1.1716, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5, |
| "eval_loss": 1.109603762626648, |
| "eval_runtime": 81.7249, |
| "eval_samples_per_second": 31.802, |
| "eval_steps_per_second": 31.802, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.1372615098953247, |
| "learning_rate": 0.00019289260191256483, |
| "loss": 1.1084, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.13863563537597656, |
| "learning_rate": 0.0001928188901635571, |
| "loss": 1.0546, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.13055531680583954, |
| "learning_rate": 0.00019274481237921114, |
| "loss": 1.018, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.14135099947452545, |
| "learning_rate": 0.00019267036885165588, |
| "loss": 1.1131, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.14308464527130127, |
| "learning_rate": 0.0001925955598744627, |
| "loss": 1.0723, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.13907764852046967, |
| "learning_rate": 0.00019252038574264405, |
| "loss": 1.1607, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.13771073520183563, |
| "learning_rate": 0.00019244484675265232, |
| "loss": 1.172, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.13774815201759338, |
| "learning_rate": 0.00019236894320237894, |
| "loss": 1.0622, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.1426474153995514, |
| "learning_rate": 0.0001922926753911527, |
| "loss": 1.0368, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.1380661278963089, |
| "learning_rate": 0.00019221604361973919, |
| "loss": 1.0873, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.14044702053070068, |
| "learning_rate": 0.00019213904819033903, |
| "loss": 1.0901, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.1415887176990509, |
| "learning_rate": 0.00019206168940658712, |
| "loss": 1.1061, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.1580592840909958, |
| "learning_rate": 0.00019198396757355118, |
| "loss": 1.1073, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.14094668626785278, |
| "learning_rate": 0.00019190588299773062, |
| "loss": 1.1781, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.14229640364646912, |
| "learning_rate": 0.00019182743598705542, |
| "loss": 1.1095, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.140314981341362, |
| "learning_rate": 0.00019174862685088472, |
| "loss": 1.1534, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.160028338432312, |
| "learning_rate": 0.00019166945590000584, |
| "loss": 1.087, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.14278572797775269, |
| "learning_rate": 0.0001915899234466328, |
| "loss": 1.1583, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.13695856928825378, |
| "learning_rate": 0.0001915100298044054, |
| "loss": 1.1151, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.14235751330852509, |
| "learning_rate": 0.00019142977528838762, |
| "loss": 1.1111, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.15174664556980133, |
| "learning_rate": 0.00019134916021506666, |
| "loss": 1.1438, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.15249325335025787, |
| "learning_rate": 0.0001912681849023516, |
| "loss": 1.1575, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.14303787052631378, |
| "learning_rate": 0.00019118684966957207, |
| "loss": 1.1302, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.1405183970928192, |
| "learning_rate": 0.00019110515483747716, |
| "loss": 1.1157, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.1475205421447754, |
| "learning_rate": 0.00019102310072823393, |
| "loss": 1.1175, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 0.14406634867191315, |
| "learning_rate": 0.0001909406876654264, |
| "loss": 1.0578, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 0.13999773561954498, |
| "learning_rate": 0.00019085791597405404, |
| "loss": 1.0865, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 0.1409848928451538, |
| "learning_rate": 0.00019077478598053063, |
| "loss": 1.1297, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 0.14548417925834656, |
| "learning_rate": 0.00019069129801268294, |
| "loss": 1.1524, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.13622736930847168, |
| "learning_rate": 0.00019060745239974936, |
| "loss": 1.0744, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.14302954077720642, |
| "learning_rate": 0.0001905232494723788, |
| "loss": 1.1469, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.15202221274375916, |
| "learning_rate": 0.0001904386895626291, |
| "loss": 1.0693, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.14072120189666748, |
| "learning_rate": 0.00019035377300396597, |
| "loss": 1.0584, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.13941141963005066, |
| "learning_rate": 0.00019026850013126157, |
| "loss": 1.1257, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.1389845460653305, |
| "learning_rate": 0.0001901828712807932, |
| "loss": 1.0003, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.1431329846382141, |
| "learning_rate": 0.0001900968867902419, |
| "loss": 1.0795, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.15022633969783783, |
| "learning_rate": 0.00019001054699869133, |
| "loss": 1.1427, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.1578160673379898, |
| "learning_rate": 0.00018992385224662623, |
| "loss": 1.13, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.13778769969940186, |
| "learning_rate": 0.00018983680287593105, |
| "loss": 1.0739, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.1454969048500061, |
| "learning_rate": 0.00018974939922988883, |
| "loss": 1.0864, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.13545964658260345, |
| "learning_rate": 0.00018966164165317966, |
| "loss": 1.0169, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 0.13648608326911926, |
| "learning_rate": 0.00018957353049187936, |
| "loss": 1.0732, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 0.14080677926540375, |
| "learning_rate": 0.00018948506609345813, |
| "loss": 1.0579, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 0.14503297209739685, |
| "learning_rate": 0.00018939624880677918, |
| "loss": 1.0755, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 0.15316741168498993, |
| "learning_rate": 0.00018930707898209733, |
| "loss": 1.0885, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 0.14839263260364532, |
| "learning_rate": 0.0001892175569710577, |
| "loss": 1.121, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 0.13919925689697266, |
| "learning_rate": 0.00018912768312669424, |
| "loss": 1.1039, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 0.13975974917411804, |
| "learning_rate": 0.00018903745780342839, |
| "loss": 1.1454, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 0.13851100206375122, |
| "learning_rate": 0.0001889468813570676, |
| "loss": 1.0905, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 0.14839564263820648, |
| "learning_rate": 0.00018885595414480405, |
| "loss": 1.1002, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 0.1421942263841629, |
| "learning_rate": 0.00018876467652521317, |
| "loss": 1.093, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 0.14453786611557007, |
| "learning_rate": 0.0001886730488582522, |
| "loss": 1.0278, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 0.13856688141822815, |
| "learning_rate": 0.0001885810715052589, |
| "loss": 1.079, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.14092479646205902, |
| "learning_rate": 0.00018848874482894993, |
| "loss": 1.0608, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.14616413414478302, |
| "learning_rate": 0.0001883960691934196, |
| "loss": 1.1097, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.1410474181175232, |
| "learning_rate": 0.00018830304496413822, |
| "loss": 1.0577, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.15473878383636475, |
| "learning_rate": 0.000188209672507951, |
| "loss": 1.1453, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 0.14370983839035034, |
| "learning_rate": 0.00018811595219307622, |
| "loss": 1.1732, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 0.14861780405044556, |
| "learning_rate": 0.00018802188438910405, |
| "loss": 1.1471, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 0.1523188352584839, |
| "learning_rate": 0.000187927469466995, |
| "loss": 1.129, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 0.14366289973258972, |
| "learning_rate": 0.00018783270779907838, |
| "loss": 1.0792, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 0.1363295018672943, |
| "learning_rate": 0.00018773759975905098, |
| "loss": 0.9848, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 0.1438857764005661, |
| "learning_rate": 0.00018764214572197552, |
| "loss": 1.1371, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 0.13751162588596344, |
| "learning_rate": 0.00018754634606427914, |
| "loss": 1.0557, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 0.1384708732366562, |
| "learning_rate": 0.00018745020116375197, |
| "loss": 1.0664, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 0.14196960628032684, |
| "learning_rate": 0.00018735371139954558, |
| "loss": 1.0828, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 0.15374121069908142, |
| "learning_rate": 0.00018725687715217163, |
| "loss": 1.073, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 0.14955537021160126, |
| "learning_rate": 0.0001871596988035001, |
| "loss": 1.1444, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 0.13760650157928467, |
| "learning_rate": 0.00018706217673675811, |
| "loss": 1.088, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 0.17072008550167084, |
| "learning_rate": 0.00018696431133652817, |
| "loss": 1.07, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 0.14745061099529266, |
| "learning_rate": 0.00018686610298874676, |
| "loss": 1.1105, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 0.14695587754249573, |
| "learning_rate": 0.00018676755208070275, |
| "loss": 1.0612, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 0.15686020255088806, |
| "learning_rate": 0.00018666865900103597, |
| "loss": 1.0933, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 0.14162233471870422, |
| "learning_rate": 0.00018656942413973555, |
| "loss": 1.0832, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 0.14662939310073853, |
| "learning_rate": 0.00018646984788813856, |
| "loss": 1.1175, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 0.13886839151382446, |
| "learning_rate": 0.0001863699306389282, |
| "loss": 1.1221, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 0.13897326588630676, |
| "learning_rate": 0.00018626967278613253, |
| "loss": 1.0767, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 0.13283655047416687, |
| "learning_rate": 0.0001861690747251228, |
| "loss": 1.1397, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 0.14036604762077332, |
| "learning_rate": 0.0001860681368526118, |
| "loss": 1.0965, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 0.1449379026889801, |
| "learning_rate": 0.00018596685956665245, |
| "loss": 1.1262, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 0.14264287054538727, |
| "learning_rate": 0.00018586524326663615, |
| "loss": 1.1317, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 0.14677459001541138, |
| "learning_rate": 0.00018576328835329117, |
| "loss": 1.0785, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 0.14834077656269073, |
| "learning_rate": 0.00018566099522868119, |
| "loss": 1.0892, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 0.15325355529785156, |
| "learning_rate": 0.00018555836429620358, |
| "loss": 1.0843, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.14825651049613953, |
| "learning_rate": 0.00018545539596058795, |
| "loss": 1.1288, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.14722499251365662, |
| "learning_rate": 0.00018535209062789433, |
| "loss": 1.1391, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.14388781785964966, |
| "learning_rate": 0.00018524844870551185, |
| "loss": 1.1013, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.1455835998058319, |
| "learning_rate": 0.00018514447060215698, |
| "loss": 1.0811, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 0.14625433087348938, |
| "learning_rate": 0.00018504015672787184, |
| "loss": 1.0854, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 0.13978470861911774, |
| "learning_rate": 0.00018493550749402278, |
| "loss": 1.1398, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 0.1447162628173828, |
| "learning_rate": 0.00018483052331329857, |
| "loss": 1.0553, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 0.13894303143024445, |
| "learning_rate": 0.00018472520459970898, |
| "loss": 1.0305, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 0.1372181624174118, |
| "learning_rate": 0.00018461955176858285, |
| "loss": 1.021, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 0.14599645137786865, |
| "learning_rate": 0.0001845135652365668, |
| "loss": 1.0808, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 0.1599220335483551, |
| "learning_rate": 0.00018440724542162328, |
| "loss": 1.1143, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 0.1450476050376892, |
| "learning_rate": 0.00018430059274302917, |
| "loss": 1.0508, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.1439283937215805, |
| "learning_rate": 0.00018419360762137395, |
| "loss": 1.0592, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.1410531848669052, |
| "learning_rate": 0.00018408629047855804, |
| "loss": 1.0632, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.1468774974346161, |
| "learning_rate": 0.00018397864173779133, |
| "loss": 1.056, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.1467033177614212, |
| "learning_rate": 0.00018387066182359133, |
| "loss": 1.1122, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.75, |
| "eval_loss": 1.0955116748809814, |
| "eval_runtime": 81.7775, |
| "eval_samples_per_second": 31.781, |
| "eval_steps_per_second": 31.781, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.14950688183307648, |
| "learning_rate": 0.00018376235116178148, |
| "loss": 1.0698, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.142381951212883, |
| "learning_rate": 0.00018365371017948964, |
| "loss": 1.0528, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.1410701423883438, |
| "learning_rate": 0.0001835447393051463, |
| "loss": 1.0785, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.14708860218524933, |
| "learning_rate": 0.00018343543896848273, |
| "loss": 1.0142, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 0.1467617303133011, |
| "learning_rate": 0.00018332580960052965, |
| "loss": 1.0973, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 0.15761792659759521, |
| "learning_rate": 0.00018321585163361527, |
| "loss": 1.1745, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 0.13972119987010956, |
| "learning_rate": 0.00018310556550136357, |
| "loss": 1.0832, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 0.1481141895055771, |
| "learning_rate": 0.00018299495163869275, |
| "loss": 1.1573, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 0.14397870004177094, |
| "learning_rate": 0.0001828840104818134, |
| "loss": 1.171, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 0.14765049517154694, |
| "learning_rate": 0.0001827727424682268, |
| "loss": 1.0544, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 0.14956365525722504, |
| "learning_rate": 0.00018266114803672318, |
| "loss": 1.1755, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 0.15122386813163757, |
| "learning_rate": 0.00018254922762738008, |
| "loss": 1.1547, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 0.14254115521907806, |
| "learning_rate": 0.00018243698168156054, |
| "loss": 1.1075, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 0.14294452965259552, |
| "learning_rate": 0.00018232441064191125, |
| "loss": 1.1419, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 0.14777772128582, |
| "learning_rate": 0.0001822115149523611, |
| "loss": 1.1662, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 0.14944781363010406, |
| "learning_rate": 0.0001820982950581191, |
| "loss": 1.1497, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.1466801017522812, |
| "learning_rate": 0.00018198475140567287, |
| "loss": 1.1374, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.15346656739711761, |
| "learning_rate": 0.00018187088444278674, |
| "loss": 1.1356, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.15271005034446716, |
| "learning_rate": 0.00018175669461850005, |
| "loss": 1.0845, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.14452996850013733, |
| "learning_rate": 0.00018164218238312535, |
| "loss": 1.1162, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 0.14632536470890045, |
| "learning_rate": 0.00018152734818824658, |
| "loss": 1.0187, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 0.14935997128486633, |
| "learning_rate": 0.00018141219248671745, |
| "loss": 1.1167, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 0.14043933153152466, |
| "learning_rate": 0.0001812967157326595, |
| "loss": 1.0044, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 0.14850106835365295, |
| "learning_rate": 0.00018118091838146029, |
| "loss": 1.1226, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 0.14655061066150665, |
| "learning_rate": 0.00018106480088977172, |
| "loss": 1.0508, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 0.14721763134002686, |
| "learning_rate": 0.00018094836371550824, |
| "loss": 1.0659, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 0.1433349996805191, |
| "learning_rate": 0.00018083160731784486, |
| "loss": 1.147, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 0.13528144359588623, |
| "learning_rate": 0.00018071453215721554, |
| "loss": 1.0388, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 0.15466062724590302, |
| "learning_rate": 0.0001805971386953113, |
| "loss": 1.0649, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 0.15163114666938782, |
| "learning_rate": 0.00018047942739507836, |
| "loss": 1.1454, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 0.14693276584148407, |
| "learning_rate": 0.0001803613987207163, |
| "loss": 1.1137, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 0.14229321479797363, |
| "learning_rate": 0.00018024305313767646, |
| "loss": 1.0153, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 0.13863018155097961, |
| "learning_rate": 0.00018012439111265974, |
| "loss": 1.0491, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 0.1422068327665329, |
| "learning_rate": 0.000180005413113615, |
| "loss": 1.0952, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 0.1419857293367386, |
| "learning_rate": 0.00017988611960973713, |
| "loss": 1.0532, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 0.1446901261806488, |
| "learning_rate": 0.00017976651107146533, |
| "loss": 1.0477, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.14558811485767365, |
| "learning_rate": 0.00017964658797048108, |
| "loss": 1.1481, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.15488363802433014, |
| "learning_rate": 0.0001795263507797063, |
| "loss": 1.1302, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.14942613244056702, |
| "learning_rate": 0.00017940579997330165, |
| "loss": 1.0698, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.14417564868927002, |
| "learning_rate": 0.00017928493602666445, |
| "loss": 1.0867, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 0.14839497208595276, |
| "learning_rate": 0.0001791637594164269, |
| "loss": 1.0124, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 0.1415972113609314, |
| "learning_rate": 0.00017904227062045437, |
| "loss": 1.0958, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 0.143202543258667, |
| "learning_rate": 0.00017892047011784312, |
| "loss": 1.0808, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 0.14291773736476898, |
| "learning_rate": 0.00017879835838891875, |
| "loss": 1.1386, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 0.1504325121641159, |
| "learning_rate": 0.00017867593591523422, |
| "loss": 1.0804, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 0.1444767862558365, |
| "learning_rate": 0.00017855320317956784, |
| "loss": 1.1207, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 0.14493699371814728, |
| "learning_rate": 0.00017843016066592158, |
| "loss": 1.0954, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 0.14571166038513184, |
| "learning_rate": 0.00017830680885951887, |
| "loss": 1.0676, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.14583171904087067, |
| "learning_rate": 0.000178183148246803, |
| "loss": 1.0674, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.15080390870571136, |
| "learning_rate": 0.00017805917931543492, |
| "loss": 1.0757, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.14790864288806915, |
| "learning_rate": 0.00017793490255429157, |
| "loss": 1.1005, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.14861677587032318, |
| "learning_rate": 0.00017781031845346375, |
| "loss": 1.0645, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 0.15099036693572998, |
| "learning_rate": 0.00017768542750425426, |
| "loss": 1.1306, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 0.14353971183300018, |
| "learning_rate": 0.00017756023019917607, |
| "loss": 1.0834, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 0.14582550525665283, |
| "learning_rate": 0.00017743472703195015, |
| "loss": 1.0722, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 0.14268234372138977, |
| "learning_rate": 0.00017730891849750377, |
| "loss": 1.092, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 0.1424105316400528, |
| "learning_rate": 0.00017718280509196828, |
| "loss": 1.1355, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 0.13972117006778717, |
| "learning_rate": 0.0001770563873126775, |
| "loss": 1.0318, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 0.14622163772583008, |
| "learning_rate": 0.00017692966565816532, |
| "loss": 1.0985, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 0.13956372439861298, |
| "learning_rate": 0.0001768026406281642, |
| "loss": 1.102, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 0.14042189717292786, |
| "learning_rate": 0.0001766753127236029, |
| "loss": 1.0284, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 0.14376944303512573, |
| "learning_rate": 0.00017654768244660448, |
| "loss": 1.1452, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 0.14055544137954712, |
| "learning_rate": 0.00017641975030048454, |
| "loss": 1.0306, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 0.14599303901195526, |
| "learning_rate": 0.00017629151678974907, |
| "loss": 1.0838, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 0.1528831571340561, |
| "learning_rate": 0.00017616298242009251, |
| "loss": 1.1293, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 0.1404455453157425, |
| "learning_rate": 0.00017603414769839577, |
| "loss": 1.0425, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 0.14992842078208923, |
| "learning_rate": 0.00017590501313272415, |
| "loss": 1.0928, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 0.14540541172027588, |
| "learning_rate": 0.00017577557923232546, |
| "loss": 1.0366, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 0.1451583057641983, |
| "learning_rate": 0.00017564584650762793, |
| "loss": 1.1108, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 0.155447855591774, |
| "learning_rate": 0.00017551581547023819, |
| "loss": 1.1394, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 0.1441376656293869, |
| "learning_rate": 0.0001753854866329393, |
| "loss": 1.0264, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 0.13875485956668854, |
| "learning_rate": 0.00017525486050968875, |
| "loss": 1.0672, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 0.14158080518245697, |
| "learning_rate": 0.00017512393761561632, |
| "loss": 1.053, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 0.15505361557006836, |
| "learning_rate": 0.00017499271846702213, |
| "loss": 1.0713, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 0.14172373712062836, |
| "learning_rate": 0.0001748612035813747, |
| "loss": 1.0544, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 0.14016349613666534, |
| "learning_rate": 0.00017472939347730856, |
| "loss": 1.0382, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.15148378908634186, |
| "learning_rate": 0.00017459728867462275, |
| "loss": 1.1218, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.1416306346654892, |
| "learning_rate": 0.0001744648896942782, |
| "loss": 1.0895, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.14276988804340363, |
| "learning_rate": 0.00017433219705839616, |
| "loss": 1.0991, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.13922327756881714, |
| "learning_rate": 0.00017419921129025576, |
| "loss": 1.0883, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.1479676216840744, |
| "learning_rate": 0.00017406593291429217, |
| "loss": 1.1083, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.14659778773784637, |
| "learning_rate": 0.0001739323624560945, |
| "loss": 1.0863, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.14685633778572083, |
| "learning_rate": 0.00017379850044240368, |
| "loss": 1.1075, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.14316044747829437, |
| "learning_rate": 0.00017366434740111037, |
| "loss": 1.0584, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 0.14292864501476288, |
| "learning_rate": 0.00017352990386125292, |
| "loss": 1.1002, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 0.14412067830562592, |
| "learning_rate": 0.00017339517035301532, |
| "loss": 1.0671, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 0.14292089641094208, |
| "learning_rate": 0.000173260147407725, |
| "loss": 1.0958, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 0.1490335911512375, |
| "learning_rate": 0.00017312483555785086, |
| "loss": 1.1074, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 0.14249826967716217, |
| "learning_rate": 0.00017298923533700107, |
| "loss": 1.1546, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 0.14555396139621735, |
| "learning_rate": 0.000172853347279921, |
| "loss": 1.076, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 0.14374902844429016, |
| "learning_rate": 0.00017271717192249116, |
| "loss": 1.0767, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 0.14903804659843445, |
| "learning_rate": 0.00017258070980172494, |
| "loss": 1.0969, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 0.1533229798078537, |
| "learning_rate": 0.00017244396145576672, |
| "loss": 1.1206, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 0.14720167219638824, |
| "learning_rate": 0.0001723069274238895, |
| "loss": 1.0655, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 0.14380764961242676, |
| "learning_rate": 0.00017216960824649303, |
| "loss": 1.0123, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 0.14513961970806122, |
| "learning_rate": 0.0001720320044651014, |
| "loss": 1.0196, |
| "step": 396 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.14310909807682037, |
| "learning_rate": 0.0001718941166223612, |
| "loss": 1.0278, |
| "step": 397 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.14312389492988586, |
| "learning_rate": 0.00017175594526203905, |
| "loss": 1.0649, |
| "step": 398 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1592, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 398, |
| "total_flos": 2.96912159961514e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|