| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.9836888331242157, |
| "eval_steps": 100, |
| "global_step": 796, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0, |
| "grad_norm": 0.1427878886461258, |
| "learning_rate": 2e-05, |
| "loss": 1.3359, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0, |
| "eval_loss": 1.324710488319397, |
| "eval_runtime": 82.0652, |
| "eval_samples_per_second": 31.67, |
| "eval_steps_per_second": 31.67, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.13537771999835968, |
| "learning_rate": 4e-05, |
| "loss": 1.2865, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.14623422920703888, |
| "learning_rate": 6e-05, |
| "loss": 1.3192, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.15388095378875732, |
| "learning_rate": 8e-05, |
| "loss": 1.3244, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.1628686636686325, |
| "learning_rate": 0.0001, |
| "loss": 1.3, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.20623335242271423, |
| "learning_rate": 0.00012, |
| "loss": 1.244, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.1510678231716156, |
| "learning_rate": 0.00014, |
| "loss": 1.2799, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.15237094461917877, |
| "learning_rate": 0.00016, |
| "loss": 1.2979, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.15166334807872772, |
| "learning_rate": 0.00018, |
| "loss": 1.28, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.17794868350028992, |
| "learning_rate": 0.0002, |
| "loss": 1.2298, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.25811436772346497, |
| "learning_rate": 0.0001999998028228211, |
| "loss": 1.1909, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.19142530858516693, |
| "learning_rate": 0.000199999211292062, |
| "loss": 1.178, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.1891462802886963, |
| "learning_rate": 0.00019999822541005537, |
| "loss": 1.1173, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.17077742516994476, |
| "learning_rate": 0.00019999684518068916, |
| "loss": 1.2092, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.15135815739631653, |
| "learning_rate": 0.00019999507060940625, |
| "loss": 1.1439, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.1767009049654007, |
| "learning_rate": 0.00019999290170320485, |
| "loss": 1.1408, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.1310850977897644, |
| "learning_rate": 0.00019999033847063811, |
| "loss": 1.2369, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.12432192265987396, |
| "learning_rate": 0.00019998738092181421, |
| "loss": 1.152, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.12430022656917572, |
| "learning_rate": 0.00019998402906839643, |
| "loss": 1.2111, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.12175025045871735, |
| "learning_rate": 0.00019998028292360286, |
| "loss": 1.1686, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.11878372728824615, |
| "learning_rate": 0.0001999761425022067, |
| "loss": 1.2452, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.11329779773950577, |
| "learning_rate": 0.00019997160782053578, |
| "loss": 1.0964, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.11987729370594025, |
| "learning_rate": 0.00019996667889647288, |
| "loss": 1.1809, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.12245086580514908, |
| "learning_rate": 0.00019996135574945544, |
| "loss": 1.1138, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.1399640142917633, |
| "learning_rate": 0.00019995563840047542, |
| "loss": 1.184, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.13597123324871063, |
| "learning_rate": 0.00019994952687207954, |
| "loss": 1.1872, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.13976556062698364, |
| "learning_rate": 0.00019994302118836883, |
| "loss": 1.1685, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.13106240332126617, |
| "learning_rate": 0.00019993612137499876, |
| "loss": 1.1872, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.12896399199962616, |
| "learning_rate": 0.00019992882745917902, |
| "loss": 1.1462, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.13873620331287384, |
| "learning_rate": 0.00019992113946967353, |
| "loss": 1.1742, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.14103546738624573, |
| "learning_rate": 0.00019991305743680013, |
| "loss": 1.1245, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.1377720981836319, |
| "learning_rate": 0.00019990458139243077, |
| "loss": 1.2045, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.13191157579421997, |
| "learning_rate": 0.000199895711369991, |
| "loss": 1.1716, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.13426551222801208, |
| "learning_rate": 0.00019988644740446022, |
| "loss": 1.1382, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.13733097910881042, |
| "learning_rate": 0.00019987678953237127, |
| "loss": 1.1677, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.12618272006511688, |
| "learning_rate": 0.00019986673779181033, |
| "loss": 1.2195, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.13636991381645203, |
| "learning_rate": 0.00019985629222241694, |
| "loss": 1.1577, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.13234035670757294, |
| "learning_rate": 0.0001998454528653836, |
| "loss": 1.1089, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.1395445317029953, |
| "learning_rate": 0.00019983421976345586, |
| "loss": 1.139, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.1284484714269638, |
| "learning_rate": 0.0001998225929609319, |
| "loss": 1.117, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.13304275274276733, |
| "learning_rate": 0.00019981057250366253, |
| "loss": 1.161, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.13184913992881775, |
| "learning_rate": 0.00019979815843905097, |
| "loss": 1.1826, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.12830235064029694, |
| "learning_rate": 0.0001997853508160526, |
| "loss": 1.0739, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.1346379965543747, |
| "learning_rate": 0.0001997721496851748, |
| "loss": 1.191, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.13036642968654633, |
| "learning_rate": 0.00019975855509847686, |
| "loss": 1.1361, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.12707848846912384, |
| "learning_rate": 0.00019974456710956964, |
| "loss": 1.101, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.12984970211982727, |
| "learning_rate": 0.00019973018577361536, |
| "loss": 1.1085, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.12627972662448883, |
| "learning_rate": 0.00019971541114732741, |
| "loss": 1.1607, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.13074152171611786, |
| "learning_rate": 0.00019970024328897022, |
| "loss": 1.1004, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.1309152990579605, |
| "learning_rate": 0.0001996846822583589, |
| "loss": 1.1378, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.1303664743900299, |
| "learning_rate": 0.000199668728116859, |
| "loss": 1.0956, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.13290388882160187, |
| "learning_rate": 0.00019965238092738643, |
| "loss": 1.1264, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.12805409729480743, |
| "learning_rate": 0.00019963564075440703, |
| "loss": 1.183, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.1399564892053604, |
| "learning_rate": 0.0001996185076639364, |
| "loss": 1.1102, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.12978173792362213, |
| "learning_rate": 0.00019960098172353962, |
| "loss": 1.1634, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.13925811648368835, |
| "learning_rate": 0.00019958306300233098, |
| "loss": 1.0636, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.13258852064609528, |
| "learning_rate": 0.00019956475157097378, |
| "loss": 1.1428, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.1285356879234314, |
| "learning_rate": 0.00019954604750167993, |
| "loss": 1.1664, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.1321210116147995, |
| "learning_rate": 0.00019952695086820975, |
| "loss": 1.1419, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.14086973667144775, |
| "learning_rate": 0.00019950746174587163, |
| "loss": 1.1827, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.1311366856098175, |
| "learning_rate": 0.0001994875802115218, |
| "loss": 1.1971, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.14063993096351624, |
| "learning_rate": 0.0001994673063435639, |
| "loss": 1.1945, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.12695981562137604, |
| "learning_rate": 0.00019944664022194885, |
| "loss": 1.0385, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.14170674979686737, |
| "learning_rate": 0.0001994255819281744, |
| "loss": 1.0883, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.13162197172641754, |
| "learning_rate": 0.0001994041315452849, |
| "loss": 1.153, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.1326906979084015, |
| "learning_rate": 0.0001993822891578708, |
| "loss": 1.1186, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.13306689262390137, |
| "learning_rate": 0.00019936005485206851, |
| "loss": 1.1587, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.13625258207321167, |
| "learning_rate": 0.00019933742871556, |
| "loss": 1.1339, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.13773800432682037, |
| "learning_rate": 0.00019931441083757245, |
| "loss": 1.1944, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.15291447937488556, |
| "learning_rate": 0.00019929100130887782, |
| "loss": 1.1028, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.15140767395496368, |
| "learning_rate": 0.0001992672002217926, |
| "loss": 1.1896, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.1344233751296997, |
| "learning_rate": 0.0001992430076701775, |
| "loss": 1.0561, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.13877920806407928, |
| "learning_rate": 0.0001992184237494368, |
| "loss": 1.1108, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.1359027922153473, |
| "learning_rate": 0.00019919344855651833, |
| "loss": 1.1563, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.14610135555267334, |
| "learning_rate": 0.0001991680821899128, |
| "loss": 1.1299, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.14259958267211914, |
| "learning_rate": 0.00019914232474965365, |
| "loss": 1.1021, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.14158602058887482, |
| "learning_rate": 0.00019911617633731638, |
| "loss": 1.0787, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.1418074518442154, |
| "learning_rate": 0.00019908963705601846, |
| "loss": 1.1359, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.12850767374038696, |
| "learning_rate": 0.0001990627070104187, |
| "loss": 1.1373, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.1312914341688156, |
| "learning_rate": 0.0001990353863067169, |
| "loss": 1.0832, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.13280583918094635, |
| "learning_rate": 0.0001990076750526534, |
| "loss": 1.0462, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.13617292046546936, |
| "learning_rate": 0.00019897957335750878, |
| "loss": 1.1059, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.15030132234096527, |
| "learning_rate": 0.00019895108133210335, |
| "loss": 1.0761, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.14291270077228546, |
| "learning_rate": 0.00019892219908879653, |
| "loss": 1.1217, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.1685461699962616, |
| "learning_rate": 0.00019889292674148682, |
| "loss": 1.1607, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.13756121695041656, |
| "learning_rate": 0.00019886326440561093, |
| "loss": 1.0914, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.13901358842849731, |
| "learning_rate": 0.0001988332121981436, |
| "loss": 1.1234, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.13816247880458832, |
| "learning_rate": 0.00019880277023759702, |
| "loss": 1.1583, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.13309679925441742, |
| "learning_rate": 0.00019877193864402038, |
| "loss": 1.163, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.13356180489063263, |
| "learning_rate": 0.0001987407175389994, |
| "loss": 1.1301, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.1388397067785263, |
| "learning_rate": 0.00019870910704565588, |
| "loss": 1.1326, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.13303454220294952, |
| "learning_rate": 0.0001986771072886472, |
| "loss": 1.0779, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.1316283941268921, |
| "learning_rate": 0.00019864471839416576, |
| "loss": 1.0935, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.1348309963941574, |
| "learning_rate": 0.00019861194048993863, |
| "loss": 1.1816, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.1341564655303955, |
| "learning_rate": 0.00019857877370522685, |
| "loss": 1.1187, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.13689687848091125, |
| "learning_rate": 0.0001985452181708251, |
| "loss": 1.1637, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.13348707556724548, |
| "learning_rate": 0.0001985112740190611, |
| "loss": 1.1026, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.13700643181800842, |
| "learning_rate": 0.00019847694138379506, |
| "loss": 1.1508, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.13654476404190063, |
| "learning_rate": 0.00019844222040041928, |
| "loss": 1.1668, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.15331624448299408, |
| "learning_rate": 0.0001984071112058574, |
| "loss": 1.1121, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.25, |
| "eval_loss": 1.1294280290603638, |
| "eval_runtime": 81.6595, |
| "eval_samples_per_second": 31.827, |
| "eval_steps_per_second": 31.827, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.14425526559352875, |
| "learning_rate": 0.0001983716139385641, |
| "loss": 1.1447, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.13741208612918854, |
| "learning_rate": 0.00019833572873852444, |
| "loss": 1.1001, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.1282232254743576, |
| "learning_rate": 0.0001982994557472532, |
| "loss": 1.1199, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.13605354726314545, |
| "learning_rate": 0.00019826279510779454, |
| "loss": 1.154, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.13503985106945038, |
| "learning_rate": 0.00019822574696472126, |
| "loss": 1.0565, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.13878273963928223, |
| "learning_rate": 0.00019818831146413434, |
| "loss": 1.106, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.141740083694458, |
| "learning_rate": 0.00019815048875366234, |
| "loss": 1.0848, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.13799507915973663, |
| "learning_rate": 0.0001981122789824607, |
| "loss": 1.1582, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.1441466212272644, |
| "learning_rate": 0.0001980736823012114, |
| "loss": 1.0787, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.1377534121274948, |
| "learning_rate": 0.0001980346988621221, |
| "loss": 1.1092, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.1400901973247528, |
| "learning_rate": 0.00019799532881892564, |
| "loss": 1.0549, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.13621239364147186, |
| "learning_rate": 0.00019795557232687956, |
| "loss": 1.0991, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.1324262171983719, |
| "learning_rate": 0.0001979154295427653, |
| "loss": 1.0583, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.13273654878139496, |
| "learning_rate": 0.0001978749006248877, |
| "loss": 1.1504, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.14279481768608093, |
| "learning_rate": 0.00019783398573307428, |
| "loss": 1.0941, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.1432316154241562, |
| "learning_rate": 0.00019779268502867473, |
| "loss": 1.1111, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.14505276083946228, |
| "learning_rate": 0.00019775099867456013, |
| "loss": 1.0941, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.13935014605522156, |
| "learning_rate": 0.0001977089268351225, |
| "loss": 1.0597, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.14532430469989777, |
| "learning_rate": 0.0001976664696762739, |
| "loss": 1.1116, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.14096760749816895, |
| "learning_rate": 0.00019762362736544607, |
| "loss": 1.1381, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.1470746099948883, |
| "learning_rate": 0.00019758040007158948, |
| "loss": 1.1215, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.13610850274562836, |
| "learning_rate": 0.00019753678796517282, |
| "loss": 1.136, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.1399529129266739, |
| "learning_rate": 0.00019749279121818235, |
| "loss": 1.1035, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.13626012206077576, |
| "learning_rate": 0.00019744841000412123, |
| "loss": 1.1248, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.13053762912750244, |
| "learning_rate": 0.0001974036444980086, |
| "loss": 1.1286, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.14427675306797028, |
| "learning_rate": 0.00019735849487637929, |
| "loss": 1.2792, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.14464688301086426, |
| "learning_rate": 0.0001973129613172827, |
| "loss": 1.1091, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.12712322175502777, |
| "learning_rate": 0.0001972670440002825, |
| "loss": 1.1219, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.13343971967697144, |
| "learning_rate": 0.00019722074310645553, |
| "loss": 1.1296, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.15525247156620026, |
| "learning_rate": 0.00019717405881839145, |
| "loss": 1.159, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.12908953428268433, |
| "learning_rate": 0.0001971269913201918, |
| "loss": 1.0821, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.24165280163288116, |
| "learning_rate": 0.00019707954079746927, |
| "loss": 1.1388, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.1432817280292511, |
| "learning_rate": 0.00019703170743734706, |
| "loss": 1.1184, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.14007362723350525, |
| "learning_rate": 0.00019698349142845814, |
| "loss": 1.1576, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.14235983788967133, |
| "learning_rate": 0.00019693489296094443, |
| "loss": 1.0847, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.1430092453956604, |
| "learning_rate": 0.00019688591222645607, |
| "loss": 1.1562, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.13986627757549286, |
| "learning_rate": 0.00019683654941815077, |
| "loss": 1.124, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.13933469355106354, |
| "learning_rate": 0.00019678680473069293, |
| "loss": 1.1001, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.13476844131946564, |
| "learning_rate": 0.00019673667836025283, |
| "loss": 1.1186, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.13418316841125488, |
| "learning_rate": 0.00019668617050450603, |
| "loss": 1.1309, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.12794847786426544, |
| "learning_rate": 0.00019663528136263246, |
| "loss": 1.1142, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.1326293647289276, |
| "learning_rate": 0.00019658401113531565, |
| "loss": 1.0503, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.14793147146701813, |
| "learning_rate": 0.000196532360024742, |
| "loss": 1.2104, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.13718444108963013, |
| "learning_rate": 0.00019648032823459994, |
| "loss": 1.1685, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.14404018223285675, |
| "learning_rate": 0.00019642791597007902, |
| "loss": 1.09, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.14241506159305573, |
| "learning_rate": 0.00019637512343786937, |
| "loss": 1.1355, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.14581352472305298, |
| "learning_rate": 0.00019632195084616063, |
| "loss": 1.1005, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.14792676270008087, |
| "learning_rate": 0.00019626839840464119, |
| "loss": 1.1168, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.1484677940607071, |
| "learning_rate": 0.00019621446632449744, |
| "loss": 1.1138, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.15315671265125275, |
| "learning_rate": 0.0001961601548184129, |
| "loss": 1.1636, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.14746810495853424, |
| "learning_rate": 0.0001961054641005674, |
| "loss": 1.0881, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.1407732516527176, |
| "learning_rate": 0.00019605039438663614, |
| "loss": 1.0347, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.14150719344615936, |
| "learning_rate": 0.0001959949458937889, |
| "loss": 1.1112, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.16782569885253906, |
| "learning_rate": 0.0001959391188406893, |
| "loss": 1.0496, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.1452791690826416, |
| "learning_rate": 0.0001958829134474937, |
| "loss": 1.1185, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.145284965634346, |
| "learning_rate": 0.00019582632993585052, |
| "loss": 1.1431, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.15500612556934357, |
| "learning_rate": 0.00019576936852889936, |
| "loss": 1.1679, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.1416521966457367, |
| "learning_rate": 0.00019571202945126994, |
| "loss": 1.1322, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.1465340405702591, |
| "learning_rate": 0.00019565431292908146, |
| "loss": 1.0693, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.13601765036582947, |
| "learning_rate": 0.0001955962191899415, |
| "loss": 1.0676, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.14759162068367004, |
| "learning_rate": 0.0001955377484629453, |
| "loss": 1.0506, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.14839032292366028, |
| "learning_rate": 0.00019547890097867468, |
| "loss": 1.1245, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.1440214365720749, |
| "learning_rate": 0.0001954196769691973, |
| "loss": 1.1672, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.1372719258069992, |
| "learning_rate": 0.00019536007666806556, |
| "loss": 1.1084, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.14372558891773224, |
| "learning_rate": 0.00019530010031031586, |
| "loss": 1.1679, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.13789264857769012, |
| "learning_rate": 0.00019523974813246767, |
| "loss": 1.1253, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.14368915557861328, |
| "learning_rate": 0.0001951790203725223, |
| "loss": 1.085, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.1380469799041748, |
| "learning_rate": 0.00019511791726996243, |
| "loss": 1.1379, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.13288158178329468, |
| "learning_rate": 0.00019505643906575073, |
| "loss": 1.113, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.1390606164932251, |
| "learning_rate": 0.0001949945860023292, |
| "loss": 1.095, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.14271940290927887, |
| "learning_rate": 0.0001949323583236181, |
| "loss": 1.1063, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.13795693218708038, |
| "learning_rate": 0.00019486975627501502, |
| "loss": 1.0628, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.14073535799980164, |
| "learning_rate": 0.0001948067801033938, |
| "loss": 1.1192, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.138822540640831, |
| "learning_rate": 0.0001947434300571038, |
| "loss": 1.1299, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.13592712581157684, |
| "learning_rate": 0.0001946797063859686, |
| "loss": 1.0868, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.1379610300064087, |
| "learning_rate": 0.00019461560934128533, |
| "loss": 1.069, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.14286787807941437, |
| "learning_rate": 0.00019455113917582346, |
| "loss": 1.139, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.14168201386928558, |
| "learning_rate": 0.0001944862961438239, |
| "loss": 1.1405, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.1345077008008957, |
| "learning_rate": 0.000194421080500998, |
| "loss": 1.1039, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.1363426297903061, |
| "learning_rate": 0.00019435549250452645, |
| "loss": 1.1056, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.14109478890895844, |
| "learning_rate": 0.00019428953241305838, |
| "loss": 1.0927, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.14332321286201477, |
| "learning_rate": 0.0001942232004867103, |
| "loss": 1.0305, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.15956294536590576, |
| "learning_rate": 0.00019415649698706507, |
| "loss": 1.1245, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.14164718985557556, |
| "learning_rate": 0.0001940894221771708, |
| "loss": 1.0963, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.14296875894069672, |
| "learning_rate": 0.00019402197632153992, |
| "loss": 1.0853, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.12994709610939026, |
| "learning_rate": 0.00019395415968614813, |
| "loss": 1.0503, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.1399766504764557, |
| "learning_rate": 0.00019388597253843334, |
| "loss": 1.0623, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.14874404668807983, |
| "learning_rate": 0.00019381741514729443, |
| "loss": 1.0885, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.1453857719898224, |
| "learning_rate": 0.00019374848778309055, |
| "loss": 1.1702, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.14976643025875092, |
| "learning_rate": 0.0001936791907176397, |
| "loss": 1.0834, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.1418897956609726, |
| "learning_rate": 0.00019360952422421793, |
| "loss": 1.0918, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.14602817595005035, |
| "learning_rate": 0.00019353948857755803, |
| "loss": 1.0825, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.14669157564640045, |
| "learning_rate": 0.00019346908405384867, |
| "loss": 1.0973, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.14327263832092285, |
| "learning_rate": 0.00019339831093073318, |
| "loss": 1.1191, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.13806897401809692, |
| "learning_rate": 0.0001933271694873084, |
| "loss": 1.1504, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.13992969691753387, |
| "learning_rate": 0.00019325566000412376, |
| "loss": 1.0865, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.14395759999752045, |
| "learning_rate": 0.00019318378276318, |
| "loss": 1.1204, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.1409691572189331, |
| "learning_rate": 0.0001931115380479281, |
| "loss": 1.0766, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.1448824405670166, |
| "learning_rate": 0.00019303892614326836, |
| "loss": 1.1741, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.142364963889122, |
| "learning_rate": 0.00019296594733554892, |
| "loss": 1.1716, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5, |
| "eval_loss": 1.109603762626648, |
| "eval_runtime": 81.7249, |
| "eval_samples_per_second": 31.802, |
| "eval_steps_per_second": 31.802, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.1372615098953247, |
| "learning_rate": 0.00019289260191256483, |
| "loss": 1.1084, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.13863563537597656, |
| "learning_rate": 0.0001928188901635571, |
| "loss": 1.0546, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.13055531680583954, |
| "learning_rate": 0.00019274481237921114, |
| "loss": 1.018, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.14135099947452545, |
| "learning_rate": 0.00019267036885165588, |
| "loss": 1.1131, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.14308464527130127, |
| "learning_rate": 0.0001925955598744627, |
| "loss": 1.0723, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.13907764852046967, |
| "learning_rate": 0.00019252038574264405, |
| "loss": 1.1607, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.13771073520183563, |
| "learning_rate": 0.00019244484675265232, |
| "loss": 1.172, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.13774815201759338, |
| "learning_rate": 0.00019236894320237894, |
| "loss": 1.0622, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.1426474153995514, |
| "learning_rate": 0.0001922926753911527, |
| "loss": 1.0368, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.1380661278963089, |
| "learning_rate": 0.00019221604361973919, |
| "loss": 1.0873, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.14044702053070068, |
| "learning_rate": 0.00019213904819033903, |
| "loss": 1.0901, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.1415887176990509, |
| "learning_rate": 0.00019206168940658712, |
| "loss": 1.1061, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.1580592840909958, |
| "learning_rate": 0.00019198396757355118, |
| "loss": 1.1073, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.14094668626785278, |
| "learning_rate": 0.00019190588299773062, |
| "loss": 1.1781, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.14229640364646912, |
| "learning_rate": 0.00019182743598705542, |
| "loss": 1.1095, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.140314981341362, |
| "learning_rate": 0.00019174862685088472, |
| "loss": 1.1534, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.160028338432312, |
| "learning_rate": 0.00019166945590000584, |
| "loss": 1.087, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.14278572797775269, |
| "learning_rate": 0.0001915899234466328, |
| "loss": 1.1583, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.13695856928825378, |
| "learning_rate": 0.0001915100298044054, |
| "loss": 1.1151, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.14235751330852509, |
| "learning_rate": 0.00019142977528838762, |
| "loss": 1.1111, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.15174664556980133, |
| "learning_rate": 0.00019134916021506666, |
| "loss": 1.1438, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.15249325335025787, |
| "learning_rate": 0.0001912681849023516, |
| "loss": 1.1575, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.14303787052631378, |
| "learning_rate": 0.00019118684966957207, |
| "loss": 1.1302, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.1405183970928192, |
| "learning_rate": 0.00019110515483747716, |
| "loss": 1.1157, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.1475205421447754, |
| "learning_rate": 0.00019102310072823393, |
| "loss": 1.1175, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 0.14406634867191315, |
| "learning_rate": 0.0001909406876654264, |
| "loss": 1.0578, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 0.13999773561954498, |
| "learning_rate": 0.00019085791597405404, |
| "loss": 1.0865, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 0.1409848928451538, |
| "learning_rate": 0.00019077478598053063, |
| "loss": 1.1297, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 0.14548417925834656, |
| "learning_rate": 0.00019069129801268294, |
| "loss": 1.1524, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.13622736930847168, |
| "learning_rate": 0.00019060745239974936, |
| "loss": 1.0744, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.14302954077720642, |
| "learning_rate": 0.0001905232494723788, |
| "loss": 1.1469, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.15202221274375916, |
| "learning_rate": 0.0001904386895626291, |
| "loss": 1.0693, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.14072120189666748, |
| "learning_rate": 0.00019035377300396597, |
| "loss": 1.0584, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.13941141963005066, |
| "learning_rate": 0.00019026850013126157, |
| "loss": 1.1257, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.1389845460653305, |
| "learning_rate": 0.0001901828712807932, |
| "loss": 1.0003, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.1431329846382141, |
| "learning_rate": 0.0001900968867902419, |
| "loss": 1.0795, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.15022633969783783, |
| "learning_rate": 0.00019001054699869133, |
| "loss": 1.1427, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.1578160673379898, |
| "learning_rate": 0.00018992385224662623, |
| "loss": 1.13, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.13778769969940186, |
| "learning_rate": 0.00018983680287593105, |
| "loss": 1.0739, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.1454969048500061, |
| "learning_rate": 0.00018974939922988883, |
| "loss": 1.0864, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.13545964658260345, |
| "learning_rate": 0.00018966164165317966, |
| "loss": 1.0169, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 0.13648608326911926, |
| "learning_rate": 0.00018957353049187936, |
| "loss": 1.0732, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 0.14080677926540375, |
| "learning_rate": 0.00018948506609345813, |
| "loss": 1.0579, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 0.14503297209739685, |
| "learning_rate": 0.00018939624880677918, |
| "loss": 1.0755, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 0.15316741168498993, |
| "learning_rate": 0.00018930707898209733, |
| "loss": 1.0885, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 0.14839263260364532, |
| "learning_rate": 0.0001892175569710577, |
| "loss": 1.121, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 0.13919925689697266, |
| "learning_rate": 0.00018912768312669424, |
| "loss": 1.1039, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 0.13975974917411804, |
| "learning_rate": 0.00018903745780342839, |
| "loss": 1.1454, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 0.13851100206375122, |
| "learning_rate": 0.0001889468813570676, |
| "loss": 1.0905, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 0.14839564263820648, |
| "learning_rate": 0.00018885595414480405, |
| "loss": 1.1002, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 0.1421942263841629, |
| "learning_rate": 0.00018876467652521317, |
| "loss": 1.093, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 0.14453786611557007, |
| "learning_rate": 0.0001886730488582522, |
| "loss": 1.0278, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 0.13856688141822815, |
| "learning_rate": 0.0001885810715052589, |
| "loss": 1.079, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.14092479646205902, |
| "learning_rate": 0.00018848874482894993, |
| "loss": 1.0608, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.14616413414478302, |
| "learning_rate": 0.0001883960691934196, |
| "loss": 1.1097, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.1410474181175232, |
| "learning_rate": 0.00018830304496413822, |
| "loss": 1.0577, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.15473878383636475, |
| "learning_rate": 0.000188209672507951, |
| "loss": 1.1453, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 0.14370983839035034, |
| "learning_rate": 0.00018811595219307622, |
| "loss": 1.1732, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 0.14861780405044556, |
| "learning_rate": 0.00018802188438910405, |
| "loss": 1.1471, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 0.1523188352584839, |
| "learning_rate": 0.000187927469466995, |
| "loss": 1.129, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 0.14366289973258972, |
| "learning_rate": 0.00018783270779907838, |
| "loss": 1.0792, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 0.1363295018672943, |
| "learning_rate": 0.00018773759975905098, |
| "loss": 0.9848, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 0.1438857764005661, |
| "learning_rate": 0.00018764214572197552, |
| "loss": 1.1371, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 0.13751162588596344, |
| "learning_rate": 0.00018754634606427914, |
| "loss": 1.0557, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 0.1384708732366562, |
| "learning_rate": 0.00018745020116375197, |
| "loss": 1.0664, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 0.14196960628032684, |
| "learning_rate": 0.00018735371139954558, |
| "loss": 1.0828, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 0.15374121069908142, |
| "learning_rate": 0.00018725687715217163, |
| "loss": 1.073, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 0.14955537021160126, |
| "learning_rate": 0.0001871596988035001, |
| "loss": 1.1444, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 0.13760650157928467, |
| "learning_rate": 0.00018706217673675811, |
| "loss": 1.088, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 0.17072008550167084, |
| "learning_rate": 0.00018696431133652817, |
| "loss": 1.07, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 0.14745061099529266, |
| "learning_rate": 0.00018686610298874676, |
| "loss": 1.1105, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 0.14695587754249573, |
| "learning_rate": 0.00018676755208070275, |
| "loss": 1.0612, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 0.15686020255088806, |
| "learning_rate": 0.00018666865900103597, |
| "loss": 1.0933, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 0.14162233471870422, |
| "learning_rate": 0.00018656942413973555, |
| "loss": 1.0832, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 0.14662939310073853, |
| "learning_rate": 0.00018646984788813856, |
| "loss": 1.1175, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 0.13886839151382446, |
| "learning_rate": 0.0001863699306389282, |
| "loss": 1.1221, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 0.13897326588630676, |
| "learning_rate": 0.00018626967278613253, |
| "loss": 1.0767, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 0.13283655047416687, |
| "learning_rate": 0.0001861690747251228, |
| "loss": 1.1397, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 0.14036604762077332, |
| "learning_rate": 0.0001860681368526118, |
| "loss": 1.0965, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 0.1449379026889801, |
| "learning_rate": 0.00018596685956665245, |
| "loss": 1.1262, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 0.14264287054538727, |
| "learning_rate": 0.00018586524326663615, |
| "loss": 1.1317, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 0.14677459001541138, |
| "learning_rate": 0.00018576328835329117, |
| "loss": 1.0785, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 0.14834077656269073, |
| "learning_rate": 0.00018566099522868119, |
| "loss": 1.0892, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 0.15325355529785156, |
| "learning_rate": 0.00018555836429620358, |
| "loss": 1.0843, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.14825651049613953, |
| "learning_rate": 0.00018545539596058795, |
| "loss": 1.1288, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.14722499251365662, |
| "learning_rate": 0.00018535209062789433, |
| "loss": 1.1391, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.14388781785964966, |
| "learning_rate": 0.00018524844870551185, |
| "loss": 1.1013, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.1455835998058319, |
| "learning_rate": 0.00018514447060215698, |
| "loss": 1.0811, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 0.14625433087348938, |
| "learning_rate": 0.00018504015672787184, |
| "loss": 1.0854, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 0.13978470861911774, |
| "learning_rate": 0.00018493550749402278, |
| "loss": 1.1398, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 0.1447162628173828, |
| "learning_rate": 0.00018483052331329857, |
| "loss": 1.0553, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 0.13894303143024445, |
| "learning_rate": 0.00018472520459970898, |
| "loss": 1.0305, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 0.1372181624174118, |
| "learning_rate": 0.00018461955176858285, |
| "loss": 1.021, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 0.14599645137786865, |
| "learning_rate": 0.0001845135652365668, |
| "loss": 1.0808, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 0.1599220335483551, |
| "learning_rate": 0.00018440724542162328, |
| "loss": 1.1143, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 0.1450476050376892, |
| "learning_rate": 0.00018430059274302917, |
| "loss": 1.0508, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.1439283937215805, |
| "learning_rate": 0.00018419360762137395, |
| "loss": 1.0592, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.1410531848669052, |
| "learning_rate": 0.00018408629047855804, |
| "loss": 1.0632, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.1468774974346161, |
| "learning_rate": 0.00018397864173779133, |
| "loss": 1.056, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.1467033177614212, |
| "learning_rate": 0.00018387066182359133, |
| "loss": 1.1122, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.75, |
| "eval_loss": 1.0955116748809814, |
| "eval_runtime": 81.7775, |
| "eval_samples_per_second": 31.781, |
| "eval_steps_per_second": 31.781, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.14950688183307648, |
| "learning_rate": 0.00018376235116178148, |
| "loss": 1.0698, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.142381951212883, |
| "learning_rate": 0.00018365371017948964, |
| "loss": 1.0528, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.1410701423883438, |
| "learning_rate": 0.0001835447393051463, |
| "loss": 1.0785, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.14708860218524933, |
| "learning_rate": 0.00018343543896848273, |
| "loss": 1.0142, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 0.1467617303133011, |
| "learning_rate": 0.00018332580960052965, |
| "loss": 1.0973, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 0.15761792659759521, |
| "learning_rate": 0.00018321585163361527, |
| "loss": 1.1745, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 0.13972119987010956, |
| "learning_rate": 0.00018310556550136357, |
| "loss": 1.0832, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 0.1481141895055771, |
| "learning_rate": 0.00018299495163869275, |
| "loss": 1.1573, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 0.14397870004177094, |
| "learning_rate": 0.0001828840104818134, |
| "loss": 1.171, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 0.14765049517154694, |
| "learning_rate": 0.0001827727424682268, |
| "loss": 1.0544, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 0.14956365525722504, |
| "learning_rate": 0.00018266114803672318, |
| "loss": 1.1755, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 0.15122386813163757, |
| "learning_rate": 0.00018254922762738008, |
| "loss": 1.1547, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 0.14254115521907806, |
| "learning_rate": 0.00018243698168156054, |
| "loss": 1.1075, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 0.14294452965259552, |
| "learning_rate": 0.00018232441064191125, |
| "loss": 1.1419, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 0.14777772128582, |
| "learning_rate": 0.0001822115149523611, |
| "loss": 1.1662, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 0.14944781363010406, |
| "learning_rate": 0.0001820982950581191, |
| "loss": 1.1497, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.1466801017522812, |
| "learning_rate": 0.00018198475140567287, |
| "loss": 1.1374, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.15346656739711761, |
| "learning_rate": 0.00018187088444278674, |
| "loss": 1.1356, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.15271005034446716, |
| "learning_rate": 0.00018175669461850005, |
| "loss": 1.0845, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.14452996850013733, |
| "learning_rate": 0.00018164218238312535, |
| "loss": 1.1162, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 0.14632536470890045, |
| "learning_rate": 0.00018152734818824658, |
| "loss": 1.0187, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 0.14935997128486633, |
| "learning_rate": 0.00018141219248671745, |
| "loss": 1.1167, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 0.14043933153152466, |
| "learning_rate": 0.0001812967157326595, |
| "loss": 1.0044, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 0.14850106835365295, |
| "learning_rate": 0.00018118091838146029, |
| "loss": 1.1226, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 0.14655061066150665, |
| "learning_rate": 0.00018106480088977172, |
| "loss": 1.0508, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 0.14721763134002686, |
| "learning_rate": 0.00018094836371550824, |
| "loss": 1.0659, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 0.1433349996805191, |
| "learning_rate": 0.00018083160731784486, |
| "loss": 1.147, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 0.13528144359588623, |
| "learning_rate": 0.00018071453215721554, |
| "loss": 1.0388, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 0.15466062724590302, |
| "learning_rate": 0.0001805971386953113, |
| "loss": 1.0649, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 0.15163114666938782, |
| "learning_rate": 0.00018047942739507836, |
| "loss": 1.1454, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 0.14693276584148407, |
| "learning_rate": 0.0001803613987207163, |
| "loss": 1.1137, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 0.14229321479797363, |
| "learning_rate": 0.00018024305313767646, |
| "loss": 1.0153, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 0.13863018155097961, |
| "learning_rate": 0.00018012439111265974, |
| "loss": 1.0491, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 0.1422068327665329, |
| "learning_rate": 0.000180005413113615, |
| "loss": 1.0952, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 0.1419857293367386, |
| "learning_rate": 0.00017988611960973713, |
| "loss": 1.0532, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 0.1446901261806488, |
| "learning_rate": 0.00017976651107146533, |
| "loss": 1.0477, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.14558811485767365, |
| "learning_rate": 0.00017964658797048108, |
| "loss": 1.1481, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.15488363802433014, |
| "learning_rate": 0.0001795263507797063, |
| "loss": 1.1302, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.14942613244056702, |
| "learning_rate": 0.00017940579997330165, |
| "loss": 1.0698, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.14417564868927002, |
| "learning_rate": 0.00017928493602666445, |
| "loss": 1.0867, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 0.14839497208595276, |
| "learning_rate": 0.0001791637594164269, |
| "loss": 1.0124, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 0.1415972113609314, |
| "learning_rate": 0.00017904227062045437, |
| "loss": 1.0958, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 0.143202543258667, |
| "learning_rate": 0.00017892047011784312, |
| "loss": 1.0808, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 0.14291773736476898, |
| "learning_rate": 0.00017879835838891875, |
| "loss": 1.1386, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 0.1504325121641159, |
| "learning_rate": 0.00017867593591523422, |
| "loss": 1.0804, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 0.1444767862558365, |
| "learning_rate": 0.00017855320317956784, |
| "loss": 1.1207, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 0.14493699371814728, |
| "learning_rate": 0.00017843016066592158, |
| "loss": 1.0954, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 0.14571166038513184, |
| "learning_rate": 0.00017830680885951887, |
| "loss": 1.0676, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.14583171904087067, |
| "learning_rate": 0.000178183148246803, |
| "loss": 1.0674, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.15080390870571136, |
| "learning_rate": 0.00017805917931543492, |
| "loss": 1.0757, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.14790864288806915, |
| "learning_rate": 0.00017793490255429157, |
| "loss": 1.1005, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.14861677587032318, |
| "learning_rate": 0.00017781031845346375, |
| "loss": 1.0645, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 0.15099036693572998, |
| "learning_rate": 0.00017768542750425426, |
| "loss": 1.1306, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 0.14353971183300018, |
| "learning_rate": 0.00017756023019917607, |
| "loss": 1.0834, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 0.14582550525665283, |
| "learning_rate": 0.00017743472703195015, |
| "loss": 1.0722, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 0.14268234372138977, |
| "learning_rate": 0.00017730891849750377, |
| "loss": 1.092, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 0.1424105316400528, |
| "learning_rate": 0.00017718280509196828, |
| "loss": 1.1355, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 0.13972117006778717, |
| "learning_rate": 0.0001770563873126775, |
| "loss": 1.0318, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 0.14622163772583008, |
| "learning_rate": 0.00017692966565816532, |
| "loss": 1.0985, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 0.13956372439861298, |
| "learning_rate": 0.0001768026406281642, |
| "loss": 1.102, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 0.14042189717292786, |
| "learning_rate": 0.0001766753127236029, |
| "loss": 1.0284, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 0.14376944303512573, |
| "learning_rate": 0.00017654768244660448, |
| "loss": 1.1452, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 0.14055544137954712, |
| "learning_rate": 0.00017641975030048454, |
| "loss": 1.0306, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 0.14599303901195526, |
| "learning_rate": 0.00017629151678974907, |
| "loss": 1.0838, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 0.1528831571340561, |
| "learning_rate": 0.00017616298242009251, |
| "loss": 1.1293, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 0.1404455453157425, |
| "learning_rate": 0.00017603414769839577, |
| "loss": 1.0425, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 0.14992842078208923, |
| "learning_rate": 0.00017590501313272415, |
| "loss": 1.0928, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 0.14540541172027588, |
| "learning_rate": 0.00017577557923232546, |
| "loss": 1.0366, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 0.1451583057641983, |
| "learning_rate": 0.00017564584650762793, |
| "loss": 1.1108, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 0.155447855591774, |
| "learning_rate": 0.00017551581547023819, |
| "loss": 1.1394, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 0.1441376656293869, |
| "learning_rate": 0.0001753854866329393, |
| "loss": 1.0264, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 0.13875485956668854, |
| "learning_rate": 0.00017525486050968875, |
| "loss": 1.0672, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 0.14158080518245697, |
| "learning_rate": 0.00017512393761561632, |
| "loss": 1.053, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 0.15505361557006836, |
| "learning_rate": 0.00017499271846702213, |
| "loss": 1.0713, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 0.14172373712062836, |
| "learning_rate": 0.0001748612035813747, |
| "loss": 1.0544, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 0.14016349613666534, |
| "learning_rate": 0.00017472939347730856, |
| "loss": 1.0382, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.15148378908634186, |
| "learning_rate": 0.00017459728867462275, |
| "loss": 1.1218, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.1416306346654892, |
| "learning_rate": 0.0001744648896942782, |
| "loss": 1.0895, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.14276988804340363, |
| "learning_rate": 0.00017433219705839616, |
| "loss": 1.0991, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.13922327756881714, |
| "learning_rate": 0.00017419921129025576, |
| "loss": 1.0883, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.1479676216840744, |
| "learning_rate": 0.00017406593291429217, |
| "loss": 1.1083, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.14659778773784637, |
| "learning_rate": 0.0001739323624560945, |
| "loss": 1.0863, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.14685633778572083, |
| "learning_rate": 0.00017379850044240368, |
| "loss": 1.1075, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.14316044747829437, |
| "learning_rate": 0.00017366434740111037, |
| "loss": 1.0584, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 0.14292864501476288, |
| "learning_rate": 0.00017352990386125292, |
| "loss": 1.1002, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 0.14412067830562592, |
| "learning_rate": 0.00017339517035301532, |
| "loss": 1.0671, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 0.14292089641094208, |
| "learning_rate": 0.000173260147407725, |
| "loss": 1.0958, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 0.1490335911512375, |
| "learning_rate": 0.00017312483555785086, |
| "loss": 1.1074, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 0.14249826967716217, |
| "learning_rate": 0.00017298923533700107, |
| "loss": 1.1546, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 0.14555396139621735, |
| "learning_rate": 0.000172853347279921, |
| "loss": 1.076, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 0.14374902844429016, |
| "learning_rate": 0.00017271717192249116, |
| "loss": 1.0767, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 0.14903804659843445, |
| "learning_rate": 0.00017258070980172494, |
| "loss": 1.0969, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 0.1533229798078537, |
| "learning_rate": 0.00017244396145576672, |
| "loss": 1.1206, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 0.14720167219638824, |
| "learning_rate": 0.0001723069274238895, |
| "loss": 1.0655, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 0.14380764961242676, |
| "learning_rate": 0.00017216960824649303, |
| "loss": 1.0123, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 0.14513961970806122, |
| "learning_rate": 0.0001720320044651014, |
| "loss": 1.0196, |
| "step": 396 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.14310909807682037, |
| "learning_rate": 0.0001718941166223612, |
| "loss": 1.0278, |
| "step": 397 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.14312389492988586, |
| "learning_rate": 0.00017175594526203905, |
| "loss": 1.0649, |
| "step": 398 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.1408112645149231, |
| "learning_rate": 0.00017161749092901984, |
| "loss": 1.0793, |
| "step": 399 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.14593806862831116, |
| "learning_rate": 0.00017147875416930416, |
| "loss": 1.0474, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 1.083612322807312, |
| "eval_runtime": 81.6893, |
| "eval_samples_per_second": 31.816, |
| "eval_steps_per_second": 31.816, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 0.14213843643665314, |
| "learning_rate": 0.00017133973553000654, |
| "loss": 1.0476, |
| "step": 401 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 0.14211952686309814, |
| "learning_rate": 0.00017120043555935298, |
| "loss": 1.0386, |
| "step": 402 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 0.15638479590415955, |
| "learning_rate": 0.00017106085480667903, |
| "loss": 1.1145, |
| "step": 403 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 0.1525896191596985, |
| "learning_rate": 0.00017092099382242748, |
| "loss": 1.1124, |
| "step": 404 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.13780884444713593, |
| "learning_rate": 0.0001707808531581462, |
| "loss": 1.0208, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 0.13917113840579987, |
| "learning_rate": 0.00017064043336648599, |
| "loss": 1.0143, |
| "step": 406 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 0.14122170209884644, |
| "learning_rate": 0.00017049973500119845, |
| "loss": 0.9977, |
| "step": 407 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 0.14243052899837494, |
| "learning_rate": 0.0001703587586171337, |
| "loss": 0.9933, |
| "step": 408 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 0.14186780154705048, |
| "learning_rate": 0.0001702175047702382, |
| "loss": 0.9567, |
| "step": 409 |
| }, |
| { |
| "epoch": 1.02, |
| "grad_norm": 0.1524883359670639, |
| "learning_rate": 0.00017007597401755276, |
| "loss": 0.9874, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.02, |
| "grad_norm": 0.15759988129138947, |
| "learning_rate": 0.00016993416691720998, |
| "loss": 1.0292, |
| "step": 411 |
| }, |
| { |
| "epoch": 1.02, |
| "grad_norm": 0.15617264807224274, |
| "learning_rate": 0.00016979208402843237, |
| "loss": 1.0168, |
| "step": 412 |
| }, |
| { |
| "epoch": 1.02, |
| "grad_norm": 0.15921927988529205, |
| "learning_rate": 0.00016964972591153, |
| "loss": 1.0209, |
| "step": 413 |
| }, |
| { |
| "epoch": 1.03, |
| "grad_norm": 0.1540677845478058, |
| "learning_rate": 0.00016950709312789833, |
| "loss": 1.0013, |
| "step": 414 |
| }, |
| { |
| "epoch": 1.03, |
| "grad_norm": 0.156731516122818, |
| "learning_rate": 0.00016936418624001592, |
| "loss": 1.0171, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.03, |
| "grad_norm": 0.15679331123828888, |
| "learning_rate": 0.00016922100581144228, |
| "loss": 1.0137, |
| "step": 416 |
| }, |
| { |
| "epoch": 1.03, |
| "grad_norm": 0.15117546916007996, |
| "learning_rate": 0.00016907755240681577, |
| "loss": 0.9041, |
| "step": 417 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 0.1581723839044571, |
| "learning_rate": 0.00016893382659185105, |
| "loss": 0.9891, |
| "step": 418 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 0.15231919288635254, |
| "learning_rate": 0.00016878982893333717, |
| "loss": 0.9626, |
| "step": 419 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 0.15532514452934265, |
| "learning_rate": 0.00016864555999913518, |
| "loss": 0.9639, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 0.16158603131771088, |
| "learning_rate": 0.00016850102035817588, |
| "loss": 1.0156, |
| "step": 421 |
| }, |
| { |
| "epoch": 1.05, |
| "grad_norm": 0.16860714554786682, |
| "learning_rate": 0.0001683562105804577, |
| "loss": 1.0279, |
| "step": 422 |
| }, |
| { |
| "epoch": 1.05, |
| "grad_norm": 0.1704617142677307, |
| "learning_rate": 0.00016821113123704424, |
| "loss": 1.0261, |
| "step": 423 |
| }, |
| { |
| "epoch": 1.05, |
| "grad_norm": 0.16520226001739502, |
| "learning_rate": 0.00016806578290006225, |
| "loss": 1.0307, |
| "step": 424 |
| }, |
| { |
| "epoch": 1.05, |
| "grad_norm": 0.16199736297130585, |
| "learning_rate": 0.00016792016614269924, |
| "loss": 0.9764, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.06, |
| "grad_norm": 0.16184571385383606, |
| "learning_rate": 0.0001677742815392012, |
| "loss": 0.9958, |
| "step": 426 |
| }, |
| { |
| "epoch": 1.06, |
| "grad_norm": 0.16386933624744415, |
| "learning_rate": 0.00016762812966487044, |
| "loss": 1.0221, |
| "step": 427 |
| }, |
| { |
| "epoch": 1.06, |
| "grad_norm": 0.17046724259853363, |
| "learning_rate": 0.00016748171109606328, |
| "loss": 1.029, |
| "step": 428 |
| }, |
| { |
| "epoch": 1.06, |
| "grad_norm": 0.1638820469379425, |
| "learning_rate": 0.00016733502641018766, |
| "loss": 1.0175, |
| "step": 429 |
| }, |
| { |
| "epoch": 1.07, |
| "grad_norm": 0.16480222344398499, |
| "learning_rate": 0.00016718807618570106, |
| "loss": 1.033, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.07, |
| "grad_norm": 0.1661783903837204, |
| "learning_rate": 0.00016704086100210815, |
| "loss": 0.9379, |
| "step": 431 |
| }, |
| { |
| "epoch": 1.07, |
| "grad_norm": 0.15570427477359772, |
| "learning_rate": 0.00016689338143995833, |
| "loss": 0.9877, |
| "step": 432 |
| }, |
| { |
| "epoch": 1.07, |
| "grad_norm": 0.170819491147995, |
| "learning_rate": 0.00016674563808084377, |
| "loss": 1.0738, |
| "step": 433 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 0.16349053382873535, |
| "learning_rate": 0.00016659763150739677, |
| "loss": 0.9474, |
| "step": 434 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 0.1703306883573532, |
| "learning_rate": 0.0001664493623032877, |
| "loss": 1.054, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 0.1705269068479538, |
| "learning_rate": 0.00016630083105322266, |
| "loss": 1.0175, |
| "step": 436 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 0.15883858501911163, |
| "learning_rate": 0.00016615203834294119, |
| "loss": 1.0414, |
| "step": 437 |
| }, |
| { |
| "epoch": 1.09, |
| "grad_norm": 0.17120327055454254, |
| "learning_rate": 0.00016600298475921365, |
| "loss": 1.0222, |
| "step": 438 |
| }, |
| { |
| "epoch": 1.09, |
| "grad_norm": 0.1668461114168167, |
| "learning_rate": 0.00016585367088983946, |
| "loss": 0.9212, |
| "step": 439 |
| }, |
| { |
| "epoch": 1.09, |
| "grad_norm": 0.178915336728096, |
| "learning_rate": 0.00016570409732364437, |
| "loss": 1.0167, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.09, |
| "grad_norm": 0.171407088637352, |
| "learning_rate": 0.00016555426465047823, |
| "loss": 0.9693, |
| "step": 441 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 0.1687992811203003, |
| "learning_rate": 0.0001654041734612127, |
| "loss": 1.0257, |
| "step": 442 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 0.17136409878730774, |
| "learning_rate": 0.00016525382434773894, |
| "loss": 0.9874, |
| "step": 443 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 0.1806887686252594, |
| "learning_rate": 0.00016510321790296525, |
| "loss": 1.0684, |
| "step": 444 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 0.17648373544216156, |
| "learning_rate": 0.00016495235472081468, |
| "loss": 0.9867, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.11, |
| "grad_norm": 0.17426486313343048, |
| "learning_rate": 0.00016480123539622281, |
| "loss": 1.0439, |
| "step": 446 |
| }, |
| { |
| "epoch": 1.11, |
| "grad_norm": 0.17550793290138245, |
| "learning_rate": 0.0001646498605251352, |
| "loss": 1.0127, |
| "step": 447 |
| }, |
| { |
| "epoch": 1.11, |
| "grad_norm": 0.1805875450372696, |
| "learning_rate": 0.00016449823070450531, |
| "loss": 1.0317, |
| "step": 448 |
| }, |
| { |
| "epoch": 1.11, |
| "grad_norm": 0.17466574907302856, |
| "learning_rate": 0.00016434634653229199, |
| "loss": 0.9713, |
| "step": 449 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 0.16918793320655823, |
| "learning_rate": 0.00016419420860745699, |
| "loss": 1.0376, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 0.16672617197036743, |
| "learning_rate": 0.00016404181752996289, |
| "loss": 0.9211, |
| "step": 451 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 0.17270368337631226, |
| "learning_rate": 0.00016388917390077054, |
| "loss": 0.987, |
| "step": 452 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 0.16792818903923035, |
| "learning_rate": 0.0001637362783218368, |
| "loss": 0.9782, |
| "step": 453 |
| }, |
| { |
| "epoch": 1.13, |
| "grad_norm": 0.1800449639558792, |
| "learning_rate": 0.00016358313139611195, |
| "loss": 0.9747, |
| "step": 454 |
| }, |
| { |
| "epoch": 1.13, |
| "grad_norm": 0.17128407955169678, |
| "learning_rate": 0.0001634297337275376, |
| "loss": 1.0312, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.13, |
| "grad_norm": 0.17059966921806335, |
| "learning_rate": 0.0001632760859210442, |
| "loss": 1.0075, |
| "step": 456 |
| }, |
| { |
| "epoch": 1.13, |
| "grad_norm": 0.18244986236095428, |
| "learning_rate": 0.0001631221885825485, |
| "loss": 1.0161, |
| "step": 457 |
| }, |
| { |
| "epoch": 1.14, |
| "grad_norm": 0.17219580709934235, |
| "learning_rate": 0.00016296804231895142, |
| "loss": 1.0105, |
| "step": 458 |
| }, |
| { |
| "epoch": 1.14, |
| "grad_norm": 0.1736789494752884, |
| "learning_rate": 0.0001628136477381354, |
| "loss": 1.0128, |
| "step": 459 |
| }, |
| { |
| "epoch": 1.14, |
| "grad_norm": 0.2108864039182663, |
| "learning_rate": 0.00016265900544896225, |
| "loss": 0.9926, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.14, |
| "grad_norm": 0.16976673901081085, |
| "learning_rate": 0.00016250411606127054, |
| "loss": 0.9633, |
| "step": 461 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 0.1719416379928589, |
| "learning_rate": 0.00016234898018587337, |
| "loss": 1.0222, |
| "step": 462 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 0.17205439507961273, |
| "learning_rate": 0.00016219359843455577, |
| "loss": 1.0328, |
| "step": 463 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 0.17340464890003204, |
| "learning_rate": 0.0001620379714200725, |
| "loss": 0.9781, |
| "step": 464 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 0.17654834687709808, |
| "learning_rate": 0.00016188209975614542, |
| "loss": 1.0151, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.16, |
| "grad_norm": 0.17264829576015472, |
| "learning_rate": 0.00016172598405746124, |
| "loss": 0.9525, |
| "step": 466 |
| }, |
| { |
| "epoch": 1.16, |
| "grad_norm": 0.16847053170204163, |
| "learning_rate": 0.00016156962493966908, |
| "loss": 0.9202, |
| "step": 467 |
| }, |
| { |
| "epoch": 1.16, |
| "grad_norm": 0.18013043701648712, |
| "learning_rate": 0.00016141302301937786, |
| "loss": 1.0383, |
| "step": 468 |
| }, |
| { |
| "epoch": 1.16, |
| "grad_norm": 0.17866036295890808, |
| "learning_rate": 0.0001612561789141541, |
| "loss": 0.9682, |
| "step": 469 |
| }, |
| { |
| "epoch": 1.17, |
| "grad_norm": 0.17272624373435974, |
| "learning_rate": 0.0001610990932425194, |
| "loss": 1.0254, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.17, |
| "grad_norm": 0.18053527176380157, |
| "learning_rate": 0.00016094176662394792, |
| "loss": 1.0435, |
| "step": 471 |
| }, |
| { |
| "epoch": 1.17, |
| "grad_norm": 0.17645591497421265, |
| "learning_rate": 0.00016078419967886402, |
| "loss": 0.9929, |
| "step": 472 |
| }, |
| { |
| "epoch": 1.17, |
| "grad_norm": 0.17896148562431335, |
| "learning_rate": 0.00016062639302863986, |
| "loss": 0.9597, |
| "step": 473 |
| }, |
| { |
| "epoch": 1.18, |
| "grad_norm": 0.1784675121307373, |
| "learning_rate": 0.0001604683472955928, |
| "loss": 0.9877, |
| "step": 474 |
| }, |
| { |
| "epoch": 1.18, |
| "grad_norm": 0.18384787440299988, |
| "learning_rate": 0.00016031006310298306, |
| "loss": 0.98, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.18, |
| "grad_norm": 0.17336387932300568, |
| "learning_rate": 0.00016015154107501133, |
| "loss": 0.9813, |
| "step": 476 |
| }, |
| { |
| "epoch": 1.18, |
| "grad_norm": 0.1778045892715454, |
| "learning_rate": 0.00015999278183681604, |
| "loss": 0.9327, |
| "step": 477 |
| }, |
| { |
| "epoch": 1.19, |
| "grad_norm": 0.17641645669937134, |
| "learning_rate": 0.00015983378601447127, |
| "loss": 0.9955, |
| "step": 478 |
| }, |
| { |
| "epoch": 1.19, |
| "grad_norm": 0.18100661039352417, |
| "learning_rate": 0.00015967455423498387, |
| "loss": 1.0304, |
| "step": 479 |
| }, |
| { |
| "epoch": 1.19, |
| "grad_norm": 0.17939269542694092, |
| "learning_rate": 0.0001595150871262914, |
| "loss": 0.9129, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.19, |
| "grad_norm": 0.18178121745586395, |
| "learning_rate": 0.00015935538531725927, |
| "loss": 1.0567, |
| "step": 481 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 0.18156662583351135, |
| "learning_rate": 0.00015919544943767856, |
| "loss": 0.9731, |
| "step": 482 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 0.18265368044376373, |
| "learning_rate": 0.00015903528011826335, |
| "loss": 1.0253, |
| "step": 483 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 0.16867631673812866, |
| "learning_rate": 0.00015887487799064838, |
| "loss": 0.967, |
| "step": 484 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 0.181188702583313, |
| "learning_rate": 0.0001587142436873864, |
| "loss": 1.0113, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.21, |
| "grad_norm": 0.17186175286769867, |
| "learning_rate": 0.00015855337784194577, |
| "loss": 0.9987, |
| "step": 486 |
| }, |
| { |
| "epoch": 1.21, |
| "grad_norm": 0.16855312883853912, |
| "learning_rate": 0.000158392281088708, |
| "loss": 0.9623, |
| "step": 487 |
| }, |
| { |
| "epoch": 1.21, |
| "grad_norm": 0.1724013239145279, |
| "learning_rate": 0.00015823095406296514, |
| "loss": 0.922, |
| "step": 488 |
| }, |
| { |
| "epoch": 1.21, |
| "grad_norm": 0.18288518488407135, |
| "learning_rate": 0.00015806939740091734, |
| "loss": 0.9884, |
| "step": 489 |
| }, |
| { |
| "epoch": 1.22, |
| "grad_norm": 0.17419768869876862, |
| "learning_rate": 0.00015790761173967036, |
| "loss": 0.9246, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.22, |
| "grad_norm": 0.1798882633447647, |
| "learning_rate": 0.00015774559771723298, |
| "loss": 0.9276, |
| "step": 491 |
| }, |
| { |
| "epoch": 1.22, |
| "grad_norm": 0.18484486639499664, |
| "learning_rate": 0.00015758335597251458, |
| "loss": 0.9967, |
| "step": 492 |
| }, |
| { |
| "epoch": 1.22, |
| "grad_norm": 0.17431318759918213, |
| "learning_rate": 0.00015742088714532247, |
| "loss": 0.9672, |
| "step": 493 |
| }, |
| { |
| "epoch": 1.23, |
| "grad_norm": 0.1722385287284851, |
| "learning_rate": 0.00015725819187635968, |
| "loss": 0.9561, |
| "step": 494 |
| }, |
| { |
| "epoch": 1.23, |
| "grad_norm": 0.19427751004695892, |
| "learning_rate": 0.00015709527080722202, |
| "loss": 0.969, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.23, |
| "grad_norm": 0.1689085215330124, |
| "learning_rate": 0.00015693212458039584, |
| "loss": 0.9618, |
| "step": 496 |
| }, |
| { |
| "epoch": 1.23, |
| "grad_norm": 0.1696721762418747, |
| "learning_rate": 0.00015676875383925534, |
| "loss": 0.9686, |
| "step": 497 |
| }, |
| { |
| "epoch": 1.24, |
| "grad_norm": 0.17037516832351685, |
| "learning_rate": 0.00015660515922806027, |
| "loss": 0.956, |
| "step": 498 |
| }, |
| { |
| "epoch": 1.24, |
| "grad_norm": 0.17930398881435394, |
| "learning_rate": 0.000156441341391953, |
| "loss": 0.983, |
| "step": 499 |
| }, |
| { |
| "epoch": 1.24, |
| "grad_norm": 0.18172559142112732, |
| "learning_rate": 0.00015627730097695638, |
| "loss": 1.0447, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.24, |
| "eval_loss": 1.0872775316238403, |
| "eval_runtime": 81.628, |
| "eval_samples_per_second": 31.84, |
| "eval_steps_per_second": 31.84, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.24, |
| "grad_norm": 0.179900661110878, |
| "learning_rate": 0.0001561130386299709, |
| "loss": 0.9864, |
| "step": 501 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.1860770583152771, |
| "learning_rate": 0.0001559485549987723, |
| "loss": 0.9963, |
| "step": 502 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.17942041158676147, |
| "learning_rate": 0.00015578385073200895, |
| "loss": 1.0004, |
| "step": 503 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.17420290410518646, |
| "learning_rate": 0.0001556189264791992, |
| "loss": 1.002, |
| "step": 504 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.17478443682193756, |
| "learning_rate": 0.00015545378289072922, |
| "loss": 0.9624, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.26, |
| "grad_norm": 0.18624065816402435, |
| "learning_rate": 0.0001552884206178498, |
| "loss": 1.0315, |
| "step": 506 |
| }, |
| { |
| "epoch": 1.26, |
| "grad_norm": 0.17450089752674103, |
| "learning_rate": 0.00015512284031267437, |
| "loss": 0.9906, |
| "step": 507 |
| }, |
| { |
| "epoch": 1.26, |
| "grad_norm": 0.1746608465909958, |
| "learning_rate": 0.00015495704262817597, |
| "loss": 0.9898, |
| "step": 508 |
| }, |
| { |
| "epoch": 1.26, |
| "grad_norm": 0.17796628177165985, |
| "learning_rate": 0.00015479102821818507, |
| "loss": 1.0194, |
| "step": 509 |
| }, |
| { |
| "epoch": 1.27, |
| "grad_norm": 0.17470288276672363, |
| "learning_rate": 0.0001546247977373867, |
| "loss": 0.9309, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.27, |
| "grad_norm": 0.17829464375972748, |
| "learning_rate": 0.000154458351841318, |
| "loss": 1.0141, |
| "step": 511 |
| }, |
| { |
| "epoch": 1.27, |
| "grad_norm": 0.17732754349708557, |
| "learning_rate": 0.00015429169118636566, |
| "loss": 0.9817, |
| "step": 512 |
| }, |
| { |
| "epoch": 1.27, |
| "grad_norm": 0.1795651614665985, |
| "learning_rate": 0.00015412481642976318, |
| "loss": 0.9709, |
| "step": 513 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 0.17974676191806793, |
| "learning_rate": 0.00015395772822958845, |
| "loss": 1.0243, |
| "step": 514 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 0.18511098623275757, |
| "learning_rate": 0.0001537904272447611, |
| "loss": 1.0001, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 0.1780577152967453, |
| "learning_rate": 0.00015362291413503984, |
| "loss": 0.9829, |
| "step": 516 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 0.17798136174678802, |
| "learning_rate": 0.0001534551895610199, |
| "loss": 0.9659, |
| "step": 517 |
| }, |
| { |
| "epoch": 1.29, |
| "grad_norm": 0.1870565563440323, |
| "learning_rate": 0.00015328725418413045, |
| "loss": 0.9749, |
| "step": 518 |
| }, |
| { |
| "epoch": 1.29, |
| "grad_norm": 0.18744368851184845, |
| "learning_rate": 0.00015311910866663196, |
| "loss": 1.015, |
| "step": 519 |
| }, |
| { |
| "epoch": 1.29, |
| "grad_norm": 0.18052896857261658, |
| "learning_rate": 0.00015295075367161367, |
| "loss": 1.0313, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.29, |
| "grad_norm": 0.1779204159975052, |
| "learning_rate": 0.00015278218986299074, |
| "loss": 0.9496, |
| "step": 521 |
| }, |
| { |
| "epoch": 1.3, |
| "grad_norm": 0.1824800670146942, |
| "learning_rate": 0.00015261341790550196, |
| "loss": 1.0281, |
| "step": 522 |
| }, |
| { |
| "epoch": 1.3, |
| "grad_norm": 0.19057531654834747, |
| "learning_rate": 0.0001524444384647069, |
| "loss": 1.0271, |
| "step": 523 |
| }, |
| { |
| "epoch": 1.3, |
| "grad_norm": 0.19244614243507385, |
| "learning_rate": 0.0001522752522069833, |
| "loss": 0.9907, |
| "step": 524 |
| }, |
| { |
| "epoch": 1.3, |
| "grad_norm": 0.17696735262870789, |
| "learning_rate": 0.0001521058597995246, |
| "loss": 0.9331, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.31, |
| "grad_norm": 0.17268431186676025, |
| "learning_rate": 0.00015193626191033712, |
| "loss": 0.9427, |
| "step": 526 |
| }, |
| { |
| "epoch": 1.31, |
| "grad_norm": 0.18662290275096893, |
| "learning_rate": 0.0001517664592082375, |
| "loss": 1.0074, |
| "step": 527 |
| }, |
| { |
| "epoch": 1.31, |
| "grad_norm": 0.17090214788913727, |
| "learning_rate": 0.0001515964523628501, |
| "loss": 0.9608, |
| "step": 528 |
| }, |
| { |
| "epoch": 1.31, |
| "grad_norm": 0.1795254349708557, |
| "learning_rate": 0.00015142624204460435, |
| "loss": 0.9439, |
| "step": 529 |
| }, |
| { |
| "epoch": 1.32, |
| "grad_norm": 0.18272066116333008, |
| "learning_rate": 0.00015125582892473204, |
| "loss": 0.9828, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.32, |
| "grad_norm": 0.2021034061908722, |
| "learning_rate": 0.00015108521367526479, |
| "loss": 1.0375, |
| "step": 531 |
| }, |
| { |
| "epoch": 1.32, |
| "grad_norm": 0.18685071170330048, |
| "learning_rate": 0.00015091439696903115, |
| "loss": 1.0026, |
| "step": 532 |
| }, |
| { |
| "epoch": 1.32, |
| "grad_norm": 0.17936167120933533, |
| "learning_rate": 0.00015074337947965435, |
| "loss": 0.9296, |
| "step": 533 |
| }, |
| { |
| "epoch": 1.33, |
| "grad_norm": 0.18303433060646057, |
| "learning_rate": 0.00015057216188154928, |
| "loss": 0.9416, |
| "step": 534 |
| }, |
| { |
| "epoch": 1.33, |
| "grad_norm": 0.18212522566318512, |
| "learning_rate": 0.00015040074484992, |
| "loss": 0.9812, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.33, |
| "grad_norm": 0.17352260649204254, |
| "learning_rate": 0.00015022912906075702, |
| "loss": 0.9766, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.33, |
| "grad_norm": 0.17948494851589203, |
| "learning_rate": 0.0001500573151908347, |
| "loss": 1.006, |
| "step": 537 |
| }, |
| { |
| "epoch": 1.34, |
| "grad_norm": 0.18391214311122894, |
| "learning_rate": 0.00014988530391770856, |
| "loss": 1.0484, |
| "step": 538 |
| }, |
| { |
| "epoch": 1.34, |
| "grad_norm": 0.1719055324792862, |
| "learning_rate": 0.00014971309591971252, |
| "loss": 0.964, |
| "step": 539 |
| }, |
| { |
| "epoch": 1.34, |
| "grad_norm": 0.1985386312007904, |
| "learning_rate": 0.00014954069187595633, |
| "loss": 1.0035, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.34, |
| "grad_norm": 0.18530823290348053, |
| "learning_rate": 0.0001493680924663228, |
| "loss": 1.0089, |
| "step": 541 |
| }, |
| { |
| "epoch": 1.35, |
| "grad_norm": 0.18150845170021057, |
| "learning_rate": 0.00014919529837146528, |
| "loss": 1.0586, |
| "step": 542 |
| }, |
| { |
| "epoch": 1.35, |
| "grad_norm": 0.19130894541740417, |
| "learning_rate": 0.00014902231027280486, |
| "loss": 1.0152, |
| "step": 543 |
| }, |
| { |
| "epoch": 1.35, |
| "grad_norm": 0.1798924058675766, |
| "learning_rate": 0.0001488491288525275, |
| "loss": 0.9548, |
| "step": 544 |
| }, |
| { |
| "epoch": 1.35, |
| "grad_norm": 0.17213404178619385, |
| "learning_rate": 0.0001486757547935818, |
| "loss": 1.0226, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.36, |
| "grad_norm": 0.18383356928825378, |
| "learning_rate": 0.0001485021887796759, |
| "loss": 1.0291, |
| "step": 546 |
| }, |
| { |
| "epoch": 1.36, |
| "grad_norm": 0.19143284857273102, |
| "learning_rate": 0.0001483284314952749, |
| "loss": 1.0055, |
| "step": 547 |
| }, |
| { |
| "epoch": 1.36, |
| "grad_norm": 0.19124020636081696, |
| "learning_rate": 0.00014815448362559826, |
| "loss": 1.0231, |
| "step": 548 |
| }, |
| { |
| "epoch": 1.36, |
| "grad_norm": 0.18096496164798737, |
| "learning_rate": 0.00014798034585661695, |
| "loss": 1.0152, |
| "step": 549 |
| }, |
| { |
| "epoch": 1.37, |
| "grad_norm": 0.17621304094791412, |
| "learning_rate": 0.00014780601887505088, |
| "loss": 0.9718, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.37, |
| "grad_norm": 0.18995219469070435, |
| "learning_rate": 0.00014763150336836604, |
| "loss": 1.0052, |
| "step": 551 |
| }, |
| { |
| "epoch": 1.37, |
| "grad_norm": 0.19126906991004944, |
| "learning_rate": 0.00014745680002477203, |
| "loss": 0.9409, |
| "step": 552 |
| }, |
| { |
| "epoch": 1.37, |
| "grad_norm": 0.17537294328212738, |
| "learning_rate": 0.00014728190953321903, |
| "loss": 1.0021, |
| "step": 553 |
| }, |
| { |
| "epoch": 1.38, |
| "grad_norm": 0.18963244557380676, |
| "learning_rate": 0.00014710683258339536, |
| "loss": 1.0154, |
| "step": 554 |
| }, |
| { |
| "epoch": 1.38, |
| "grad_norm": 0.17940685153007507, |
| "learning_rate": 0.00014693156986572456, |
| "loss": 0.9898, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.38, |
| "grad_norm": 0.19598953425884247, |
| "learning_rate": 0.0001467561220713628, |
| "loss": 1.0479, |
| "step": 556 |
| }, |
| { |
| "epoch": 1.38, |
| "grad_norm": 0.18346156179904938, |
| "learning_rate": 0.00014658048989219614, |
| "loss": 1.0076, |
| "step": 557 |
| }, |
| { |
| "epoch": 1.39, |
| "grad_norm": 0.17553867399692535, |
| "learning_rate": 0.0001464046740208377, |
| "loss": 0.9696, |
| "step": 558 |
| }, |
| { |
| "epoch": 1.39, |
| "grad_norm": 0.1788376122713089, |
| "learning_rate": 0.00014622867515062503, |
| "loss": 0.9788, |
| "step": 559 |
| }, |
| { |
| "epoch": 1.39, |
| "grad_norm": 0.17731797695159912, |
| "learning_rate": 0.00014605249397561736, |
| "loss": 1.003, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.39, |
| "grad_norm": 0.17706608772277832, |
| "learning_rate": 0.00014587613119059284, |
| "loss": 1.0055, |
| "step": 561 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 0.168448805809021, |
| "learning_rate": 0.00014569958749104575, |
| "loss": 0.9516, |
| "step": 562 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 0.18675707280635834, |
| "learning_rate": 0.0001455228635731839, |
| "loss": 0.9837, |
| "step": 563 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 0.17538242042064667, |
| "learning_rate": 0.00014534596013392575, |
| "loss": 1.0367, |
| "step": 564 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 0.17501141130924225, |
| "learning_rate": 0.00014516887787089774, |
| "loss": 0.9733, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.41, |
| "grad_norm": 0.1874341070652008, |
| "learning_rate": 0.00014499161748243147, |
| "loss": 1.0206, |
| "step": 566 |
| }, |
| { |
| "epoch": 1.41, |
| "grad_norm": 0.1980811208486557, |
| "learning_rate": 0.00014481417966756102, |
| "loss": 1.0289, |
| "step": 567 |
| }, |
| { |
| "epoch": 1.41, |
| "grad_norm": 0.18807095289230347, |
| "learning_rate": 0.0001446365651260201, |
| "loss": 1.0205, |
| "step": 568 |
| }, |
| { |
| "epoch": 1.41, |
| "grad_norm": 0.1855577528476715, |
| "learning_rate": 0.00014445877455823946, |
| "loss": 1.0497, |
| "step": 569 |
| }, |
| { |
| "epoch": 1.42, |
| "grad_norm": 0.18725629150867462, |
| "learning_rate": 0.00014428080866534396, |
| "loss": 1.0326, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.42, |
| "grad_norm": 0.19902606308460236, |
| "learning_rate": 0.0001441026681491498, |
| "loss": 1.0252, |
| "step": 571 |
| }, |
| { |
| "epoch": 1.42, |
| "grad_norm": 0.19441325962543488, |
| "learning_rate": 0.00014392435371216185, |
| "loss": 1.0191, |
| "step": 572 |
| }, |
| { |
| "epoch": 1.42, |
| "grad_norm": 0.18167538940906525, |
| "learning_rate": 0.00014374586605757095, |
| "loss": 1.029, |
| "step": 573 |
| }, |
| { |
| "epoch": 1.43, |
| "grad_norm": 0.1809268742799759, |
| "learning_rate": 0.0001435672058892509, |
| "loss": 0.975, |
| "step": 574 |
| }, |
| { |
| "epoch": 1.43, |
| "grad_norm": 0.18132343888282776, |
| "learning_rate": 0.00014338837391175582, |
| "loss": 0.9688, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.43, |
| "grad_norm": 0.1733206808567047, |
| "learning_rate": 0.00014320937083031748, |
| "loss": 0.958, |
| "step": 576 |
| }, |
| { |
| "epoch": 1.43, |
| "grad_norm": 0.1799648404121399, |
| "learning_rate": 0.00014303019735084226, |
| "loss": 0.9842, |
| "step": 577 |
| }, |
| { |
| "epoch": 1.44, |
| "grad_norm": 0.1771499365568161, |
| "learning_rate": 0.0001428508541799086, |
| "loss": 1.0048, |
| "step": 578 |
| }, |
| { |
| "epoch": 1.44, |
| "grad_norm": 0.1818363070487976, |
| "learning_rate": 0.00014267134202476417, |
| "loss": 1.0374, |
| "step": 579 |
| }, |
| { |
| "epoch": 1.44, |
| "grad_norm": 0.1858426034450531, |
| "learning_rate": 0.0001424916615933229, |
| "loss": 0.9952, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.44, |
| "grad_norm": 0.19056333601474762, |
| "learning_rate": 0.00014231181359416247, |
| "loss": 1.0125, |
| "step": 581 |
| }, |
| { |
| "epoch": 1.45, |
| "grad_norm": 0.179644376039505, |
| "learning_rate": 0.00014213179873652127, |
| "loss": 0.9194, |
| "step": 582 |
| }, |
| { |
| "epoch": 1.45, |
| "grad_norm": 0.177077516913414, |
| "learning_rate": 0.0001419516177302957, |
| "loss": 0.991, |
| "step": 583 |
| }, |
| { |
| "epoch": 1.45, |
| "grad_norm": 0.18390731513500214, |
| "learning_rate": 0.00014177127128603745, |
| "loss": 0.9921, |
| "step": 584 |
| }, |
| { |
| "epoch": 1.45, |
| "grad_norm": 0.1845334768295288, |
| "learning_rate": 0.00014159076011495061, |
| "loss": 0.993, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.46, |
| "grad_norm": 0.1941182017326355, |
| "learning_rate": 0.0001414100849288888, |
| "loss": 0.9864, |
| "step": 586 |
| }, |
| { |
| "epoch": 1.46, |
| "grad_norm": 0.17679093778133392, |
| "learning_rate": 0.00014122924644035249, |
| "loss": 1.0078, |
| "step": 587 |
| }, |
| { |
| "epoch": 1.46, |
| "grad_norm": 0.1847458928823471, |
| "learning_rate": 0.00014104824536248614, |
| "loss": 1.0043, |
| "step": 588 |
| }, |
| { |
| "epoch": 1.46, |
| "grad_norm": 0.1811904013156891, |
| "learning_rate": 0.00014086708240907542, |
| "loss": 0.9493, |
| "step": 589 |
| }, |
| { |
| "epoch": 1.47, |
| "grad_norm": 0.18393242359161377, |
| "learning_rate": 0.00014068575829454436, |
| "loss": 1.0019, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.47, |
| "grad_norm": 0.17711445689201355, |
| "learning_rate": 0.0001405042737339524, |
| "loss": 0.9666, |
| "step": 591 |
| }, |
| { |
| "epoch": 1.47, |
| "grad_norm": 0.18920022249221802, |
| "learning_rate": 0.00014032262944299194, |
| "loss": 0.9579, |
| "step": 592 |
| }, |
| { |
| "epoch": 1.47, |
| "grad_norm": 0.18185077607631683, |
| "learning_rate": 0.00014014082613798503, |
| "loss": 1.0523, |
| "step": 593 |
| }, |
| { |
| "epoch": 1.48, |
| "grad_norm": 0.19337935745716095, |
| "learning_rate": 0.00013995886453588104, |
| "loss": 0.9841, |
| "step": 594 |
| }, |
| { |
| "epoch": 1.48, |
| "grad_norm": 0.1859455108642578, |
| "learning_rate": 0.00013977674535425337, |
| "loss": 1.0389, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.48, |
| "grad_norm": 0.17890392243862152, |
| "learning_rate": 0.00013959446931129704, |
| "loss": 1.0308, |
| "step": 596 |
| }, |
| { |
| "epoch": 1.48, |
| "grad_norm": 0.1741844266653061, |
| "learning_rate": 0.00013941203712582553, |
| "loss": 1.0466, |
| "step": 597 |
| }, |
| { |
| "epoch": 1.49, |
| "grad_norm": 0.19279837608337402, |
| "learning_rate": 0.0001392294495172681, |
| "loss": 0.9952, |
| "step": 598 |
| }, |
| { |
| "epoch": 1.49, |
| "grad_norm": 0.19602486491203308, |
| "learning_rate": 0.00013904670720566698, |
| "loss": 1.0273, |
| "step": 599 |
| }, |
| { |
| "epoch": 1.49, |
| "grad_norm": 0.18000701069831848, |
| "learning_rate": 0.0001388638109116744, |
| "loss": 1.0131, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.49, |
| "eval_loss": 1.080866813659668, |
| "eval_runtime": 81.6407, |
| "eval_samples_per_second": 31.835, |
| "eval_steps_per_second": 31.835, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.49, |
| "grad_norm": 0.18183240294456482, |
| "learning_rate": 0.0001386807613565499, |
| "loss": 0.9962, |
| "step": 601 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 0.1762516349554062, |
| "learning_rate": 0.00013849755926215735, |
| "loss": 1.0288, |
| "step": 602 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 0.17683060467243195, |
| "learning_rate": 0.00013831420535096223, |
| "loss": 0.9464, |
| "step": 603 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 0.1796884983778, |
| "learning_rate": 0.00013813070034602863, |
| "loss": 1.0294, |
| "step": 604 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 0.1921350210905075, |
| "learning_rate": 0.00013794704497101655, |
| "loss": 1.0216, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.51, |
| "grad_norm": 0.18306772410869598, |
| "learning_rate": 0.00013776323995017898, |
| "loss": 1.0552, |
| "step": 606 |
| }, |
| { |
| "epoch": 1.51, |
| "grad_norm": 0.18202297389507294, |
| "learning_rate": 0.000137579286008359, |
| "loss": 0.9735, |
| "step": 607 |
| }, |
| { |
| "epoch": 1.51, |
| "grad_norm": 0.18103723227977753, |
| "learning_rate": 0.00013739518387098705, |
| "loss": 0.9673, |
| "step": 608 |
| }, |
| { |
| "epoch": 1.51, |
| "grad_norm": 0.17903882265090942, |
| "learning_rate": 0.0001372109342640779, |
| "loss": 0.9405, |
| "step": 609 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 0.18169891834259033, |
| "learning_rate": 0.0001370265379142279, |
| "loss": 0.9595, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 0.18569333851337433, |
| "learning_rate": 0.00013684199554861207, |
| "loss": 0.9859, |
| "step": 611 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 0.18026390671730042, |
| "learning_rate": 0.0001366573078949813, |
| "loss": 0.9804, |
| "step": 612 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 0.18330590426921844, |
| "learning_rate": 0.00013647247568165938, |
| "loss": 0.9623, |
| "step": 613 |
| }, |
| { |
| "epoch": 1.53, |
| "grad_norm": 0.18787868320941925, |
| "learning_rate": 0.00013628749963754026, |
| "loss": 0.977, |
| "step": 614 |
| }, |
| { |
| "epoch": 1.53, |
| "grad_norm": 0.17502212524414062, |
| "learning_rate": 0.00013610238049208495, |
| "loss": 0.9615, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.53, |
| "grad_norm": 0.18354558944702148, |
| "learning_rate": 0.0001359171189753189, |
| "loss": 0.9493, |
| "step": 616 |
| }, |
| { |
| "epoch": 1.53, |
| "grad_norm": 0.18860042095184326, |
| "learning_rate": 0.00013573171581782897, |
| "loss": 1.0698, |
| "step": 617 |
| }, |
| { |
| "epoch": 1.54, |
| "grad_norm": 0.1900940239429474, |
| "learning_rate": 0.00013554617175076062, |
| "loss": 0.961, |
| "step": 618 |
| }, |
| { |
| "epoch": 1.54, |
| "grad_norm": 0.18823568522930145, |
| "learning_rate": 0.00013536048750581494, |
| "loss": 0.9106, |
| "step": 619 |
| }, |
| { |
| "epoch": 1.54, |
| "grad_norm": 0.18658524751663208, |
| "learning_rate": 0.0001351746638152458, |
| "loss": 0.9161, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.54, |
| "grad_norm": 0.18179596960544586, |
| "learning_rate": 0.00013498870141185712, |
| "loss": 0.9394, |
| "step": 621 |
| }, |
| { |
| "epoch": 1.55, |
| "grad_norm": 0.18801775574684143, |
| "learning_rate": 0.00013480260102899966, |
| "loss": 0.9827, |
| "step": 622 |
| }, |
| { |
| "epoch": 1.55, |
| "grad_norm": 0.18649117648601532, |
| "learning_rate": 0.00013461636340056843, |
| "loss": 0.9565, |
| "step": 623 |
| }, |
| { |
| "epoch": 1.55, |
| "grad_norm": 0.1857774257659912, |
| "learning_rate": 0.0001344299892609996, |
| "loss": 1.0292, |
| "step": 624 |
| }, |
| { |
| "epoch": 1.55, |
| "grad_norm": 0.1910741627216339, |
| "learning_rate": 0.00013424347934526772, |
| "loss": 1.0411, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.56, |
| "grad_norm": 0.19100044667720795, |
| "learning_rate": 0.00013405683438888282, |
| "loss": 1.0071, |
| "step": 626 |
| }, |
| { |
| "epoch": 1.56, |
| "grad_norm": 0.17907825112342834, |
| "learning_rate": 0.00013387005512788733, |
| "loss": 1.0374, |
| "step": 627 |
| }, |
| { |
| "epoch": 1.56, |
| "grad_norm": 0.1795564442873001, |
| "learning_rate": 0.00013368314229885347, |
| "loss": 1.0094, |
| "step": 628 |
| }, |
| { |
| "epoch": 1.56, |
| "grad_norm": 0.17529642581939697, |
| "learning_rate": 0.00013349609663888015, |
| "loss": 0.9316, |
| "step": 629 |
| }, |
| { |
| "epoch": 1.57, |
| "grad_norm": 0.18285749852657318, |
| "learning_rate": 0.00013330891888559002, |
| "loss": 0.9878, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.57, |
| "grad_norm": 0.18477262556552887, |
| "learning_rate": 0.00013312160977712668, |
| "loss": 1.0027, |
| "step": 631 |
| }, |
| { |
| "epoch": 1.57, |
| "grad_norm": 0.1869228482246399, |
| "learning_rate": 0.00013293417005215188, |
| "loss": 1.0269, |
| "step": 632 |
| }, |
| { |
| "epoch": 1.57, |
| "grad_norm": 0.19262288510799408, |
| "learning_rate": 0.00013274660044984224, |
| "loss": 1.0839, |
| "step": 633 |
| }, |
| { |
| "epoch": 1.58, |
| "grad_norm": 0.18182508647441864, |
| "learning_rate": 0.0001325589017098867, |
| "loss": 0.9953, |
| "step": 634 |
| }, |
| { |
| "epoch": 1.58, |
| "grad_norm": 0.21832676231861115, |
| "learning_rate": 0.0001323710745724834, |
| "loss": 1.028, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.58, |
| "grad_norm": 0.18413691222667694, |
| "learning_rate": 0.00013218311977833687, |
| "loss": 1.0081, |
| "step": 636 |
| }, |
| { |
| "epoch": 1.58, |
| "grad_norm": 0.182253897190094, |
| "learning_rate": 0.00013199503806865504, |
| "loss": 0.9492, |
| "step": 637 |
| }, |
| { |
| "epoch": 1.59, |
| "grad_norm": 0.19804389774799347, |
| "learning_rate": 0.0001318068301851463, |
| "loss": 0.9859, |
| "step": 638 |
| }, |
| { |
| "epoch": 1.59, |
| "grad_norm": 0.1846335232257843, |
| "learning_rate": 0.00013161849687001666, |
| "loss": 0.9594, |
| "step": 639 |
| }, |
| { |
| "epoch": 1.59, |
| "grad_norm": 0.18544115126132965, |
| "learning_rate": 0.00013143003886596669, |
| "loss": 1.0116, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.59, |
| "grad_norm": 0.1846534013748169, |
| "learning_rate": 0.00013124145691618884, |
| "loss": 1.0081, |
| "step": 641 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 0.17868997156620026, |
| "learning_rate": 0.0001310527517643642, |
| "loss": 0.9044, |
| "step": 642 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 0.18729160726070404, |
| "learning_rate": 0.00013086392415465972, |
| "loss": 0.9888, |
| "step": 643 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 0.1919986605644226, |
| "learning_rate": 0.00013067497483172538, |
| "loss": 1.0277, |
| "step": 644 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 0.20795708894729614, |
| "learning_rate": 0.00013048590454069108, |
| "loss": 0.8709, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.61, |
| "grad_norm": 0.19611623883247375, |
| "learning_rate": 0.00013029671402716366, |
| "loss": 0.984, |
| "step": 646 |
| }, |
| { |
| "epoch": 1.61, |
| "grad_norm": 0.19515739381313324, |
| "learning_rate": 0.0001301074040372242, |
| "loss": 0.9985, |
| "step": 647 |
| }, |
| { |
| "epoch": 1.61, |
| "grad_norm": 0.1995517462491989, |
| "learning_rate": 0.00012991797531742492, |
| "loss": 1.034, |
| "step": 648 |
| }, |
| { |
| "epoch": 1.61, |
| "grad_norm": 0.18805646896362305, |
| "learning_rate": 0.00012972842861478618, |
| "loss": 0.9625, |
| "step": 649 |
| }, |
| { |
| "epoch": 1.62, |
| "grad_norm": 0.19192944467067719, |
| "learning_rate": 0.00012953876467679373, |
| "loss": 1.0583, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.62, |
| "grad_norm": 0.19570088386535645, |
| "learning_rate": 0.0001293489842513955, |
| "loss": 0.9634, |
| "step": 651 |
| }, |
| { |
| "epoch": 1.62, |
| "grad_norm": 0.19576574862003326, |
| "learning_rate": 0.00012915908808699893, |
| "loss": 1.0172, |
| "step": 652 |
| }, |
| { |
| "epoch": 1.62, |
| "grad_norm": 0.17955078184604645, |
| "learning_rate": 0.0001289690769324678, |
| "loss": 0.9849, |
| "step": 653 |
| }, |
| { |
| "epoch": 1.63, |
| "grad_norm": 0.18549513816833496, |
| "learning_rate": 0.00012877895153711935, |
| "loss": 0.9527, |
| "step": 654 |
| }, |
| { |
| "epoch": 1.63, |
| "grad_norm": 0.19443288445472717, |
| "learning_rate": 0.0001285887126507214, |
| "loss": 1.0151, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.63, |
| "grad_norm": 0.17947880923748016, |
| "learning_rate": 0.00012839836102348926, |
| "loss": 0.9655, |
| "step": 656 |
| }, |
| { |
| "epoch": 1.63, |
| "grad_norm": 0.18537116050720215, |
| "learning_rate": 0.00012820789740608293, |
| "loss": 0.9429, |
| "step": 657 |
| }, |
| { |
| "epoch": 1.64, |
| "grad_norm": 0.19015100598335266, |
| "learning_rate": 0.00012801732254960388, |
| "loss": 1.0355, |
| "step": 658 |
| }, |
| { |
| "epoch": 1.64, |
| "grad_norm": 0.18511660397052765, |
| "learning_rate": 0.00012782663720559246, |
| "loss": 1.0473, |
| "step": 659 |
| }, |
| { |
| "epoch": 1.64, |
| "grad_norm": 0.18822525441646576, |
| "learning_rate": 0.00012763584212602453, |
| "loss": 0.9671, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.64, |
| "grad_norm": 0.18707570433616638, |
| "learning_rate": 0.0001274449380633089, |
| "loss": 1.0481, |
| "step": 661 |
| }, |
| { |
| "epoch": 1.65, |
| "grad_norm": 0.1918199360370636, |
| "learning_rate": 0.00012725392577028402, |
| "loss": 1.0062, |
| "step": 662 |
| }, |
| { |
| "epoch": 1.65, |
| "grad_norm": 0.19667948782444, |
| "learning_rate": 0.00012706280600021522, |
| "loss": 0.9817, |
| "step": 663 |
| }, |
| { |
| "epoch": 1.65, |
| "grad_norm": 0.1822723001241684, |
| "learning_rate": 0.0001268715795067916, |
| "loss": 0.9716, |
| "step": 664 |
| }, |
| { |
| "epoch": 1.65, |
| "grad_norm": 0.1914030760526657, |
| "learning_rate": 0.00012668024704412317, |
| "loss": 1.0209, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.66, |
| "grad_norm": 0.187057226896286, |
| "learning_rate": 0.00012648880936673787, |
| "loss": 1.0381, |
| "step": 666 |
| }, |
| { |
| "epoch": 1.66, |
| "grad_norm": 0.18619103729724884, |
| "learning_rate": 0.00012629726722957846, |
| "loss": 1.0432, |
| "step": 667 |
| }, |
| { |
| "epoch": 1.66, |
| "grad_norm": 0.19731828570365906, |
| "learning_rate": 0.00012610562138799978, |
| "loss": 1.0611, |
| "step": 668 |
| }, |
| { |
| "epoch": 1.66, |
| "grad_norm": 0.1894959807395935, |
| "learning_rate": 0.00012591387259776551, |
| "loss": 0.9914, |
| "step": 669 |
| }, |
| { |
| "epoch": 1.67, |
| "grad_norm": 0.1772470325231552, |
| "learning_rate": 0.00012572202161504543, |
| "loss": 0.9843, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.67, |
| "grad_norm": 0.18182332813739777, |
| "learning_rate": 0.00012553006919641214, |
| "loss": 0.949, |
| "step": 671 |
| }, |
| { |
| "epoch": 1.67, |
| "grad_norm": 0.1846974790096283, |
| "learning_rate": 0.00012533801609883842, |
| "loss": 0.9959, |
| "step": 672 |
| }, |
| { |
| "epoch": 1.68, |
| "grad_norm": 0.18767496943473816, |
| "learning_rate": 0.0001251458630796941, |
| "loss": 0.9466, |
| "step": 673 |
| }, |
| { |
| "epoch": 1.68, |
| "grad_norm": 0.18881787359714508, |
| "learning_rate": 0.00012495361089674285, |
| "loss": 0.9637, |
| "step": 674 |
| }, |
| { |
| "epoch": 1.68, |
| "grad_norm": 0.1902247816324234, |
| "learning_rate": 0.00012476126030813963, |
| "loss": 0.9985, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.68, |
| "grad_norm": 0.18302756547927856, |
| "learning_rate": 0.00012456881207242732, |
| "loss": 0.95, |
| "step": 676 |
| }, |
| { |
| "epoch": 1.69, |
| "grad_norm": 0.18244938552379608, |
| "learning_rate": 0.000124376266948534, |
| "loss": 0.9918, |
| "step": 677 |
| }, |
| { |
| "epoch": 1.69, |
| "grad_norm": 0.19507256150245667, |
| "learning_rate": 0.00012418362569576965, |
| "loss": 1.0055, |
| "step": 678 |
| }, |
| { |
| "epoch": 1.69, |
| "grad_norm": 0.19234226644039154, |
| "learning_rate": 0.0001239908890738235, |
| "loss": 1.0511, |
| "step": 679 |
| }, |
| { |
| "epoch": 1.69, |
| "grad_norm": 0.19556111097335815, |
| "learning_rate": 0.00012379805784276082, |
| "loss": 0.9981, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.7, |
| "grad_norm": 0.19322308897972107, |
| "learning_rate": 0.00012360513276301997, |
| "loss": 0.9603, |
| "step": 681 |
| }, |
| { |
| "epoch": 1.7, |
| "grad_norm": 0.1905602067708969, |
| "learning_rate": 0.0001234121145954094, |
| "loss": 0.9937, |
| "step": 682 |
| }, |
| { |
| "epoch": 1.7, |
| "grad_norm": 0.19340857863426208, |
| "learning_rate": 0.00012321900410110464, |
| "loss": 0.9996, |
| "step": 683 |
| }, |
| { |
| "epoch": 1.7, |
| "grad_norm": 0.181385800242424, |
| "learning_rate": 0.00012302580204164541, |
| "loss": 0.9563, |
| "step": 684 |
| }, |
| { |
| "epoch": 1.71, |
| "grad_norm": 0.19400039315223694, |
| "learning_rate": 0.00012283250917893244, |
| "loss": 1.0732, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.71, |
| "grad_norm": 0.1877606064081192, |
| "learning_rate": 0.0001226391262752245, |
| "loss": 1.0057, |
| "step": 686 |
| }, |
| { |
| "epoch": 1.71, |
| "grad_norm": 0.18977177143096924, |
| "learning_rate": 0.00012244565409313547, |
| "loss": 0.9898, |
| "step": 687 |
| }, |
| { |
| "epoch": 1.71, |
| "grad_norm": 0.19174890220165253, |
| "learning_rate": 0.00012225209339563145, |
| "loss": 1.0449, |
| "step": 688 |
| }, |
| { |
| "epoch": 1.72, |
| "grad_norm": 0.18353353440761566, |
| "learning_rate": 0.0001220584449460274, |
| "loss": 0.9952, |
| "step": 689 |
| }, |
| { |
| "epoch": 1.72, |
| "grad_norm": 0.18639762699604034, |
| "learning_rate": 0.00012186470950798445, |
| "loss": 0.9693, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.72, |
| "grad_norm": 0.1900029480457306, |
| "learning_rate": 0.00012167088784550673, |
| "loss": 0.9574, |
| "step": 691 |
| }, |
| { |
| "epoch": 1.72, |
| "grad_norm": 0.18529686331748962, |
| "learning_rate": 0.00012147698072293842, |
| "loss": 1.0299, |
| "step": 692 |
| }, |
| { |
| "epoch": 1.73, |
| "grad_norm": 0.1907936930656433, |
| "learning_rate": 0.00012128298890496072, |
| "loss": 0.9557, |
| "step": 693 |
| }, |
| { |
| "epoch": 1.73, |
| "grad_norm": 0.1865403652191162, |
| "learning_rate": 0.00012108891315658879, |
| "loss": 0.946, |
| "step": 694 |
| }, |
| { |
| "epoch": 1.73, |
| "grad_norm": 0.18556007742881775, |
| "learning_rate": 0.00012089475424316883, |
| "loss": 1.0129, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.73, |
| "grad_norm": 0.1845078021287918, |
| "learning_rate": 0.00012070051293037492, |
| "loss": 0.9436, |
| "step": 696 |
| }, |
| { |
| "epoch": 1.74, |
| "grad_norm": 0.18208341300487518, |
| "learning_rate": 0.00012050618998420624, |
| "loss": 0.9985, |
| "step": 697 |
| }, |
| { |
| "epoch": 1.74, |
| "grad_norm": 0.19252164661884308, |
| "learning_rate": 0.00012031178617098371, |
| "loss": 1.0147, |
| "step": 698 |
| }, |
| { |
| "epoch": 1.74, |
| "grad_norm": 0.1972821056842804, |
| "learning_rate": 0.00012011730225734723, |
| "loss": 1.0548, |
| "step": 699 |
| }, |
| { |
| "epoch": 1.74, |
| "grad_norm": 0.18477863073349, |
| "learning_rate": 0.00011992273901025269, |
| "loss": 0.9847, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.74, |
| "eval_loss": 1.0762046575546265, |
| "eval_runtime": 81.6492, |
| "eval_samples_per_second": 31.831, |
| "eval_steps_per_second": 31.831, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 0.19482113420963287, |
| "learning_rate": 0.00011972809719696864, |
| "loss": 0.9685, |
| "step": 701 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 0.19040922820568085, |
| "learning_rate": 0.0001195333775850736, |
| "loss": 1.0528, |
| "step": 702 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 0.19116735458374023, |
| "learning_rate": 0.00011933858094245281, |
| "loss": 0.983, |
| "step": 703 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 0.17496508359909058, |
| "learning_rate": 0.00011914370803729533, |
| "loss": 0.936, |
| "step": 704 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 0.1774684637784958, |
| "learning_rate": 0.00011894875963809098, |
| "loss": 1.001, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 0.1926085203886032, |
| "learning_rate": 0.00011875373651362727, |
| "loss": 1.0406, |
| "step": 706 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 0.18313874304294586, |
| "learning_rate": 0.00011855863943298631, |
| "loss": 0.9501, |
| "step": 707 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 0.18082866072654724, |
| "learning_rate": 0.00011836346916554205, |
| "loss": 0.9773, |
| "step": 708 |
| }, |
| { |
| "epoch": 1.77, |
| "grad_norm": 0.1892704963684082, |
| "learning_rate": 0.00011816822648095687, |
| "loss": 0.9879, |
| "step": 709 |
| }, |
| { |
| "epoch": 1.77, |
| "grad_norm": 0.1928127110004425, |
| "learning_rate": 0.00011797291214917881, |
| "loss": 1.0106, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.77, |
| "grad_norm": 0.191785529255867, |
| "learning_rate": 0.00011777752694043849, |
| "loss": 0.9633, |
| "step": 711 |
| }, |
| { |
| "epoch": 1.77, |
| "grad_norm": 0.18815581500530243, |
| "learning_rate": 0.00011758207162524598, |
| "loss": 1.0087, |
| "step": 712 |
| }, |
| { |
| "epoch": 1.78, |
| "grad_norm": 0.19140002131462097, |
| "learning_rate": 0.00011738654697438782, |
| "loss": 1.022, |
| "step": 713 |
| }, |
| { |
| "epoch": 1.78, |
| "grad_norm": 0.18412011861801147, |
| "learning_rate": 0.00011719095375892396, |
| "loss": 0.9177, |
| "step": 714 |
| }, |
| { |
| "epoch": 1.78, |
| "grad_norm": 0.19803179800510406, |
| "learning_rate": 0.00011699529275018484, |
| "loss": 1.056, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.78, |
| "grad_norm": 0.18873557448387146, |
| "learning_rate": 0.00011679956471976814, |
| "loss": 0.9664, |
| "step": 716 |
| }, |
| { |
| "epoch": 1.79, |
| "grad_norm": 0.1954958438873291, |
| "learning_rate": 0.00011660377043953588, |
| "loss": 0.9837, |
| "step": 717 |
| }, |
| { |
| "epoch": 1.79, |
| "grad_norm": 0.1911032795906067, |
| "learning_rate": 0.0001164079106816113, |
| "loss": 1.0281, |
| "step": 718 |
| }, |
| { |
| "epoch": 1.79, |
| "grad_norm": 0.19415371119976044, |
| "learning_rate": 0.00011621198621837593, |
| "loss": 0.9596, |
| "step": 719 |
| }, |
| { |
| "epoch": 1.79, |
| "grad_norm": 0.1977900266647339, |
| "learning_rate": 0.00011601599782246646, |
| "loss": 0.9503, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 0.1874951422214508, |
| "learning_rate": 0.0001158199462667716, |
| "loss": 1.0024, |
| "step": 721 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 0.1944780796766281, |
| "learning_rate": 0.00011562383232442926, |
| "loss": 0.9805, |
| "step": 722 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 0.18960687518119812, |
| "learning_rate": 0.00011542765676882325, |
| "loss": 1.0155, |
| "step": 723 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 0.1834162324666977, |
| "learning_rate": 0.0001152314203735805, |
| "loss": 0.9558, |
| "step": 724 |
| }, |
| { |
| "epoch": 1.81, |
| "grad_norm": 0.1892080008983612, |
| "learning_rate": 0.00011503512391256776, |
| "loss": 1.0202, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.81, |
| "grad_norm": 0.19285555183887482, |
| "learning_rate": 0.00011483876815988867, |
| "loss": 0.986, |
| "step": 726 |
| }, |
| { |
| "epoch": 1.81, |
| "grad_norm": 0.1912676841020584, |
| "learning_rate": 0.00011464235388988067, |
| "loss": 1.0215, |
| "step": 727 |
| }, |
| { |
| "epoch": 1.81, |
| "grad_norm": 0.18774007260799408, |
| "learning_rate": 0.00011444588187711205, |
| "loss": 0.9133, |
| "step": 728 |
| }, |
| { |
| "epoch": 1.82, |
| "grad_norm": 0.18041113018989563, |
| "learning_rate": 0.0001142493528963787, |
| "loss": 0.9651, |
| "step": 729 |
| }, |
| { |
| "epoch": 1.82, |
| "grad_norm": 0.18634317815303802, |
| "learning_rate": 0.00011405276772270126, |
| "loss": 1.0167, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.82, |
| "grad_norm": 0.18424159288406372, |
| "learning_rate": 0.0001138561271313219, |
| "loss": 0.9602, |
| "step": 731 |
| }, |
| { |
| "epoch": 1.82, |
| "grad_norm": 0.18384714424610138, |
| "learning_rate": 0.0001136594318977014, |
| "loss": 0.9298, |
| "step": 732 |
| }, |
| { |
| "epoch": 1.83, |
| "grad_norm": 0.19117358326911926, |
| "learning_rate": 0.00011346268279751595, |
| "loss": 0.9123, |
| "step": 733 |
| }, |
| { |
| "epoch": 1.83, |
| "grad_norm": 0.18405017256736755, |
| "learning_rate": 0.0001132658806066542, |
| "loss": 0.9986, |
| "step": 734 |
| }, |
| { |
| "epoch": 1.83, |
| "grad_norm": 0.1914985477924347, |
| "learning_rate": 0.00011306902610121419, |
| "loss": 0.9518, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.83, |
| "grad_norm": 0.1904747486114502, |
| "learning_rate": 0.00011287212005750024, |
| "loss": 0.9891, |
| "step": 736 |
| }, |
| { |
| "epoch": 1.84, |
| "grad_norm": 0.1916552037000656, |
| "learning_rate": 0.00011267516325201985, |
| "loss": 0.9616, |
| "step": 737 |
| }, |
| { |
| "epoch": 1.84, |
| "grad_norm": 0.18625429272651672, |
| "learning_rate": 0.00011247815646148087, |
| "loss": 0.9592, |
| "step": 738 |
| }, |
| { |
| "epoch": 1.84, |
| "grad_norm": 0.1944790482521057, |
| "learning_rate": 0.00011228110046278808, |
| "loss": 0.9469, |
| "step": 739 |
| }, |
| { |
| "epoch": 1.84, |
| "grad_norm": 0.20122645795345306, |
| "learning_rate": 0.00011208399603304047, |
| "loss": 0.9849, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.85, |
| "grad_norm": 0.19067947566509247, |
| "learning_rate": 0.00011188684394952789, |
| "loss": 1.0099, |
| "step": 741 |
| }, |
| { |
| "epoch": 1.85, |
| "grad_norm": 0.18489985167980194, |
| "learning_rate": 0.00011168964498972818, |
| "loss": 0.9669, |
| "step": 742 |
| }, |
| { |
| "epoch": 1.85, |
| "grad_norm": 0.1892281025648117, |
| "learning_rate": 0.00011149239993130403, |
| "loss": 0.9674, |
| "step": 743 |
| }, |
| { |
| "epoch": 1.85, |
| "grad_norm": 0.1811356395483017, |
| "learning_rate": 0.00011129510955209996, |
| "loss": 1.0119, |
| "step": 744 |
| }, |
| { |
| "epoch": 1.86, |
| "grad_norm": 0.19581769406795502, |
| "learning_rate": 0.00011109777463013915, |
| "loss": 0.9978, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.86, |
| "grad_norm": 0.19298292696475983, |
| "learning_rate": 0.00011090039594362045, |
| "loss": 0.9971, |
| "step": 746 |
| }, |
| { |
| "epoch": 1.86, |
| "grad_norm": 0.1880626529455185, |
| "learning_rate": 0.00011070297427091534, |
| "loss": 1.0108, |
| "step": 747 |
| }, |
| { |
| "epoch": 1.86, |
| "grad_norm": 0.1833215206861496, |
| "learning_rate": 0.00011050551039056479, |
| "loss": 0.9353, |
| "step": 748 |
| }, |
| { |
| "epoch": 1.87, |
| "grad_norm": 0.18261606991291046, |
| "learning_rate": 0.0001103080050812762, |
| "loss": 0.9607, |
| "step": 749 |
| }, |
| { |
| "epoch": 1.87, |
| "grad_norm": 0.1790233999490738, |
| "learning_rate": 0.00011011045912192035, |
| "loss": 0.9579, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.87, |
| "grad_norm": 0.20333704352378845, |
| "learning_rate": 0.00010991287329152838, |
| "loss": 1.0136, |
| "step": 751 |
| }, |
| { |
| "epoch": 1.87, |
| "grad_norm": 0.18839126825332642, |
| "learning_rate": 0.0001097152483692886, |
| "loss": 0.992, |
| "step": 752 |
| }, |
| { |
| "epoch": 1.88, |
| "grad_norm": 0.1932857632637024, |
| "learning_rate": 0.00010951758513454351, |
| "loss": 0.9098, |
| "step": 753 |
| }, |
| { |
| "epoch": 1.88, |
| "grad_norm": 0.19326822459697723, |
| "learning_rate": 0.00010931988436678666, |
| "loss": 0.9718, |
| "step": 754 |
| }, |
| { |
| "epoch": 1.88, |
| "grad_norm": 0.19290626049041748, |
| "learning_rate": 0.00010912214684565967, |
| "loss": 0.9569, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.88, |
| "grad_norm": 0.1982078105211258, |
| "learning_rate": 0.00010892437335094912, |
| "loss": 0.929, |
| "step": 756 |
| }, |
| { |
| "epoch": 1.89, |
| "grad_norm": 0.18881501257419586, |
| "learning_rate": 0.00010872656466258328, |
| "loss": 1.0139, |
| "step": 757 |
| }, |
| { |
| "epoch": 1.89, |
| "grad_norm": 0.18985024094581604, |
| "learning_rate": 0.00010852872156062946, |
| "loss": 0.9946, |
| "step": 758 |
| }, |
| { |
| "epoch": 1.89, |
| "grad_norm": 0.19749155640602112, |
| "learning_rate": 0.00010833084482529048, |
| "loss": 1.0356, |
| "step": 759 |
| }, |
| { |
| "epoch": 1.89, |
| "grad_norm": 0.19211384654045105, |
| "learning_rate": 0.00010813293523690191, |
| "loss": 0.9779, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.9, |
| "grad_norm": 0.19262412190437317, |
| "learning_rate": 0.0001079349935759288, |
| "loss": 0.9665, |
| "step": 761 |
| }, |
| { |
| "epoch": 1.9, |
| "grad_norm": 0.18871724605560303, |
| "learning_rate": 0.00010773702062296273, |
| "loss": 0.9511, |
| "step": 762 |
| }, |
| { |
| "epoch": 1.9, |
| "grad_norm": 0.18119603395462036, |
| "learning_rate": 0.00010753901715871866, |
| "loss": 0.9482, |
| "step": 763 |
| }, |
| { |
| "epoch": 1.9, |
| "grad_norm": 0.18349209427833557, |
| "learning_rate": 0.00010734098396403192, |
| "loss": 0.9386, |
| "step": 764 |
| }, |
| { |
| "epoch": 1.91, |
| "grad_norm": 0.19208337366580963, |
| "learning_rate": 0.00010714292181985498, |
| "loss": 0.9473, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.91, |
| "grad_norm": 0.18588630855083466, |
| "learning_rate": 0.00010694483150725458, |
| "loss": 1.0278, |
| "step": 766 |
| }, |
| { |
| "epoch": 1.91, |
| "grad_norm": 0.18634718656539917, |
| "learning_rate": 0.00010674671380740851, |
| "loss": 1.0387, |
| "step": 767 |
| }, |
| { |
| "epoch": 1.91, |
| "grad_norm": 0.18514113128185272, |
| "learning_rate": 0.00010654856950160253, |
| "loss": 0.9557, |
| "step": 768 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 0.18085001409053802, |
| "learning_rate": 0.00010635039937122733, |
| "loss": 0.9689, |
| "step": 769 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 0.18852289021015167, |
| "learning_rate": 0.00010615220419777548, |
| "loss": 1.0444, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 0.19260498881340027, |
| "learning_rate": 0.00010595398476283827, |
| "loss": 0.9204, |
| "step": 771 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 0.19677571952342987, |
| "learning_rate": 0.00010575574184810269, |
| "loss": 1.0183, |
| "step": 772 |
| }, |
| { |
| "epoch": 1.93, |
| "grad_norm": 0.19709721207618713, |
| "learning_rate": 0.00010555747623534831, |
| "loss": 1.011, |
| "step": 773 |
| }, |
| { |
| "epoch": 1.93, |
| "grad_norm": 0.18773804605007172, |
| "learning_rate": 0.0001053591887064442, |
| "loss": 0.9834, |
| "step": 774 |
| }, |
| { |
| "epoch": 1.93, |
| "grad_norm": 0.19036594033241272, |
| "learning_rate": 0.0001051608800433459, |
| "loss": 0.9657, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.93, |
| "grad_norm": 0.1866806596517563, |
| "learning_rate": 0.00010496255102809223, |
| "loss": 0.9609, |
| "step": 776 |
| }, |
| { |
| "epoch": 1.94, |
| "grad_norm": 0.1847977638244629, |
| "learning_rate": 0.00010476420244280232, |
| "loss": 0.9814, |
| "step": 777 |
| }, |
| { |
| "epoch": 1.94, |
| "grad_norm": 0.19136272370815277, |
| "learning_rate": 0.00010456583506967248, |
| "loss": 1.0256, |
| "step": 778 |
| }, |
| { |
| "epoch": 1.94, |
| "grad_norm": 0.22890682518482208, |
| "learning_rate": 0.00010436744969097306, |
| "loss": 0.9979, |
| "step": 779 |
| }, |
| { |
| "epoch": 1.94, |
| "grad_norm": 0.19637508690357208, |
| "learning_rate": 0.00010416904708904548, |
| "loss": 0.9841, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.95, |
| "grad_norm": 0.1934499442577362, |
| "learning_rate": 0.000103970628046299, |
| "loss": 0.9251, |
| "step": 781 |
| }, |
| { |
| "epoch": 1.95, |
| "grad_norm": 0.1859968602657318, |
| "learning_rate": 0.00010377219334520783, |
| "loss": 0.9702, |
| "step": 782 |
| }, |
| { |
| "epoch": 1.95, |
| "grad_norm": 0.18776066601276398, |
| "learning_rate": 0.00010357374376830775, |
| "loss": 0.95, |
| "step": 783 |
| }, |
| { |
| "epoch": 1.95, |
| "grad_norm": 0.19182752072811127, |
| "learning_rate": 0.00010337528009819344, |
| "loss": 0.9476, |
| "step": 784 |
| }, |
| { |
| "epoch": 1.96, |
| "grad_norm": 0.19188746809959412, |
| "learning_rate": 0.00010317680311751496, |
| "loss": 1.0165, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.96, |
| "grad_norm": 0.18225421011447906, |
| "learning_rate": 0.00010297831360897492, |
| "loss": 0.9593, |
| "step": 786 |
| }, |
| { |
| "epoch": 1.96, |
| "grad_norm": 0.1944630891084671, |
| "learning_rate": 0.00010277981235532541, |
| "loss": 0.9439, |
| "step": 787 |
| }, |
| { |
| "epoch": 1.96, |
| "grad_norm": 0.1944238543510437, |
| "learning_rate": 0.00010258130013936474, |
| "loss": 1.0166, |
| "step": 788 |
| }, |
| { |
| "epoch": 1.97, |
| "grad_norm": 0.18848265707492828, |
| "learning_rate": 0.00010238277774393448, |
| "loss": 0.9808, |
| "step": 789 |
| }, |
| { |
| "epoch": 1.97, |
| "grad_norm": 0.1884046196937561, |
| "learning_rate": 0.00010218424595191631, |
| "loss": 1.0332, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.97, |
| "grad_norm": 0.1906522959470749, |
| "learning_rate": 0.00010198570554622909, |
| "loss": 0.9361, |
| "step": 791 |
| }, |
| { |
| "epoch": 1.97, |
| "grad_norm": 0.1847391128540039, |
| "learning_rate": 0.00010178715730982549, |
| "loss": 0.9522, |
| "step": 792 |
| }, |
| { |
| "epoch": 1.98, |
| "grad_norm": 0.18664193153381348, |
| "learning_rate": 0.00010158860202568916, |
| "loss": 0.9834, |
| "step": 793 |
| }, |
| { |
| "epoch": 1.98, |
| "grad_norm": 0.19117935001850128, |
| "learning_rate": 0.00010139004047683151, |
| "loss": 0.9931, |
| "step": 794 |
| }, |
| { |
| "epoch": 1.98, |
| "grad_norm": 0.1847536265850067, |
| "learning_rate": 0.0001011914734462887, |
| "loss": 1.0131, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.98, |
| "grad_norm": 0.18716172873973846, |
| "learning_rate": 0.00010099290171711841, |
| "loss": 0.948, |
| "step": 796 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1592, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 398, |
| "total_flos": 5.93824319923028e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|