| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.997496871088861, | |
| "eval_steps": 500, | |
| "global_step": 1197, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.025031289111389236, | |
| "grad_norm": 2.131367059642943, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0783, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05006257822277847, | |
| "grad_norm": 5.240110415023845, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9676, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07509386733416772, | |
| "grad_norm": 3.8068595828161884, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9339, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10012515644555695, | |
| "grad_norm": 1.6005118869442196, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9069, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1251564455569462, | |
| "grad_norm": 1.144980360790589, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8873, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15018773466833543, | |
| "grad_norm": 0.9988315731444815, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8743, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.17521902377972465, | |
| "grad_norm": 1.1160785754418054, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8694, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2002503128911139, | |
| "grad_norm": 0.6502972385224608, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8554, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.22528160200250313, | |
| "grad_norm": 1.1342296445805713, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8509, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2503128911138924, | |
| "grad_norm": 1.7350477074788984, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8447, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2753441802252816, | |
| "grad_norm": 1.4021943431412354, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8397, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.30037546933667086, | |
| "grad_norm": 0.7783764690541898, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8363, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.32540675844806005, | |
| "grad_norm": 1.054120463662093, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8331, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3504380475594493, | |
| "grad_norm": 0.7025904608947156, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8319, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.37546933667083854, | |
| "grad_norm": 0.625073338332613, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8228, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4005006257822278, | |
| "grad_norm": 0.68586294875638, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8273, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.425531914893617, | |
| "grad_norm": 0.7166203381918367, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8271, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.45056320400500627, | |
| "grad_norm": 0.7727657584916029, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8201, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4755944931163955, | |
| "grad_norm": 0.6165452607139709, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8165, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5006257822277848, | |
| "grad_norm": 0.5941915725497845, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8165, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5256570713391739, | |
| "grad_norm": 0.9044617312024994, | |
| "learning_rate": 5e-06, | |
| "loss": 0.814, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5506883604505632, | |
| "grad_norm": 0.9023052529604444, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8158, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5757196495619524, | |
| "grad_norm": 0.8778852029110713, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8126, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6007509386733417, | |
| "grad_norm": 0.7745768192218916, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8094, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6257822277847309, | |
| "grad_norm": 0.676919868236649, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8081, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6508135168961201, | |
| "grad_norm": 0.6457111096534787, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8063, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6758448060075094, | |
| "grad_norm": 0.6026716036669796, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8086, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7008760951188986, | |
| "grad_norm": 0.6933525279767419, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8006, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7259073842302879, | |
| "grad_norm": 0.9470644278270366, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8006, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7509386733416771, | |
| "grad_norm": 0.9427280269205052, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7961, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7759699624530664, | |
| "grad_norm": 0.7264873730589139, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7989, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8010012515644556, | |
| "grad_norm": 0.6143483178764919, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7958, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8260325406758448, | |
| "grad_norm": 0.8137195556401391, | |
| "learning_rate": 5e-06, | |
| "loss": 0.798, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.851063829787234, | |
| "grad_norm": 0.6352119054411591, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7979, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8760951188986232, | |
| "grad_norm": 0.6463859124240815, | |
| "learning_rate": 5e-06, | |
| "loss": 0.796, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9011264080100125, | |
| "grad_norm": 0.60316586739445, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7927, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9261576971214017, | |
| "grad_norm": 0.893894099723059, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7954, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.951188986232791, | |
| "grad_norm": 0.5680880106145626, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7892, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9762202753441802, | |
| "grad_norm": 0.6238008753559559, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7876, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9987484355444305, | |
| "eval_loss": 0.7894856929779053, | |
| "eval_runtime": 272.7742, | |
| "eval_samples_per_second": 39.461, | |
| "eval_steps_per_second": 0.62, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.0018773466833542, | |
| "grad_norm": 1.4020823851876998, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8446, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0269086357947435, | |
| "grad_norm": 0.9263533629153153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7458, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.0519399249061328, | |
| "grad_norm": 0.7694418549908636, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7468, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0769712140175218, | |
| "grad_norm": 0.6223863756395531, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7482, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.1020025031289111, | |
| "grad_norm": 0.6462188205588866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7462, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.1270337922403004, | |
| "grad_norm": 0.8428401134288817, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7451, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1520650813516897, | |
| "grad_norm": 0.6127601124586929, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7455, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.1770963704630788, | |
| "grad_norm": 0.6867077491456403, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7464, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.202127659574468, | |
| "grad_norm": 0.655478438583233, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7467, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.2271589486858574, | |
| "grad_norm": 0.6163393565591452, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7466, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.2521902377972465, | |
| "grad_norm": 0.593653632025471, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7472, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2772215269086358, | |
| "grad_norm": 0.7027928119588218, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7483, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.302252816020025, | |
| "grad_norm": 0.6854728377454722, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7435, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.3272841051314144, | |
| "grad_norm": 1.0781928345187066, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7466, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.3523153942428034, | |
| "grad_norm": 0.6327512087924364, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7448, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3773466833541927, | |
| "grad_norm": 0.6100108550845114, | |
| "learning_rate": 5e-06, | |
| "loss": 0.744, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.402377972465582, | |
| "grad_norm": 0.7365288494797091, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7478, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.4274092615769711, | |
| "grad_norm": 0.5377128504214334, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7439, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.4524405506883604, | |
| "grad_norm": 0.6741574190963199, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7409, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.4774718397997497, | |
| "grad_norm": 0.6592122959853899, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7426, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.502503128911139, | |
| "grad_norm": 0.8235710348347305, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7411, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5275344180225283, | |
| "grad_norm": 0.7239324585893679, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7443, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.5525657071339174, | |
| "grad_norm": 0.6151649317588374, | |
| "learning_rate": 5e-06, | |
| "loss": 0.745, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.5775969962453065, | |
| "grad_norm": 0.750130072011022, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7431, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.6026282853566958, | |
| "grad_norm": 0.6703845951317265, | |
| "learning_rate": 5e-06, | |
| "loss": 0.745, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.627659574468085, | |
| "grad_norm": 0.5817463764362418, | |
| "learning_rate": 5e-06, | |
| "loss": 0.745, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.6526908635794744, | |
| "grad_norm": 0.6421167763606337, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7413, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.6777221526908637, | |
| "grad_norm": 0.60677554331151, | |
| "learning_rate": 5e-06, | |
| "loss": 0.746, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.702753441802253, | |
| "grad_norm": 0.6934469874354038, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7434, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.727784730913642, | |
| "grad_norm": 0.6181233971516625, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7426, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.7528160200250313, | |
| "grad_norm": 1.2959559737613207, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7397, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.7778473091364204, | |
| "grad_norm": 0.5691879884263149, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7425, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.8028785982478097, | |
| "grad_norm": 0.6574433826038453, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7415, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.827909887359199, | |
| "grad_norm": 0.6537064387599367, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7409, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.8529411764705883, | |
| "grad_norm": 0.5954863570531691, | |
| "learning_rate": 5e-06, | |
| "loss": 0.741, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.8779724655819776, | |
| "grad_norm": 0.6362794802654446, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7398, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.9030037546933667, | |
| "grad_norm": 0.5986786317956753, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7375, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.928035043804756, | |
| "grad_norm": 0.7083210186967874, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7398, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.953066332916145, | |
| "grad_norm": 0.5556104219419278, | |
| "learning_rate": 5e-06, | |
| "loss": 0.738, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.9780976220275344, | |
| "grad_norm": 0.5820311052189984, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7419, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.9981226533166458, | |
| "eval_loss": 0.7726743817329407, | |
| "eval_runtime": 279.978, | |
| "eval_samples_per_second": 38.446, | |
| "eval_steps_per_second": 0.604, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 2.0037546933667083, | |
| "grad_norm": 1.1635293256781625, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7865, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.0287859824780976, | |
| "grad_norm": 0.7276790303807796, | |
| "learning_rate": 5e-06, | |
| "loss": 0.699, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.053817271589487, | |
| "grad_norm": 0.7288566025250091, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6925, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.078848560700876, | |
| "grad_norm": 0.7172705630929458, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6919, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.1038798498122655, | |
| "grad_norm": 0.6776212393282559, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6934, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.1289111389236544, | |
| "grad_norm": 0.6439197802971096, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6927, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.1539424280350437, | |
| "grad_norm": 0.6173323306558143, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6937, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.178973717146433, | |
| "grad_norm": 0.7564554823831022, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6945, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.2040050062578223, | |
| "grad_norm": 0.6620302857887288, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6944, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.2290362953692116, | |
| "grad_norm": 0.6377986146944816, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6964, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.254067584480601, | |
| "grad_norm": 0.5556020070065616, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6935, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.27909887359199, | |
| "grad_norm": 0.7076075517084951, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6993, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.3041301627033794, | |
| "grad_norm": 0.5973458431773908, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6986, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.3291614518147683, | |
| "grad_norm": 0.7640330738397633, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6979, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.3541927409261576, | |
| "grad_norm": 0.6544037662816962, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6985, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.379224030037547, | |
| "grad_norm": 0.643258238595077, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6994, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.404255319148936, | |
| "grad_norm": 0.622397334096867, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6953, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.4292866082603255, | |
| "grad_norm": 0.7595102567708334, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6977, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.454317897371715, | |
| "grad_norm": 0.6430300856113221, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6936, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.4793491864831037, | |
| "grad_norm": 0.6649001597729085, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6966, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.504380475594493, | |
| "grad_norm": 0.7807773547782566, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6977, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.5294117647058822, | |
| "grad_norm": 0.7685637796964472, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7037, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.5544430538172715, | |
| "grad_norm": 0.6087836172391723, | |
| "learning_rate": 5e-06, | |
| "loss": 0.695, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.579474342928661, | |
| "grad_norm": 0.6126786752740975, | |
| "learning_rate": 5e-06, | |
| "loss": 0.702, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.60450563204005, | |
| "grad_norm": 0.5692342336245159, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6965, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.6295369211514394, | |
| "grad_norm": 0.5461330132492804, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6958, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.6545682102628287, | |
| "grad_norm": 0.5974455044275497, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6977, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.679599499374218, | |
| "grad_norm": 0.6678395373217543, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6992, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.704630788485607, | |
| "grad_norm": 0.6068475995182039, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6977, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.729662077596996, | |
| "grad_norm": 0.6224775601315933, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6994, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.7546933667083855, | |
| "grad_norm": 0.5607628869873644, | |
| "learning_rate": 5e-06, | |
| "loss": 0.698, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.779724655819775, | |
| "grad_norm": 1.0197508263783457, | |
| "learning_rate": 5e-06, | |
| "loss": 0.699, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.804755944931164, | |
| "grad_norm": 0.6699677843683128, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6979, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.829787234042553, | |
| "grad_norm": 0.6628279862395011, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7011, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.8548185231539422, | |
| "grad_norm": 0.5823164374883697, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6988, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.8798498122653315, | |
| "grad_norm": 0.6607195364082912, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6968, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.904881101376721, | |
| "grad_norm": 0.577549838302954, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7005, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.92991239048811, | |
| "grad_norm": 0.6569021208672484, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7034, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.9549436795994994, | |
| "grad_norm": 0.5809953224848247, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6951, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.9799749687108887, | |
| "grad_norm": 0.5986204550852365, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6976, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.997496871088861, | |
| "eval_loss": 0.770167887210846, | |
| "eval_runtime": 267.216, | |
| "eval_samples_per_second": 40.282, | |
| "eval_steps_per_second": 0.632, | |
| "step": 1197 | |
| }, | |
| { | |
| "epoch": 2.997496871088861, | |
| "step": 1197, | |
| "total_flos": 2004810203136000.0, | |
| "train_loss": 0.7593094730934902, | |
| "train_runtime": 39147.1454, | |
| "train_samples_per_second": 15.671, | |
| "train_steps_per_second": 0.031 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1197, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2004810203136000.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |