{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.997496871088861, "eval_steps": 500, "global_step": 1197, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025031289111389236, "grad_norm": 2.131367059642943, "learning_rate": 5e-06, "loss": 1.0783, "step": 10 }, { "epoch": 0.05006257822277847, "grad_norm": 5.240110415023845, "learning_rate": 5e-06, "loss": 0.9676, "step": 20 }, { "epoch": 0.07509386733416772, "grad_norm": 3.8068595828161884, "learning_rate": 5e-06, "loss": 0.9339, "step": 30 }, { "epoch": 0.10012515644555695, "grad_norm": 1.6005118869442196, "learning_rate": 5e-06, "loss": 0.9069, "step": 40 }, { "epoch": 0.1251564455569462, "grad_norm": 1.144980360790589, "learning_rate": 5e-06, "loss": 0.8873, "step": 50 }, { "epoch": 0.15018773466833543, "grad_norm": 0.9988315731444815, "learning_rate": 5e-06, "loss": 0.8743, "step": 60 }, { "epoch": 0.17521902377972465, "grad_norm": 1.1160785754418054, "learning_rate": 5e-06, "loss": 0.8694, "step": 70 }, { "epoch": 0.2002503128911139, "grad_norm": 0.6502972385224608, "learning_rate": 5e-06, "loss": 0.8554, "step": 80 }, { "epoch": 0.22528160200250313, "grad_norm": 1.1342296445805713, "learning_rate": 5e-06, "loss": 0.8509, "step": 90 }, { "epoch": 0.2503128911138924, "grad_norm": 1.7350477074788984, "learning_rate": 5e-06, "loss": 0.8447, "step": 100 }, { "epoch": 0.2753441802252816, "grad_norm": 1.4021943431412354, "learning_rate": 5e-06, "loss": 0.8397, "step": 110 }, { "epoch": 0.30037546933667086, "grad_norm": 0.7783764690541898, "learning_rate": 5e-06, "loss": 0.8363, "step": 120 }, { "epoch": 0.32540675844806005, "grad_norm": 1.054120463662093, "learning_rate": 5e-06, "loss": 0.8331, "step": 130 }, { "epoch": 0.3504380475594493, "grad_norm": 0.7025904608947156, "learning_rate": 5e-06, "loss": 0.8319, "step": 140 }, { "epoch": 0.37546933667083854, "grad_norm": 0.625073338332613, "learning_rate": 5e-06, "loss": 0.8228, "step": 150 }, { "epoch": 0.4005006257822278, "grad_norm": 0.68586294875638, "learning_rate": 5e-06, "loss": 0.8273, "step": 160 }, { "epoch": 0.425531914893617, "grad_norm": 0.7166203381918367, "learning_rate": 5e-06, "loss": 0.8271, "step": 170 }, { "epoch": 0.45056320400500627, "grad_norm": 0.7727657584916029, "learning_rate": 5e-06, "loss": 0.8201, "step": 180 }, { "epoch": 0.4755944931163955, "grad_norm": 0.6165452607139709, "learning_rate": 5e-06, "loss": 0.8165, "step": 190 }, { "epoch": 0.5006257822277848, "grad_norm": 0.5941915725497845, "learning_rate": 5e-06, "loss": 0.8165, "step": 200 }, { "epoch": 0.5256570713391739, "grad_norm": 0.9044617312024994, "learning_rate": 5e-06, "loss": 0.814, "step": 210 }, { "epoch": 0.5506883604505632, "grad_norm": 0.9023052529604444, "learning_rate": 5e-06, "loss": 0.8158, "step": 220 }, { "epoch": 0.5757196495619524, "grad_norm": 0.8778852029110713, "learning_rate": 5e-06, "loss": 0.8126, "step": 230 }, { "epoch": 0.6007509386733417, "grad_norm": 0.7745768192218916, "learning_rate": 5e-06, "loss": 0.8094, "step": 240 }, { "epoch": 0.6257822277847309, "grad_norm": 0.676919868236649, "learning_rate": 5e-06, "loss": 0.8081, "step": 250 }, { "epoch": 0.6508135168961201, "grad_norm": 0.6457111096534787, "learning_rate": 5e-06, "loss": 0.8063, "step": 260 }, { "epoch": 0.6758448060075094, "grad_norm": 0.6026716036669796, "learning_rate": 5e-06, "loss": 0.8086, "step": 270 }, { "epoch": 0.7008760951188986, "grad_norm": 0.6933525279767419, "learning_rate": 5e-06, "loss": 0.8006, "step": 280 }, { "epoch": 0.7259073842302879, "grad_norm": 0.9470644278270366, "learning_rate": 5e-06, "loss": 0.8006, "step": 290 }, { "epoch": 0.7509386733416771, "grad_norm": 0.9427280269205052, "learning_rate": 5e-06, "loss": 0.7961, "step": 300 }, { "epoch": 0.7759699624530664, "grad_norm": 0.7264873730589139, "learning_rate": 5e-06, "loss": 0.7989, "step": 310 }, { "epoch": 0.8010012515644556, "grad_norm": 0.6143483178764919, "learning_rate": 5e-06, "loss": 0.7958, "step": 320 }, { "epoch": 0.8260325406758448, "grad_norm": 0.8137195556401391, "learning_rate": 5e-06, "loss": 0.798, "step": 330 }, { "epoch": 0.851063829787234, "grad_norm": 0.6352119054411591, "learning_rate": 5e-06, "loss": 0.7979, "step": 340 }, { "epoch": 0.8760951188986232, "grad_norm": 0.6463859124240815, "learning_rate": 5e-06, "loss": 0.796, "step": 350 }, { "epoch": 0.9011264080100125, "grad_norm": 0.60316586739445, "learning_rate": 5e-06, "loss": 0.7927, "step": 360 }, { "epoch": 0.9261576971214017, "grad_norm": 0.893894099723059, "learning_rate": 5e-06, "loss": 0.7954, "step": 370 }, { "epoch": 0.951188986232791, "grad_norm": 0.5680880106145626, "learning_rate": 5e-06, "loss": 0.7892, "step": 380 }, { "epoch": 0.9762202753441802, "grad_norm": 0.6238008753559559, "learning_rate": 5e-06, "loss": 0.7876, "step": 390 }, { "epoch": 0.9987484355444305, "eval_loss": 0.7894856929779053, "eval_runtime": 272.7742, "eval_samples_per_second": 39.461, "eval_steps_per_second": 0.62, "step": 399 }, { "epoch": 1.0018773466833542, "grad_norm": 1.4020823851876998, "learning_rate": 5e-06, "loss": 0.8446, "step": 400 }, { "epoch": 1.0269086357947435, "grad_norm": 0.9263533629153153, "learning_rate": 5e-06, "loss": 0.7458, "step": 410 }, { "epoch": 1.0519399249061328, "grad_norm": 0.7694418549908636, "learning_rate": 5e-06, "loss": 0.7468, "step": 420 }, { "epoch": 1.0769712140175218, "grad_norm": 0.6223863756395531, "learning_rate": 5e-06, "loss": 0.7482, "step": 430 }, { "epoch": 1.1020025031289111, "grad_norm": 0.6462188205588866, "learning_rate": 5e-06, "loss": 0.7462, "step": 440 }, { "epoch": 1.1270337922403004, "grad_norm": 0.8428401134288817, "learning_rate": 5e-06, "loss": 0.7451, "step": 450 }, { "epoch": 1.1520650813516897, "grad_norm": 0.6127601124586929, "learning_rate": 5e-06, "loss": 0.7455, "step": 460 }, { "epoch": 1.1770963704630788, "grad_norm": 0.6867077491456403, "learning_rate": 5e-06, "loss": 0.7464, "step": 470 }, { "epoch": 1.202127659574468, "grad_norm": 0.655478438583233, "learning_rate": 5e-06, "loss": 0.7467, "step": 480 }, { "epoch": 1.2271589486858574, "grad_norm": 0.6163393565591452, "learning_rate": 5e-06, "loss": 0.7466, "step": 490 }, { "epoch": 1.2521902377972465, "grad_norm": 0.593653632025471, "learning_rate": 5e-06, "loss": 0.7472, "step": 500 }, { "epoch": 1.2772215269086358, "grad_norm": 0.7027928119588218, "learning_rate": 5e-06, "loss": 0.7483, "step": 510 }, { "epoch": 1.302252816020025, "grad_norm": 0.6854728377454722, "learning_rate": 5e-06, "loss": 0.7435, "step": 520 }, { "epoch": 1.3272841051314144, "grad_norm": 1.0781928345187066, "learning_rate": 5e-06, "loss": 0.7466, "step": 530 }, { "epoch": 1.3523153942428034, "grad_norm": 0.6327512087924364, "learning_rate": 5e-06, "loss": 0.7448, "step": 540 }, { "epoch": 1.3773466833541927, "grad_norm": 0.6100108550845114, "learning_rate": 5e-06, "loss": 0.744, "step": 550 }, { "epoch": 1.402377972465582, "grad_norm": 0.7365288494797091, "learning_rate": 5e-06, "loss": 0.7478, "step": 560 }, { "epoch": 1.4274092615769711, "grad_norm": 0.5377128504214334, "learning_rate": 5e-06, "loss": 0.7439, "step": 570 }, { "epoch": 1.4524405506883604, "grad_norm": 0.6741574190963199, "learning_rate": 5e-06, "loss": 0.7409, "step": 580 }, { "epoch": 1.4774718397997497, "grad_norm": 0.6592122959853899, "learning_rate": 5e-06, "loss": 0.7426, "step": 590 }, { "epoch": 1.502503128911139, "grad_norm": 0.8235710348347305, "learning_rate": 5e-06, "loss": 0.7411, "step": 600 }, { "epoch": 1.5275344180225283, "grad_norm": 0.7239324585893679, "learning_rate": 5e-06, "loss": 0.7443, "step": 610 }, { "epoch": 1.5525657071339174, "grad_norm": 0.6151649317588374, "learning_rate": 5e-06, "loss": 0.745, "step": 620 }, { "epoch": 1.5775969962453065, "grad_norm": 0.750130072011022, "learning_rate": 5e-06, "loss": 0.7431, "step": 630 }, { "epoch": 1.6026282853566958, "grad_norm": 0.6703845951317265, "learning_rate": 5e-06, "loss": 0.745, "step": 640 }, { "epoch": 1.627659574468085, "grad_norm": 0.5817463764362418, "learning_rate": 5e-06, "loss": 0.745, "step": 650 }, { "epoch": 1.6526908635794744, "grad_norm": 0.6421167763606337, "learning_rate": 5e-06, "loss": 0.7413, "step": 660 }, { "epoch": 1.6777221526908637, "grad_norm": 0.60677554331151, "learning_rate": 5e-06, "loss": 0.746, "step": 670 }, { "epoch": 1.702753441802253, "grad_norm": 0.6934469874354038, "learning_rate": 5e-06, "loss": 0.7434, "step": 680 }, { "epoch": 1.727784730913642, "grad_norm": 0.6181233971516625, "learning_rate": 5e-06, "loss": 0.7426, "step": 690 }, { "epoch": 1.7528160200250313, "grad_norm": 1.2959559737613207, "learning_rate": 5e-06, "loss": 0.7397, "step": 700 }, { "epoch": 1.7778473091364204, "grad_norm": 0.5691879884263149, "learning_rate": 5e-06, "loss": 0.7425, "step": 710 }, { "epoch": 1.8028785982478097, "grad_norm": 0.6574433826038453, "learning_rate": 5e-06, "loss": 0.7415, "step": 720 }, { "epoch": 1.827909887359199, "grad_norm": 0.6537064387599367, "learning_rate": 5e-06, "loss": 0.7409, "step": 730 }, { "epoch": 1.8529411764705883, "grad_norm": 0.5954863570531691, "learning_rate": 5e-06, "loss": 0.741, "step": 740 }, { "epoch": 1.8779724655819776, "grad_norm": 0.6362794802654446, "learning_rate": 5e-06, "loss": 0.7398, "step": 750 }, { "epoch": 1.9030037546933667, "grad_norm": 0.5986786317956753, "learning_rate": 5e-06, "loss": 0.7375, "step": 760 }, { "epoch": 1.928035043804756, "grad_norm": 0.7083210186967874, "learning_rate": 5e-06, "loss": 0.7398, "step": 770 }, { "epoch": 1.953066332916145, "grad_norm": 0.5556104219419278, "learning_rate": 5e-06, "loss": 0.738, "step": 780 }, { "epoch": 1.9780976220275344, "grad_norm": 0.5820311052189984, "learning_rate": 5e-06, "loss": 0.7419, "step": 790 }, { "epoch": 1.9981226533166458, "eval_loss": 0.7726743817329407, "eval_runtime": 279.978, "eval_samples_per_second": 38.446, "eval_steps_per_second": 0.604, "step": 798 }, { "epoch": 2.0037546933667083, "grad_norm": 1.1635293256781625, "learning_rate": 5e-06, "loss": 0.7865, "step": 800 }, { "epoch": 2.0287859824780976, "grad_norm": 0.7276790303807796, "learning_rate": 5e-06, "loss": 0.699, "step": 810 }, { "epoch": 2.053817271589487, "grad_norm": 0.7288566025250091, "learning_rate": 5e-06, "loss": 0.6925, "step": 820 }, { "epoch": 2.078848560700876, "grad_norm": 0.7172705630929458, "learning_rate": 5e-06, "loss": 0.6919, "step": 830 }, { "epoch": 2.1038798498122655, "grad_norm": 0.6776212393282559, "learning_rate": 5e-06, "loss": 0.6934, "step": 840 }, { "epoch": 2.1289111389236544, "grad_norm": 0.6439197802971096, "learning_rate": 5e-06, "loss": 0.6927, "step": 850 }, { "epoch": 2.1539424280350437, "grad_norm": 0.6173323306558143, "learning_rate": 5e-06, "loss": 0.6937, "step": 860 }, { "epoch": 2.178973717146433, "grad_norm": 0.7564554823831022, "learning_rate": 5e-06, "loss": 0.6945, "step": 870 }, { "epoch": 2.2040050062578223, "grad_norm": 0.6620302857887288, "learning_rate": 5e-06, "loss": 0.6944, "step": 880 }, { "epoch": 2.2290362953692116, "grad_norm": 0.6377986146944816, "learning_rate": 5e-06, "loss": 0.6964, "step": 890 }, { "epoch": 2.254067584480601, "grad_norm": 0.5556020070065616, "learning_rate": 5e-06, "loss": 0.6935, "step": 900 }, { "epoch": 2.27909887359199, "grad_norm": 0.7076075517084951, "learning_rate": 5e-06, "loss": 0.6993, "step": 910 }, { "epoch": 2.3041301627033794, "grad_norm": 0.5973458431773908, "learning_rate": 5e-06, "loss": 0.6986, "step": 920 }, { "epoch": 2.3291614518147683, "grad_norm": 0.7640330738397633, "learning_rate": 5e-06, "loss": 0.6979, "step": 930 }, { "epoch": 2.3541927409261576, "grad_norm": 0.6544037662816962, "learning_rate": 5e-06, "loss": 0.6985, "step": 940 }, { "epoch": 2.379224030037547, "grad_norm": 0.643258238595077, "learning_rate": 5e-06, "loss": 0.6994, "step": 950 }, { "epoch": 2.404255319148936, "grad_norm": 0.622397334096867, "learning_rate": 5e-06, "loss": 0.6953, "step": 960 }, { "epoch": 2.4292866082603255, "grad_norm": 0.7595102567708334, "learning_rate": 5e-06, "loss": 0.6977, "step": 970 }, { "epoch": 2.454317897371715, "grad_norm": 0.6430300856113221, "learning_rate": 5e-06, "loss": 0.6936, "step": 980 }, { "epoch": 2.4793491864831037, "grad_norm": 0.6649001597729085, "learning_rate": 5e-06, "loss": 0.6966, "step": 990 }, { "epoch": 2.504380475594493, "grad_norm": 0.7807773547782566, "learning_rate": 5e-06, "loss": 0.6977, "step": 1000 }, { "epoch": 2.5294117647058822, "grad_norm": 0.7685637796964472, "learning_rate": 5e-06, "loss": 0.7037, "step": 1010 }, { "epoch": 2.5544430538172715, "grad_norm": 0.6087836172391723, "learning_rate": 5e-06, "loss": 0.695, "step": 1020 }, { "epoch": 2.579474342928661, "grad_norm": 0.6126786752740975, "learning_rate": 5e-06, "loss": 0.702, "step": 1030 }, { "epoch": 2.60450563204005, "grad_norm": 0.5692342336245159, "learning_rate": 5e-06, "loss": 0.6965, "step": 1040 }, { "epoch": 2.6295369211514394, "grad_norm": 0.5461330132492804, "learning_rate": 5e-06, "loss": 0.6958, "step": 1050 }, { "epoch": 2.6545682102628287, "grad_norm": 0.5974455044275497, "learning_rate": 5e-06, "loss": 0.6977, "step": 1060 }, { "epoch": 2.679599499374218, "grad_norm": 0.6678395373217543, "learning_rate": 5e-06, "loss": 0.6992, "step": 1070 }, { "epoch": 2.704630788485607, "grad_norm": 0.6068475995182039, "learning_rate": 5e-06, "loss": 0.6977, "step": 1080 }, { "epoch": 2.729662077596996, "grad_norm": 0.6224775601315933, "learning_rate": 5e-06, "loss": 0.6994, "step": 1090 }, { "epoch": 2.7546933667083855, "grad_norm": 0.5607628869873644, "learning_rate": 5e-06, "loss": 0.698, "step": 1100 }, { "epoch": 2.779724655819775, "grad_norm": 1.0197508263783457, "learning_rate": 5e-06, "loss": 0.699, "step": 1110 }, { "epoch": 2.804755944931164, "grad_norm": 0.6699677843683128, "learning_rate": 5e-06, "loss": 0.6979, "step": 1120 }, { "epoch": 2.829787234042553, "grad_norm": 0.6628279862395011, "learning_rate": 5e-06, "loss": 0.7011, "step": 1130 }, { "epoch": 2.8548185231539422, "grad_norm": 0.5823164374883697, "learning_rate": 5e-06, "loss": 0.6988, "step": 1140 }, { "epoch": 2.8798498122653315, "grad_norm": 0.6607195364082912, "learning_rate": 5e-06, "loss": 0.6968, "step": 1150 }, { "epoch": 2.904881101376721, "grad_norm": 0.577549838302954, "learning_rate": 5e-06, "loss": 0.7005, "step": 1160 }, { "epoch": 2.92991239048811, "grad_norm": 0.6569021208672484, "learning_rate": 5e-06, "loss": 0.7034, "step": 1170 }, { "epoch": 2.9549436795994994, "grad_norm": 0.5809953224848247, "learning_rate": 5e-06, "loss": 0.6951, "step": 1180 }, { "epoch": 2.9799749687108887, "grad_norm": 0.5986204550852365, "learning_rate": 5e-06, "loss": 0.6976, "step": 1190 }, { "epoch": 2.997496871088861, "eval_loss": 0.770167887210846, "eval_runtime": 267.216, "eval_samples_per_second": 40.282, "eval_steps_per_second": 0.632, "step": 1197 }, { "epoch": 2.997496871088861, "step": 1197, "total_flos": 2004810203136000.0, "train_loss": 0.7593094730934902, "train_runtime": 39147.1454, "train_samples_per_second": 15.671, "train_steps_per_second": 0.031 } ], "logging_steps": 10, "max_steps": 1197, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2004810203136000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }