{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08030285648732362, "eval_steps": 10, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001147183664104623, "eval_loss": 1.873344898223877, "eval_runtime": 12.7647, "eval_samples_per_second": 515.797, "eval_steps_per_second": 8.069, "step": 10 }, { "epoch": 0.002294367328209246, "eval_loss": 1.8726389408111572, "eval_runtime": 12.8667, "eval_samples_per_second": 511.709, "eval_steps_per_second": 8.005, "step": 20 }, { "epoch": 0.0034415509923138693, "eval_loss": 1.8714078664779663, "eval_runtime": 12.9103, "eval_samples_per_second": 509.979, "eval_steps_per_second": 7.978, "step": 30 }, { "epoch": 0.004588734656418492, "eval_loss": 1.8696790933609009, "eval_runtime": 12.947, "eval_samples_per_second": 508.534, "eval_steps_per_second": 7.955, "step": 40 }, { "epoch": 0.0057359183205231154, "eval_loss": 1.8675329685211182, "eval_runtime": 12.9458, "eval_samples_per_second": 508.582, "eval_steps_per_second": 7.956, "step": 50 }, { "epoch": 0.006883101984627739, "eval_loss": 1.8649154901504517, "eval_runtime": 13.0432, "eval_samples_per_second": 504.785, "eval_steps_per_second": 7.897, "step": 60 }, { "epoch": 0.008030285648732363, "eval_loss": 1.8619294166564941, "eval_runtime": 13.0638, "eval_samples_per_second": 503.988, "eval_steps_per_second": 7.884, "step": 70 }, { "epoch": 0.009177469312836984, "eval_loss": 1.8583979606628418, "eval_runtime": 13.0482, "eval_samples_per_second": 504.592, "eval_steps_per_second": 7.894, "step": 80 }, { "epoch": 0.010324652976941608, "eval_loss": 1.85438871383667, "eval_runtime": 13.0615, "eval_samples_per_second": 504.075, "eval_steps_per_second": 7.886, "step": 90 }, { "epoch": 0.011471836641046231, "grad_norm": 9.938580513000488, "learning_rate": 3.8226299694189603e-07, "loss": 3.1046, "step": 100 }, { "epoch": 0.011471836641046231, "eval_loss": 1.849947214126587, "eval_runtime": 13.0663, "eval_samples_per_second": 503.89, "eval_steps_per_second": 7.883, "step": 100 }, { "epoch": 0.012619020305150854, "eval_loss": 1.8451412916183472, "eval_runtime": 12.9771, "eval_samples_per_second": 507.357, "eval_steps_per_second": 7.937, "step": 110 }, { "epoch": 0.013766203969255477, "eval_loss": 1.8399487733840942, "eval_runtime": 13.0209, "eval_samples_per_second": 505.648, "eval_steps_per_second": 7.91, "step": 120 }, { "epoch": 0.0149133876333601, "eval_loss": 1.8342881202697754, "eval_runtime": 13.0369, "eval_samples_per_second": 505.028, "eval_steps_per_second": 7.901, "step": 130 }, { "epoch": 0.016060571297464726, "eval_loss": 1.8283486366271973, "eval_runtime": 13.0149, "eval_samples_per_second": 505.88, "eval_steps_per_second": 7.914, "step": 140 }, { "epoch": 0.017207754961569347, "eval_loss": 1.822334885597229, "eval_runtime": 13.0213, "eval_samples_per_second": 505.632, "eval_steps_per_second": 7.91, "step": 150 }, { "epoch": 0.01835493862567397, "eval_loss": 1.8158738613128662, "eval_runtime": 13.0599, "eval_samples_per_second": 504.14, "eval_steps_per_second": 7.887, "step": 160 }, { "epoch": 0.019502122289778594, "eval_loss": 1.8090614080429077, "eval_runtime": 13.034, "eval_samples_per_second": 505.14, "eval_steps_per_second": 7.902, "step": 170 }, { "epoch": 0.020649305953883215, "eval_loss": 1.8015782833099365, "eval_runtime": 13.0665, "eval_samples_per_second": 503.885, "eval_steps_per_second": 7.883, "step": 180 }, { "epoch": 0.02179648961798784, "eval_loss": 1.793796420097351, "eval_runtime": 13.0555, "eval_samples_per_second": 504.31, "eval_steps_per_second": 7.889, "step": 190 }, { "epoch": 0.022943673282092462, "grad_norm": 4.906337738037109, "learning_rate": 7.645259938837921e-07, "loss": 3.0303, "step": 200 }, { "epoch": 0.022943673282092462, "eval_loss": 1.785815715789795, "eval_runtime": 12.9925, "eval_samples_per_second": 506.754, "eval_steps_per_second": 7.928, "step": 200 }, { "epoch": 0.024090856946197087, "eval_loss": 1.7775053977966309, "eval_runtime": 13.0639, "eval_samples_per_second": 503.986, "eval_steps_per_second": 7.884, "step": 210 }, { "epoch": 0.025238040610301708, "eval_loss": 1.7692992687225342, "eval_runtime": 13.0129, "eval_samples_per_second": 505.96, "eval_steps_per_second": 7.915, "step": 220 }, { "epoch": 0.026385224274406333, "eval_loss": 1.760453224182129, "eval_runtime": 13.0078, "eval_samples_per_second": 506.158, "eval_steps_per_second": 7.918, "step": 230 }, { "epoch": 0.027532407938510955, "eval_loss": 1.751396656036377, "eval_runtime": 12.9957, "eval_samples_per_second": 506.628, "eval_steps_per_second": 7.926, "step": 240 }, { "epoch": 0.02867959160261558, "eval_loss": 1.7417218685150146, "eval_runtime": 12.9774, "eval_samples_per_second": 507.344, "eval_steps_per_second": 7.937, "step": 250 }, { "epoch": 0.0298267752667202, "eval_loss": 1.7319914102554321, "eval_runtime": 13.0219, "eval_samples_per_second": 505.611, "eval_steps_per_second": 7.91, "step": 260 }, { "epoch": 0.030973958930824826, "eval_loss": 1.7227253913879395, "eval_runtime": 13.0026, "eval_samples_per_second": 506.361, "eval_steps_per_second": 7.922, "step": 270 }, { "epoch": 0.03212114259492945, "eval_loss": 1.7133797407150269, "eval_runtime": 12.9757, "eval_samples_per_second": 507.409, "eval_steps_per_second": 7.938, "step": 280 }, { "epoch": 0.03326832625903407, "eval_loss": 1.704041600227356, "eval_runtime": 12.9845, "eval_samples_per_second": 507.065, "eval_steps_per_second": 7.933, "step": 290 }, { "epoch": 0.034415509923138694, "grad_norm": 4.665822505950928, "learning_rate": 1.1467889908256882e-06, "loss": 2.9459, "step": 300 }, { "epoch": 0.034415509923138694, "eval_loss": 1.6940686702728271, "eval_runtime": 13.0019, "eval_samples_per_second": 506.387, "eval_steps_per_second": 7.922, "step": 300 }, { "epoch": 0.035562693587243316, "eval_loss": 1.683342695236206, "eval_runtime": 13.0065, "eval_samples_per_second": 506.209, "eval_steps_per_second": 7.919, "step": 310 }, { "epoch": 0.03670987725134794, "eval_loss": 1.6724653244018555, "eval_runtime": 13.0129, "eval_samples_per_second": 505.96, "eval_steps_per_second": 7.915, "step": 320 }, { "epoch": 0.037857060915452566, "eval_loss": 1.6614341735839844, "eval_runtime": 12.9921, "eval_samples_per_second": 506.769, "eval_steps_per_second": 7.928, "step": 330 }, { "epoch": 0.03900424457955719, "eval_loss": 1.6510112285614014, "eval_runtime": 13.0242, "eval_samples_per_second": 505.52, "eval_steps_per_second": 7.908, "step": 340 }, { "epoch": 0.04015142824366181, "eval_loss": 1.6401513814926147, "eval_runtime": 12.9214, "eval_samples_per_second": 509.542, "eval_steps_per_second": 7.971, "step": 350 }, { "epoch": 0.04129861190776643, "eval_loss": 1.6295816898345947, "eval_runtime": 12.9563, "eval_samples_per_second": 508.171, "eval_steps_per_second": 7.95, "step": 360 }, { "epoch": 0.04244579557187106, "eval_loss": 1.6187150478363037, "eval_runtime": 12.9758, "eval_samples_per_second": 507.405, "eval_steps_per_second": 7.938, "step": 370 }, { "epoch": 0.04359297923597568, "eval_loss": 1.607272982597351, "eval_runtime": 12.9876, "eval_samples_per_second": 506.947, "eval_steps_per_second": 7.931, "step": 380 }, { "epoch": 0.0447401629000803, "eval_loss": 1.5961676836013794, "eval_runtime": 12.9782, "eval_samples_per_second": 507.313, "eval_steps_per_second": 7.936, "step": 390 }, { "epoch": 0.045887346564184923, "grad_norm": 4.870114326477051, "learning_rate": 1.5290519877675841e-06, "loss": 2.7813, "step": 400 }, { "epoch": 0.045887346564184923, "eval_loss": 1.5848218202590942, "eval_runtime": 12.9783, "eval_samples_per_second": 507.309, "eval_steps_per_second": 7.936, "step": 400 }, { "epoch": 0.04703453022828955, "eval_loss": 1.5734797716140747, "eval_runtime": 12.9739, "eval_samples_per_second": 507.482, "eval_steps_per_second": 7.939, "step": 410 }, { "epoch": 0.04818171389239417, "eval_loss": 1.562021255493164, "eval_runtime": 12.9388, "eval_samples_per_second": 508.855, "eval_steps_per_second": 7.961, "step": 420 }, { "epoch": 0.049328897556498795, "eval_loss": 1.5495364665985107, "eval_runtime": 12.9412, "eval_samples_per_second": 508.764, "eval_steps_per_second": 7.959, "step": 430 }, { "epoch": 0.050476081220603417, "eval_loss": 1.5375314950942993, "eval_runtime": 12.9686, "eval_samples_per_second": 507.687, "eval_steps_per_second": 7.942, "step": 440 }, { "epoch": 0.051623264884708045, "eval_loss": 1.525598168373108, "eval_runtime": 12.9695, "eval_samples_per_second": 507.651, "eval_steps_per_second": 7.942, "step": 450 }, { "epoch": 0.052770448548812667, "eval_loss": 1.5132672786712646, "eval_runtime": 12.8961, "eval_samples_per_second": 510.543, "eval_steps_per_second": 7.987, "step": 460 }, { "epoch": 0.05391763221291729, "eval_loss": 1.5012215375900269, "eval_runtime": 12.9428, "eval_samples_per_second": 508.7, "eval_steps_per_second": 7.958, "step": 470 }, { "epoch": 0.05506481587702191, "eval_loss": 1.4892219305038452, "eval_runtime": 12.9208, "eval_samples_per_second": 509.567, "eval_steps_per_second": 7.972, "step": 480 }, { "epoch": 0.05621199954112653, "eval_loss": 1.4768636226654053, "eval_runtime": 12.9423, "eval_samples_per_second": 508.721, "eval_steps_per_second": 7.958, "step": 490 }, { "epoch": 0.05735918320523116, "grad_norm": 4.155641555786133, "learning_rate": 1.9113149847094803e-06, "loss": 2.6308, "step": 500 }, { "epoch": 0.05735918320523116, "eval_loss": 1.4640088081359863, "eval_runtime": 12.8729, "eval_samples_per_second": 511.462, "eval_steps_per_second": 8.001, "step": 500 }, { "epoch": 0.05850636686933578, "eval_loss": 1.4513096809387207, "eval_runtime": 12.9653, "eval_samples_per_second": 507.817, "eval_steps_per_second": 7.944, "step": 510 }, { "epoch": 0.0596535505334404, "eval_loss": 1.439149260520935, "eval_runtime": 12.9443, "eval_samples_per_second": 508.639, "eval_steps_per_second": 7.957, "step": 520 }, { "epoch": 0.060800734197545024, "eval_loss": 1.426237940788269, "eval_runtime": 12.9496, "eval_samples_per_second": 508.433, "eval_steps_per_second": 7.954, "step": 530 }, { "epoch": 0.06194791786164965, "eval_loss": 1.4129557609558105, "eval_runtime": 12.9822, "eval_samples_per_second": 507.157, "eval_steps_per_second": 7.934, "step": 540 }, { "epoch": 0.06309510152575427, "eval_loss": 1.3997886180877686, "eval_runtime": 12.9979, "eval_samples_per_second": 506.542, "eval_steps_per_second": 7.924, "step": 550 }, { "epoch": 0.0642422851898589, "eval_loss": 1.3873906135559082, "eval_runtime": 12.9378, "eval_samples_per_second": 508.895, "eval_steps_per_second": 7.961, "step": 560 }, { "epoch": 0.06538946885396352, "eval_loss": 1.3751789331436157, "eval_runtime": 12.9624, "eval_samples_per_second": 507.932, "eval_steps_per_second": 7.946, "step": 570 }, { "epoch": 0.06653665251806815, "eval_loss": 1.3620370626449585, "eval_runtime": 12.9498, "eval_samples_per_second": 508.426, "eval_steps_per_second": 7.954, "step": 580 }, { "epoch": 0.06768383618217276, "eval_loss": 1.3485124111175537, "eval_runtime": 12.9759, "eval_samples_per_second": 507.403, "eval_steps_per_second": 7.938, "step": 590 }, { "epoch": 0.06883101984627739, "grad_norm": 5.262124061584473, "learning_rate": 2.2935779816513764e-06, "loss": 2.4452, "step": 600 }, { "epoch": 0.06883101984627739, "eval_loss": 1.3349775075912476, "eval_runtime": 12.9631, "eval_samples_per_second": 507.902, "eval_steps_per_second": 7.946, "step": 600 }, { "epoch": 0.06997820351038202, "eval_loss": 1.3213400840759277, "eval_runtime": 12.9619, "eval_samples_per_second": 507.951, "eval_steps_per_second": 7.946, "step": 610 }, { "epoch": 0.07112538717448663, "eval_loss": 1.308822512626648, "eval_runtime": 12.9652, "eval_samples_per_second": 507.82, "eval_steps_per_second": 7.944, "step": 620 }, { "epoch": 0.07227257083859126, "eval_loss": 1.296485185623169, "eval_runtime": 13.0441, "eval_samples_per_second": 504.75, "eval_steps_per_second": 7.896, "step": 630 }, { "epoch": 0.07341975450269587, "eval_loss": 1.283867597579956, "eval_runtime": 12.9822, "eval_samples_per_second": 507.154, "eval_steps_per_second": 7.934, "step": 640 }, { "epoch": 0.0745669381668005, "eval_loss": 1.2713148593902588, "eval_runtime": 12.9775, "eval_samples_per_second": 507.338, "eval_steps_per_second": 7.937, "step": 650 }, { "epoch": 0.07571412183090513, "eval_loss": 1.2591922283172607, "eval_runtime": 13.0227, "eval_samples_per_second": 505.578, "eval_steps_per_second": 7.909, "step": 660 }, { "epoch": 0.07686130549500975, "eval_loss": 1.246610164642334, "eval_runtime": 13.014, "eval_samples_per_second": 505.917, "eval_steps_per_second": 7.915, "step": 670 }, { "epoch": 0.07800848915911437, "eval_loss": 1.2331972122192383, "eval_runtime": 12.9634, "eval_samples_per_second": 507.891, "eval_steps_per_second": 7.945, "step": 680 }, { "epoch": 0.079155672823219, "eval_loss": 1.2203081846237183, "eval_runtime": 12.9664, "eval_samples_per_second": 507.775, "eval_steps_per_second": 7.944, "step": 690 }, { "epoch": 0.08030285648732362, "grad_norm": 3.824066400527954, "learning_rate": 2.6758409785932725e-06, "loss": 2.2626, "step": 700 }, { "epoch": 0.08030285648732362, "eval_loss": 1.207729697227478, "eval_runtime": 13.037, "eval_samples_per_second": 505.025, "eval_steps_per_second": 7.901, "step": 700 } ], "logging_steps": 100, "max_steps": 26151, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }