{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 698, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014336917562724014, "grad_norm": 5.102053642272949, "learning_rate": 0.0, "loss": 1.1358120441436768, "step": 1 }, { "epoch": 0.014336917562724014, "grad_norm": 0.22924242913722992, "learning_rate": 5.142857142857142e-06, "loss": 0.5612610710991753, "step": 10 }, { "epoch": 0.02867383512544803, "grad_norm": 0.12333694845438004, "learning_rate": 1.0857142857142858e-05, "loss": 0.18545818328857422, "step": 20 }, { "epoch": 0.043010752688172046, "grad_norm": 0.14276181161403656, "learning_rate": 1.6571428571428574e-05, "loss": 0.16717807054519654, "step": 30 }, { "epoch": 0.05734767025089606, "grad_norm": 0.10264057666063309, "learning_rate": 1.9998203820754213e-05, "loss": 0.17055926322937012, "step": 40 }, { "epoch": 0.07168458781362007, "grad_norm": 0.104465052485466, "learning_rate": 1.9978004213822736e-05, "loss": 0.1625239610671997, "step": 50 }, { "epoch": 0.08602150537634409, "grad_norm": 0.10240382701158524, "learning_rate": 1.993540527265239e-05, "loss": 0.1715969681739807, "step": 60 }, { "epoch": 0.1003584229390681, "grad_norm": 0.08893006294965744, "learning_rate": 1.9870502626379127e-05, "loss": 0.16313509941101073, "step": 70 }, { "epoch": 0.11469534050179211, "grad_norm": 0.12808184325695038, "learning_rate": 1.9783441973084023e-05, "loss": 0.16091221570968628, "step": 80 }, { "epoch": 0.12903225806451613, "grad_norm": 0.08048602193593979, "learning_rate": 1.9674418752719835e-05, "loss": 0.15647590160369873, "step": 90 }, { "epoch": 0.14336917562724014, "grad_norm": 0.08210249245166779, "learning_rate": 1.9543677708373496e-05, "loss": 0.16005860567092894, "step": 100 }, { "epoch": 0.15770609318996415, "grad_norm": 0.0777139738202095, "learning_rate": 1.9391512336849406e-05, "loss": 0.15733616352081298, "step": 110 }, { "epoch": 0.17204301075268819, "grad_norm": 0.07640817761421204, "learning_rate": 1.9218264229806917e-05, "loss": 0.15088759660720824, "step": 120 }, { "epoch": 0.1863799283154122, "grad_norm": 0.08728871494531631, "learning_rate": 1.9024322306931035e-05, "loss": 0.1508031129837036, "step": 130 }, { "epoch": 0.2007168458781362, "grad_norm": 0.0811760425567627, "learning_rate": 1.8810121942857848e-05, "loss": 0.15035929679870605, "step": 140 }, { "epoch": 0.21505376344086022, "grad_norm": 0.08090047538280487, "learning_rate": 1.8576143989814524e-05, "loss": 0.15261363983154297, "step": 150 }, { "epoch": 0.22939068100358423, "grad_norm": 0.09918594360351562, "learning_rate": 1.8322913698168014e-05, "loss": 0.15076379776000975, "step": 160 }, { "epoch": 0.24372759856630824, "grad_norm": 0.08928713947534561, "learning_rate": 1.8050999537305634e-05, "loss": 0.15194735527038575, "step": 170 }, { "epoch": 0.25806451612903225, "grad_norm": 0.07266855984926224, "learning_rate": 1.776101191949449e-05, "loss": 0.15317165851593018, "step": 180 }, { "epoch": 0.2724014336917563, "grad_norm": 0.07347071915864944, "learning_rate": 1.745360182958459e-05, "loss": 0.15253105163574218, "step": 190 }, { "epoch": 0.2867383512544803, "grad_norm": 0.0758705884218216, "learning_rate": 1.7129459363631692e-05, "loss": 0.14779815673828126, "step": 200 }, { "epoch": 0.2867383512544803, "eval_loss": 0.2187168002128601, "eval_runtime": 47.0917, "eval_samples_per_second": 124.247, "eval_steps_per_second": 0.255, "step": 200 }, { "epoch": 0.3010752688172043, "grad_norm": 0.07489030808210373, "learning_rate": 1.678931217972055e-05, "loss": 0.14687340259552, "step": 210 }, { "epoch": 0.3154121863799283, "grad_norm": 0.0788242295384407, "learning_rate": 1.6433923864466235e-05, "loss": 0.14286456108093262, "step": 220 }, { "epoch": 0.32974910394265233, "grad_norm": 0.8215664029121399, "learning_rate": 1.6064092218860553e-05, "loss": 0.15229986906051635, "step": 230 }, { "epoch": 0.34408602150537637, "grad_norm": 0.06975199282169342, "learning_rate": 1.568064746731156e-05, "loss": 0.14773318767547608, "step": 240 }, { "epoch": 0.35842293906810035, "grad_norm": 0.07249259203672409, "learning_rate": 1.5284450393896713e-05, "loss": 0.15218548774719237, "step": 250 }, { "epoch": 0.3727598566308244, "grad_norm": 0.07201112061738968, "learning_rate": 1.4876390410013498e-05, "loss": 0.14689412117004394, "step": 260 }, { "epoch": 0.3870967741935484, "grad_norm": 0.06810774654150009, "learning_rate": 1.4457383557765385e-05, "loss": 0.14651920795440673, "step": 270 }, { "epoch": 0.4014336917562724, "grad_norm": 0.06508651375770569, "learning_rate": 1.402837045356531e-05, "loss": 0.1467513084411621, "step": 280 }, { "epoch": 0.4157706093189964, "grad_norm": 0.06673789769411087, "learning_rate": 1.3590314176572989e-05, "loss": 0.13457820415496827, "step": 290 }, { "epoch": 0.43010752688172044, "grad_norm": 0.07056573778390884, "learning_rate": 1.314419810670624e-05, "loss": 0.14077857732772828, "step": 300 }, { "epoch": 0.4444444444444444, "grad_norm": 0.06851952522993088, "learning_rate": 1.2691023717079735e-05, "loss": 0.138556170463562, "step": 310 }, { "epoch": 0.45878136200716846, "grad_norm": 0.06733077019453049, "learning_rate": 1.2231808325826862e-05, "loss": 0.13921281099319457, "step": 320 }, { "epoch": 0.4731182795698925, "grad_norm": 0.07425093650817871, "learning_rate": 1.176758281235155e-05, "loss": 0.13010122776031494, "step": 330 }, { "epoch": 0.4874551971326165, "grad_norm": 0.06684776395559311, "learning_rate": 1.129938930313678e-05, "loss": 0.1314706802368164, "step": 340 }, { "epoch": 0.5017921146953405, "grad_norm": 0.06619785726070404, "learning_rate": 1.082827883230487e-05, "loss": 0.13385097980499266, "step": 350 }, { "epoch": 0.5161290322580645, "grad_norm": 0.07259789854288101, "learning_rate": 1.0355308982181254e-05, "loss": 0.13582653999328614, "step": 360 }, { "epoch": 0.5304659498207885, "grad_norm": 0.06673564016819, "learning_rate": 9.881541509158366e-06, "loss": 0.1326061010360718, "step": 370 }, { "epoch": 0.5448028673835126, "grad_norm": 0.0609956830739975, "learning_rate": 9.408039960189317e-06, "loss": 0.12904767990112304, "step": 380 }, { "epoch": 0.5591397849462365, "grad_norm": 0.07004307955503464, "learning_rate": 8.935867285261977e-06, "loss": 0.14127167463302612, "step": 390 }, { "epoch": 0.5734767025089605, "grad_norm": 0.06114516407251358, "learning_rate": 8.466083451213145e-06, "loss": 0.13098690509796143, "step": 400 }, { "epoch": 0.5734767025089605, "eval_loss": 0.1937306523323059, "eval_runtime": 47.434, "eval_samples_per_second": 123.35, "eval_steps_per_second": 0.253, "step": 400 }, { "epoch": 0.5878136200716846, "grad_norm": 0.07561196386814117, "learning_rate": 7.999743062239557e-06, "loss": 0.13492929935455322, "step": 410 }, { "epoch": 0.6021505376344086, "grad_norm": 0.06452950835227966, "learning_rate": 7.5378929924472735e-06, "loss": 0.13134829998016356, "step": 420 }, { "epoch": 0.6164874551971327, "grad_norm": 0.05738452449440956, "learning_rate": 7.081570035754189e-06, "loss": 0.1305772066116333, "step": 430 }, { "epoch": 0.6308243727598566, "grad_norm": 0.06140970438718796, "learning_rate": 6.631798578421195e-06, "loss": 0.12945042848587035, "step": 440 }, { "epoch": 0.6451612903225806, "grad_norm": 0.0673835426568985, "learning_rate": 6.189588299436997e-06, "loss": 0.133364737033844, "step": 450 }, { "epoch": 0.6594982078853047, "grad_norm": 0.06186283379793167, "learning_rate": 5.755931903918835e-06, "loss": 0.12029262781143188, "step": 460 }, { "epoch": 0.6738351254480287, "grad_norm": 0.061157312244176865, "learning_rate": 5.331802894617333e-06, "loss": 0.12551844120025635, "step": 470 }, { "epoch": 0.6881720430107527, "grad_norm": 0.06083202734589577, "learning_rate": 4.918153386528271e-06, "loss": 0.12006042003631592, "step": 480 }, { "epoch": 0.7025089605734767, "grad_norm": 0.05786828324198723, "learning_rate": 4.515911969516985e-06, "loss": 0.12329277992248536, "step": 490 }, { "epoch": 0.7168458781362007, "grad_norm": 0.07877270132303238, "learning_rate": 4.125981623753801e-06, "loss": 0.1233370542526245, "step": 500 }, { "epoch": 0.7311827956989247, "grad_norm": 0.06246776878833771, "learning_rate": 3.7492376926397966e-06, "loss": 0.12439545392990112, "step": 510 }, { "epoch": 0.7455197132616488, "grad_norm": 0.06481961905956268, "learning_rate": 3.3865259177736663e-06, "loss": 0.12364455461502075, "step": 520 }, { "epoch": 0.7598566308243727, "grad_norm": 0.0777650848031044, "learning_rate": 3.0386605403707347e-06, "loss": 0.1195528745651245, "step": 530 }, { "epoch": 0.7741935483870968, "grad_norm": 0.05968371778726578, "learning_rate": 2.7064224733963197e-06, "loss": 0.1176903247833252, "step": 540 }, { "epoch": 0.7885304659498208, "grad_norm": 0.059006717056035995, "learning_rate": 2.3905575485167098e-06, "loss": 0.12136919498443603, "step": 550 }, { "epoch": 0.8028673835125448, "grad_norm": 0.062085703015327454, "learning_rate": 2.0917748418031415e-06, "loss": 0.1185749888420105, "step": 560 }, { "epoch": 0.8172043010752689, "grad_norm": 0.05684107542037964, "learning_rate": 1.8107450819473505e-06, "loss": 0.12052475214004517, "step": 570 }, { "epoch": 0.8315412186379928, "grad_norm": 0.056005772203207016, "learning_rate": 1.5480991445620541e-06, "loss": 0.11635445356369019, "step": 580 }, { "epoch": 0.8458781362007168, "grad_norm": 0.08036984503269196, "learning_rate": 1.3044266359464542e-06, "loss": 0.12693744897842407, "step": 590 }, { "epoch": 0.8602150537634409, "grad_norm": 0.05893237143754959, "learning_rate": 1.080274569496057e-06, "loss": 0.11600825786590577, "step": 600 }, { "epoch": 0.8602150537634409, "eval_loss": 0.17753292620182037, "eval_runtime": 48.0542, "eval_samples_per_second": 121.758, "eval_steps_per_second": 0.25, "step": 600 }, { "epoch": 0.8745519713261649, "grad_norm": 0.06269249320030212, "learning_rate": 8.761461377280311e-07, "loss": 0.11844503879547119, "step": 610 }, { "epoch": 0.8888888888888888, "grad_norm": 0.06056367605924606, "learning_rate": 6.924995826788516e-07, "loss": 0.1225665807723999, "step": 620 }, { "epoch": 0.9032258064516129, "grad_norm": 0.05936103314161301, "learning_rate": 5.29747167209923e-07, "loss": 0.11874287128448487, "step": 630 }, { "epoch": 0.9175627240143369, "grad_norm": 0.06041805073618889, "learning_rate": 3.88254249530583e-07, "loss": 0.11610586643218994, "step": 640 }, { "epoch": 0.931899641577061, "grad_norm": 0.0608358308672905, "learning_rate": 2.6833846301596246e-07, "loss": 0.11917197704315186, "step": 650 }, { "epoch": 0.946236559139785, "grad_norm": 0.0607263408601284, "learning_rate": 1.7026900316098217e-07, "loss": 0.11865537166595459, "step": 660 }, { "epoch": 0.9605734767025089, "grad_norm": 0.05322834476828575, "learning_rate": 9.426602327113788e-08, "loss": 0.11138832569122314, "step": 670 }, { "epoch": 0.974910394265233, "grad_norm": 0.06341966986656189, "learning_rate": 4.050014024668425e-08, "loss": 0.11363672018051148, "step": 680 }, { "epoch": 0.989247311827957, "grad_norm": 0.06364062428474426, "learning_rate": 9.092051569674632e-09, "loss": 0.12197227478027343, "step": 690 }, { "epoch": 1.0, "eval_loss": 0.17584605515003204, "eval_runtime": 49.4161, "eval_samples_per_second": 118.403, "eval_steps_per_second": 0.243, "step": 698 } ], "logging_steps": 10, "max_steps": 698, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.2148830443916493e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }