| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 200, |
| "global_step": 698, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0014336917562724014, |
| "grad_norm": 5.102053642272949, |
| "learning_rate": 0.0, |
| "loss": 1.1358120441436768, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.014336917562724014, |
| "grad_norm": 0.22924242913722992, |
| "learning_rate": 5.142857142857142e-06, |
| "loss": 0.5612610710991753, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.02867383512544803, |
| "grad_norm": 0.12333694845438004, |
| "learning_rate": 1.0857142857142858e-05, |
| "loss": 0.18545818328857422, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.043010752688172046, |
| "grad_norm": 0.14276181161403656, |
| "learning_rate": 1.6571428571428574e-05, |
| "loss": 0.16717807054519654, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.05734767025089606, |
| "grad_norm": 0.10264057666063309, |
| "learning_rate": 1.9998203820754213e-05, |
| "loss": 0.17055926322937012, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.07168458781362007, |
| "grad_norm": 0.104465052485466, |
| "learning_rate": 1.9978004213822736e-05, |
| "loss": 0.1625239610671997, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.08602150537634409, |
| "grad_norm": 0.10240382701158524, |
| "learning_rate": 1.993540527265239e-05, |
| "loss": 0.1715969681739807, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.1003584229390681, |
| "grad_norm": 0.08893006294965744, |
| "learning_rate": 1.9870502626379127e-05, |
| "loss": 0.16313509941101073, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.11469534050179211, |
| "grad_norm": 0.12808184325695038, |
| "learning_rate": 1.9783441973084023e-05, |
| "loss": 0.16091221570968628, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.12903225806451613, |
| "grad_norm": 0.08048602193593979, |
| "learning_rate": 1.9674418752719835e-05, |
| "loss": 0.15647590160369873, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.14336917562724014, |
| "grad_norm": 0.08210249245166779, |
| "learning_rate": 1.9543677708373496e-05, |
| "loss": 0.16005860567092894, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.15770609318996415, |
| "grad_norm": 0.0777139738202095, |
| "learning_rate": 1.9391512336849406e-05, |
| "loss": 0.15733616352081298, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.17204301075268819, |
| "grad_norm": 0.07640817761421204, |
| "learning_rate": 1.9218264229806917e-05, |
| "loss": 0.15088759660720824, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.1863799283154122, |
| "grad_norm": 0.08728871494531631, |
| "learning_rate": 1.9024322306931035e-05, |
| "loss": 0.1508031129837036, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.2007168458781362, |
| "grad_norm": 0.0811760425567627, |
| "learning_rate": 1.8810121942857848e-05, |
| "loss": 0.15035929679870605, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.21505376344086022, |
| "grad_norm": 0.08090047538280487, |
| "learning_rate": 1.8576143989814524e-05, |
| "loss": 0.15261363983154297, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.22939068100358423, |
| "grad_norm": 0.09918594360351562, |
| "learning_rate": 1.8322913698168014e-05, |
| "loss": 0.15076379776000975, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.24372759856630824, |
| "grad_norm": 0.08928713947534561, |
| "learning_rate": 1.8050999537305634e-05, |
| "loss": 0.15194735527038575, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.25806451612903225, |
| "grad_norm": 0.07266855984926224, |
| "learning_rate": 1.776101191949449e-05, |
| "loss": 0.15317165851593018, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.2724014336917563, |
| "grad_norm": 0.07347071915864944, |
| "learning_rate": 1.745360182958459e-05, |
| "loss": 0.15253105163574218, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.2867383512544803, |
| "grad_norm": 0.0758705884218216, |
| "learning_rate": 1.7129459363631692e-05, |
| "loss": 0.14779815673828126, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.2867383512544803, |
| "eval_loss": 0.2187168002128601, |
| "eval_runtime": 47.0917, |
| "eval_samples_per_second": 124.247, |
| "eval_steps_per_second": 0.255, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3010752688172043, |
| "grad_norm": 0.07489030808210373, |
| "learning_rate": 1.678931217972055e-05, |
| "loss": 0.14687340259552, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.3154121863799283, |
| "grad_norm": 0.0788242295384407, |
| "learning_rate": 1.6433923864466235e-05, |
| "loss": 0.14286456108093262, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.32974910394265233, |
| "grad_norm": 0.8215664029121399, |
| "learning_rate": 1.6064092218860553e-05, |
| "loss": 0.15229986906051635, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.34408602150537637, |
| "grad_norm": 0.06975199282169342, |
| "learning_rate": 1.568064746731156e-05, |
| "loss": 0.14773318767547608, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.35842293906810035, |
| "grad_norm": 0.07249259203672409, |
| "learning_rate": 1.5284450393896713e-05, |
| "loss": 0.15218548774719237, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.3727598566308244, |
| "grad_norm": 0.07201112061738968, |
| "learning_rate": 1.4876390410013498e-05, |
| "loss": 0.14689412117004394, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.3870967741935484, |
| "grad_norm": 0.06810774654150009, |
| "learning_rate": 1.4457383557765385e-05, |
| "loss": 0.14651920795440673, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.4014336917562724, |
| "grad_norm": 0.06508651375770569, |
| "learning_rate": 1.402837045356531e-05, |
| "loss": 0.1467513084411621, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.4157706093189964, |
| "grad_norm": 0.06673789769411087, |
| "learning_rate": 1.3590314176572989e-05, |
| "loss": 0.13457820415496827, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.43010752688172044, |
| "grad_norm": 0.07056573778390884, |
| "learning_rate": 1.314419810670624e-05, |
| "loss": 0.14077857732772828, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4444444444444444, |
| "grad_norm": 0.06851952522993088, |
| "learning_rate": 1.2691023717079735e-05, |
| "loss": 0.138556170463562, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.45878136200716846, |
| "grad_norm": 0.06733077019453049, |
| "learning_rate": 1.2231808325826862e-05, |
| "loss": 0.13921281099319457, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.4731182795698925, |
| "grad_norm": 0.07425093650817871, |
| "learning_rate": 1.176758281235155e-05, |
| "loss": 0.13010122776031494, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.4874551971326165, |
| "grad_norm": 0.06684776395559311, |
| "learning_rate": 1.129938930313678e-05, |
| "loss": 0.1314706802368164, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5017921146953405, |
| "grad_norm": 0.06619785726070404, |
| "learning_rate": 1.082827883230487e-05, |
| "loss": 0.13385097980499266, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5161290322580645, |
| "grad_norm": 0.07259789854288101, |
| "learning_rate": 1.0355308982181254e-05, |
| "loss": 0.13582653999328614, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5304659498207885, |
| "grad_norm": 0.06673564016819, |
| "learning_rate": 9.881541509158366e-06, |
| "loss": 0.1326061010360718, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.5448028673835126, |
| "grad_norm": 0.0609956830739975, |
| "learning_rate": 9.408039960189317e-06, |
| "loss": 0.12904767990112304, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5591397849462365, |
| "grad_norm": 0.07004307955503464, |
| "learning_rate": 8.935867285261977e-06, |
| "loss": 0.14127167463302612, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.5734767025089605, |
| "grad_norm": 0.06114516407251358, |
| "learning_rate": 8.466083451213145e-06, |
| "loss": 0.13098690509796143, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5734767025089605, |
| "eval_loss": 0.1937306523323059, |
| "eval_runtime": 47.434, |
| "eval_samples_per_second": 123.35, |
| "eval_steps_per_second": 0.253, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5878136200716846, |
| "grad_norm": 0.07561196386814117, |
| "learning_rate": 7.999743062239557e-06, |
| "loss": 0.13492929935455322, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.6021505376344086, |
| "grad_norm": 0.06452950835227966, |
| "learning_rate": 7.5378929924472735e-06, |
| "loss": 0.13134829998016356, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6164874551971327, |
| "grad_norm": 0.05738452449440956, |
| "learning_rate": 7.081570035754189e-06, |
| "loss": 0.1305772066116333, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.6308243727598566, |
| "grad_norm": 0.06140970438718796, |
| "learning_rate": 6.631798578421195e-06, |
| "loss": 0.12945042848587035, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 0.0673835426568985, |
| "learning_rate": 6.189588299436997e-06, |
| "loss": 0.133364737033844, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.6594982078853047, |
| "grad_norm": 0.06186283379793167, |
| "learning_rate": 5.755931903918835e-06, |
| "loss": 0.12029262781143188, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.6738351254480287, |
| "grad_norm": 0.061157312244176865, |
| "learning_rate": 5.331802894617333e-06, |
| "loss": 0.12551844120025635, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.6881720430107527, |
| "grad_norm": 0.06083202734589577, |
| "learning_rate": 4.918153386528271e-06, |
| "loss": 0.12006042003631592, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.7025089605734767, |
| "grad_norm": 0.05786828324198723, |
| "learning_rate": 4.515911969516985e-06, |
| "loss": 0.12329277992248536, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.7168458781362007, |
| "grad_norm": 0.07877270132303238, |
| "learning_rate": 4.125981623753801e-06, |
| "loss": 0.1233370542526245, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7311827956989247, |
| "grad_norm": 0.06246776878833771, |
| "learning_rate": 3.7492376926397966e-06, |
| "loss": 0.12439545392990112, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.7455197132616488, |
| "grad_norm": 0.06481961905956268, |
| "learning_rate": 3.3865259177736663e-06, |
| "loss": 0.12364455461502075, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.7598566308243727, |
| "grad_norm": 0.0777650848031044, |
| "learning_rate": 3.0386605403707347e-06, |
| "loss": 0.1195528745651245, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.7741935483870968, |
| "grad_norm": 0.05968371778726578, |
| "learning_rate": 2.7064224733963197e-06, |
| "loss": 0.1176903247833252, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.7885304659498208, |
| "grad_norm": 0.059006717056035995, |
| "learning_rate": 2.3905575485167098e-06, |
| "loss": 0.12136919498443603, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.8028673835125448, |
| "grad_norm": 0.062085703015327454, |
| "learning_rate": 2.0917748418031415e-06, |
| "loss": 0.1185749888420105, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.8172043010752689, |
| "grad_norm": 0.05684107542037964, |
| "learning_rate": 1.8107450819473505e-06, |
| "loss": 0.12052475214004517, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.8315412186379928, |
| "grad_norm": 0.056005772203207016, |
| "learning_rate": 1.5480991445620541e-06, |
| "loss": 0.11635445356369019, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.8458781362007168, |
| "grad_norm": 0.08036984503269196, |
| "learning_rate": 1.3044266359464542e-06, |
| "loss": 0.12693744897842407, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.8602150537634409, |
| "grad_norm": 0.05893237143754959, |
| "learning_rate": 1.080274569496057e-06, |
| "loss": 0.11600825786590577, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.8602150537634409, |
| "eval_loss": 0.17753292620182037, |
| "eval_runtime": 48.0542, |
| "eval_samples_per_second": 121.758, |
| "eval_steps_per_second": 0.25, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.8745519713261649, |
| "grad_norm": 0.06269249320030212, |
| "learning_rate": 8.761461377280311e-07, |
| "loss": 0.11844503879547119, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "grad_norm": 0.06056367605924606, |
| "learning_rate": 6.924995826788516e-07, |
| "loss": 0.1225665807723999, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.9032258064516129, |
| "grad_norm": 0.05936103314161301, |
| "learning_rate": 5.29747167209923e-07, |
| "loss": 0.11874287128448487, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.9175627240143369, |
| "grad_norm": 0.06041805073618889, |
| "learning_rate": 3.88254249530583e-07, |
| "loss": 0.11610586643218994, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.931899641577061, |
| "grad_norm": 0.0608358308672905, |
| "learning_rate": 2.6833846301596246e-07, |
| "loss": 0.11917197704315186, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.946236559139785, |
| "grad_norm": 0.0607263408601284, |
| "learning_rate": 1.7026900316098217e-07, |
| "loss": 0.11865537166595459, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.9605734767025089, |
| "grad_norm": 0.05322834476828575, |
| "learning_rate": 9.426602327113788e-08, |
| "loss": 0.11138832569122314, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.974910394265233, |
| "grad_norm": 0.06341966986656189, |
| "learning_rate": 4.050014024668425e-08, |
| "loss": 0.11363672018051148, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.989247311827957, |
| "grad_norm": 0.06364062428474426, |
| "learning_rate": 9.092051569674632e-09, |
| "loss": 0.12197227478027343, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.17584605515003204, |
| "eval_runtime": 49.4161, |
| "eval_samples_per_second": 118.403, |
| "eval_steps_per_second": 0.243, |
| "step": 698 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 698, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.2148830443916493e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|