{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 657, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015225045199352935, "grad_norm": 0.8015036989610748, "learning_rate": 6.818181818181818e-07, "loss": 1.2434, "step": 10 }, { "epoch": 0.03045009039870587, "grad_norm": 0.5371339027245265, "learning_rate": 1.4393939393939396e-06, "loss": 1.3254, "step": 20 }, { "epoch": 0.04567513559805881, "grad_norm": 0.3873488090138409, "learning_rate": 2.196969696969697e-06, "loss": 1.2972, "step": 30 }, { "epoch": 0.06090018079741174, "grad_norm": 0.3521791179594526, "learning_rate": 2.954545454545455e-06, "loss": 1.3127, "step": 40 }, { "epoch": 0.07612522599676468, "grad_norm": 0.30676127981755036, "learning_rate": 3.7121212121212124e-06, "loss": 1.2638, "step": 50 }, { "epoch": 0.09135027119611762, "grad_norm": 0.4101414956476006, "learning_rate": 4.46969696969697e-06, "loss": 1.3305, "step": 60 }, { "epoch": 0.10657531639547055, "grad_norm": 0.32298711751246617, "learning_rate": 4.999682116415026e-06, "loss": 1.2712, "step": 70 }, { "epoch": 0.12180036159482348, "grad_norm": 0.317442988654654, "learning_rate": 4.9940331012821616e-06, "loss": 1.273, "step": 80 }, { "epoch": 0.1370254067941764, "grad_norm": 0.30534590571797454, "learning_rate": 4.981338376708957e-06, "loss": 1.2204, "step": 90 }, { "epoch": 0.15225045199352935, "grad_norm": 0.33409391048066833, "learning_rate": 4.961633805627912e-06, "loss": 1.2558, "step": 100 }, { "epoch": 0.1674754971928823, "grad_norm": 0.33936819417642766, "learning_rate": 4.934975053973217e-06, "loss": 1.247, "step": 110 }, { "epoch": 0.18270054239223524, "grad_norm": 0.3159856537332395, "learning_rate": 4.901437433423016e-06, "loss": 1.2884, "step": 120 }, { "epoch": 0.19792558759158815, "grad_norm": 0.321908331706447, "learning_rate": 4.861115688641921e-06, "loss": 1.2543, "step": 130 }, { "epoch": 0.2131506327909411, "grad_norm": 0.328440211023217, "learning_rate": 4.814123729624837e-06, "loss": 1.2735, "step": 140 }, { "epoch": 0.22837567799029404, "grad_norm": 0.35087717949475955, "learning_rate": 4.7605943098982075e-06, "loss": 1.2938, "step": 150 }, { "epoch": 0.24360072318964696, "grad_norm": 0.32756221423558446, "learning_rate": 4.7006786514878e-06, "loss": 1.1977, "step": 160 }, { "epoch": 0.2588257683889999, "grad_norm": 0.31899258273232245, "learning_rate": 4.6345460177124816e-06, "loss": 1.2292, "step": 170 }, { "epoch": 0.2740508135883528, "grad_norm": 0.3291697633439208, "learning_rate": 4.5623832350108674e-06, "loss": 1.2431, "step": 180 }, { "epoch": 0.2892758587877058, "grad_norm": 0.31915754548001085, "learning_rate": 4.4843941651517e-06, "loss": 1.2183, "step": 190 }, { "epoch": 0.3045009039870587, "grad_norm": 0.3104063063524443, "learning_rate": 4.400799129318952e-06, "loss": 1.2598, "step": 200 }, { "epoch": 0.3197259491864117, "grad_norm": 0.3277653309557523, "learning_rate": 4.31183428569867e-06, "loss": 1.2197, "step": 210 }, { "epoch": 0.3349509943857646, "grad_norm": 0.3565586762710126, "learning_rate": 4.217750962325845e-06, "loss": 1.2978, "step": 220 }, { "epoch": 0.3501760395851175, "grad_norm": 0.3124052373761214, "learning_rate": 4.11881494707608e-06, "loss": 1.2376, "step": 230 }, { "epoch": 0.3654010847844705, "grad_norm": 0.321888222544292, "learning_rate": 4.015305736807806e-06, "loss": 1.2338, "step": 240 }, { "epoch": 0.3806261299838234, "grad_norm": 0.36420812224810134, "learning_rate": 3.907515747776275e-06, "loss": 1.2556, "step": 250 }, { "epoch": 0.3958511751831763, "grad_norm": 0.2989183187303492, "learning_rate": 3.795749489549904e-06, "loss": 1.2527, "step": 260 }, { "epoch": 0.4110762203825293, "grad_norm": 0.30715884439307567, "learning_rate": 3.680322704762701e-06, "loss": 1.2467, "step": 270 }, { "epoch": 0.4263012655818822, "grad_norm": 0.41263512309014105, "learning_rate": 3.561561477132971e-06, "loss": 1.2592, "step": 280 }, { "epoch": 0.4415263107812351, "grad_norm": 0.2990764181830289, "learning_rate": 3.4398013102681956e-06, "loss": 1.2942, "step": 290 }, { "epoch": 0.4567513559805881, "grad_norm": 0.30817207269780955, "learning_rate": 3.3153861798584696e-06, "loss": 1.2278, "step": 300 }, { "epoch": 0.471976401179941, "grad_norm": 0.3072887178239842, "learning_rate": 3.1886675619360883e-06, "loss": 1.2753, "step": 310 }, { "epoch": 0.4872014463792939, "grad_norm": 0.38789533497763495, "learning_rate": 3.060003439946462e-06, "loss": 1.2951, "step": 320 }, { "epoch": 0.5024264915786468, "grad_norm": 0.28331258754735067, "learning_rate": 2.929757293435419e-06, "loss": 1.2851, "step": 330 }, { "epoch": 0.5176515367779998, "grad_norm": 0.2962487342346661, "learning_rate": 2.7982970712098795e-06, "loss": 1.2702, "step": 340 }, { "epoch": 0.5328765819773528, "grad_norm": 0.3039773975171447, "learning_rate": 2.665994151872755e-06, "loss": 1.2127, "step": 350 }, { "epoch": 0.5481016271767056, "grad_norm": 0.4131494842990153, "learning_rate": 2.5332222946685707e-06, "loss": 1.2551, "step": 360 }, { "epoch": 0.5633266723760586, "grad_norm": 0.3888599336298796, "learning_rate": 2.4003565836037245e-06, "loss": 1.2354, "step": 370 }, { "epoch": 0.5785517175754116, "grad_norm": 0.2793746233865594, "learning_rate": 2.267772367824249e-06, "loss": 1.2819, "step": 380 }, { "epoch": 0.5937767627747644, "grad_norm": 0.2899583112911672, "learning_rate": 2.135844201244556e-06, "loss": 1.2523, "step": 390 }, { "epoch": 0.6090018079741174, "grad_norm": 0.2923000214972851, "learning_rate": 2.0049447844227265e-06, "loss": 1.2759, "step": 400 }, { "epoch": 0.6242268531734704, "grad_norm": 0.2919228747330785, "learning_rate": 1.875443911671579e-06, "loss": 1.2044, "step": 410 }, { "epoch": 0.6394518983728233, "grad_norm": 0.5366405058825244, "learning_rate": 1.7477074263799632e-06, "loss": 1.2527, "step": 420 }, { "epoch": 0.6546769435721762, "grad_norm": 0.29907607930356067, "learning_rate": 1.6220961874955136e-06, "loss": 1.2442, "step": 430 }, { "epoch": 0.6699019887715292, "grad_norm": 0.30050211246658853, "learning_rate": 1.4989650500885838e-06, "loss": 1.2406, "step": 440 }, { "epoch": 0.6851270339708821, "grad_norm": 0.2834575399227386, "learning_rate": 1.3786618628772938e-06, "loss": 1.2197, "step": 450 }, { "epoch": 0.700352079170235, "grad_norm": 0.28730355579070144, "learning_rate": 1.2615264855457037e-06, "loss": 1.26, "step": 460 }, { "epoch": 0.715577124369588, "grad_norm": 0.5094507361581523, "learning_rate": 1.1478898286312231e-06, "loss": 1.3205, "step": 470 }, { "epoch": 0.730802169568941, "grad_norm": 0.2896864936273596, "learning_rate": 1.038072918693596e-06, "loss": 1.2423, "step": 480 }, { "epoch": 0.7460272147682938, "grad_norm": 0.29550104589134707, "learning_rate": 9.323859914063815e-07, "loss": 1.2507, "step": 490 }, { "epoch": 0.7612522599676468, "grad_norm": 0.2721570960991744, "learning_rate": 8.311276151329775e-07, "loss": 1.2683, "step": 500 }, { "epoch": 0.7764773051669998, "grad_norm": 0.2852630721545509, "learning_rate": 7.345838474630993e-07, "loss": 1.2815, "step": 510 }, { "epoch": 0.7917023503663526, "grad_norm": 0.2909871526763653, "learning_rate": 6.430274270925271e-07, "loss": 1.2503, "step": 520 }, { "epoch": 0.8069273955657056, "grad_norm": 0.30499754932838724, "learning_rate": 5.56717003329082e-07, "loss": 1.2364, "step": 530 }, { "epoch": 0.8221524407650586, "grad_norm": 0.2776051695513212, "learning_rate": 4.758964054014931e-07, "loss": 1.2012, "step": 540 }, { "epoch": 0.8373774859644114, "grad_norm": 0.2905227549359444, "learning_rate": 4.0079395363538056e-07, "loss": 1.3095, "step": 550 }, { "epoch": 0.8526025311637644, "grad_norm": 0.2810906800968861, "learning_rate": 3.3162181444230056e-07, "loss": 1.2799, "step": 560 }, { "epoch": 0.8678275763631174, "grad_norm": 0.2687773919781078, "learning_rate": 2.6857540094402365e-07, "loss": 1.2415, "step": 570 }, { "epoch": 0.8830526215624702, "grad_norm": 0.30078684187223376, "learning_rate": 2.1183282092530067e-07, "loss": 1.2361, "step": 580 }, { "epoch": 0.8982776667618232, "grad_norm": 0.27536926671011003, "learning_rate": 1.6155437367466277e-07, "loss": 1.2392, "step": 590 }, { "epoch": 0.9135027119611762, "grad_norm": 0.2753306105776222, "learning_rate": 1.1788209713469195e-07, "loss": 1.2716, "step": 600 }, { "epoch": 0.928727757160529, "grad_norm": 0.274286963333408, "learning_rate": 8.093936664108071e-08, "loss": 1.2109, "step": 610 }, { "epoch": 0.943952802359882, "grad_norm": 0.29590574898063726, "learning_rate": 5.083054638404722e-08, "loss": 1.2449, "step": 620 }, { "epoch": 0.959177847559235, "grad_norm": 0.27873264107936796, "learning_rate": 2.7640694576737125e-08, "loss": 1.2194, "step": 630 }, { "epoch": 0.9744028927585878, "grad_norm": 0.2757612055037619, "learning_rate": 1.1435323163525026e-08, "loss": 1.2431, "step": 640 }, { "epoch": 0.9896279379579408, "grad_norm": 0.2715956982953583, "learning_rate": 2.2602127470383593e-09, "loss": 1.2422, "step": 650 }, { "epoch": 1.0, "step": 657, "total_flos": 718053792808960.0, "train_loss": 1.2574238450559851, "train_runtime": 37309.1102, "train_samples_per_second": 9.013, "train_steps_per_second": 0.018 } ], "logging_steps": 10, "max_steps": 657, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 718053792808960.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }