{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 766, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013061224489795919, "grad_norm": 0.013593924231827259, "learning_rate": 1.8461538461538465e-05, "loss": 1.0348053932189942, "step": 10 }, { "epoch": 0.026122448979591838, "grad_norm": 0.010592319071292877, "learning_rate": 3.8974358974358976e-05, "loss": 0.9912956237792969, "step": 20 }, { "epoch": 0.03918367346938775, "grad_norm": 0.008065517991781235, "learning_rate": 5.9487179487179495e-05, "loss": 0.9145261764526367, "step": 30 }, { "epoch": 0.052244897959183675, "grad_norm": 0.006928236689418554, "learning_rate": 8e-05, "loss": 0.8690940856933593, "step": 40 }, { "epoch": 0.0653061224489796, "grad_norm": 0.0065370709635317326, "learning_rate": 7.996265836446255e-05, "loss": 0.8447072982788086, "step": 50 }, { "epoch": 0.0783673469387755, "grad_norm": 0.005765452049672604, "learning_rate": 7.985070317773737e-05, "loss": 0.8226842880249023, "step": 60 }, { "epoch": 0.09142857142857143, "grad_norm": 0.004979082383215427, "learning_rate": 7.966434346931348e-05, "loss": 0.8047774314880372, "step": 70 }, { "epoch": 0.10448979591836735, "grad_norm": 0.00686669023707509, "learning_rate": 7.940392718800637e-05, "loss": 0.7929642200469971, "step": 80 }, { "epoch": 0.11755102040816326, "grad_norm": 0.006248346995562315, "learning_rate": 7.90699405523093e-05, "loss": 0.7915477275848388, "step": 90 }, { "epoch": 0.1306122448979592, "grad_norm": 0.007850440219044685, "learning_rate": 7.86630071425835e-05, "loss": 0.7851225376129151, "step": 100 }, { "epoch": 0.1436734693877551, "grad_norm": 0.007440537214279175, "learning_rate": 7.818388673678265e-05, "loss": 0.7773007869720459, "step": 110 }, { "epoch": 0.156734693877551, "grad_norm": 0.007219385821372271, "learning_rate": 7.763347389188538e-05, "loss": 0.7723363399505615, "step": 120 }, { "epoch": 0.16979591836734695, "grad_norm": 0.006354185286909342, "learning_rate": 7.701279627368412e-05, "loss": 0.7682206153869628, "step": 130 }, { "epoch": 0.18285714285714286, "grad_norm": 0.006955909077078104, "learning_rate": 7.632301273804914e-05, "loss": 0.7699796676635742, "step": 140 }, { "epoch": 0.19591836734693877, "grad_norm": 0.008277718909084797, "learning_rate": 7.556541116724981e-05, "loss": 0.764019775390625, "step": 150 }, { "epoch": 0.2089795918367347, "grad_norm": 0.007425008807331324, "learning_rate": 7.474140606537311e-05, "loss": 0.7628626823425293, "step": 160 }, { "epoch": 0.2220408163265306, "grad_norm": 0.008092896081507206, "learning_rate": 7.38525359173288e-05, "loss": 0.7607949256896973, "step": 170 }, { "epoch": 0.23510204081632652, "grad_norm": 0.011249990202486515, "learning_rate": 7.290046031637228e-05, "loss": 0.757351303100586, "step": 180 }, { "epoch": 0.24816326530612245, "grad_norm": 0.009156743064522743, "learning_rate": 7.188695686550836e-05, "loss": 0.7565219879150391, "step": 190 }, { "epoch": 0.2612244897959184, "grad_norm": 0.008516875095665455, "learning_rate": 7.081391785856087e-05, "loss": 0.7473669052124023, "step": 200 }, { "epoch": 0.2742857142857143, "grad_norm": 0.008919311687350273, "learning_rate": 6.96833467471056e-05, "loss": 0.7444419860839844, "step": 210 }, { "epoch": 0.2873469387755102, "grad_norm": 0.007335508707910776, "learning_rate": 6.84973543998622e-05, "loss": 0.7573845386505127, "step": 220 }, { "epoch": 0.3004081632653061, "grad_norm": 0.007334326393902302, "learning_rate": 6.725815516152973e-05, "loss": 0.7524682998657226, "step": 230 }, { "epoch": 0.313469387755102, "grad_norm": 0.006889250595122576, "learning_rate": 6.596806271842398e-05, "loss": 0.7463503837585449, "step": 240 }, { "epoch": 0.32653061224489793, "grad_norm": 0.007522549480199814, "learning_rate": 6.462948577863593e-05, "loss": 0.7468090057373047, "step": 250 }, { "epoch": 0.3395918367346939, "grad_norm": 0.00689704529941082, "learning_rate": 6.324492357477686e-05, "loss": 0.745818042755127, "step": 260 }, { "epoch": 0.3526530612244898, "grad_norm": 0.007402004674077034, "learning_rate": 6.18169611977065e-05, "loss": 0.737040901184082, "step": 270 }, { "epoch": 0.3657142857142857, "grad_norm": 0.006718257907778025, "learning_rate": 6.034826476995715e-05, "loss": 0.7412730693817139, "step": 280 }, { "epoch": 0.3787755102040816, "grad_norm": 0.006786980666220188, "learning_rate": 5.8841576467864825e-05, "loss": 0.7408377170562744, "step": 290 }, { "epoch": 0.39183673469387753, "grad_norm": 0.006618338171392679, "learning_rate": 5.7299709401701805e-05, "loss": 0.7430388927459717, "step": 300 }, { "epoch": 0.4048979591836735, "grad_norm": 0.007314841262996197, "learning_rate": 5.572554236336965e-05, "loss": 0.7401338577270508, "step": 310 }, { "epoch": 0.4179591836734694, "grad_norm": 0.0067168474197387695, "learning_rate": 5.4122014451459385e-05, "loss": 0.7423385143280029, "step": 320 }, { "epoch": 0.4310204081632653, "grad_norm": 0.007062443997710943, "learning_rate": 5.2492119583714064e-05, "loss": 0.7407833099365234, "step": 330 }, { "epoch": 0.4440816326530612, "grad_norm": 0.009489820338785648, "learning_rate": 5.083890090713949e-05, "loss": 0.7376296997070313, "step": 340 }, { "epoch": 0.45714285714285713, "grad_norm": 0.006291185040026903, "learning_rate": 4.916544511619984e-05, "loss": 0.7393476963043213, "step": 350 }, { "epoch": 0.47020408163265304, "grad_norm": 0.007340357638895512, "learning_rate": 4.747487668970681e-05, "loss": 0.7434526443481445, "step": 360 }, { "epoch": 0.483265306122449, "grad_norm": 0.007148618344217539, "learning_rate": 4.5770352057162046e-05, "loss": 0.7274169445037841, "step": 370 }, { "epoch": 0.4963265306122449, "grad_norm": 0.007421489339321852, "learning_rate": 4.405505370544521e-05, "loss": 0.7373303413391114, "step": 380 }, { "epoch": 0.5093877551020408, "grad_norm": 0.006447239778935909, "learning_rate": 4.233218423685071e-05, "loss": 0.7334803581237793, "step": 390 }, { "epoch": 0.5224489795918368, "grad_norm": 0.007035430055111647, "learning_rate": 4.060496038956728e-05, "loss": 0.7342597961425781, "step": 400 }, { "epoch": 0.5355102040816326, "grad_norm": 0.006574620492756367, "learning_rate": 3.887660703176474e-05, "loss": 0.7356997966766358, "step": 410 }, { "epoch": 0.5485714285714286, "grad_norm": 0.0066298553720116615, "learning_rate": 3.7150351140501457e-05, "loss": 0.7381177425384522, "step": 420 }, { "epoch": 0.5616326530612245, "grad_norm": 0.007492161355912685, "learning_rate": 3.542941577669424e-05, "loss": 0.7291494369506836, "step": 430 }, { "epoch": 0.5746938775510204, "grad_norm": 0.006217462942004204, "learning_rate": 3.3717014067400025e-05, "loss": 0.7233750343322753, "step": 440 }, { "epoch": 0.5877551020408164, "grad_norm": 0.006911132019013166, "learning_rate": 3.201634320664491e-05, "loss": 0.7269360542297363, "step": 450 }, { "epoch": 0.6008163265306122, "grad_norm": 0.00729888491332531, "learning_rate": 3.0330578486001478e-05, "loss": 0.7219826698303222, "step": 460 }, { "epoch": 0.6138775510204082, "grad_norm": 0.007913697510957718, "learning_rate": 2.8662867366059758e-05, "loss": 0.7285231590270996, "step": 470 }, { "epoch": 0.626938775510204, "grad_norm": 0.006929247174412012, "learning_rate": 2.7016323599860833e-05, "loss": 0.7285576820373535, "step": 480 }, { "epoch": 0.64, "grad_norm": 0.006759402342140675, "learning_rate": 2.5394021419265458e-05, "loss": 0.7239264965057373, "step": 490 }, { "epoch": 0.6530612244897959, "grad_norm": 0.006527756340801716, "learning_rate": 2.379898979511156e-05, "loss": 0.731016731262207, "step": 500 }, { "epoch": 0.6661224489795918, "grad_norm": 0.006792586762458086, "learning_rate": 2.2234206781878127e-05, "loss": 0.7236400604248047, "step": 510 }, { "epoch": 0.6791836734693878, "grad_norm": 0.006418135017156601, "learning_rate": 2.0702593957413973e-05, "loss": 0.7233975887298584, "step": 520 }, { "epoch": 0.6922448979591836, "grad_norm": 0.006609582342207432, "learning_rate": 1.9207010968112856e-05, "loss": 0.7252939224243165, "step": 530 }, { "epoch": 0.7053061224489796, "grad_norm": 0.006350552197545767, "learning_rate": 1.7750250189719885e-05, "loss": 0.7322314739227295, "step": 540 }, { "epoch": 0.7183673469387755, "grad_norm": 0.006282226648181677, "learning_rate": 1.633503151373769e-05, "loss": 0.718090009689331, "step": 550 }, { "epoch": 0.7314285714285714, "grad_norm": 0.006717463489621878, "learning_rate": 1.4963997269166472e-05, "loss": 0.7251626491546631, "step": 560 }, { "epoch": 0.7444897959183674, "grad_norm": 0.006243122275918722, "learning_rate": 1.363970728905975e-05, "loss": 0.7236129760742187, "step": 570 }, { "epoch": 0.7575510204081632, "grad_norm": 0.005929226521402597, "learning_rate": 1.2364634131106663e-05, "loss": 0.7275202751159668, "step": 580 }, { "epoch": 0.7706122448979592, "grad_norm": 0.006825726944953203, "learning_rate": 1.11411584611646e-05, "loss": 0.7266074657440186, "step": 590 }, { "epoch": 0.7836734693877551, "grad_norm": 0.006338095758110285, "learning_rate": 9.971564608361387e-06, "loss": 0.7227339744567871, "step": 600 }, { "epoch": 0.796734693877551, "grad_norm": 0.006189221516251564, "learning_rate": 8.858036300065912e-06, "loss": 0.7260101318359375, "step": 610 }, { "epoch": 0.809795918367347, "grad_norm": 0.006516415625810623, "learning_rate": 7.802652584690627e-06, "loss": 0.721678638458252, "step": 620 }, { "epoch": 0.8228571428571428, "grad_norm": 0.006069981027394533, "learning_rate": 6.807383949938131e-06, "loss": 0.7275302886962891, "step": 630 }, { "epoch": 0.8359183673469388, "grad_norm": 0.006245093885809183, "learning_rate": 5.874088643739453e-06, "loss": 0.7232262134552002, "step": 640 }, { "epoch": 0.8489795918367347, "grad_norm": 0.0058577232994139194, "learning_rate": 5.0045092047532385e-06, "loss": 0.7290368556976319, "step": 650 }, { "epoch": 0.8620408163265306, "grad_norm": 0.005565746687352657, "learning_rate": 4.200269208903569e-06, "loss": 0.7283772945404052, "step": 660 }, { "epoch": 0.8751020408163265, "grad_norm": 0.006886645220220089, "learning_rate": 3.4628702380309266e-06, "loss": 0.7242953300476074, "step": 670 }, { "epoch": 0.8881632653061224, "grad_norm": 0.005979357752948999, "learning_rate": 2.793689076316111e-06, "loss": 0.7234395503997803, "step": 680 }, { "epoch": 0.9012244897959184, "grad_norm": 0.005459806881844997, "learning_rate": 2.193975139711575e-06, "loss": 0.7297232151031494, "step": 690 }, { "epoch": 0.9142857142857143, "grad_norm": 0.00546460272744298, "learning_rate": 1.6648481431797137e-06, "loss": 0.7135615348815918, "step": 700 }, { "epoch": 0.9273469387755102, "grad_norm": 0.005548370536416769, "learning_rate": 1.2072960100933862e-06, "loss": 0.7257327079772949, "step": 710 }, { "epoch": 0.9404081632653061, "grad_norm": 0.005945554003119469, "learning_rate": 8.221730277022488e-07, "loss": 0.7289669036865234, "step": 720 }, { "epoch": 0.953469387755102, "grad_norm": 0.005690570455044508, "learning_rate": 5.101982521085847e-07, "loss": 0.7285196781158447, "step": 730 }, { "epoch": 0.966530612244898, "grad_norm": 0.005644120275974274, "learning_rate": 2.719541657307456e-07, "loss": 0.7271464347839356, "step": 740 }, { "epoch": 0.9795918367346939, "grad_norm": 0.005760515108704567, "learning_rate": 1.0788558976085662e-07, "loss": 0.726295280456543, "step": 750 }, { "epoch": 0.9926530612244898, "grad_norm": 0.006554395891726017, "learning_rate": 1.8298853647267245e-08, "loss": 0.7278533458709717, "step": 760 }, { "epoch": 1.0, "step": 766, "total_flos": 7.613569370341507e+18, "train_loss": 0.7527124293479223, "train_runtime": 22851.6673, "train_samples_per_second": 2.144, "train_steps_per_second": 0.034 } ], "logging_steps": 10, "max_steps": 766, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.613569370341507e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }