{ "best_global_step": 1000, "best_metric": 0.7969963550567627, "best_model_checkpoint": "checkpoints/lora_tutor/checkpoint-1000", "epoch": 0.35634743875278396, "eval_steps": 200, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00035634743875278396, "grad_norm": 99.66374969482422, "learning_rate": 0.0, "loss": 4.1982, "step": 1 }, { "epoch": 0.0017817371937639199, "grad_norm": 57.29983139038086, "learning_rate": 3.3333333333333333e-06, "loss": 4.0, "step": 5 }, { "epoch": 0.0035634743875278397, "grad_norm": 27.13236427307129, "learning_rate": 7.5e-06, "loss": 2.4799, "step": 10 }, { "epoch": 0.005345211581291759, "grad_norm": 13.770986557006836, "learning_rate": 1.1666666666666668e-05, "loss": 1.4129, "step": 15 }, { "epoch": 0.0071269487750556795, "grad_norm": 9.720105171203613, "learning_rate": 1.5833333333333333e-05, "loss": 1.2207, "step": 20 }, { "epoch": 0.008908685968819599, "grad_norm": 10.442448616027832, "learning_rate": 2e-05, "loss": 1.2037, "step": 25 }, { "epoch": 0.010690423162583519, "grad_norm": 8.392529487609863, "learning_rate": 2.4166666666666667e-05, "loss": 1.1535, "step": 30 }, { "epoch": 0.012472160356347439, "grad_norm": 7.494682788848877, "learning_rate": 2.8333333333333335e-05, "loss": 1.1494, "step": 35 }, { "epoch": 0.014253897550111359, "grad_norm": 7.756562232971191, "learning_rate": 3.2500000000000004e-05, "loss": 1.1588, "step": 40 }, { "epoch": 0.016035634743875277, "grad_norm": 5.802969932556152, "learning_rate": 3.6666666666666666e-05, "loss": 1.1049, "step": 45 }, { "epoch": 0.017817371937639197, "grad_norm": 7.09335470199585, "learning_rate": 4.0833333333333334e-05, "loss": 1.0902, "step": 50 }, { "epoch": 0.019599109131403118, "grad_norm": 5.961513042449951, "learning_rate": 4.5e-05, "loss": 1.0484, "step": 55 }, { "epoch": 0.021380846325167038, "grad_norm": 4.003515720367432, "learning_rate": 4.9166666666666665e-05, "loss": 1.0418, "step": 60 }, { "epoch": 0.023162583518930958, "grad_norm": 4.197242259979248, "learning_rate": 5.333333333333333e-05, "loss": 1.0574, "step": 65 }, { "epoch": 0.024944320712694878, "grad_norm": 4.823288917541504, "learning_rate": 5.7499999999999995e-05, "loss": 1.0194, "step": 70 }, { "epoch": 0.026726057906458798, "grad_norm": 6.500214099884033, "learning_rate": 6.166666666666667e-05, "loss": 1.0146, "step": 75 }, { "epoch": 0.028507795100222718, "grad_norm": 6.800583362579346, "learning_rate": 6.583333333333334e-05, "loss": 1.0192, "step": 80 }, { "epoch": 0.030289532293986638, "grad_norm": 5.19222354888916, "learning_rate": 7e-05, "loss": 1.0455, "step": 85 }, { "epoch": 0.032071269487750555, "grad_norm": 6.357260704040527, "learning_rate": 7.416666666666668e-05, "loss": 1.029, "step": 90 }, { "epoch": 0.033853006681514475, "grad_norm": 5.543500900268555, "learning_rate": 7.833333333333333e-05, "loss": 1.0228, "step": 95 }, { "epoch": 0.035634743875278395, "grad_norm": 4.388900279998779, "learning_rate": 8.25e-05, "loss": 1.044, "step": 100 }, { "epoch": 0.037416481069042315, "grad_norm": 5.311753273010254, "learning_rate": 8.666666666666667e-05, "loss": 1.063, "step": 105 }, { "epoch": 0.039198218262806235, "grad_norm": 5.037621974945068, "learning_rate": 9.083333333333334e-05, "loss": 1.0138, "step": 110 }, { "epoch": 0.040979955456570155, "grad_norm": 5.125575542449951, "learning_rate": 9.5e-05, "loss": 1.0403, "step": 115 }, { "epoch": 0.042761692650334075, "grad_norm": 5.154388904571533, "learning_rate": 9.916666666666667e-05, "loss": 1.015, "step": 120 }, { "epoch": 0.044543429844097995, "grad_norm": 3.9509270191192627, "learning_rate": 9.999661540018812e-05, "loss": 1.0027, "step": 125 }, { "epoch": 0.046325167037861915, "grad_norm": 3.7814090251922607, "learning_rate": 9.998286624877786e-05, "loss": 0.9863, "step": 130 }, { "epoch": 0.048106904231625836, "grad_norm": 3.7585690021514893, "learning_rate": 9.995854391448606e-05, "loss": 1.0459, "step": 135 }, { "epoch": 0.049888641425389756, "grad_norm": 6.917703628540039, "learning_rate": 9.992365354236557e-05, "loss": 1.0719, "step": 140 }, { "epoch": 0.051670378619153676, "grad_norm": 3.885483503341675, "learning_rate": 9.987820251299122e-05, "loss": 1.0123, "step": 145 }, { "epoch": 0.053452115812917596, "grad_norm": 3.287639617919922, "learning_rate": 9.982220044089859e-05, "loss": 0.9903, "step": 150 }, { "epoch": 0.055233853006681516, "grad_norm": 3.95298171043396, "learning_rate": 9.975565917255016e-05, "loss": 0.9841, "step": 155 }, { "epoch": 0.057015590200445436, "grad_norm": 4.5531721115112305, "learning_rate": 9.967859278382938e-05, "loss": 0.9968, "step": 160 }, { "epoch": 0.058797327394209356, "grad_norm": 4.4977641105651855, "learning_rate": 9.959101757706308e-05, "loss": 1.006, "step": 165 }, { "epoch": 0.060579064587973276, "grad_norm": 3.260209798812866, "learning_rate": 9.949295207757299e-05, "loss": 0.9557, "step": 170 }, { "epoch": 0.062360801781737196, "grad_norm": 3.9708852767944336, "learning_rate": 9.938441702975689e-05, "loss": 0.9914, "step": 175 }, { "epoch": 0.06414253897550111, "grad_norm": 3.731992721557617, "learning_rate": 9.926543539270048e-05, "loss": 0.9794, "step": 180 }, { "epoch": 0.06592427616926504, "grad_norm": 3.153402805328369, "learning_rate": 9.913603233532067e-05, "loss": 0.9525, "step": 185 }, { "epoch": 0.06770601336302895, "grad_norm": 2.9249067306518555, "learning_rate": 9.899623523104149e-05, "loss": 0.9578, "step": 190 }, { "epoch": 0.06948775055679288, "grad_norm": 2.661738872528076, "learning_rate": 9.884607365200356e-05, "loss": 0.9711, "step": 195 }, { "epoch": 0.07126948775055679, "grad_norm": 3.0224714279174805, "learning_rate": 9.868557936280855e-05, "loss": 0.9693, "step": 200 }, { "epoch": 0.07126948775055679, "eval_loss": 0.9798622131347656, "eval_runtime": 249.2057, "eval_samples_per_second": 20.02, "eval_steps_per_second": 2.504, "step": 200 }, { "epoch": 0.07305122494432072, "grad_norm": 2.5287749767303467, "learning_rate": 9.851478631379982e-05, "loss": 0.9299, "step": 205 }, { "epoch": 0.07483296213808463, "grad_norm": 2.9961535930633545, "learning_rate": 9.83337306338807e-05, "loss": 0.9606, "step": 210 }, { "epoch": 0.07661469933184856, "grad_norm": 3.6630430221557617, "learning_rate": 9.814245062287189e-05, "loss": 0.9546, "step": 215 }, { "epoch": 0.07839643652561247, "grad_norm": 2.665858030319214, "learning_rate": 9.794098674340965e-05, "loss": 0.958, "step": 220 }, { "epoch": 0.0801781737193764, "grad_norm": 2.741337776184082, "learning_rate": 9.77293816123866e-05, "loss": 0.963, "step": 225 }, { "epoch": 0.08195991091314031, "grad_norm": 2.693640947341919, "learning_rate": 9.750767999193656e-05, "loss": 0.9677, "step": 230 }, { "epoch": 0.08374164810690424, "grad_norm": 2.718897581100464, "learning_rate": 9.727592877996585e-05, "loss": 0.9551, "step": 235 }, { "epoch": 0.08552338530066815, "grad_norm": 3.1531124114990234, "learning_rate": 9.70341770002326e-05, "loss": 0.9692, "step": 240 }, { "epoch": 0.08730512249443208, "grad_norm": 2.4551897048950195, "learning_rate": 9.678247579197657e-05, "loss": 0.9727, "step": 245 }, { "epoch": 0.08908685968819599, "grad_norm": 2.886244058609009, "learning_rate": 9.652087839910124e-05, "loss": 0.9537, "step": 250 }, { "epoch": 0.09086859688195992, "grad_norm": 2.8074824810028076, "learning_rate": 9.62494401589108e-05, "loss": 0.9327, "step": 255 }, { "epoch": 0.09265033407572383, "grad_norm": 2.750798463821411, "learning_rate": 9.596821849040447e-05, "loss": 0.9228, "step": 260 }, { "epoch": 0.09443207126948774, "grad_norm": 2.552215337753296, "learning_rate": 9.567727288213005e-05, "loss": 0.9423, "step": 265 }, { "epoch": 0.09621380846325167, "grad_norm": 2.3609156608581543, "learning_rate": 9.537666487960019e-05, "loss": 0.9676, "step": 270 }, { "epoch": 0.09799554565701558, "grad_norm": 2.8906874656677246, "learning_rate": 9.506645807227312e-05, "loss": 0.955, "step": 275 }, { "epoch": 0.09977728285077951, "grad_norm": 2.660022497177124, "learning_rate": 9.474671808010126e-05, "loss": 0.9695, "step": 280 }, { "epoch": 0.10155902004454342, "grad_norm": 3.261420726776123, "learning_rate": 9.441751253965021e-05, "loss": 0.9477, "step": 285 }, { "epoch": 0.10334075723830735, "grad_norm": 3.65535044670105, "learning_rate": 9.407891108979117e-05, "loss": 0.9724, "step": 290 }, { "epoch": 0.10512249443207126, "grad_norm": 6.143333911895752, "learning_rate": 9.373098535696979e-05, "loss": 0.9477, "step": 295 }, { "epoch": 0.10690423162583519, "grad_norm": 3.469689130783081, "learning_rate": 9.337380894005463e-05, "loss": 0.9286, "step": 300 }, { "epoch": 0.1086859688195991, "grad_norm": 2.4321353435516357, "learning_rate": 9.300745739476829e-05, "loss": 0.9681, "step": 305 }, { "epoch": 0.11046770601336303, "grad_norm": 2.3954951763153076, "learning_rate": 9.263200821770461e-05, "loss": 0.9223, "step": 310 }, { "epoch": 0.11224944320712694, "grad_norm": 3.206364154815674, "learning_rate": 9.224754082993552e-05, "loss": 0.9111, "step": 315 }, { "epoch": 0.11403118040089087, "grad_norm": 2.411461591720581, "learning_rate": 9.185413656021036e-05, "loss": 0.9254, "step": 320 }, { "epoch": 0.11581291759465479, "grad_norm": 3.2764694690704346, "learning_rate": 9.145187862775209e-05, "loss": 0.9388, "step": 325 }, { "epoch": 0.11759465478841871, "grad_norm": 2.724217653274536, "learning_rate": 9.104085212465336e-05, "loss": 0.9493, "step": 330 }, { "epoch": 0.11937639198218263, "grad_norm": 2.4242122173309326, "learning_rate": 9.062114399787647e-05, "loss": 0.9439, "step": 335 }, { "epoch": 0.12115812917594655, "grad_norm": 2.391575813293457, "learning_rate": 9.019284303086087e-05, "loss": 0.9253, "step": 340 }, { "epoch": 0.12293986636971047, "grad_norm": 2.7728800773620605, "learning_rate": 8.97560398247424e-05, "loss": 0.946, "step": 345 }, { "epoch": 0.12472160356347439, "grad_norm": 3.3350629806518555, "learning_rate": 8.931082677918771e-05, "loss": 0.9318, "step": 350 }, { "epoch": 0.12650334075723832, "grad_norm": 2.887850761413574, "learning_rate": 8.885729807284856e-05, "loss": 0.9407, "step": 355 }, { "epoch": 0.12828507795100222, "grad_norm": 2.461491107940674, "learning_rate": 8.839554964343943e-05, "loss": 0.9748, "step": 360 }, { "epoch": 0.13006681514476615, "grad_norm": 2.649059772491455, "learning_rate": 8.792567916744346e-05, "loss": 0.9569, "step": 365 }, { "epoch": 0.13184855233853007, "grad_norm": 2.505889415740967, "learning_rate": 8.744778603945011e-05, "loss": 0.9235, "step": 370 }, { "epoch": 0.133630289532294, "grad_norm": 3.084015369415283, "learning_rate": 8.69619713511298e-05, "loss": 0.9466, "step": 375 }, { "epoch": 0.1354120267260579, "grad_norm": 2.242276191711426, "learning_rate": 8.646833786984927e-05, "loss": 0.8958, "step": 380 }, { "epoch": 0.13719376391982183, "grad_norm": 2.439112424850464, "learning_rate": 8.596699001693255e-05, "loss": 0.9211, "step": 385 }, { "epoch": 0.13897550111358575, "grad_norm": 2.7526488304138184, "learning_rate": 8.545803384557219e-05, "loss": 0.9218, "step": 390 }, { "epoch": 0.14075723830734965, "grad_norm": 2.521644353866577, "learning_rate": 8.4941577018395e-05, "loss": 0.9365, "step": 395 }, { "epoch": 0.14253897550111358, "grad_norm": 2.8012807369232178, "learning_rate": 8.44177287846877e-05, "loss": 0.8991, "step": 400 }, { "epoch": 0.14253897550111358, "eval_loss": 0.9173732995986938, "eval_runtime": 250.7158, "eval_samples_per_second": 19.899, "eval_steps_per_second": 2.489, "step": 400 }, { "epoch": 0.1443207126948775, "grad_norm": 2.3261518478393555, "learning_rate": 8.388659995728661e-05, "loss": 0.8968, "step": 405 }, { "epoch": 0.14610244988864143, "grad_norm": 2.2134907245635986, "learning_rate": 8.334830288913682e-05, "loss": 0.91, "step": 410 }, { "epoch": 0.14788418708240533, "grad_norm": 3.5786261558532715, "learning_rate": 8.280295144952536e-05, "loss": 0.9175, "step": 415 }, { "epoch": 0.14966592427616926, "grad_norm": 2.7428812980651855, "learning_rate": 8.225066099999392e-05, "loss": 0.9345, "step": 420 }, { "epoch": 0.1514476614699332, "grad_norm": 2.246025800704956, "learning_rate": 8.169154836993551e-05, "loss": 0.9067, "step": 425 }, { "epoch": 0.15322939866369711, "grad_norm": 2.188469886779785, "learning_rate": 8.112573183188099e-05, "loss": 0.9537, "step": 430 }, { "epoch": 0.155011135857461, "grad_norm": 2.545259475708008, "learning_rate": 8.055333107647999e-05, "loss": 0.9159, "step": 435 }, { "epoch": 0.15679287305122494, "grad_norm": 2.421093463897705, "learning_rate": 7.99744671871822e-05, "loss": 0.9034, "step": 440 }, { "epoch": 0.15857461024498887, "grad_norm": 2.5586888790130615, "learning_rate": 7.938926261462366e-05, "loss": 0.9072, "step": 445 }, { "epoch": 0.1603563474387528, "grad_norm": 2.444941759109497, "learning_rate": 7.879784115072417e-05, "loss": 0.9101, "step": 450 }, { "epoch": 0.1621380846325167, "grad_norm": 2.3764047622680664, "learning_rate": 7.820032790250074e-05, "loss": 0.9065, "step": 455 }, { "epoch": 0.16391982182628062, "grad_norm": 2.34041428565979, "learning_rate": 7.75968492656029e-05, "loss": 0.8791, "step": 460 }, { "epoch": 0.16570155902004455, "grad_norm": 2.013155698776245, "learning_rate": 7.698753289757565e-05, "loss": 0.9058, "step": 465 }, { "epoch": 0.16748329621380847, "grad_norm": 2.3692591190338135, "learning_rate": 7.6372507690855e-05, "loss": 0.8898, "step": 470 }, { "epoch": 0.16926503340757237, "grad_norm": 2.4539620876312256, "learning_rate": 7.575190374550272e-05, "loss": 0.9201, "step": 475 }, { "epoch": 0.1710467706013363, "grad_norm": 2.6015443801879883, "learning_rate": 7.51258523416855e-05, "loss": 0.8823, "step": 480 }, { "epoch": 0.17282850779510023, "grad_norm": 2.413839101791382, "learning_rate": 7.449448591190435e-05, "loss": 0.9196, "step": 485 }, { "epoch": 0.17461024498886416, "grad_norm": 2.1962289810180664, "learning_rate": 7.385793801298042e-05, "loss": 0.8869, "step": 490 }, { "epoch": 0.17639198218262805, "grad_norm": 2.994487762451172, "learning_rate": 7.321634329780286e-05, "loss": 0.9103, "step": 495 }, { "epoch": 0.17817371937639198, "grad_norm": 2.9973297119140625, "learning_rate": 7.256983748684485e-05, "loss": 0.9083, "step": 500 }, { "epoch": 0.1799554565701559, "grad_norm": 2.6006710529327393, "learning_rate": 7.191855733945387e-05, "loss": 0.9131, "step": 505 }, { "epoch": 0.18173719376391984, "grad_norm": 2.4508118629455566, "learning_rate": 7.126264062492217e-05, "loss": 0.8762, "step": 510 }, { "epoch": 0.18351893095768373, "grad_norm": 2.8403897285461426, "learning_rate": 7.060222609334343e-05, "loss": 0.8673, "step": 515 }, { "epoch": 0.18530066815144766, "grad_norm": 2.5483813285827637, "learning_rate": 6.993745344626231e-05, "loss": 0.8812, "step": 520 }, { "epoch": 0.1870824053452116, "grad_norm": 1.927654504776001, "learning_rate": 6.926846330712242e-05, "loss": 0.9213, "step": 525 }, { "epoch": 0.1888641425389755, "grad_norm": 2.8513023853302, "learning_rate": 6.859539719151933e-05, "loss": 0.8911, "step": 530 }, { "epoch": 0.19064587973273942, "grad_norm": 2.6732981204986572, "learning_rate": 6.7918397477265e-05, "loss": 0.9018, "step": 535 }, { "epoch": 0.19242761692650334, "grad_norm": 2.3755311965942383, "learning_rate": 6.723760737426971e-05, "loss": 0.8803, "step": 540 }, { "epoch": 0.19420935412026727, "grad_norm": 2.5072877407073975, "learning_rate": 6.65531708942479e-05, "loss": 0.9066, "step": 545 }, { "epoch": 0.19599109131403117, "grad_norm": 2.3347630500793457, "learning_rate": 6.586523282025462e-05, "loss": 0.8999, "step": 550 }, { "epoch": 0.1977728285077951, "grad_norm": 2.4541633129119873, "learning_rate": 6.517393867605855e-05, "loss": 0.9024, "step": 555 }, { "epoch": 0.19955456570155902, "grad_norm": 2.89241361618042, "learning_rate": 6.447943469535856e-05, "loss": 0.8802, "step": 560 }, { "epoch": 0.20133630289532295, "grad_norm": 2.635859251022339, "learning_rate": 6.378186779084995e-05, "loss": 0.91, "step": 565 }, { "epoch": 0.20311804008908685, "grad_norm": 2.5360910892486572, "learning_rate": 6.308138552314718e-05, "loss": 0.883, "step": 570 }, { "epoch": 0.20489977728285078, "grad_norm": 2.0861408710479736, "learning_rate": 6.23781360695693e-05, "loss": 0.9051, "step": 575 }, { "epoch": 0.2066815144766147, "grad_norm": 1.938452959060669, "learning_rate": 6.167226819279528e-05, "loss": 0.8763, "step": 580 }, { "epoch": 0.20846325167037863, "grad_norm": 2.333118200302124, "learning_rate": 6.096393120939516e-05, "loss": 0.8939, "step": 585 }, { "epoch": 0.21024498886414253, "grad_norm": 2.2652223110198975, "learning_rate": 6.0253274958244386e-05, "loss": 0.8992, "step": 590 }, { "epoch": 0.21202672605790646, "grad_norm": 1.830731749534607, "learning_rate": 5.9540449768827246e-05, "loss": 0.8617, "step": 595 }, { "epoch": 0.21380846325167038, "grad_norm": 2.4237635135650635, "learning_rate": 5.882560642943696e-05, "loss": 0.9189, "step": 600 }, { "epoch": 0.21380846325167038, "eval_loss": 0.8756723999977112, "eval_runtime": 256.5875, "eval_samples_per_second": 19.444, "eval_steps_per_second": 2.432, "step": 600 }, { "epoch": 0.2155902004454343, "grad_norm": 2.6089930534362793, "learning_rate": 5.810889615527838e-05, "loss": 0.9052, "step": 605 }, { "epoch": 0.2173719376391982, "grad_norm": 2.457108974456787, "learning_rate": 5.7390470556480545e-05, "loss": 0.8959, "step": 610 }, { "epoch": 0.21915367483296214, "grad_norm": 2.3315470218658447, "learning_rate": 5.667048160602564e-05, "loss": 0.8772, "step": 615 }, { "epoch": 0.22093541202672606, "grad_norm": 2.0484960079193115, "learning_rate": 5.5949081607601274e-05, "loss": 0.8387, "step": 620 }, { "epoch": 0.22271714922049, "grad_norm": 2.341867208480835, "learning_rate": 5.522642316338268e-05, "loss": 0.8778, "step": 625 }, { "epoch": 0.2244988864142539, "grad_norm": 2.4177300930023193, "learning_rate": 5.450265914175187e-05, "loss": 0.8936, "step": 630 }, { "epoch": 0.22628062360801782, "grad_norm": 2.4489850997924805, "learning_rate": 5.377794264496041e-05, "loss": 0.8654, "step": 635 }, { "epoch": 0.22806236080178174, "grad_norm": 2.468477964401245, "learning_rate": 5.3052426976742855e-05, "loss": 0.8467, "step": 640 }, { "epoch": 0.22984409799554567, "grad_norm": 2.1568973064422607, "learning_rate": 5.232626560988735e-05, "loss": 0.8337, "step": 645 }, { "epoch": 0.23162583518930957, "grad_norm": 2.248286485671997, "learning_rate": 5.159961215377065e-05, "loss": 0.8626, "step": 650 }, { "epoch": 0.2334075723830735, "grad_norm": 2.197516918182373, "learning_rate": 5.0872620321864185e-05, "loss": 0.8857, "step": 655 }, { "epoch": 0.23518930957683742, "grad_norm": 2.0258774757385254, "learning_rate": 5.0145443899218105e-05, "loss": 0.8693, "step": 660 }, { "epoch": 0.23697104677060132, "grad_norm": 2.576545000076294, "learning_rate": 4.941823670993016e-05, "loss": 0.8585, "step": 665 }, { "epoch": 0.23875278396436525, "grad_norm": 2.1643807888031006, "learning_rate": 4.869115258460635e-05, "loss": 0.8844, "step": 670 }, { "epoch": 0.24053452115812918, "grad_norm": 1.8109593391418457, "learning_rate": 4.7964345327820217e-05, "loss": 0.8526, "step": 675 }, { "epoch": 0.2423162583518931, "grad_norm": 2.2996315956115723, "learning_rate": 4.723796868557758e-05, "loss": 0.8588, "step": 680 }, { "epoch": 0.244097995545657, "grad_norm": 2.109656810760498, "learning_rate": 4.6512176312793736e-05, "loss": 0.8657, "step": 685 }, { "epoch": 0.24587973273942093, "grad_norm": 2.0365986824035645, "learning_rate": 4.578712174078986e-05, "loss": 0.8722, "step": 690 }, { "epoch": 0.24766146993318486, "grad_norm": 2.396369695663452, "learning_rate": 4.506295834481561e-05, "loss": 0.8595, "step": 695 }, { "epoch": 0.24944320712694878, "grad_norm": 1.9721331596374512, "learning_rate": 4.433983931160467e-05, "loss": 0.845, "step": 700 }, { "epoch": 0.2512249443207127, "grad_norm": 2.6028833389282227, "learning_rate": 4.361791760697027e-05, "loss": 0.8756, "step": 705 }, { "epoch": 0.25300668151447664, "grad_norm": 2.5747413635253906, "learning_rate": 4.289734594344738e-05, "loss": 0.8553, "step": 710 }, { "epoch": 0.25478841870824054, "grad_norm": 2.2102746963500977, "learning_rate": 4.2178276747988446e-05, "loss": 0.8301, "step": 715 }, { "epoch": 0.25657015590200444, "grad_norm": 2.2053496837615967, "learning_rate": 4.146086212971967e-05, "loss": 0.8347, "step": 720 }, { "epoch": 0.2583518930957684, "grad_norm": 2.1658267974853516, "learning_rate": 4.074525384776428e-05, "loss": 0.8583, "step": 725 }, { "epoch": 0.2601336302895323, "grad_norm": 2.4658656120300293, "learning_rate": 4.003160327914015e-05, "loss": 0.8448, "step": 730 }, { "epoch": 0.2619153674832962, "grad_norm": 2.5138092041015625, "learning_rate": 3.932006138673801e-05, "loss": 0.7994, "step": 735 }, { "epoch": 0.26369710467706015, "grad_norm": 2.3678791522979736, "learning_rate": 3.861077868738733e-05, "loss": 0.8543, "step": 740 }, { "epoch": 0.26547884187082404, "grad_norm": 2.174612283706665, "learning_rate": 3.790390522001662e-05, "loss": 0.8255, "step": 745 }, { "epoch": 0.267260579064588, "grad_norm": 2.633901596069336, "learning_rate": 3.719959051391472e-05, "loss": 0.8574, "step": 750 }, { "epoch": 0.2690423162583519, "grad_norm": 2.3723981380462646, "learning_rate": 3.649798355709997e-05, "loss": 0.8313, "step": 755 }, { "epoch": 0.2708240534521158, "grad_norm": 2.452537775039673, "learning_rate": 3.579923276480387e-05, "loss": 0.8332, "step": 760 }, { "epoch": 0.27260579064587975, "grad_norm": 2.7250778675079346, "learning_rate": 3.51034859480759e-05, "loss": 0.8345, "step": 765 }, { "epoch": 0.27438752783964365, "grad_norm": 2.827697992324829, "learning_rate": 3.44108902825161e-05, "loss": 0.8547, "step": 770 }, { "epoch": 0.27616926503340755, "grad_norm": 2.2842516899108887, "learning_rate": 3.372159227714218e-05, "loss": 0.8245, "step": 775 }, { "epoch": 0.2779510022271715, "grad_norm": 2.4392411708831787, "learning_rate": 3.303573774339745e-05, "loss": 0.827, "step": 780 }, { "epoch": 0.2797327394209354, "grad_norm": 2.548760175704956, "learning_rate": 3.235347176430656e-05, "loss": 0.8085, "step": 785 }, { "epoch": 0.2815144766146993, "grad_norm": 2.289919376373291, "learning_rate": 3.167493866378514e-05, "loss": 0.8725, "step": 790 }, { "epoch": 0.28329621380846326, "grad_norm": 2.1732709407806396, "learning_rate": 3.100028197611006e-05, "loss": 0.8184, "step": 795 }, { "epoch": 0.28507795100222716, "grad_norm": 2.4083878993988037, "learning_rate": 3.0329644415556758e-05, "loss": 0.8186, "step": 800 }, { "epoch": 0.28507795100222716, "eval_loss": 0.8320774435997009, "eval_runtime": 261.5096, "eval_samples_per_second": 19.078, "eval_steps_per_second": 2.386, "step": 800 }, { "epoch": 0.2868596881959911, "grad_norm": 2.4117252826690674, "learning_rate": 2.9663167846209998e-05, "loss": 0.8061, "step": 805 }, { "epoch": 0.288641425389755, "grad_norm": 2.716094493865967, "learning_rate": 2.9000993251954527e-05, "loss": 0.8372, "step": 810 }, { "epoch": 0.2904231625835189, "grad_norm": 1.987546443939209, "learning_rate": 2.8343260706651864e-05, "loss": 0.8539, "step": 815 }, { "epoch": 0.29220489977728287, "grad_norm": 2.1564650535583496, "learning_rate": 2.7690109344509563e-05, "loss": 0.8398, "step": 820 }, { "epoch": 0.29398663697104677, "grad_norm": 2.394848108291626, "learning_rate": 2.7041677330649407e-05, "loss": 0.8257, "step": 825 }, { "epoch": 0.29576837416481067, "grad_norm": 2.211273670196533, "learning_rate": 2.639810183188045e-05, "loss": 0.8238, "step": 830 }, { "epoch": 0.2975501113585746, "grad_norm": 2.2479021549224854, "learning_rate": 2.575951898768315e-05, "loss": 0.8277, "step": 835 }, { "epoch": 0.2993318485523385, "grad_norm": 2.60609769821167, "learning_rate": 2.5126063881411188e-05, "loss": 0.8371, "step": 840 }, { "epoch": 0.3011135857461025, "grad_norm": 2.4049665927886963, "learning_rate": 2.4497870511716235e-05, "loss": 0.8077, "step": 845 }, { "epoch": 0.3028953229398664, "grad_norm": 2.140543222427368, "learning_rate": 2.3875071764202563e-05, "loss": 0.8288, "step": 850 }, { "epoch": 0.3046770601336303, "grad_norm": 2.6508686542510986, "learning_rate": 2.3257799383316798e-05, "loss": 0.848, "step": 855 }, { "epoch": 0.30645879732739423, "grad_norm": 2.6622097492218018, "learning_rate": 2.264618394447927e-05, "loss": 0.8133, "step": 860 }, { "epoch": 0.3082405345211581, "grad_norm": 2.2243332862854004, "learning_rate": 2.2040354826462668e-05, "loss": 0.8227, "step": 865 }, { "epoch": 0.310022271714922, "grad_norm": 2.4186229705810547, "learning_rate": 2.1440440184023564e-05, "loss": 0.7982, "step": 870 }, { "epoch": 0.311804008908686, "grad_norm": 2.1508822441101074, "learning_rate": 2.0846566920793266e-05, "loss": 0.8421, "step": 875 }, { "epoch": 0.3135857461024499, "grad_norm": 2.5740039348602295, "learning_rate": 2.0258860662432942e-05, "loss": 0.8337, "step": 880 }, { "epoch": 0.31536748329621384, "grad_norm": 2.060276985168457, "learning_rate": 1.967744573005934e-05, "loss": 0.8319, "step": 885 }, { "epoch": 0.31714922048997773, "grad_norm": 2.0549917221069336, "learning_rate": 1.9102445113946343e-05, "loss": 0.7851, "step": 890 }, { "epoch": 0.31893095768374163, "grad_norm": 2.7247533798217773, "learning_rate": 1.8533980447508137e-05, "loss": 0.8113, "step": 895 }, { "epoch": 0.3207126948775056, "grad_norm": 2.852099657058716, "learning_rate": 1.797217198156924e-05, "loss": 0.8502, "step": 900 }, { "epoch": 0.3224944320712695, "grad_norm": 2.2780370712280273, "learning_rate": 1.7417138558927244e-05, "loss": 0.8175, "step": 905 }, { "epoch": 0.3242761692650334, "grad_norm": 2.220999240875244, "learning_rate": 1.6868997589213136e-05, "loss": 0.8107, "step": 910 }, { "epoch": 0.32605790645879734, "grad_norm": 2.26967191696167, "learning_rate": 1.6327865024054984e-05, "loss": 0.815, "step": 915 }, { "epoch": 0.32783964365256124, "grad_norm": 3.1814401149749756, "learning_rate": 1.5793855332550005e-05, "loss": 0.8274, "step": 920 }, { "epoch": 0.32962138084632514, "grad_norm": 2.5263116359710693, "learning_rate": 1.526708147705013e-05, "loss": 0.8126, "step": 925 }, { "epoch": 0.3314031180400891, "grad_norm": 2.7154064178466797, "learning_rate": 1.4747654889266476e-05, "loss": 0.8147, "step": 930 }, { "epoch": 0.333184855233853, "grad_norm": 2.2681655883789062, "learning_rate": 1.4235685446697433e-05, "loss": 0.8247, "step": 935 }, { "epoch": 0.33496659242761695, "grad_norm": 1.97934889793396, "learning_rate": 1.373128144938563e-05, "loss": 0.7941, "step": 940 }, { "epoch": 0.33674832962138085, "grad_norm": 2.35060977935791, "learning_rate": 1.3234549597008571e-05, "loss": 0.8306, "step": 945 }, { "epoch": 0.33853006681514475, "grad_norm": 2.231822967529297, "learning_rate": 1.2745594966307823e-05, "loss": 0.8044, "step": 950 }, { "epoch": 0.3403118040089087, "grad_norm": 2.1318812370300293, "learning_rate": 1.22645209888614e-05, "loss": 0.7989, "step": 955 }, { "epoch": 0.3420935412026726, "grad_norm": 2.565772294998169, "learning_rate": 1.1791429429204342e-05, "loss": 0.7852, "step": 960 }, { "epoch": 0.3438752783964365, "grad_norm": 2.2323334217071533, "learning_rate": 1.132642036330181e-05, "loss": 0.798, "step": 965 }, { "epoch": 0.34565701559020046, "grad_norm": 2.159836769104004, "learning_rate": 1.0869592157379304e-05, "loss": 0.7913, "step": 970 }, { "epoch": 0.34743875278396436, "grad_norm": 2.292523145675659, "learning_rate": 1.0421041447114838e-05, "loss": 0.8303, "step": 975 }, { "epoch": 0.3492204899777283, "grad_norm": 2.540412187576294, "learning_rate": 9.980863117196815e-06, "loss": 0.8174, "step": 980 }, { "epoch": 0.3510022271714922, "grad_norm": 2.6382853984832764, "learning_rate": 9.549150281252633e-06, "loss": 0.7803, "step": 985 }, { "epoch": 0.3527839643652561, "grad_norm": 2.3101236820220947, "learning_rate": 9.125994262151682e-06, "loss": 0.8372, "step": 990 }, { "epoch": 0.35456570155902006, "grad_norm": 2.285560131072998, "learning_rate": 8.711484572687296e-06, "loss": 0.7965, "step": 995 }, { "epoch": 0.35634743875278396, "grad_norm": 1.8707315921783447, "learning_rate": 8.305708896641594e-06, "loss": 0.8255, "step": 1000 }, { "epoch": 0.35634743875278396, "eval_loss": 0.7969963550567627, "eval_runtime": 258.8113, "eval_samples_per_second": 19.277, "eval_steps_per_second": 2.411, "step": 1000 } ], "logging_steps": 5, "max_steps": 1200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1495650375386112.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }