| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1467, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02047082906857728, | |
| "grad_norm": 2.7419285797467823, | |
| "learning_rate": 6.122448979591837e-07, | |
| "loss": 0.5714, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04094165813715456, | |
| "grad_norm": 1.3395217822161771, | |
| "learning_rate": 1.2925170068027212e-06, | |
| "loss": 0.5607, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06141248720573183, | |
| "grad_norm": 0.7131203009670463, | |
| "learning_rate": 1.9727891156462586e-06, | |
| "loss": 0.5058, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08188331627430911, | |
| "grad_norm": 0.5939213129269232, | |
| "learning_rate": 2.6530612244897964e-06, | |
| "loss": 0.4748, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1023541453428864, | |
| "grad_norm": 0.4371524595984187, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.4519, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.12282497441146366, | |
| "grad_norm": 0.3502938420177603, | |
| "learning_rate": 4.013605442176871e-06, | |
| "loss": 0.4443, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14329580348004095, | |
| "grad_norm": 0.29761528774367, | |
| "learning_rate": 4.693877551020409e-06, | |
| "loss": 0.4251, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.16376663254861823, | |
| "grad_norm": 0.2854822784474314, | |
| "learning_rate": 5.374149659863946e-06, | |
| "loss": 0.4159, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1842374616171955, | |
| "grad_norm": 0.3073495732358923, | |
| "learning_rate": 6.054421768707484e-06, | |
| "loss": 0.4206, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2047082906857728, | |
| "grad_norm": 0.3182797565439541, | |
| "learning_rate": 6.734693877551021e-06, | |
| "loss": 0.4105, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.22517911975435004, | |
| "grad_norm": 0.2936005289527639, | |
| "learning_rate": 7.414965986394559e-06, | |
| "loss": 0.4066, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.24564994882292732, | |
| "grad_norm": 0.3237498717947107, | |
| "learning_rate": 8.095238095238097e-06, | |
| "loss": 0.4016, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2661207778915046, | |
| "grad_norm": 0.29086892031699574, | |
| "learning_rate": 8.775510204081633e-06, | |
| "loss": 0.3988, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2865916069600819, | |
| "grad_norm": 0.32624665443263495, | |
| "learning_rate": 9.455782312925171e-06, | |
| "loss": 0.3926, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3070624360286592, | |
| "grad_norm": 0.41791983226234863, | |
| "learning_rate": 9.999943356371867e-06, | |
| "loss": 0.396, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.32753326509723646, | |
| "grad_norm": 0.3103761740717932, | |
| "learning_rate": 9.997960964140946e-06, | |
| "loss": 0.3901, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.34800409416581374, | |
| "grad_norm": 0.3251036078984643, | |
| "learning_rate": 9.993147673772869e-06, | |
| "loss": 0.3874, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.368474923234391, | |
| "grad_norm": 0.29848102638048, | |
| "learning_rate": 9.985506211566388e-06, | |
| "loss": 0.3918, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3889457523029683, | |
| "grad_norm": 0.34239845549124515, | |
| "learning_rate": 9.975040905726799e-06, | |
| "loss": 0.3906, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4094165813715456, | |
| "grad_norm": 0.31464814584407047, | |
| "learning_rate": 9.961757683914406e-06, | |
| "loss": 0.3885, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.42988741044012285, | |
| "grad_norm": 0.3151092550628075, | |
| "learning_rate": 9.945664069887028e-06, | |
| "loss": 0.3728, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4503582395087001, | |
| "grad_norm": 0.3176233859013783, | |
| "learning_rate": 9.926769179238467e-06, | |
| "loss": 0.3728, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.47082906857727735, | |
| "grad_norm": 0.4612429659946746, | |
| "learning_rate": 9.905083714235326e-06, | |
| "loss": 0.3807, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.49129989764585463, | |
| "grad_norm": 0.3058908611230555, | |
| "learning_rate": 9.880619957755151e-06, | |
| "loss": 0.3822, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5117707267144319, | |
| "grad_norm": 0.3091236211182382, | |
| "learning_rate": 9.853391766329264e-06, | |
| "loss": 0.3719, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5322415557830092, | |
| "grad_norm": 0.3303116552583245, | |
| "learning_rate": 9.82341456229428e-06, | |
| "loss": 0.3772, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5527123848515865, | |
| "grad_norm": 0.32625347302412766, | |
| "learning_rate": 9.790705325056735e-06, | |
| "loss": 0.3771, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5731832139201638, | |
| "grad_norm": 0.31108860222980084, | |
| "learning_rate": 9.755282581475769e-06, | |
| "loss": 0.3694, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.593654042988741, | |
| "grad_norm": 0.35683232841035667, | |
| "learning_rate": 9.717166395369312e-06, | |
| "loss": 0.374, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6141248720573184, | |
| "grad_norm": 0.30708155807016757, | |
| "learning_rate": 9.676378356149733e-06, | |
| "loss": 0.3671, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6345957011258956, | |
| "grad_norm": 0.3512172588847162, | |
| "learning_rate": 9.632941566595357e-06, | |
| "loss": 0.3674, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6550665301944729, | |
| "grad_norm": 0.32150553015980243, | |
| "learning_rate": 9.586880629764817e-06, | |
| "loss": 0.3732, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6755373592630501, | |
| "grad_norm": 0.28994930563644283, | |
| "learning_rate": 9.538221635061611e-06, | |
| "loss": 0.3748, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6960081883316275, | |
| "grad_norm": 0.3317951897221946, | |
| "learning_rate": 9.486992143456792e-06, | |
| "loss": 0.3652, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7164790174002047, | |
| "grad_norm": 0.31325853168901024, | |
| "learning_rate": 9.433221171878144e-06, | |
| "loss": 0.3647, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.736949846468782, | |
| "grad_norm": 0.3687088471057981, | |
| "learning_rate": 9.376939176774678e-06, | |
| "loss": 0.3624, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7574206755373593, | |
| "grad_norm": 0.3237568256415363, | |
| "learning_rate": 9.318178036865786e-06, | |
| "loss": 0.3595, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7778915046059366, | |
| "grad_norm": 0.29274287364928764, | |
| "learning_rate": 9.256971035084786e-06, | |
| "loss": 0.363, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.7983623336745138, | |
| "grad_norm": 0.29836089389437453, | |
| "learning_rate": 9.193352839727122e-06, | |
| "loss": 0.3681, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8188331627430911, | |
| "grad_norm": 0.30001387212541714, | |
| "learning_rate": 9.12735948481387e-06, | |
| "loss": 0.3666, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8393039918116684, | |
| "grad_norm": 0.312461464326523, | |
| "learning_rate": 9.059028349681693e-06, | |
| "loss": 0.3672, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8597748208802457, | |
| "grad_norm": 0.2835800419598021, | |
| "learning_rate": 8.988398137810778e-06, | |
| "loss": 0.3512, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8802456499488229, | |
| "grad_norm": 0.3003334841564673, | |
| "learning_rate": 8.915508854902778e-06, | |
| "loss": 0.3649, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9007164790174002, | |
| "grad_norm": 0.32025481154259877, | |
| "learning_rate": 8.84040178622116e-06, | |
| "loss": 0.3596, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9211873080859775, | |
| "grad_norm": 0.3115117787294787, | |
| "learning_rate": 8.763119473206795e-06, | |
| "loss": 0.3608, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9416581371545547, | |
| "grad_norm": 0.3425972446675102, | |
| "learning_rate": 8.683705689382025e-06, | |
| "loss": 0.3637, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.962128966223132, | |
| "grad_norm": 0.3663718766106508, | |
| "learning_rate": 8.602205415556889e-06, | |
| "loss": 0.3581, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9825997952917093, | |
| "grad_norm": 0.37122946179964184, | |
| "learning_rate": 8.518664814351502e-06, | |
| "loss": 0.363, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.0020470829068577, | |
| "grad_norm": 0.35006617529270384, | |
| "learning_rate": 8.433131204049067e-06, | |
| "loss": 0.3605, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.022517911975435, | |
| "grad_norm": 0.3740591080146805, | |
| "learning_rate": 8.345653031794292e-06, | |
| "loss": 0.3366, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0429887410440122, | |
| "grad_norm": 0.3162453200296232, | |
| "learning_rate": 8.25627984615241e-06, | |
| "loss": 0.3399, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.0634595701125895, | |
| "grad_norm": 0.3459734066982941, | |
| "learning_rate": 8.165062269044353e-06, | |
| "loss": 0.3347, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.0839303991811668, | |
| "grad_norm": 0.3754414242837869, | |
| "learning_rate": 8.072051967073955e-06, | |
| "loss": 0.3393, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.1044012282497442, | |
| "grad_norm": 0.31310887226269396, | |
| "learning_rate": 7.97730162226344e-06, | |
| "loss": 0.3378, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.1248720573183213, | |
| "grad_norm": 0.2743343327695542, | |
| "learning_rate": 7.880864902213765e-06, | |
| "loss": 0.3339, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.1453428863868986, | |
| "grad_norm": 0.29488856378638867, | |
| "learning_rate": 7.782796429706721e-06, | |
| "loss": 0.3371, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.165813715455476, | |
| "grad_norm": 0.31936132904356773, | |
| "learning_rate": 7.683151751766005e-06, | |
| "loss": 0.3353, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.1862845445240533, | |
| "grad_norm": 0.2862668653895022, | |
| "learning_rate": 7.5819873081948105e-06, | |
| "loss": 0.3437, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.2067553735926304, | |
| "grad_norm": 0.373142212943522, | |
| "learning_rate": 7.479360399607707e-06, | |
| "loss": 0.34, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.2272262026612077, | |
| "grad_norm": 0.26627867428916774, | |
| "learning_rate": 7.3753291549749764e-06, | |
| "loss": 0.3359, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.247697031729785, | |
| "grad_norm": 0.29276632404031816, | |
| "learning_rate": 7.269952498697734e-06, | |
| "loss": 0.3332, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.2681678607983624, | |
| "grad_norm": 0.28433544101185126, | |
| "learning_rate": 7.163290117232542e-06, | |
| "loss": 0.3282, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.2886386898669397, | |
| "grad_norm": 0.2953581704843155, | |
| "learning_rate": 7.055402425284346e-06, | |
| "loss": 0.3367, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.3091095189355169, | |
| "grad_norm": 0.33230185564417497, | |
| "learning_rate": 6.946350531586959e-06, | |
| "loss": 0.33, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.3295803480040942, | |
| "grad_norm": 0.2886636563529011, | |
| "learning_rate": 6.836196204290417e-06, | |
| "loss": 0.3396, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.3500511770726713, | |
| "grad_norm": 0.2999611703532613, | |
| "learning_rate": 6.725001835974854e-06, | |
| "loss": 0.3393, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.3705220061412486, | |
| "grad_norm": 0.2726538110659223, | |
| "learning_rate": 6.612830408310671e-06, | |
| "loss": 0.3312, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.390992835209826, | |
| "grad_norm": 0.291476742075843, | |
| "learning_rate": 6.499745456385054e-06, | |
| "loss": 0.3351, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.4114636642784033, | |
| "grad_norm": 0.31263083981840367, | |
| "learning_rate": 6.385811032715031e-06, | |
| "loss": 0.3288, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.4319344933469806, | |
| "grad_norm": 0.2902918227753739, | |
| "learning_rate": 6.271091670967437e-06, | |
| "loss": 0.3352, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.4524053224155578, | |
| "grad_norm": 0.28079102264241496, | |
| "learning_rate": 6.155652349406366e-06, | |
| "loss": 0.335, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.472876151484135, | |
| "grad_norm": 0.29347480666251813, | |
| "learning_rate": 6.039558454088796e-06, | |
| "loss": 0.3363, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.4933469805527124, | |
| "grad_norm": 0.29437779600755265, | |
| "learning_rate": 5.922875741829227e-06, | |
| "loss": 0.3341, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.5138178096212895, | |
| "grad_norm": 0.32924496076891324, | |
| "learning_rate": 5.805670302954322e-06, | |
| "loss": 0.3376, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.5342886386898669, | |
| "grad_norm": 0.28214414735311516, | |
| "learning_rate": 5.688008523868646e-06, | |
| "loss": 0.3316, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.5547594677584442, | |
| "grad_norm": 0.27441915869668093, | |
| "learning_rate": 5.569957049452703e-06, | |
| "loss": 0.3334, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.5752302968270215, | |
| "grad_norm": 0.2832467579497326, | |
| "learning_rate": 5.451582745314576e-06, | |
| "loss": 0.3338, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.5957011258955989, | |
| "grad_norm": 0.2662158568853315, | |
| "learning_rate": 5.33295265991652e-06, | |
| "loss": 0.3394, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.6161719549641762, | |
| "grad_norm": 0.28021626464260035, | |
| "learning_rate": 5.214133986598014e-06, | |
| "loss": 0.3401, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.6366427840327533, | |
| "grad_norm": 0.27748535569979166, | |
| "learning_rate": 5.095194025516733e-06, | |
| "loss": 0.3389, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.6571136131013307, | |
| "grad_norm": 0.27557614607505543, | |
| "learning_rate": 4.976200145529039e-06, | |
| "loss": 0.3297, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.6775844421699078, | |
| "grad_norm": 0.29920349158775145, | |
| "learning_rate": 4.85721974603152e-06, | |
| "loss": 0.3332, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.698055271238485, | |
| "grad_norm": 0.28020907502801123, | |
| "learning_rate": 4.738320218785281e-06, | |
| "loss": 0.3331, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.7185261003070624, | |
| "grad_norm": 0.275720710241024, | |
| "learning_rate": 4.619568909744524e-06, | |
| "loss": 0.334, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.7389969293756398, | |
| "grad_norm": 0.2981912122148231, | |
| "learning_rate": 4.501033080911087e-06, | |
| "loss": 0.3275, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.759467758444217, | |
| "grad_norm": 0.2867885235447855, | |
| "learning_rate": 4.382779872236527e-06, | |
| "loss": 0.3279, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.7799385875127944, | |
| "grad_norm": 0.3130758313387298, | |
| "learning_rate": 4.264876263593347e-06, | |
| "loss": 0.3339, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.8004094165813715, | |
| "grad_norm": 0.2964510943375963, | |
| "learning_rate": 4.147389036836881e-06, | |
| "loss": 0.3279, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.8208802456499487, | |
| "grad_norm": 0.2583942550882026, | |
| "learning_rate": 4.030384737979345e-06, | |
| "loss": 0.3286, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.841351074718526, | |
| "grad_norm": 0.2922177868095114, | |
| "learning_rate": 3.913929639497462e-06, | |
| "loss": 0.3343, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.8618219037871033, | |
| "grad_norm": 0.28649431693767013, | |
| "learning_rate": 3.798089702795038e-06, | |
| "loss": 0.337, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.8822927328556807, | |
| "grad_norm": 0.2719777009560596, | |
| "learning_rate": 3.682930540841717e-06, | |
| "loss": 0.3255, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.902763561924258, | |
| "grad_norm": 0.2592856290273692, | |
| "learning_rate": 3.568517381009099e-06, | |
| "loss": 0.3334, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.9232343909928353, | |
| "grad_norm": 0.2669776100626122, | |
| "learning_rate": 3.4549150281252635e-06, | |
| "loss": 0.3274, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.9437052200614124, | |
| "grad_norm": 0.2644660634287941, | |
| "learning_rate": 3.3421878277686315e-06, | |
| "loss": 0.329, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.9641760491299898, | |
| "grad_norm": 0.255388401567599, | |
| "learning_rate": 3.230399629821942e-06, | |
| "loss": 0.333, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.9846468781985669, | |
| "grad_norm": 0.26076402769472123, | |
| "learning_rate": 3.119613752307002e-06, | |
| "loss": 0.3277, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.0040941658137155, | |
| "grad_norm": 0.34035031287023115, | |
| "learning_rate": 3.0098929455206905e-06, | |
| "loss": 0.3239, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.024564994882293, | |
| "grad_norm": 0.26481931821481197, | |
| "learning_rate": 2.901299356492516e-06, | |
| "loss": 0.3082, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.04503582395087, | |
| "grad_norm": 0.27840797715546095, | |
| "learning_rate": 2.7938944937838924e-06, | |
| "loss": 0.3141, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.0655066530194475, | |
| "grad_norm": 0.264803380198392, | |
| "learning_rate": 2.687739192649026e-06, | |
| "loss": 0.309, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.0859774820880244, | |
| "grad_norm": 0.2503878058457451, | |
| "learning_rate": 2.5828935805771804e-06, | |
| "loss": 0.3055, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.1064483111566017, | |
| "grad_norm": 0.24288791515168753, | |
| "learning_rate": 2.4794170432358415e-06, | |
| "loss": 0.3101, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.126919140225179, | |
| "grad_norm": 0.3260568670065737, | |
| "learning_rate": 2.3773681908340284e-06, | |
| "loss": 0.3174, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.1473899692937564, | |
| "grad_norm": 0.2752973297448372, | |
| "learning_rate": 2.2768048249248648e-06, | |
| "loss": 0.3161, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.1678607983623337, | |
| "grad_norm": 0.2525589153397712, | |
| "learning_rate": 2.1777839056661555e-06, | |
| "loss": 0.3127, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.188331627430911, | |
| "grad_norm": 0.2721221125171894, | |
| "learning_rate": 2.080361519557548e-06, | |
| "loss": 0.3095, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.2088024564994884, | |
| "grad_norm": 0.256857141944826, | |
| "learning_rate": 1.9845928476725522e-06, | |
| "loss": 0.3117, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.2292732855680657, | |
| "grad_norm": 0.23983629928731875, | |
| "learning_rate": 1.89053213440339e-06, | |
| "loss": 0.3096, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.2497441146366426, | |
| "grad_norm": 0.2576559289401722, | |
| "learning_rate": 1.798232656736389e-06, | |
| "loss": 0.3167, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.27021494370522, | |
| "grad_norm": 0.25400768886970904, | |
| "learning_rate": 1.7077466940753446e-06, | |
| "loss": 0.3085, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.2906857727737973, | |
| "grad_norm": 0.2464137122590109, | |
| "learning_rate": 1.6191254986299044e-06, | |
| "loss": 0.309, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.3111566018423746, | |
| "grad_norm": 0.24550795789449772, | |
| "learning_rate": 1.5324192663857673e-06, | |
| "loss": 0.3159, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.331627430910952, | |
| "grad_norm": 0.28852640148423236, | |
| "learning_rate": 1.4476771086731567e-06, | |
| "loss": 0.3123, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.3520982599795293, | |
| "grad_norm": 0.37687841751171514, | |
| "learning_rate": 1.3649470243496327e-06, | |
| "loss": 0.3098, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.3725690890481066, | |
| "grad_norm": 0.25339405885205124, | |
| "learning_rate": 1.2842758726130283e-06, | |
| "loss": 0.308, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.393039918116684, | |
| "grad_norm": 0.2487646284184678, | |
| "learning_rate": 1.2057093464599156e-06, | |
| "loss": 0.3127, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.413510747185261, | |
| "grad_norm": 0.32463394959288167, | |
| "learning_rate": 1.1292919468045876e-06, | |
| "loss": 0.3139, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.433981576253838, | |
| "grad_norm": 0.255238902256777, | |
| "learning_rate": 1.0550669572732862e-06, | |
| "loss": 0.3111, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.4544524053224155, | |
| "grad_norm": 0.23908874761946466, | |
| "learning_rate": 9.830764196878872e-07, | |
| "loss": 0.3164, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.474923234390993, | |
| "grad_norm": 0.2649523347415388, | |
| "learning_rate": 9.133611102529655e-07, | |
| "loss": 0.3149, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.49539406345957, | |
| "grad_norm": 0.24534103849668815, | |
| "learning_rate": 8.459605164597268e-07, | |
| "loss": 0.3098, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.5158648925281475, | |
| "grad_norm": 0.2416421159869344, | |
| "learning_rate": 7.809128147198692e-07, | |
| "loss": 0.3091, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.536335721596725, | |
| "grad_norm": 0.23831181790073264, | |
| "learning_rate": 7.182548487420555e-07, | |
| "loss": 0.3118, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.5568065506653017, | |
| "grad_norm": 0.23798865253936233, | |
| "learning_rate": 6.580221086632516e-07, | |
| "loss": 0.3138, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.5772773797338795, | |
| "grad_norm": 0.2483650737490233, | |
| "learning_rate": 6.002487109467347e-07, | |
| "loss": 0.3044, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.5977482088024564, | |
| "grad_norm": 0.24574354382203337, | |
| "learning_rate": 5.449673790581611e-07, | |
| "loss": 0.3117, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.6182190378710337, | |
| "grad_norm": 0.25945540600991146, | |
| "learning_rate": 4.922094249306559e-07, | |
| "loss": 0.3077, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.638689866939611, | |
| "grad_norm": 0.2376843101597617, | |
| "learning_rate": 4.420047312293946e-07, | |
| "loss": 0.3099, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.6591606960081884, | |
| "grad_norm": 0.23445126717886705, | |
| "learning_rate": 3.9438173442575e-07, | |
| "loss": 0.3153, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.6796315250767657, | |
| "grad_norm": 0.23799144075624934, | |
| "learning_rate": 3.4936740869057075e-07, | |
| "loss": 0.3052, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.7001023541453426, | |
| "grad_norm": 0.23443772087547457, | |
| "learning_rate": 3.069872506157212e-07, | |
| "loss": 0.3077, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.7205731832139204, | |
| "grad_norm": 0.24404450847384368, | |
| "learning_rate": 2.6726526477254986e-07, | |
| "loss": 0.3111, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.7410440122824973, | |
| "grad_norm": 0.2537534925687698, | |
| "learning_rate": 2.3022395011543687e-07, | |
| "loss": 0.3127, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.7615148413510746, | |
| "grad_norm": 0.2531714733259407, | |
| "learning_rate": 1.9588428723814945e-07, | |
| "loss": 0.3126, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.781985670419652, | |
| "grad_norm": 0.2476065493877303, | |
| "learning_rate": 1.6426572649021477e-07, | |
| "loss": 0.3147, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.8024564994882293, | |
| "grad_norm": 0.22861950264891634, | |
| "learning_rate": 1.3538617696003066e-07, | |
| "loss": 0.3206, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.8229273285568066, | |
| "grad_norm": 0.23238655363179303, | |
| "learning_rate": 1.0926199633097156e-07, | |
| "loss": 0.3074, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.843398157625384, | |
| "grad_norm": 0.25027856224860606, | |
| "learning_rate": 8.590798161622227e-08, | |
| "loss": 0.3086, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.8638689866939613, | |
| "grad_norm": 0.45100936356107935, | |
| "learning_rate": 6.533736077758868e-08, | |
| "loss": 0.3089, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.884339815762538, | |
| "grad_norm": 0.23851515762515696, | |
| "learning_rate": 4.756178523304622e-08, | |
| "loss": 0.3082, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.9048106448311155, | |
| "grad_norm": 0.23121697809405467, | |
| "learning_rate": 3.25913232572489e-08, | |
| "loss": 0.3124, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.925281473899693, | |
| "grad_norm": 0.23935340813095157, | |
| "learning_rate": 2.0434454278752126e-08, | |
| "loss": 0.3122, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.94575230296827, | |
| "grad_norm": 0.24056630460754483, | |
| "learning_rate": 1.109806407717462e-08, | |
| "loss": 0.3094, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.9662231320368475, | |
| "grad_norm": 0.24275000924008874, | |
| "learning_rate": 4.587440883021543e-09, | |
| "loss": 0.3058, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.986693961105425, | |
| "grad_norm": 0.2672744704384668, | |
| "learning_rate": 9.062723823710651e-10, | |
| "loss": 0.3045, | |
| "step": 1460 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1467, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 10000000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4881065357344768.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |