{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.997824139255088, "eval_steps": 500, "global_step": 9765, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02559836170485089, "grad_norm": 0.7305641763872902, "learning_rate": 1.9897593445980545e-05, "loss": 0.24, "step": 50 }, { "epoch": 0.05119672340970178, "grad_norm": 0.05103642085416945, "learning_rate": 1.9795186891961087e-05, "loss": 0.0049, "step": 100 }, { "epoch": 0.07679508511455267, "grad_norm": 0.030274497499991383, "learning_rate": 1.969278033794163e-05, "loss": 0.0017, "step": 150 }, { "epoch": 0.10239344681940356, "grad_norm": 0.029756392410786685, "learning_rate": 1.9590373783922173e-05, "loss": 0.0011, "step": 200 }, { "epoch": 0.12799180852425446, "grad_norm": 0.02549118945457913, "learning_rate": 1.9487967229902716e-05, "loss": 0.0008, "step": 250 }, { "epoch": 0.15359017022910534, "grad_norm": 0.02438470537104193, "learning_rate": 1.9385560675883256e-05, "loss": 0.0008, "step": 300 }, { "epoch": 0.17918853193395623, "grad_norm": 0.016258758537083494, "learning_rate": 1.9283154121863802e-05, "loss": 0.0007, "step": 350 }, { "epoch": 0.20478689363880712, "grad_norm": 0.014976187170917211, "learning_rate": 1.9180747567844345e-05, "loss": 0.0005, "step": 400 }, { "epoch": 0.230385255343658, "grad_norm": 0.014196620668655414, "learning_rate": 1.9078341013824884e-05, "loss": 0.0005, "step": 450 }, { "epoch": 0.2559836170485089, "grad_norm": 0.01865887251990293, "learning_rate": 1.897593445980543e-05, "loss": 0.0004, "step": 500 }, { "epoch": 0.2815819787533598, "grad_norm": 0.016349380331586796, "learning_rate": 1.887352790578597e-05, "loss": 0.0004, "step": 550 }, { "epoch": 0.3071803404582107, "grad_norm": 0.01413169547355441, "learning_rate": 1.8771121351766516e-05, "loss": 0.0003, "step": 600 }, { "epoch": 0.33277870216306155, "grad_norm": 0.010152573483527069, "learning_rate": 1.866871479774706e-05, "loss": 0.0003, "step": 650 }, { "epoch": 0.35837706386791246, "grad_norm": 0.01017225834937972, "learning_rate": 1.85663082437276e-05, "loss": 0.0003, "step": 700 }, { "epoch": 0.3839754255727633, "grad_norm": 0.013000431764113899, "learning_rate": 1.8463901689708145e-05, "loss": 0.0003, "step": 750 }, { "epoch": 0.40957378727761423, "grad_norm": 0.013254090310897974, "learning_rate": 1.8361495135688684e-05, "loss": 0.0003, "step": 800 }, { "epoch": 0.43517214898246515, "grad_norm": 0.00966168025347855, "learning_rate": 1.8259088581669227e-05, "loss": 0.0002, "step": 850 }, { "epoch": 0.460770510687316, "grad_norm": 0.00712828374097157, "learning_rate": 1.815668202764977e-05, "loss": 0.0002, "step": 900 }, { "epoch": 0.4863688723921669, "grad_norm": 0.010211960398621855, "learning_rate": 1.8054275473630313e-05, "loss": 0.0002, "step": 950 }, { "epoch": 0.5119672340970178, "grad_norm": 0.01261082529443916, "learning_rate": 1.7951868919610856e-05, "loss": 0.0002, "step": 1000 }, { "epoch": 0.5375655958018687, "grad_norm": 0.011854166152885242, "learning_rate": 1.78494623655914e-05, "loss": 0.0002, "step": 1050 }, { "epoch": 0.5631639575067195, "grad_norm": 0.009916438252277934, "learning_rate": 1.7747055811571942e-05, "loss": 0.0003, "step": 1100 }, { "epoch": 0.5887623192115704, "grad_norm": 0.005025129187771019, "learning_rate": 1.7644649257552485e-05, "loss": 0.0002, "step": 1150 }, { "epoch": 0.6143606809164214, "grad_norm": 0.006788101186805052, "learning_rate": 1.7542242703533028e-05, "loss": 0.0002, "step": 1200 }, { "epoch": 0.6399590426212722, "grad_norm": 0.00978494920049853, "learning_rate": 1.743983614951357e-05, "loss": 0.0002, "step": 1250 }, { "epoch": 0.6655574043261231, "grad_norm": 0.009494329535464946, "learning_rate": 1.7337429595494113e-05, "loss": 0.0002, "step": 1300 }, { "epoch": 0.6911557660309741, "grad_norm": 0.010405998997878094, "learning_rate": 1.7235023041474656e-05, "loss": 0.0002, "step": 1350 }, { "epoch": 0.7167541277358249, "grad_norm": 0.01406008137282546, "learning_rate": 1.71326164874552e-05, "loss": 0.0001, "step": 1400 }, { "epoch": 0.7423524894406758, "grad_norm": 0.012511809648905668, "learning_rate": 1.7030209933435742e-05, "loss": 0.0002, "step": 1450 }, { "epoch": 0.7679508511455266, "grad_norm": 0.014402924650339832, "learning_rate": 1.6927803379416285e-05, "loss": 0.0002, "step": 1500 }, { "epoch": 0.7935492128503776, "grad_norm": 0.007985015692090758, "learning_rate": 1.6825396825396828e-05, "loss": 0.0002, "step": 1550 }, { "epoch": 0.8191475745552285, "grad_norm": 0.016922696684847503, "learning_rate": 1.6722990271377367e-05, "loss": 0.0002, "step": 1600 }, { "epoch": 0.8447459362600793, "grad_norm": 0.0058610303905484145, "learning_rate": 1.6620583717357914e-05, "loss": 0.0002, "step": 1650 }, { "epoch": 0.8703442979649303, "grad_norm": 0.005758710688055935, "learning_rate": 1.6518177163338457e-05, "loss": 0.0001, "step": 1700 }, { "epoch": 0.8959426596697811, "grad_norm": 0.010005518642531934, "learning_rate": 1.6415770609318996e-05, "loss": 0.0001, "step": 1750 }, { "epoch": 0.921541021374632, "grad_norm": 0.006740034442277339, "learning_rate": 1.6313364055299542e-05, "loss": 0.0001, "step": 1800 }, { "epoch": 0.9471393830794829, "grad_norm": 0.0062403985576606705, "learning_rate": 1.6210957501280082e-05, "loss": 0.0001, "step": 1850 }, { "epoch": 0.9727377447843338, "grad_norm": 0.006225396199669411, "learning_rate": 1.6108550947260625e-05, "loss": 0.0001, "step": 1900 }, { "epoch": 0.9983361064891847, "grad_norm": 0.006345423633281202, "learning_rate": 1.600614439324117e-05, "loss": 0.0001, "step": 1950 }, { "epoch": 1.0235504927684629, "grad_norm": 0.00782413794335222, "learning_rate": 1.590373783922171e-05, "loss": 0.0001, "step": 2000 }, { "epoch": 1.0491488544733136, "grad_norm": 0.004640264761208562, "learning_rate": 1.5801331285202253e-05, "loss": 0.0001, "step": 2050 }, { "epoch": 1.0747472161781646, "grad_norm": 0.004837968806765287, "learning_rate": 1.5698924731182796e-05, "loss": 0.0001, "step": 2100 }, { "epoch": 1.1003455778830156, "grad_norm": 0.003966873491178343, "learning_rate": 1.559651817716334e-05, "loss": 0.0001, "step": 2150 }, { "epoch": 1.1259439395878663, "grad_norm": 0.007280756408676898, "learning_rate": 1.5494111623143882e-05, "loss": 0.0001, "step": 2200 }, { "epoch": 1.1515423012927173, "grad_norm": 0.007676063330830094, "learning_rate": 1.5391705069124425e-05, "loss": 0.0001, "step": 2250 }, { "epoch": 1.177140662997568, "grad_norm": 0.00759666513908814, "learning_rate": 1.5289298515104968e-05, "loss": 0.0001, "step": 2300 }, { "epoch": 1.202739024702419, "grad_norm": 0.006094073180808279, "learning_rate": 1.518689196108551e-05, "loss": 0.0001, "step": 2350 }, { "epoch": 1.22833738640727, "grad_norm": 0.009992171934592443, "learning_rate": 1.5084485407066054e-05, "loss": 0.0001, "step": 2400 }, { "epoch": 1.253935748112121, "grad_norm": 0.004425939861516632, "learning_rate": 1.4982078853046595e-05, "loss": 0.0001, "step": 2450 }, { "epoch": 1.2795341098169717, "grad_norm": 0.00618221779159838, "learning_rate": 1.487967229902714e-05, "loss": 0.0001, "step": 2500 }, { "epoch": 1.3051324715218227, "grad_norm": 0.0037721408282199286, "learning_rate": 1.477726574500768e-05, "loss": 0.0001, "step": 2550 }, { "epoch": 1.3307308332266734, "grad_norm": 0.006148728469232912, "learning_rate": 1.4674859190988225e-05, "loss": 0.0001, "step": 2600 }, { "epoch": 1.3563291949315244, "grad_norm": 0.00661518038661282, "learning_rate": 1.4572452636968768e-05, "loss": 0.0001, "step": 2650 }, { "epoch": 1.3819275566363753, "grad_norm": 0.0036588312853667437, "learning_rate": 1.447004608294931e-05, "loss": 0.0001, "step": 2700 }, { "epoch": 1.407525918341226, "grad_norm": 0.005484459005497015, "learning_rate": 1.4367639528929854e-05, "loss": 0.0001, "step": 2750 }, { "epoch": 1.433124280046077, "grad_norm": 0.010402616539983395, "learning_rate": 1.4265232974910395e-05, "loss": 0.0001, "step": 2800 }, { "epoch": 1.4587226417509278, "grad_norm": 0.007335666061283071, "learning_rate": 1.4162826420890938e-05, "loss": 0.0001, "step": 2850 }, { "epoch": 1.4843210034557788, "grad_norm": 0.006550005824502188, "learning_rate": 1.4060419866871483e-05, "loss": 0.0001, "step": 2900 }, { "epoch": 1.5099193651606297, "grad_norm": 0.0027811575400146435, "learning_rate": 1.3958013312852024e-05, "loss": 0.0001, "step": 2950 }, { "epoch": 1.5355177268654807, "grad_norm": 0.006308963330505965, "learning_rate": 1.3855606758832567e-05, "loss": 0.0001, "step": 3000 }, { "epoch": 1.5611160885703315, "grad_norm": 0.006401332782035864, "learning_rate": 1.3753200204813108e-05, "loss": 0.0001, "step": 3050 }, { "epoch": 1.5867144502751824, "grad_norm": 0.004718742517696451, "learning_rate": 1.3650793650793652e-05, "loss": 0.0001, "step": 3100 }, { "epoch": 1.6123128119800332, "grad_norm": 0.003877950477268835, "learning_rate": 1.3548387096774194e-05, "loss": 0.0001, "step": 3150 }, { "epoch": 1.6379111736848841, "grad_norm": 0.0063083392896106745, "learning_rate": 1.3445980542754738e-05, "loss": 0.0001, "step": 3200 }, { "epoch": 1.6635095353897351, "grad_norm": 0.006412039922690925, "learning_rate": 1.3343573988735281e-05, "loss": 0.0001, "step": 3250 }, { "epoch": 1.689107897094586, "grad_norm": 0.0029627793040849877, "learning_rate": 1.3241167434715822e-05, "loss": 0.0001, "step": 3300 }, { "epoch": 1.7147062587994368, "grad_norm": 0.002164481803452725, "learning_rate": 1.3138760880696367e-05, "loss": 0.0001, "step": 3350 }, { "epoch": 1.7403046205042876, "grad_norm": 0.004111311446657877, "learning_rate": 1.3036354326676908e-05, "loss": 0.0001, "step": 3400 }, { "epoch": 1.7659029822091385, "grad_norm": 0.0024071410600000186, "learning_rate": 1.2933947772657451e-05, "loss": 0.0001, "step": 3450 }, { "epoch": 1.7915013439139895, "grad_norm": 0.00428027777175206, "learning_rate": 1.2831541218637992e-05, "loss": 0.0001, "step": 3500 }, { "epoch": 1.8170997056188405, "grad_norm": 0.0035937450146907115, "learning_rate": 1.2729134664618537e-05, "loss": 0.0001, "step": 3550 }, { "epoch": 1.8426980673236912, "grad_norm": 0.007360372295628917, "learning_rate": 1.262672811059908e-05, "loss": 0.0001, "step": 3600 }, { "epoch": 1.8682964290285422, "grad_norm": 0.004225210869508024, "learning_rate": 1.2524321556579622e-05, "loss": 0.0001, "step": 3650 }, { "epoch": 1.893894790733393, "grad_norm": 0.00344941681643163, "learning_rate": 1.2421915002560165e-05, "loss": 0.0001, "step": 3700 }, { "epoch": 1.919493152438244, "grad_norm": 0.0036839082828609084, "learning_rate": 1.2319508448540707e-05, "loss": 0.0001, "step": 3750 }, { "epoch": 1.945091514143095, "grad_norm": 0.009934710271474315, "learning_rate": 1.2217101894521251e-05, "loss": 0.0001, "step": 3800 }, { "epoch": 1.9706898758479459, "grad_norm": 0.0024663729558732648, "learning_rate": 1.2114695340501794e-05, "loss": 0.0001, "step": 3850 }, { "epoch": 1.9962882375527966, "grad_norm": 0.003817898440024657, "learning_rate": 1.2012288786482335e-05, "loss": 0.0001, "step": 3900 }, { "epoch": 2.021502623832075, "grad_norm": 0.0031639489328900696, "learning_rate": 1.190988223246288e-05, "loss": 0.0001, "step": 3950 }, { "epoch": 2.0471009855369258, "grad_norm": 0.002020596329737904, "learning_rate": 1.1807475678443421e-05, "loss": 0.0001, "step": 4000 }, { "epoch": 2.0726993472417767, "grad_norm": 0.0041297671592259375, "learning_rate": 1.1705069124423964e-05, "loss": 0.0001, "step": 4050 }, { "epoch": 2.0982977089466273, "grad_norm": 0.0030187753698489852, "learning_rate": 1.1602662570404507e-05, "loss": 0.0001, "step": 4100 }, { "epoch": 2.1238960706514782, "grad_norm": 0.006719688660763743, "learning_rate": 1.150025601638505e-05, "loss": 0.0001, "step": 4150 }, { "epoch": 2.149494432356329, "grad_norm": 0.007455082822481147, "learning_rate": 1.1397849462365593e-05, "loss": 0.0001, "step": 4200 }, { "epoch": 2.17509279406118, "grad_norm": 0.0020929393058777236, "learning_rate": 1.1295442908346135e-05, "loss": 0.0001, "step": 4250 }, { "epoch": 2.200691155766031, "grad_norm": 0.004647943941373522, "learning_rate": 1.1193036354326678e-05, "loss": 0.0001, "step": 4300 }, { "epoch": 2.2262895174708817, "grad_norm": 0.002919096778092067, "learning_rate": 1.109062980030722e-05, "loss": 0.0001, "step": 4350 }, { "epoch": 2.2518878791757326, "grad_norm": 0.0022980302252630044, "learning_rate": 1.0988223246287764e-05, "loss": 0.0001, "step": 4400 }, { "epoch": 2.2774862408805836, "grad_norm": 0.0007777129612344223, "learning_rate": 1.0885816692268305e-05, "loss": 0.0001, "step": 4450 }, { "epoch": 2.3030846025854346, "grad_norm": 0.001856334209823885, "learning_rate": 1.0783410138248848e-05, "loss": 0.0001, "step": 4500 }, { "epoch": 2.3286829642902855, "grad_norm": 0.009562277401713636, "learning_rate": 1.0681003584229393e-05, "loss": 0.0115, "step": 4550 }, { "epoch": 2.354281325995136, "grad_norm": 0.008162550932912013, "learning_rate": 1.0578597030209934e-05, "loss": 0.0001, "step": 4600 }, { "epoch": 2.379879687699987, "grad_norm": 0.006086517667163692, "learning_rate": 1.0476190476190477e-05, "loss": 0.0001, "step": 4650 }, { "epoch": 2.405478049404838, "grad_norm": 0.0027507057924501116, "learning_rate": 1.037378392217102e-05, "loss": 0.0001, "step": 4700 }, { "epoch": 2.431076411109689, "grad_norm": 0.005652923712553444, "learning_rate": 1.0271377368151563e-05, "loss": 0.0001, "step": 4750 }, { "epoch": 2.45667477281454, "grad_norm": 0.0022548738506609723, "learning_rate": 1.0168970814132104e-05, "loss": 0.0001, "step": 4800 }, { "epoch": 2.482273134519391, "grad_norm": 0.0034968078517645545, "learning_rate": 1.0066564260112648e-05, "loss": 0.0001, "step": 4850 }, { "epoch": 2.507871496224242, "grad_norm": 0.0029725026316189704, "learning_rate": 9.96415770609319e-06, "loss": 0.0001, "step": 4900 }, { "epoch": 2.5334698579290924, "grad_norm": 0.002417471051371214, "learning_rate": 9.861751152073733e-06, "loss": 0.0, "step": 4950 }, { "epoch": 2.5590682196339434, "grad_norm": 0.0037793013717612994, "learning_rate": 9.759344598054277e-06, "loss": 0.0, "step": 5000 }, { "epoch": 2.5846665813387943, "grad_norm": 0.0020282875993942345, "learning_rate": 9.65693804403482e-06, "loss": 0.0, "step": 5050 }, { "epoch": 2.6102649430436453, "grad_norm": 0.006653751475623009, "learning_rate": 9.554531490015361e-06, "loss": 0.0, "step": 5100 }, { "epoch": 2.6358633047484963, "grad_norm": 0.0006177303859045243, "learning_rate": 9.452124935995904e-06, "loss": 0.0, "step": 5150 }, { "epoch": 2.661461666453347, "grad_norm": 0.001574016672401847, "learning_rate": 9.349718381976447e-06, "loss": 0.0, "step": 5200 }, { "epoch": 2.6870600281581978, "grad_norm": 0.002722823477162341, "learning_rate": 9.24731182795699e-06, "loss": 0.0, "step": 5250 }, { "epoch": 2.7126583898630487, "grad_norm": 0.0031352467257030996, "learning_rate": 9.144905273937533e-06, "loss": 0.0, "step": 5300 }, { "epoch": 2.7382567515678997, "grad_norm": 0.002855241030593728, "learning_rate": 9.042498719918076e-06, "loss": 0.0, "step": 5350 }, { "epoch": 2.7638551132727507, "grad_norm": 0.001616961495391244, "learning_rate": 8.940092165898619e-06, "loss": 0.0, "step": 5400 }, { "epoch": 2.789453474977601, "grad_norm": 0.005071888691549564, "learning_rate": 8.837685611879161e-06, "loss": 0.0001, "step": 5450 }, { "epoch": 2.815051836682452, "grad_norm": 0.0029692529220848914, "learning_rate": 8.735279057859704e-06, "loss": 0.0, "step": 5500 }, { "epoch": 2.840650198387303, "grad_norm": 0.0004172545224757254, "learning_rate": 8.632872503840246e-06, "loss": 0.0, "step": 5550 }, { "epoch": 2.866248560092154, "grad_norm": 0.008642812223624062, "learning_rate": 8.530465949820788e-06, "loss": 0.0, "step": 5600 }, { "epoch": 2.891846921797005, "grad_norm": 0.0027205512248527812, "learning_rate": 8.428059395801333e-06, "loss": 0.0001, "step": 5650 }, { "epoch": 2.9174452835018556, "grad_norm": 0.0019007511180201269, "learning_rate": 8.325652841781874e-06, "loss": 0.0, "step": 5700 }, { "epoch": 2.943043645206707, "grad_norm": 0.0011932680335579203, "learning_rate": 8.223246287762417e-06, "loss": 0.0, "step": 5750 }, { "epoch": 2.9686420069115576, "grad_norm": 0.0017683528985724605, "learning_rate": 8.12083973374296e-06, "loss": 0.0, "step": 5800 }, { "epoch": 2.9942403686164085, "grad_norm": 0.0010172759283751088, "learning_rate": 8.018433179723503e-06, "loss": 0.0, "step": 5850 }, { "epoch": 3.0194547548956865, "grad_norm": 0.0035687260604544088, "learning_rate": 7.916026625704046e-06, "loss": 0.0, "step": 5900 }, { "epoch": 3.0450531166005375, "grad_norm": 0.0017674951945728509, "learning_rate": 7.813620071684589e-06, "loss": 0.0, "step": 5950 }, { "epoch": 3.0706514783053884, "grad_norm": 0.001417796923981359, "learning_rate": 7.711213517665132e-06, "loss": 0.0, "step": 6000 }, { "epoch": 3.0962498400102394, "grad_norm": 0.001392466393900218, "learning_rate": 7.6088069636456744e-06, "loss": 0.0, "step": 6050 }, { "epoch": 3.1218482017150904, "grad_norm": 0.0010317131953453697, "learning_rate": 7.5064004096262165e-06, "loss": 0.0, "step": 6100 }, { "epoch": 3.1474465634199413, "grad_norm": 0.002671567960001357, "learning_rate": 7.403993855606759e-06, "loss": 0.0, "step": 6150 }, { "epoch": 3.173044925124792, "grad_norm": 0.0010993308683270022, "learning_rate": 7.301587301587301e-06, "loss": 0.0, "step": 6200 }, { "epoch": 3.198643286829643, "grad_norm": 0.0011792521113742672, "learning_rate": 7.199180747567845e-06, "loss": 0.0, "step": 6250 }, { "epoch": 3.224241648534494, "grad_norm": 0.002377211855286283, "learning_rate": 7.096774193548388e-06, "loss": 0.0, "step": 6300 }, { "epoch": 3.2498400102393448, "grad_norm": 0.0013709631857059255, "learning_rate": 6.994367639528931e-06, "loss": 0.0, "step": 6350 }, { "epoch": 3.2754383719441957, "grad_norm": 0.0010686686441566805, "learning_rate": 6.891961085509473e-06, "loss": 0.0, "step": 6400 }, { "epoch": 3.3010367336490463, "grad_norm": 0.0013656971047177361, "learning_rate": 6.789554531490016e-06, "loss": 0.0, "step": 6450 }, { "epoch": 3.3266350953538972, "grad_norm": 0.0030691234388346175, "learning_rate": 6.687147977470559e-06, "loss": 0.0, "step": 6500 }, { "epoch": 3.352233457058748, "grad_norm": 0.0016958004183478155, "learning_rate": 6.584741423451101e-06, "loss": 0.0, "step": 6550 }, { "epoch": 3.377831818763599, "grad_norm": 0.002679660529985062, "learning_rate": 6.4823348694316445e-06, "loss": 0.0, "step": 6600 }, { "epoch": 3.40343018046845, "grad_norm": 0.0009123671464204819, "learning_rate": 6.379928315412187e-06, "loss": 0.0, "step": 6650 }, { "epoch": 3.4290285421733007, "grad_norm": 0.0011492363622442438, "learning_rate": 6.2775217613927295e-06, "loss": 0.0, "step": 6700 }, { "epoch": 3.4546269038781516, "grad_norm": 0.0008232117328172145, "learning_rate": 6.175115207373272e-06, "loss": 0.0, "step": 6750 }, { "epoch": 3.4802252655830026, "grad_norm": 0.0022449544565699437, "learning_rate": 6.072708653353815e-06, "loss": 0.0, "step": 6800 }, { "epoch": 3.5058236272878536, "grad_norm": 0.0015276485422571994, "learning_rate": 5.970302099334357e-06, "loss": 0.0, "step": 6850 }, { "epoch": 3.5314219889927045, "grad_norm": 0.0011827584209431721, "learning_rate": 5.867895545314901e-06, "loss": 0.0, "step": 6900 }, { "epoch": 3.5570203506975555, "grad_norm": 0.003917245208198937, "learning_rate": 5.765488991295444e-06, "loss": 0.0, "step": 6950 }, { "epoch": 3.5826187124024065, "grad_norm": 0.0016006551963278512, "learning_rate": 5.663082437275986e-06, "loss": 0.0, "step": 7000 }, { "epoch": 3.608217074107257, "grad_norm": 0.0006324168438582504, "learning_rate": 5.560675883256529e-06, "loss": 0.0, "step": 7050 }, { "epoch": 3.633815435812108, "grad_norm": 0.0019190937453439484, "learning_rate": 5.458269329237072e-06, "loss": 0.0, "step": 7100 }, { "epoch": 3.659413797516959, "grad_norm": 0.0014235404292782222, "learning_rate": 5.355862775217614e-06, "loss": 0.0, "step": 7150 }, { "epoch": 3.68501215922181, "grad_norm": 0.002036273934913596, "learning_rate": 5.253456221198157e-06, "loss": 0.0, "step": 7200 }, { "epoch": 3.710610520926661, "grad_norm": 0.001317406088761277, "learning_rate": 5.1510496671787e-06, "loss": 0.0, "step": 7250 }, { "epoch": 3.7362088826315114, "grad_norm": 0.0036103172590374257, "learning_rate": 5.0486431131592425e-06, "loss": 0.0, "step": 7300 }, { "epoch": 3.7618072443363624, "grad_norm": 0.004126819254015297, "learning_rate": 4.946236559139785e-06, "loss": 0.0, "step": 7350 }, { "epoch": 3.7874056060412133, "grad_norm": 0.0016332834938483983, "learning_rate": 4.843830005120328e-06, "loss": 0.0, "step": 7400 }, { "epoch": 3.8130039677460643, "grad_norm": 0.0004651327688046216, "learning_rate": 4.741423451100871e-06, "loss": 0.0, "step": 7450 }, { "epoch": 3.8386023294509153, "grad_norm": 0.0014442256648981874, "learning_rate": 4.639016897081414e-06, "loss": 0.0, "step": 7500 }, { "epoch": 3.864200691155766, "grad_norm": 0.0044845026985442255, "learning_rate": 4.536610343061956e-06, "loss": 0.0, "step": 7550 }, { "epoch": 3.889799052860617, "grad_norm": 0.0018809502401269417, "learning_rate": 4.434203789042499e-06, "loss": 0.0, "step": 7600 }, { "epoch": 3.9153974145654677, "grad_norm": 0.0038926669396500383, "learning_rate": 4.331797235023042e-06, "loss": 0.0, "step": 7650 }, { "epoch": 3.9409957762703187, "grad_norm": 0.004267638189231195, "learning_rate": 4.229390681003585e-06, "loss": 0.0, "step": 7700 }, { "epoch": 3.9665941379751697, "grad_norm": 0.00238145784225052, "learning_rate": 4.126984126984127e-06, "loss": 0.0, "step": 7750 }, { "epoch": 3.99219249968002, "grad_norm": 0.0014756463100693462, "learning_rate": 4.0245775729646705e-06, "loss": 0.0, "step": 7800 }, { "epoch": 4.017406885959298, "grad_norm": 0.0014856407081637794, "learning_rate": 3.9221710189452126e-06, "loss": 0.0, "step": 7850 }, { "epoch": 4.04300524766415, "grad_norm": 0.0023551139030403893, "learning_rate": 3.8197644649257554e-06, "loss": 0.0, "step": 7900 }, { "epoch": 4.068603609369, "grad_norm": 0.0011981597110419454, "learning_rate": 3.7173579109062983e-06, "loss": 0.0, "step": 7950 }, { "epoch": 4.0942019710738515, "grad_norm": 0.0028843508232214783, "learning_rate": 3.6149513568868412e-06, "loss": 0.0, "step": 8000 }, { "epoch": 4.119800332778702, "grad_norm": 0.0018843735220208588, "learning_rate": 3.5125448028673837e-06, "loss": 0.0, "step": 8050 }, { "epoch": 4.1453986944835535, "grad_norm": 0.0009664312361068302, "learning_rate": 3.4101382488479266e-06, "loss": 0.0, "step": 8100 }, { "epoch": 4.170997056188404, "grad_norm": 0.0007573250297130035, "learning_rate": 3.3077316948284695e-06, "loss": 0.0, "step": 8150 }, { "epoch": 4.1965954178932545, "grad_norm": 0.0011501400163836143, "learning_rate": 3.205325140809012e-06, "loss": 0.0, "step": 8200 }, { "epoch": 4.222193779598106, "grad_norm": 0.001612883833598221, "learning_rate": 3.1029185867895553e-06, "loss": 0.0, "step": 8250 }, { "epoch": 4.2477921413029565, "grad_norm": 0.0018641211713703483, "learning_rate": 3.0005120327700977e-06, "loss": 0.0, "step": 8300 }, { "epoch": 4.273390503007808, "grad_norm": 0.0009646184071954378, "learning_rate": 2.89810547875064e-06, "loss": 0.0, "step": 8350 }, { "epoch": 4.298988864712658, "grad_norm": 0.0009154804456547847, "learning_rate": 2.7956989247311827e-06, "loss": 0.0, "step": 8400 }, { "epoch": 4.324587226417509, "grad_norm": 0.0020826965736280532, "learning_rate": 2.693292370711726e-06, "loss": 0.0, "step": 8450 }, { "epoch": 4.35018558812236, "grad_norm": 0.0011199777755488004, "learning_rate": 2.5908858166922684e-06, "loss": 0.0, "step": 8500 }, { "epoch": 4.375783949827211, "grad_norm": 0.0008422274985046506, "learning_rate": 2.4884792626728113e-06, "loss": 0.0, "step": 8550 }, { "epoch": 4.401382311532062, "grad_norm": 0.0006334420803463363, "learning_rate": 2.386072708653354e-06, "loss": 0.0, "step": 8600 }, { "epoch": 4.426980673236913, "grad_norm": 0.001586741258322779, "learning_rate": 2.2836661546338967e-06, "loss": 0.0, "step": 8650 }, { "epoch": 4.452579034941763, "grad_norm": 0.0009727630299961603, "learning_rate": 2.1812596006144396e-06, "loss": 0.0, "step": 8700 }, { "epoch": 4.478177396646615, "grad_norm": 0.0018829318719699433, "learning_rate": 2.078853046594982e-06, "loss": 0.0, "step": 8750 }, { "epoch": 4.503775758351465, "grad_norm": 0.0010542211381998285, "learning_rate": 1.976446492575525e-06, "loss": 0.0, "step": 8800 }, { "epoch": 4.529374120056317, "grad_norm": 0.0008019247102530823, "learning_rate": 1.8740399385560678e-06, "loss": 0.0, "step": 8850 }, { "epoch": 4.554972481761167, "grad_norm": 0.0025552327229986455, "learning_rate": 1.7716333845366105e-06, "loss": 0.0, "step": 8900 }, { "epoch": 4.580570843466019, "grad_norm": 0.003395542465106546, "learning_rate": 1.6692268305171534e-06, "loss": 0.0, "step": 8950 }, { "epoch": 4.606169205170869, "grad_norm": 0.0013482230913774654, "learning_rate": 1.5668202764976959e-06, "loss": 0.0, "step": 9000 }, { "epoch": 4.63176756687572, "grad_norm": 0.0015382731152315148, "learning_rate": 1.4644137224782387e-06, "loss": 0.0, "step": 9050 }, { "epoch": 4.657365928580571, "grad_norm": 0.0019761534195212524, "learning_rate": 1.3620071684587816e-06, "loss": 0.0, "step": 9100 }, { "epoch": 4.682964290285422, "grad_norm": 0.00038991149426242233, "learning_rate": 1.259600614439324e-06, "loss": 0.0, "step": 9150 }, { "epoch": 4.708562651990272, "grad_norm": 0.002601115466918085, "learning_rate": 1.157194060419867e-06, "loss": 0.0, "step": 9200 }, { "epoch": 4.7341610136951235, "grad_norm": 0.002305779477709061, "learning_rate": 1.0547875064004097e-06, "loss": 0.0, "step": 9250 }, { "epoch": 4.759759375399974, "grad_norm": 0.0014673554584682805, "learning_rate": 9.523809523809525e-07, "loss": 0.0, "step": 9300 }, { "epoch": 4.7853577371048255, "grad_norm": 0.0032349538713087054, "learning_rate": 8.499743983614952e-07, "loss": 0.0, "step": 9350 }, { "epoch": 4.810956098809676, "grad_norm": 0.000820434912366508, "learning_rate": 7.475678443420379e-07, "loss": 0.0, "step": 9400 }, { "epoch": 4.836554460514527, "grad_norm": 0.0018281915833400296, "learning_rate": 6.451612903225807e-07, "loss": 0.0, "step": 9450 }, { "epoch": 4.862152822219378, "grad_norm": 0.001079014832848559, "learning_rate": 5.427547363031235e-07, "loss": 0.0, "step": 9500 }, { "epoch": 4.8877511839242285, "grad_norm": 0.0009643703560768136, "learning_rate": 4.4034818228366616e-07, "loss": 0.0, "step": 9550 }, { "epoch": 4.91334954562908, "grad_norm": 0.0012576968463295232, "learning_rate": 3.3794162826420895e-07, "loss": 0.0, "step": 9600 }, { "epoch": 4.93894790733393, "grad_norm": 0.0017081309137603447, "learning_rate": 2.355350742447517e-07, "loss": 0.0, "step": 9650 }, { "epoch": 4.964546269038782, "grad_norm": 0.0014382920472094229, "learning_rate": 1.3312852022529444e-07, "loss": 0.0, "step": 9700 }, { "epoch": 4.990144630743632, "grad_norm": 0.0014054516528993024, "learning_rate": 3.0721966205837177e-08, "loss": 0.0, "step": 9750 } ], "logging_steps": 50, "max_steps": 9765, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.471170805648589e+16, "train_batch_size": 6, "trial_name": null, "trial_params": null }