| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 4455, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011231222799382283, |
| "grad_norm": 7.254988670349121, |
| "learning_rate": 0.00010799999999999998, |
| "loss": 65.2664, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.022462445598764567, |
| "grad_norm": 10.046869277954102, |
| "learning_rate": 0.00022799999999999999, |
| "loss": 53.1308, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.033693668398146845, |
| "grad_norm": 6.69377326965332, |
| "learning_rate": 0.00034799999999999995, |
| "loss": 42.5968, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.04492489119752913, |
| "grad_norm": 2.674748182296753, |
| "learning_rate": 0.000468, |
| "loss": 36.9814, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.056156113996911415, |
| "grad_norm": 1.836259126663208, |
| "learning_rate": 0.000588, |
| "loss": 33.8187, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.06738733679629369, |
| "grad_norm": 1.980311632156372, |
| "learning_rate": 0.0005987741203178206, |
| "loss": 31.7549, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.07861855959567599, |
| "grad_norm": 1.9080690145492554, |
| "learning_rate": 0.0005974120317820658, |
| "loss": 30.3116, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.08984978239505827, |
| "grad_norm": 2.006067991256714, |
| "learning_rate": 0.000596049943246311, |
| "loss": 29.1807, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.10108100519444055, |
| "grad_norm": 1.9598348140716553, |
| "learning_rate": 0.0005946878547105561, |
| "loss": 28.136, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.11231222799382283, |
| "grad_norm": 1.4707179069519043, |
| "learning_rate": 0.0005933257661748014, |
| "loss": 27.1126, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.12354345079320511, |
| "grad_norm": 1.9835516214370728, |
| "learning_rate": 0.0005919636776390465, |
| "loss": 26.144, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.13477467359258738, |
| "grad_norm": 1.9193854331970215, |
| "learning_rate": 0.0005906015891032917, |
| "loss": 25.4067, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.14600589639196968, |
| "grad_norm": 2.0830602645874023, |
| "learning_rate": 0.0005892395005675368, |
| "loss": 24.6646, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.15723711919135197, |
| "grad_norm": 2.2983345985412598, |
| "learning_rate": 0.0005878774120317821, |
| "loss": 23.997, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.16846834199073424, |
| "grad_norm": 1.7569265365600586, |
| "learning_rate": 0.0005865153234960272, |
| "loss": 23.4708, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.17969956479011653, |
| "grad_norm": 3.213463068008423, |
| "learning_rate": 0.0005851532349602724, |
| "loss": 22.9803, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.1909307875894988, |
| "grad_norm": 2.6588027477264404, |
| "learning_rate": 0.0005837911464245175, |
| "loss": 22.5322, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.2021620103888811, |
| "grad_norm": 2.5820348262786865, |
| "learning_rate": 0.0005824290578887628, |
| "loss": 22.0044, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.21339323318826336, |
| "grad_norm": 2.5082194805145264, |
| "learning_rate": 0.0005810669693530079, |
| "loss": 21.6131, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.22462445598764566, |
| "grad_norm": 2.2405030727386475, |
| "learning_rate": 0.0005797048808172531, |
| "loss": 21.2934, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.23585567878702793, |
| "grad_norm": 2.5789482593536377, |
| "learning_rate": 0.0005783427922814982, |
| "loss": 20.8536, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.24708690158641022, |
| "grad_norm": 2.514025926589966, |
| "learning_rate": 0.0005769807037457435, |
| "loss": 20.531, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.2583181243857925, |
| "grad_norm": 2.224000930786133, |
| "learning_rate": 0.0005756186152099886, |
| "loss": 20.1819, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.26954934718517476, |
| "grad_norm": 2.81223201751709, |
| "learning_rate": 0.0005742565266742338, |
| "loss": 19.9192, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.2807805699845571, |
| "grad_norm": 2.691889524459839, |
| "learning_rate": 0.000572894438138479, |
| "loss": 19.629, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.29201179278393935, |
| "grad_norm": 2.8096940517425537, |
| "learning_rate": 0.0005715323496027242, |
| "loss": 19.3428, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.3032430155833216, |
| "grad_norm": 2.537062406539917, |
| "learning_rate": 0.0005701702610669693, |
| "loss": 19.0967, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.31447423838270394, |
| "grad_norm": 2.8304593563079834, |
| "learning_rate": 0.0005688081725312145, |
| "loss": 18.7822, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.3257054611820862, |
| "grad_norm": 3.3747401237487793, |
| "learning_rate": 0.0005674460839954597, |
| "loss": 18.6202, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.3369366839814685, |
| "grad_norm": 3.1069352626800537, |
| "learning_rate": 0.0005660839954597049, |
| "loss": 18.3328, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.34816790678085074, |
| "grad_norm": 3.197521209716797, |
| "learning_rate": 0.00056472190692395, |
| "loss": 18.1676, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.35939912958023307, |
| "grad_norm": 2.2514944076538086, |
| "learning_rate": 0.0005633598183881951, |
| "loss": 17.9463, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.37063035237961534, |
| "grad_norm": 2.492508888244629, |
| "learning_rate": 0.0005619977298524404, |
| "loss": 17.7745, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.3818615751789976, |
| "grad_norm": 2.636312484741211, |
| "learning_rate": 0.0005606356413166855, |
| "loss": 17.5753, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.39309279797837987, |
| "grad_norm": 2.8358511924743652, |
| "learning_rate": 0.0005592735527809307, |
| "loss": 17.4287, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.4043240207777622, |
| "grad_norm": 2.478410005569458, |
| "learning_rate": 0.0005579114642451758, |
| "loss": 17.2377, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.41555524357714446, |
| "grad_norm": 3.003167152404785, |
| "learning_rate": 0.0005565493757094211, |
| "loss": 17.0597, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.42678646637652673, |
| "grad_norm": 2.51339054107666, |
| "learning_rate": 0.0005551872871736662, |
| "loss": 16.8974, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.43801768917590905, |
| "grad_norm": 2.4287829399108887, |
| "learning_rate": 0.0005538251986379114, |
| "loss": 16.7779, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.4492489119752913, |
| "grad_norm": 2.6313817501068115, |
| "learning_rate": 0.0005524631101021566, |
| "loss": 16.6392, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.4604801347746736, |
| "grad_norm": 2.333446502685547, |
| "learning_rate": 0.0005511010215664018, |
| "loss": 16.523, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.47171135757405586, |
| "grad_norm": 2.979137420654297, |
| "learning_rate": 0.0005497389330306469, |
| "loss": 16.4291, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.4829425803734382, |
| "grad_norm": 2.530829668045044, |
| "learning_rate": 0.0005483768444948921, |
| "loss": 16.2809, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.49417380317282045, |
| "grad_norm": 2.572939157485962, |
| "learning_rate": 0.0005470147559591373, |
| "loss": 16.1444, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.5054050259722027, |
| "grad_norm": 2.281402349472046, |
| "learning_rate": 0.0005456526674233825, |
| "loss": 16.1148, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.516636248771585, |
| "grad_norm": 2.429608106613159, |
| "learning_rate": 0.0005442905788876276, |
| "loss": 15.9973, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.5278674715709673, |
| "grad_norm": 3.2124414443969727, |
| "learning_rate": 0.0005429284903518729, |
| "loss": 15.7978, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.5390986943703495, |
| "grad_norm": 2.6288387775421143, |
| "learning_rate": 0.000541566401816118, |
| "loss": 15.7305, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.5503299171697319, |
| "grad_norm": 2.923504114151001, |
| "learning_rate": 0.0005402043132803632, |
| "loss": 15.649, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.5615611399691142, |
| "grad_norm": 2.313035249710083, |
| "learning_rate": 0.0005388422247446083, |
| "loss": 15.5597, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5727923627684964, |
| "grad_norm": 2.209913492202759, |
| "learning_rate": 0.0005374801362088536, |
| "loss": 15.4301, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.5840235855678787, |
| "grad_norm": 2.5013203620910645, |
| "learning_rate": 0.0005361180476730987, |
| "loss": 15.4357, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.595254808367261, |
| "grad_norm": 2.2074005603790283, |
| "learning_rate": 0.0005347559591373438, |
| "loss": 15.2997, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.6064860311666432, |
| "grad_norm": 2.186513900756836, |
| "learning_rate": 0.000533393870601589, |
| "loss": 15.1875, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.6177172539660255, |
| "grad_norm": 2.2040984630584717, |
| "learning_rate": 0.0005320317820658342, |
| "loss": 15.1292, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.6289484767654079, |
| "grad_norm": 2.560344696044922, |
| "learning_rate": 0.0005306696935300794, |
| "loss": 15.0857, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.6401796995647902, |
| "grad_norm": 2.844914436340332, |
| "learning_rate": 0.0005293076049943245, |
| "loss": 15.0146, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.6514109223641724, |
| "grad_norm": 2.1173508167266846, |
| "learning_rate": 0.0005279455164585697, |
| "loss": 14.9086, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.6626421451635547, |
| "grad_norm": 2.6725828647613525, |
| "learning_rate": 0.0005265834279228149, |
| "loss": 14.8688, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.673873367962937, |
| "grad_norm": 2.2795088291168213, |
| "learning_rate": 0.0005252213393870601, |
| "loss": 14.7697, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.6851045907623192, |
| "grad_norm": 2.5514473915100098, |
| "learning_rate": 0.0005238592508513052, |
| "loss": 14.7712, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.6963358135617015, |
| "grad_norm": 2.7515954971313477, |
| "learning_rate": 0.0005224971623155505, |
| "loss": 14.6388, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.7075670363610839, |
| "grad_norm": 2.690708637237549, |
| "learning_rate": 0.0005211350737797956, |
| "loss": 14.5995, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.7187982591604661, |
| "grad_norm": 2.461747407913208, |
| "learning_rate": 0.0005197729852440408, |
| "loss": 14.6307, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.7300294819598484, |
| "grad_norm": 2.346754550933838, |
| "learning_rate": 0.0005184108967082859, |
| "loss": 14.5505, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.7412607047592307, |
| "grad_norm": 2.376952886581421, |
| "learning_rate": 0.0005170488081725312, |
| "loss": 14.4045, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.7524919275586129, |
| "grad_norm": 2.421809673309326, |
| "learning_rate": 0.0005156867196367763, |
| "loss": 14.4317, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.7637231503579952, |
| "grad_norm": 2.1633081436157227, |
| "learning_rate": 0.0005143246311010215, |
| "loss": 14.39, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.7749543731573775, |
| "grad_norm": 2.3786990642547607, |
| "learning_rate": 0.0005129625425652666, |
| "loss": 14.3502, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.7861855959567597, |
| "grad_norm": 2.113426685333252, |
| "learning_rate": 0.0005116004540295119, |
| "loss": 14.3406, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.7974168187561421, |
| "grad_norm": 2.3934221267700195, |
| "learning_rate": 0.000510238365493757, |
| "loss": 14.1971, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.8086480415555244, |
| "grad_norm": 2.2229974269866943, |
| "learning_rate": 0.0005088762769580022, |
| "loss": 14.1345, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.8198792643549067, |
| "grad_norm": 2.368398427963257, |
| "learning_rate": 0.0005075141884222473, |
| "loss": 14.1697, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.8311104871542889, |
| "grad_norm": 2.0480265617370605, |
| "learning_rate": 0.0005061520998864926, |
| "loss": 14.109, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.8423417099536712, |
| "grad_norm": 2.5201494693756104, |
| "learning_rate": 0.0005047900113507377, |
| "loss": 14.0595, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.8535729327530535, |
| "grad_norm": 2.2030587196350098, |
| "learning_rate": 0.0005034279228149829, |
| "loss": 14.0706, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.8648041555524357, |
| "grad_norm": 2.3631434440612793, |
| "learning_rate": 0.0005020658342792281, |
| "loss": 13.9638, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.8760353783518181, |
| "grad_norm": 2.200186252593994, |
| "learning_rate": 0.0005007037457434733, |
| "loss": 13.9289, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.8872666011512004, |
| "grad_norm": 1.9684851169586182, |
| "learning_rate": 0.0004993416572077184, |
| "loss": 13.7841, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.8984978239505826, |
| "grad_norm": 2.311785936355591, |
| "learning_rate": 0.0004979795686719636, |
| "loss": 13.8482, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.9097290467499649, |
| "grad_norm": 2.169306755065918, |
| "learning_rate": 0.0004966174801362088, |
| "loss": 13.7793, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.9209602695493472, |
| "grad_norm": 2.2637839317321777, |
| "learning_rate": 0.000495255391600454, |
| "loss": 13.8025, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.9321914923487294, |
| "grad_norm": 2.4545624256134033, |
| "learning_rate": 0.0004938933030646991, |
| "loss": 13.7582, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.9434227151481117, |
| "grad_norm": 1.94866144657135, |
| "learning_rate": 0.0004925312145289444, |
| "loss": 13.6945, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.954653937947494, |
| "grad_norm": 2.5071635246276855, |
| "learning_rate": 0.0004911691259931895, |
| "loss": 13.7015, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.9658851607468764, |
| "grad_norm": 2.1163992881774902, |
| "learning_rate": 0.0004898070374574347, |
| "loss": 13.6065, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.9771163835462586, |
| "grad_norm": 2.0031838417053223, |
| "learning_rate": 0.0004884449489216798, |
| "loss": 13.6221, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.9883476063456409, |
| "grad_norm": 1.9967576265335083, |
| "learning_rate": 0.00048708286038592506, |
| "loss": 13.5372, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.9995788291450232, |
| "grad_norm": 1.9578522443771362, |
| "learning_rate": 0.0004857207718501702, |
| "loss": 13.479, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.010108100519444, |
| "grad_norm": 2.088886022567749, |
| "learning_rate": 0.0004843586833144154, |
| "loss": 12.5586, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.0213393233188264, |
| "grad_norm": 1.9549593925476074, |
| "learning_rate": 0.00048299659477866054, |
| "loss": 13.3791, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.0325705461182086, |
| "grad_norm": 1.9387295246124268, |
| "learning_rate": 0.00048163450624290577, |
| "loss": 13.4606, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.043801768917591, |
| "grad_norm": 1.9079780578613281, |
| "learning_rate": 0.0004802724177071509, |
| "loss": 13.3116, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.0550329917169732, |
| "grad_norm": 2.125767230987549, |
| "learning_rate": 0.0004789103291713961, |
| "loss": 13.3492, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.0662642145163554, |
| "grad_norm": 1.894142508506775, |
| "learning_rate": 0.00047754824063564124, |
| "loss": 13.3045, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.0774954373157377, |
| "grad_norm": 2.1038496494293213, |
| "learning_rate": 0.0004761861520998865, |
| "loss": 13.2968, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.08872666011512, |
| "grad_norm": 2.0055999755859375, |
| "learning_rate": 0.0004748240635641316, |
| "loss": 13.2548, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.0999578829145022, |
| "grad_norm": 1.8585240840911865, |
| "learning_rate": 0.00047346197502837683, |
| "loss": 13.1873, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.1111891057138845, |
| "grad_norm": 1.9325159788131714, |
| "learning_rate": 0.00047209988649262195, |
| "loss": 13.2134, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.1224203285132668, |
| "grad_norm": 1.9729729890823364, |
| "learning_rate": 0.0004707377979568672, |
| "loss": 13.2012, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.1336515513126493, |
| "grad_norm": 1.8888603448867798, |
| "learning_rate": 0.0004693757094211123, |
| "loss": 13.1582, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.1448827741120315, |
| "grad_norm": 2.0408947467803955, |
| "learning_rate": 0.00046801362088535754, |
| "loss": 13.1612, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.1561139969114138, |
| "grad_norm": 1.85500168800354, |
| "learning_rate": 0.00046665153234960266, |
| "loss": 13.1639, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.167345219710796, |
| "grad_norm": 1.9741461277008057, |
| "learning_rate": 0.0004652894438138479, |
| "loss": 13.1511, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.1785764425101783, |
| "grad_norm": 1.9243193864822388, |
| "learning_rate": 0.000463927355278093, |
| "loss": 13.0421, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.1898076653095606, |
| "grad_norm": 1.907875657081604, |
| "learning_rate": 0.00046256526674233825, |
| "loss": 13.0895, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.2010388881089429, |
| "grad_norm": 2.068942070007324, |
| "learning_rate": 0.00046120317820658337, |
| "loss": 12.9636, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.2122701109083251, |
| "grad_norm": 1.7963610887527466, |
| "learning_rate": 0.0004598410896708286, |
| "loss": 13.0476, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.2235013337077074, |
| "grad_norm": 2.0973403453826904, |
| "learning_rate": 0.0004584790011350737, |
| "loss": 13.017, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.2347325565070897, |
| "grad_norm": 1.749234676361084, |
| "learning_rate": 0.00045711691259931895, |
| "loss": 12.9918, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.245963779306472, |
| "grad_norm": 2.021134853363037, |
| "learning_rate": 0.0004557548240635641, |
| "loss": 12.9339, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.2571950021058542, |
| "grad_norm": 1.840922236442566, |
| "learning_rate": 0.00045439273552780925, |
| "loss": 12.9677, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.2684262249052365, |
| "grad_norm": 1.932059645652771, |
| "learning_rate": 0.00045303064699205443, |
| "loss": 12.9547, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.279657447704619, |
| "grad_norm": 2.177213668823242, |
| "learning_rate": 0.0004516685584562996, |
| "loss": 12.8664, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.290888670504001, |
| "grad_norm": 2.070005178451538, |
| "learning_rate": 0.0004503064699205448, |
| "loss": 12.8585, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.3021198933033835, |
| "grad_norm": 1.832366943359375, |
| "learning_rate": 0.00044894438138478996, |
| "loss": 12.8719, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.3133511161027656, |
| "grad_norm": 1.9845136404037476, |
| "learning_rate": 0.00044758229284903514, |
| "loss": 12.8424, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.324582338902148, |
| "grad_norm": 1.8147252798080444, |
| "learning_rate": 0.0004462202043132803, |
| "loss": 12.83, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.3358135617015303, |
| "grad_norm": 2.107008695602417, |
| "learning_rate": 0.0004448581157775255, |
| "loss": 12.7765, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.3470447845009126, |
| "grad_norm": 1.7466599941253662, |
| "learning_rate": 0.00044349602724177067, |
| "loss": 12.7204, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.3582760073002949, |
| "grad_norm": 1.7873570919036865, |
| "learning_rate": 0.00044213393870601585, |
| "loss": 12.7782, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.3695072300996771, |
| "grad_norm": 2.079206943511963, |
| "learning_rate": 0.000440771850170261, |
| "loss": 12.7326, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.3807384528990594, |
| "grad_norm": 1.7911771535873413, |
| "learning_rate": 0.0004394097616345062, |
| "loss": 12.7422, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.3919696756984417, |
| "grad_norm": 1.901097297668457, |
| "learning_rate": 0.0004380476730987514, |
| "loss": 12.7185, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.403200898497824, |
| "grad_norm": 1.826768398284912, |
| "learning_rate": 0.00043668558456299656, |
| "loss": 12.7017, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.4144321212972062, |
| "grad_norm": 1.7345402240753174, |
| "learning_rate": 0.00043532349602724173, |
| "loss": 12.6704, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.4256633440965885, |
| "grad_norm": 1.9229341745376587, |
| "learning_rate": 0.0004339614074914869, |
| "loss": 12.683, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.4368945668959707, |
| "grad_norm": 1.6749757528305054, |
| "learning_rate": 0.0004325993189557321, |
| "loss": 12.6501, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.4481257896953532, |
| "grad_norm": 1.812778353691101, |
| "learning_rate": 0.00043123723041997727, |
| "loss": 12.6529, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.4593570124947353, |
| "grad_norm": 1.7584545612335205, |
| "learning_rate": 0.00042987514188422244, |
| "loss": 12.6065, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.4705882352941178, |
| "grad_norm": 1.7893540859222412, |
| "learning_rate": 0.0004285130533484676, |
| "loss": 12.6011, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.4818194580934998, |
| "grad_norm": 1.8022955656051636, |
| "learning_rate": 0.0004271509648127128, |
| "loss": 12.5883, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.4930506808928823, |
| "grad_norm": 2.1660964488983154, |
| "learning_rate": 0.0004257888762769579, |
| "loss": 12.578, |
| "step": 1330 |
| }, |
| { |
| "epoch": 1.5042819036922643, |
| "grad_norm": 2.0054919719696045, |
| "learning_rate": 0.00042442678774120315, |
| "loss": 12.4797, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.5155131264916468, |
| "grad_norm": 1.6194941997528076, |
| "learning_rate": 0.0004230646992054483, |
| "loss": 12.475, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.526744349291029, |
| "grad_norm": 1.7826400995254517, |
| "learning_rate": 0.0004217026106696935, |
| "loss": 12.4979, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.5379755720904114, |
| "grad_norm": 1.7084046602249146, |
| "learning_rate": 0.00042034052213393863, |
| "loss": 12.5252, |
| "step": 1370 |
| }, |
| { |
| "epoch": 1.5492067948897936, |
| "grad_norm": 1.7256839275360107, |
| "learning_rate": 0.00041897843359818386, |
| "loss": 12.4617, |
| "step": 1380 |
| }, |
| { |
| "epoch": 1.560438017689176, |
| "grad_norm": 1.8508225679397583, |
| "learning_rate": 0.000417616345062429, |
| "loss": 12.5004, |
| "step": 1390 |
| }, |
| { |
| "epoch": 1.5716692404885582, |
| "grad_norm": 1.6339643001556396, |
| "learning_rate": 0.0004162542565266742, |
| "loss": 12.4315, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.5829004632879404, |
| "grad_norm": 1.8081163167953491, |
| "learning_rate": 0.00041489216799091934, |
| "loss": 12.4165, |
| "step": 1410 |
| }, |
| { |
| "epoch": 1.594131686087323, |
| "grad_norm": 1.8224244117736816, |
| "learning_rate": 0.00041353007945516457, |
| "loss": 12.4136, |
| "step": 1420 |
| }, |
| { |
| "epoch": 1.605362908886705, |
| "grad_norm": 1.8123489618301392, |
| "learning_rate": 0.0004121679909194097, |
| "loss": 12.4247, |
| "step": 1430 |
| }, |
| { |
| "epoch": 1.6165941316860875, |
| "grad_norm": 2.0198097229003906, |
| "learning_rate": 0.0004108059023836549, |
| "loss": 12.4179, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.6278253544854695, |
| "grad_norm": 1.759125828742981, |
| "learning_rate": 0.00040944381384790005, |
| "loss": 12.393, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.639056577284852, |
| "grad_norm": 1.768356442451477, |
| "learning_rate": 0.0004080817253121453, |
| "loss": 12.3846, |
| "step": 1460 |
| }, |
| { |
| "epoch": 1.650287800084234, |
| "grad_norm": 1.7623116970062256, |
| "learning_rate": 0.0004067196367763904, |
| "loss": 12.3867, |
| "step": 1470 |
| }, |
| { |
| "epoch": 1.6615190228836165, |
| "grad_norm": 1.7670810222625732, |
| "learning_rate": 0.00040535754824063563, |
| "loss": 12.3861, |
| "step": 1480 |
| }, |
| { |
| "epoch": 1.6727502456829986, |
| "grad_norm": 1.7134902477264404, |
| "learning_rate": 0.00040399545970488075, |
| "loss": 12.3066, |
| "step": 1490 |
| }, |
| { |
| "epoch": 1.683981468482381, |
| "grad_norm": 1.7663081884384155, |
| "learning_rate": 0.000402633371169126, |
| "loss": 12.3453, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.6952126912817633, |
| "grad_norm": 2.0105090141296387, |
| "learning_rate": 0.0004012712826333711, |
| "loss": 12.3071, |
| "step": 1510 |
| }, |
| { |
| "epoch": 1.7064439140811456, |
| "grad_norm": 1.7803105115890503, |
| "learning_rate": 0.00039990919409761634, |
| "loss": 12.3537, |
| "step": 1520 |
| }, |
| { |
| "epoch": 1.7176751368805279, |
| "grad_norm": 1.676762342453003, |
| "learning_rate": 0.00039854710556186146, |
| "loss": 12.3329, |
| "step": 1530 |
| }, |
| { |
| "epoch": 1.7289063596799101, |
| "grad_norm": 1.678467035293579, |
| "learning_rate": 0.0003971850170261067, |
| "loss": 12.2744, |
| "step": 1540 |
| }, |
| { |
| "epoch": 1.7401375824792924, |
| "grad_norm": 1.5558372735977173, |
| "learning_rate": 0.0003958229284903518, |
| "loss": 12.2728, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.7513688052786747, |
| "grad_norm": 1.8932992219924927, |
| "learning_rate": 0.00039446083995459705, |
| "loss": 12.244, |
| "step": 1560 |
| }, |
| { |
| "epoch": 1.7626000280780572, |
| "grad_norm": 1.5952329635620117, |
| "learning_rate": 0.00039309875141884217, |
| "loss": 12.2526, |
| "step": 1570 |
| }, |
| { |
| "epoch": 1.7738312508774392, |
| "grad_norm": 1.7214975357055664, |
| "learning_rate": 0.0003917366628830874, |
| "loss": 12.2495, |
| "step": 1580 |
| }, |
| { |
| "epoch": 1.7850624736768217, |
| "grad_norm": 1.686092495918274, |
| "learning_rate": 0.0003903745743473325, |
| "loss": 12.2539, |
| "step": 1590 |
| }, |
| { |
| "epoch": 1.7962936964762037, |
| "grad_norm": 1.7045249938964844, |
| "learning_rate": 0.00038901248581157776, |
| "loss": 12.229, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.8075249192755862, |
| "grad_norm": 1.591776728630066, |
| "learning_rate": 0.0003876503972758229, |
| "loss": 12.2094, |
| "step": 1610 |
| }, |
| { |
| "epoch": 1.8187561420749683, |
| "grad_norm": 1.7864668369293213, |
| "learning_rate": 0.0003862883087400681, |
| "loss": 12.1677, |
| "step": 1620 |
| }, |
| { |
| "epoch": 1.8299873648743508, |
| "grad_norm": 1.6069233417510986, |
| "learning_rate": 0.00038492622020431323, |
| "loss": 12.2168, |
| "step": 1630 |
| }, |
| { |
| "epoch": 1.841218587673733, |
| "grad_norm": 1.5101128816604614, |
| "learning_rate": 0.00038356413166855846, |
| "loss": 12.2165, |
| "step": 1640 |
| }, |
| { |
| "epoch": 1.8524498104731153, |
| "grad_norm": 1.675972819328308, |
| "learning_rate": 0.0003822020431328036, |
| "loss": 12.1509, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.8636810332724976, |
| "grad_norm": 1.5653135776519775, |
| "learning_rate": 0.00038083995459704876, |
| "loss": 12.1905, |
| "step": 1660 |
| }, |
| { |
| "epoch": 1.8749122560718798, |
| "grad_norm": 1.6895561218261719, |
| "learning_rate": 0.00037947786606129394, |
| "loss": 12.1049, |
| "step": 1670 |
| }, |
| { |
| "epoch": 1.886143478871262, |
| "grad_norm": 1.6613564491271973, |
| "learning_rate": 0.0003781157775255391, |
| "loss": 12.1453, |
| "step": 1680 |
| }, |
| { |
| "epoch": 1.8973747016706444, |
| "grad_norm": 1.5011565685272217, |
| "learning_rate": 0.0003767536889897843, |
| "loss": 12.1299, |
| "step": 1690 |
| }, |
| { |
| "epoch": 1.9086059244700266, |
| "grad_norm": 1.6737213134765625, |
| "learning_rate": 0.00037539160045402947, |
| "loss": 12.1812, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.919837147269409, |
| "grad_norm": 1.6121188402175903, |
| "learning_rate": 0.00037402951191827465, |
| "loss": 12.1656, |
| "step": 1710 |
| }, |
| { |
| "epoch": 1.9310683700687914, |
| "grad_norm": 1.527930736541748, |
| "learning_rate": 0.0003726674233825198, |
| "loss": 12.0239, |
| "step": 1720 |
| }, |
| { |
| "epoch": 1.9422995928681734, |
| "grad_norm": 1.6483533382415771, |
| "learning_rate": 0.000371305334846765, |
| "loss": 12.1018, |
| "step": 1730 |
| }, |
| { |
| "epoch": 1.953530815667556, |
| "grad_norm": 1.6136051416397095, |
| "learning_rate": 0.0003699432463110102, |
| "loss": 12.0595, |
| "step": 1740 |
| }, |
| { |
| "epoch": 1.964762038466938, |
| "grad_norm": 1.7923524379730225, |
| "learning_rate": 0.00036858115777525536, |
| "loss": 12.0446, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.9759932612663205, |
| "grad_norm": 1.732296109199524, |
| "learning_rate": 0.00036721906923950054, |
| "loss": 12.1205, |
| "step": 1760 |
| }, |
| { |
| "epoch": 1.9872244840657025, |
| "grad_norm": 1.595475435256958, |
| "learning_rate": 0.0003658569807037457, |
| "loss": 12.0134, |
| "step": 1770 |
| }, |
| { |
| "epoch": 1.998455706865085, |
| "grad_norm": 1.620702862739563, |
| "learning_rate": 0.0003644948921679909, |
| "loss": 12.0375, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.0089849782395057, |
| "grad_norm": 1.9543509483337402, |
| "learning_rate": 0.00036313280363223607, |
| "loss": 11.248, |
| "step": 1790 |
| }, |
| { |
| "epoch": 2.020216201038888, |
| "grad_norm": 1.726154088973999, |
| "learning_rate": 0.00036177071509648124, |
| "loss": 11.8804, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.0314474238382703, |
| "grad_norm": 1.61319899559021, |
| "learning_rate": 0.0003604086265607264, |
| "loss": 11.8473, |
| "step": 1810 |
| }, |
| { |
| "epoch": 2.0426786466376528, |
| "grad_norm": 1.5679258108139038, |
| "learning_rate": 0.0003590465380249716, |
| "loss": 11.8905, |
| "step": 1820 |
| }, |
| { |
| "epoch": 2.053909869437035, |
| "grad_norm": 1.5252978801727295, |
| "learning_rate": 0.0003576844494892168, |
| "loss": 11.9025, |
| "step": 1830 |
| }, |
| { |
| "epoch": 2.0651410922364173, |
| "grad_norm": 1.866320252418518, |
| "learning_rate": 0.00035632236095346195, |
| "loss": 11.8789, |
| "step": 1840 |
| }, |
| { |
| "epoch": 2.0763723150357993, |
| "grad_norm": 1.656232476234436, |
| "learning_rate": 0.00035496027241770713, |
| "loss": 11.8793, |
| "step": 1850 |
| }, |
| { |
| "epoch": 2.087603537835182, |
| "grad_norm": 1.6428873538970947, |
| "learning_rate": 0.0003535981838819523, |
| "loss": 11.9116, |
| "step": 1860 |
| }, |
| { |
| "epoch": 2.0988347606345643, |
| "grad_norm": 1.5620014667510986, |
| "learning_rate": 0.00035223609534619743, |
| "loss": 11.7622, |
| "step": 1870 |
| }, |
| { |
| "epoch": 2.1100659834339464, |
| "grad_norm": 1.6203725337982178, |
| "learning_rate": 0.00035087400681044266, |
| "loss": 11.8528, |
| "step": 1880 |
| }, |
| { |
| "epoch": 2.121297206233329, |
| "grad_norm": 1.610343337059021, |
| "learning_rate": 0.0003495119182746878, |
| "loss": 11.8451, |
| "step": 1890 |
| }, |
| { |
| "epoch": 2.132528429032711, |
| "grad_norm": 1.7108615636825562, |
| "learning_rate": 0.000348149829738933, |
| "loss": 11.8686, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.1437596518320934, |
| "grad_norm": 1.6075197458267212, |
| "learning_rate": 0.00034678774120317814, |
| "loss": 11.8373, |
| "step": 1910 |
| }, |
| { |
| "epoch": 2.1549908746314754, |
| "grad_norm": 1.6820300817489624, |
| "learning_rate": 0.00034542565266742337, |
| "loss": 11.8662, |
| "step": 1920 |
| }, |
| { |
| "epoch": 2.166222097430858, |
| "grad_norm": 1.7435542345046997, |
| "learning_rate": 0.0003440635641316685, |
| "loss": 11.7815, |
| "step": 1930 |
| }, |
| { |
| "epoch": 2.17745332023024, |
| "grad_norm": 1.6679948568344116, |
| "learning_rate": 0.0003427014755959137, |
| "loss": 11.8364, |
| "step": 1940 |
| }, |
| { |
| "epoch": 2.1886845430296225, |
| "grad_norm": 1.6212760210037231, |
| "learning_rate": 0.00034133938706015885, |
| "loss": 11.7716, |
| "step": 1950 |
| }, |
| { |
| "epoch": 2.1999157658290045, |
| "grad_norm": 1.5176148414611816, |
| "learning_rate": 0.0003399772985244041, |
| "loss": 11.7938, |
| "step": 1960 |
| }, |
| { |
| "epoch": 2.211146988628387, |
| "grad_norm": 1.5583223104476929, |
| "learning_rate": 0.0003386152099886492, |
| "loss": 11.7711, |
| "step": 1970 |
| }, |
| { |
| "epoch": 2.222378211427769, |
| "grad_norm": 1.768452763557434, |
| "learning_rate": 0.00033725312145289443, |
| "loss": 11.816, |
| "step": 1980 |
| }, |
| { |
| "epoch": 2.2336094342271515, |
| "grad_norm": 1.5718501806259155, |
| "learning_rate": 0.00033589103291713955, |
| "loss": 11.8198, |
| "step": 1990 |
| }, |
| { |
| "epoch": 2.2448406570265336, |
| "grad_norm": 1.5841343402862549, |
| "learning_rate": 0.0003345289443813848, |
| "loss": 11.7875, |
| "step": 2000 |
| }, |
| { |
| "epoch": 2.256071879825916, |
| "grad_norm": 1.8200368881225586, |
| "learning_rate": 0.0003331668558456299, |
| "loss": 11.7819, |
| "step": 2010 |
| }, |
| { |
| "epoch": 2.2673031026252985, |
| "grad_norm": 1.549752116203308, |
| "learning_rate": 0.00033180476730987514, |
| "loss": 11.751, |
| "step": 2020 |
| }, |
| { |
| "epoch": 2.2785343254246806, |
| "grad_norm": 1.4725509881973267, |
| "learning_rate": 0.00033044267877412026, |
| "loss": 11.6703, |
| "step": 2030 |
| }, |
| { |
| "epoch": 2.289765548224063, |
| "grad_norm": 1.7251125574111938, |
| "learning_rate": 0.0003290805902383655, |
| "loss": 11.7971, |
| "step": 2040 |
| }, |
| { |
| "epoch": 2.300996771023445, |
| "grad_norm": 1.8287060260772705, |
| "learning_rate": 0.0003277185017026106, |
| "loss": 11.7656, |
| "step": 2050 |
| }, |
| { |
| "epoch": 2.3122279938228276, |
| "grad_norm": 1.6441352367401123, |
| "learning_rate": 0.00032635641316685585, |
| "loss": 11.7444, |
| "step": 2060 |
| }, |
| { |
| "epoch": 2.3234592166222097, |
| "grad_norm": 1.675087809562683, |
| "learning_rate": 0.00032499432463110097, |
| "loss": 11.7359, |
| "step": 2070 |
| }, |
| { |
| "epoch": 2.334690439421592, |
| "grad_norm": 1.5940701961517334, |
| "learning_rate": 0.0003236322360953462, |
| "loss": 11.7006, |
| "step": 2080 |
| }, |
| { |
| "epoch": 2.345921662220974, |
| "grad_norm": 1.616356372833252, |
| "learning_rate": 0.0003222701475595913, |
| "loss": 11.7267, |
| "step": 2090 |
| }, |
| { |
| "epoch": 2.3571528850203567, |
| "grad_norm": 1.6495846509933472, |
| "learning_rate": 0.00032090805902383656, |
| "loss": 11.6705, |
| "step": 2100 |
| }, |
| { |
| "epoch": 2.3683841078197387, |
| "grad_norm": 1.666407585144043, |
| "learning_rate": 0.0003195459704880817, |
| "loss": 11.7987, |
| "step": 2110 |
| }, |
| { |
| "epoch": 2.3796153306191212, |
| "grad_norm": 1.4373139142990112, |
| "learning_rate": 0.0003181838819523269, |
| "loss": 11.7374, |
| "step": 2120 |
| }, |
| { |
| "epoch": 2.3908465534185033, |
| "grad_norm": 1.5677517652511597, |
| "learning_rate": 0.00031682179341657203, |
| "loss": 11.7455, |
| "step": 2130 |
| }, |
| { |
| "epoch": 2.4020777762178858, |
| "grad_norm": 1.6688272953033447, |
| "learning_rate": 0.00031545970488081727, |
| "loss": 11.6604, |
| "step": 2140 |
| }, |
| { |
| "epoch": 2.413308999017268, |
| "grad_norm": 1.5364110469818115, |
| "learning_rate": 0.0003140976163450624, |
| "loss": 11.702, |
| "step": 2150 |
| }, |
| { |
| "epoch": 2.4245402218166503, |
| "grad_norm": 1.6387137174606323, |
| "learning_rate": 0.0003127355278093076, |
| "loss": 11.6858, |
| "step": 2160 |
| }, |
| { |
| "epoch": 2.435771444616033, |
| "grad_norm": 1.5507831573486328, |
| "learning_rate": 0.00031137343927355274, |
| "loss": 11.6422, |
| "step": 2170 |
| }, |
| { |
| "epoch": 2.447002667415415, |
| "grad_norm": 1.4561623334884644, |
| "learning_rate": 0.000310011350737798, |
| "loss": 11.6273, |
| "step": 2180 |
| }, |
| { |
| "epoch": 2.4582338902147973, |
| "grad_norm": 1.5597074031829834, |
| "learning_rate": 0.0003086492622020431, |
| "loss": 11.6417, |
| "step": 2190 |
| }, |
| { |
| "epoch": 2.4694651130141794, |
| "grad_norm": 1.5203100442886353, |
| "learning_rate": 0.0003072871736662883, |
| "loss": 11.6372, |
| "step": 2200 |
| }, |
| { |
| "epoch": 2.480696335813562, |
| "grad_norm": 1.50716233253479, |
| "learning_rate": 0.00030592508513053345, |
| "loss": 11.6546, |
| "step": 2210 |
| }, |
| { |
| "epoch": 2.491927558612944, |
| "grad_norm": 1.615830898284912, |
| "learning_rate": 0.00030456299659477863, |
| "loss": 11.6139, |
| "step": 2220 |
| }, |
| { |
| "epoch": 2.5031587814123264, |
| "grad_norm": 1.479765772819519, |
| "learning_rate": 0.0003032009080590238, |
| "loss": 11.6222, |
| "step": 2230 |
| }, |
| { |
| "epoch": 2.5143900042117084, |
| "grad_norm": 1.5284301042556763, |
| "learning_rate": 0.000301838819523269, |
| "loss": 11.6529, |
| "step": 2240 |
| }, |
| { |
| "epoch": 2.525621227011091, |
| "grad_norm": 1.5398808717727661, |
| "learning_rate": 0.00030047673098751416, |
| "loss": 11.651, |
| "step": 2250 |
| }, |
| { |
| "epoch": 2.536852449810473, |
| "grad_norm": 1.526878833770752, |
| "learning_rate": 0.00029911464245175934, |
| "loss": 11.5892, |
| "step": 2260 |
| }, |
| { |
| "epoch": 2.5480836726098555, |
| "grad_norm": 1.5637303590774536, |
| "learning_rate": 0.0002977525539160045, |
| "loss": 11.6498, |
| "step": 2270 |
| }, |
| { |
| "epoch": 2.559314895409238, |
| "grad_norm": 1.711832046508789, |
| "learning_rate": 0.0002963904653802497, |
| "loss": 11.551, |
| "step": 2280 |
| }, |
| { |
| "epoch": 2.57054611820862, |
| "grad_norm": 1.5033766031265259, |
| "learning_rate": 0.00029502837684449487, |
| "loss": 11.6127, |
| "step": 2290 |
| }, |
| { |
| "epoch": 2.581777341008002, |
| "grad_norm": 1.455689787864685, |
| "learning_rate": 0.00029366628830874004, |
| "loss": 11.5994, |
| "step": 2300 |
| }, |
| { |
| "epoch": 2.5930085638073845, |
| "grad_norm": 1.6426947116851807, |
| "learning_rate": 0.0002923041997729852, |
| "loss": 11.6214, |
| "step": 2310 |
| }, |
| { |
| "epoch": 2.604239786606767, |
| "grad_norm": 1.5379929542541504, |
| "learning_rate": 0.0002909421112372304, |
| "loss": 11.5727, |
| "step": 2320 |
| }, |
| { |
| "epoch": 2.615471009406149, |
| "grad_norm": 1.5243467092514038, |
| "learning_rate": 0.0002895800227014756, |
| "loss": 11.576, |
| "step": 2330 |
| }, |
| { |
| "epoch": 2.626702232205531, |
| "grad_norm": 1.562593936920166, |
| "learning_rate": 0.00028821793416572075, |
| "loss": 11.641, |
| "step": 2340 |
| }, |
| { |
| "epoch": 2.6379334550049136, |
| "grad_norm": 1.5288798809051514, |
| "learning_rate": 0.00028685584562996593, |
| "loss": 11.5637, |
| "step": 2350 |
| }, |
| { |
| "epoch": 2.649164677804296, |
| "grad_norm": 1.8018691539764404, |
| "learning_rate": 0.0002854937570942111, |
| "loss": 11.4962, |
| "step": 2360 |
| }, |
| { |
| "epoch": 2.660395900603678, |
| "grad_norm": 1.77366304397583, |
| "learning_rate": 0.0002841316685584563, |
| "loss": 11.5311, |
| "step": 2370 |
| }, |
| { |
| "epoch": 2.6716271234030606, |
| "grad_norm": 1.500279188156128, |
| "learning_rate": 0.00028276958002270146, |
| "loss": 11.5499, |
| "step": 2380 |
| }, |
| { |
| "epoch": 2.6828583462024427, |
| "grad_norm": 1.6834344863891602, |
| "learning_rate": 0.00028140749148694664, |
| "loss": 11.5187, |
| "step": 2390 |
| }, |
| { |
| "epoch": 2.694089569001825, |
| "grad_norm": 1.4962844848632812, |
| "learning_rate": 0.0002800454029511918, |
| "loss": 11.5505, |
| "step": 2400 |
| }, |
| { |
| "epoch": 2.705320791801207, |
| "grad_norm": 1.4639493227005005, |
| "learning_rate": 0.00027868331441543694, |
| "loss": 11.5381, |
| "step": 2410 |
| }, |
| { |
| "epoch": 2.7165520146005897, |
| "grad_norm": 1.5657707452774048, |
| "learning_rate": 0.0002773212258796821, |
| "loss": 11.5474, |
| "step": 2420 |
| }, |
| { |
| "epoch": 2.727783237399972, |
| "grad_norm": 1.5155985355377197, |
| "learning_rate": 0.0002759591373439273, |
| "loss": 11.5487, |
| "step": 2430 |
| }, |
| { |
| "epoch": 2.7390144601993542, |
| "grad_norm": 1.6336228847503662, |
| "learning_rate": 0.00027459704880817247, |
| "loss": 11.5191, |
| "step": 2440 |
| }, |
| { |
| "epoch": 2.7502456829987363, |
| "grad_norm": 1.4850879907608032, |
| "learning_rate": 0.00027323496027241765, |
| "loss": 11.4788, |
| "step": 2450 |
| }, |
| { |
| "epoch": 2.7614769057981188, |
| "grad_norm": 1.5942481756210327, |
| "learning_rate": 0.0002718728717366628, |
| "loss": 11.5141, |
| "step": 2460 |
| }, |
| { |
| "epoch": 2.7727081285975013, |
| "grad_norm": 1.5158549547195435, |
| "learning_rate": 0.000270510783200908, |
| "loss": 11.4485, |
| "step": 2470 |
| }, |
| { |
| "epoch": 2.7839393513968833, |
| "grad_norm": 1.6702396869659424, |
| "learning_rate": 0.0002691486946651532, |
| "loss": 11.5334, |
| "step": 2480 |
| }, |
| { |
| "epoch": 2.7951705741962654, |
| "grad_norm": 1.5319764614105225, |
| "learning_rate": 0.00026778660612939836, |
| "loss": 11.4425, |
| "step": 2490 |
| }, |
| { |
| "epoch": 2.806401796995648, |
| "grad_norm": 1.463173270225525, |
| "learning_rate": 0.00026642451759364353, |
| "loss": 11.4739, |
| "step": 2500 |
| }, |
| { |
| "epoch": 2.8176330197950303, |
| "grad_norm": 1.7062861919403076, |
| "learning_rate": 0.0002650624290578887, |
| "loss": 11.4848, |
| "step": 2510 |
| }, |
| { |
| "epoch": 2.8288642425944124, |
| "grad_norm": 1.5613621473312378, |
| "learning_rate": 0.0002637003405221339, |
| "loss": 11.5269, |
| "step": 2520 |
| }, |
| { |
| "epoch": 2.840095465393795, |
| "grad_norm": 1.6942760944366455, |
| "learning_rate": 0.00026233825198637906, |
| "loss": 11.5173, |
| "step": 2530 |
| }, |
| { |
| "epoch": 2.851326688193177, |
| "grad_norm": 1.4133695363998413, |
| "learning_rate": 0.00026097616345062424, |
| "loss": 11.4846, |
| "step": 2540 |
| }, |
| { |
| "epoch": 2.8625579109925594, |
| "grad_norm": 1.4533722400665283, |
| "learning_rate": 0.0002596140749148694, |
| "loss": 11.5119, |
| "step": 2550 |
| }, |
| { |
| "epoch": 2.8737891337919415, |
| "grad_norm": 1.407906174659729, |
| "learning_rate": 0.0002582519863791146, |
| "loss": 11.4717, |
| "step": 2560 |
| }, |
| { |
| "epoch": 2.885020356591324, |
| "grad_norm": 1.4731615781784058, |
| "learning_rate": 0.00025688989784335977, |
| "loss": 11.434, |
| "step": 2570 |
| }, |
| { |
| "epoch": 2.8962515793907064, |
| "grad_norm": 1.702810287475586, |
| "learning_rate": 0.00025552780930760495, |
| "loss": 11.4225, |
| "step": 2580 |
| }, |
| { |
| "epoch": 2.9074828021900885, |
| "grad_norm": 1.5021584033966064, |
| "learning_rate": 0.0002541657207718501, |
| "loss": 11.5014, |
| "step": 2590 |
| }, |
| { |
| "epoch": 2.9187140249894705, |
| "grad_norm": 1.52582848072052, |
| "learning_rate": 0.0002528036322360953, |
| "loss": 11.5105, |
| "step": 2600 |
| }, |
| { |
| "epoch": 2.929945247788853, |
| "grad_norm": 1.552182912826538, |
| "learning_rate": 0.0002514415437003405, |
| "loss": 11.3703, |
| "step": 2610 |
| }, |
| { |
| "epoch": 2.9411764705882355, |
| "grad_norm": 1.5947136878967285, |
| "learning_rate": 0.00025007945516458566, |
| "loss": 11.4672, |
| "step": 2620 |
| }, |
| { |
| "epoch": 2.9524076933876175, |
| "grad_norm": 1.451735496520996, |
| "learning_rate": 0.00024871736662883083, |
| "loss": 11.3825, |
| "step": 2630 |
| }, |
| { |
| "epoch": 2.9636389161869996, |
| "grad_norm": 1.5744386911392212, |
| "learning_rate": 0.000247355278093076, |
| "loss": 11.3662, |
| "step": 2640 |
| }, |
| { |
| "epoch": 2.974870138986382, |
| "grad_norm": 1.52872633934021, |
| "learning_rate": 0.0002459931895573212, |
| "loss": 11.3914, |
| "step": 2650 |
| }, |
| { |
| "epoch": 2.9861013617857646, |
| "grad_norm": 1.5340864658355713, |
| "learning_rate": 0.00024463110102156637, |
| "loss": 11.3522, |
| "step": 2660 |
| }, |
| { |
| "epoch": 2.9973325845851466, |
| "grad_norm": 1.5574575662612915, |
| "learning_rate": 0.00024326901248581154, |
| "loss": 11.3726, |
| "step": 2670 |
| }, |
| { |
| "epoch": 3.007861855959568, |
| "grad_norm": 1.4836827516555786, |
| "learning_rate": 0.00024190692395005672, |
| "loss": 10.6008, |
| "step": 2680 |
| }, |
| { |
| "epoch": 3.01909307875895, |
| "grad_norm": 1.4343681335449219, |
| "learning_rate": 0.0002405448354143019, |
| "loss": 11.2479, |
| "step": 2690 |
| }, |
| { |
| "epoch": 3.0303243015583323, |
| "grad_norm": 1.5032552480697632, |
| "learning_rate": 0.00023918274687854707, |
| "loss": 11.2719, |
| "step": 2700 |
| }, |
| { |
| "epoch": 3.0415555243577144, |
| "grad_norm": 1.6779489517211914, |
| "learning_rate": 0.00023782065834279225, |
| "loss": 11.2446, |
| "step": 2710 |
| }, |
| { |
| "epoch": 3.052786747157097, |
| "grad_norm": 1.5352118015289307, |
| "learning_rate": 0.00023645856980703743, |
| "loss": 11.2898, |
| "step": 2720 |
| }, |
| { |
| "epoch": 3.064017969956479, |
| "grad_norm": 1.4970052242279053, |
| "learning_rate": 0.0002350964812712826, |
| "loss": 11.2933, |
| "step": 2730 |
| }, |
| { |
| "epoch": 3.0752491927558614, |
| "grad_norm": 1.4887489080429077, |
| "learning_rate": 0.00023373439273552778, |
| "loss": 11.2404, |
| "step": 2740 |
| }, |
| { |
| "epoch": 3.0864804155552434, |
| "grad_norm": 1.4684367179870605, |
| "learning_rate": 0.00023237230419977296, |
| "loss": 11.251, |
| "step": 2750 |
| }, |
| { |
| "epoch": 3.097711638354626, |
| "grad_norm": 1.5003316402435303, |
| "learning_rate": 0.00023101021566401814, |
| "loss": 11.2742, |
| "step": 2760 |
| }, |
| { |
| "epoch": 3.108942861154008, |
| "grad_norm": 1.5525890588760376, |
| "learning_rate": 0.00022964812712826331, |
| "loss": 11.2605, |
| "step": 2770 |
| }, |
| { |
| "epoch": 3.1201740839533905, |
| "grad_norm": 1.5025476217269897, |
| "learning_rate": 0.0002282860385925085, |
| "loss": 11.2734, |
| "step": 2780 |
| }, |
| { |
| "epoch": 3.1314053067527725, |
| "grad_norm": 1.5809471607208252, |
| "learning_rate": 0.00022692395005675367, |
| "loss": 11.2549, |
| "step": 2790 |
| }, |
| { |
| "epoch": 3.142636529552155, |
| "grad_norm": 1.6025090217590332, |
| "learning_rate": 0.00022556186152099885, |
| "loss": 11.2199, |
| "step": 2800 |
| }, |
| { |
| "epoch": 3.1538677523515375, |
| "grad_norm": 1.5650960206985474, |
| "learning_rate": 0.000224199772985244, |
| "loss": 11.2323, |
| "step": 2810 |
| }, |
| { |
| "epoch": 3.1650989751509195, |
| "grad_norm": 1.5035734176635742, |
| "learning_rate": 0.00022283768444948917, |
| "loss": 11.2694, |
| "step": 2820 |
| }, |
| { |
| "epoch": 3.176330197950302, |
| "grad_norm": 1.5358186960220337, |
| "learning_rate": 0.00022147559591373435, |
| "loss": 11.229, |
| "step": 2830 |
| }, |
| { |
| "epoch": 3.187561420749684, |
| "grad_norm": 1.5240596532821655, |
| "learning_rate": 0.00022011350737797953, |
| "loss": 11.2474, |
| "step": 2840 |
| }, |
| { |
| "epoch": 3.1987926435490666, |
| "grad_norm": 1.4946931600570679, |
| "learning_rate": 0.0002187514188422247, |
| "loss": 11.3082, |
| "step": 2850 |
| }, |
| { |
| "epoch": 3.2100238663484486, |
| "grad_norm": 1.5998846292495728, |
| "learning_rate": 0.00021738933030646988, |
| "loss": 11.1555, |
| "step": 2860 |
| }, |
| { |
| "epoch": 3.221255089147831, |
| "grad_norm": 1.527463436126709, |
| "learning_rate": 0.00021602724177071506, |
| "loss": 11.2718, |
| "step": 2870 |
| }, |
| { |
| "epoch": 3.232486311947213, |
| "grad_norm": 1.4795818328857422, |
| "learning_rate": 0.00021466515323496024, |
| "loss": 11.1977, |
| "step": 2880 |
| }, |
| { |
| "epoch": 3.2437175347465956, |
| "grad_norm": 1.4611546993255615, |
| "learning_rate": 0.0002133030646992054, |
| "loss": 11.2381, |
| "step": 2890 |
| }, |
| { |
| "epoch": 3.2549487575459777, |
| "grad_norm": 1.5139747858047485, |
| "learning_rate": 0.0002119409761634506, |
| "loss": 11.197, |
| "step": 2900 |
| }, |
| { |
| "epoch": 3.26617998034536, |
| "grad_norm": 1.6371299028396606, |
| "learning_rate": 0.00021057888762769577, |
| "loss": 11.264, |
| "step": 2910 |
| }, |
| { |
| "epoch": 3.277411203144742, |
| "grad_norm": 1.4956345558166504, |
| "learning_rate": 0.00020921679909194094, |
| "loss": 11.2839, |
| "step": 2920 |
| }, |
| { |
| "epoch": 3.2886424259441247, |
| "grad_norm": 1.6143215894699097, |
| "learning_rate": 0.00020785471055618612, |
| "loss": 11.2053, |
| "step": 2930 |
| }, |
| { |
| "epoch": 3.2998736487435067, |
| "grad_norm": 1.6429616212844849, |
| "learning_rate": 0.0002064926220204313, |
| "loss": 11.1887, |
| "step": 2940 |
| }, |
| { |
| "epoch": 3.3111048715428892, |
| "grad_norm": 1.50111985206604, |
| "learning_rate": 0.00020513053348467647, |
| "loss": 11.2054, |
| "step": 2950 |
| }, |
| { |
| "epoch": 3.3223360943422717, |
| "grad_norm": 1.4351112842559814, |
| "learning_rate": 0.00020376844494892165, |
| "loss": 11.2221, |
| "step": 2960 |
| }, |
| { |
| "epoch": 3.3335673171416538, |
| "grad_norm": 1.4160490036010742, |
| "learning_rate": 0.00020240635641316683, |
| "loss": 11.2436, |
| "step": 2970 |
| }, |
| { |
| "epoch": 3.3447985399410363, |
| "grad_norm": 1.5771642923355103, |
| "learning_rate": 0.000201044267877412, |
| "loss": 11.1831, |
| "step": 2980 |
| }, |
| { |
| "epoch": 3.3560297627404183, |
| "grad_norm": 1.5144894123077393, |
| "learning_rate": 0.00019968217934165718, |
| "loss": 11.1581, |
| "step": 2990 |
| }, |
| { |
| "epoch": 3.367260985539801, |
| "grad_norm": 1.456102967262268, |
| "learning_rate": 0.00019832009080590236, |
| "loss": 11.1989, |
| "step": 3000 |
| }, |
| { |
| "epoch": 3.378492208339183, |
| "grad_norm": 1.446110486984253, |
| "learning_rate": 0.00019695800227014754, |
| "loss": 11.1712, |
| "step": 3010 |
| }, |
| { |
| "epoch": 3.3897234311385653, |
| "grad_norm": 1.4775264263153076, |
| "learning_rate": 0.00019559591373439271, |
| "loss": 11.1628, |
| "step": 3020 |
| }, |
| { |
| "epoch": 3.4009546539379474, |
| "grad_norm": 1.4535621404647827, |
| "learning_rate": 0.0001942338251986379, |
| "loss": 11.092, |
| "step": 3030 |
| }, |
| { |
| "epoch": 3.41218587673733, |
| "grad_norm": 1.5154165029525757, |
| "learning_rate": 0.00019287173666288307, |
| "loss": 11.1513, |
| "step": 3040 |
| }, |
| { |
| "epoch": 3.423417099536712, |
| "grad_norm": 1.476539134979248, |
| "learning_rate": 0.00019150964812712825, |
| "loss": 11.2126, |
| "step": 3050 |
| }, |
| { |
| "epoch": 3.4346483223360944, |
| "grad_norm": 1.4598060846328735, |
| "learning_rate": 0.00019014755959137342, |
| "loss": 11.1632, |
| "step": 3060 |
| }, |
| { |
| "epoch": 3.4458795451354765, |
| "grad_norm": 1.4783436059951782, |
| "learning_rate": 0.0001887854710556186, |
| "loss": 11.1238, |
| "step": 3070 |
| }, |
| { |
| "epoch": 3.457110767934859, |
| "grad_norm": 1.545137643814087, |
| "learning_rate": 0.00018742338251986375, |
| "loss": 11.1628, |
| "step": 3080 |
| }, |
| { |
| "epoch": 3.468341990734241, |
| "grad_norm": 1.4730881452560425, |
| "learning_rate": 0.00018606129398410893, |
| "loss": 11.1644, |
| "step": 3090 |
| }, |
| { |
| "epoch": 3.4795732135336235, |
| "grad_norm": 1.48605215549469, |
| "learning_rate": 0.0001846992054483541, |
| "loss": 11.1871, |
| "step": 3100 |
| }, |
| { |
| "epoch": 3.490804436333006, |
| "grad_norm": 1.4472525119781494, |
| "learning_rate": 0.00018333711691259928, |
| "loss": 11.207, |
| "step": 3110 |
| }, |
| { |
| "epoch": 3.502035659132388, |
| "grad_norm": 1.3672376871109009, |
| "learning_rate": 0.00018197502837684446, |
| "loss": 11.1861, |
| "step": 3120 |
| }, |
| { |
| "epoch": 3.51326688193177, |
| "grad_norm": 1.5072065591812134, |
| "learning_rate": 0.00018061293984108964, |
| "loss": 11.1347, |
| "step": 3130 |
| }, |
| { |
| "epoch": 3.5244981047311525, |
| "grad_norm": 1.5243936777114868, |
| "learning_rate": 0.0001792508513053348, |
| "loss": 11.1585, |
| "step": 3140 |
| }, |
| { |
| "epoch": 3.535729327530535, |
| "grad_norm": 1.5613657236099243, |
| "learning_rate": 0.00017788876276958, |
| "loss": 11.2059, |
| "step": 3150 |
| }, |
| { |
| "epoch": 3.546960550329917, |
| "grad_norm": 1.4457815885543823, |
| "learning_rate": 0.00017652667423382517, |
| "loss": 11.0973, |
| "step": 3160 |
| }, |
| { |
| "epoch": 3.5581917731292996, |
| "grad_norm": 1.3698965311050415, |
| "learning_rate": 0.00017516458569807034, |
| "loss": 11.0931, |
| "step": 3170 |
| }, |
| { |
| "epoch": 3.5694229959286816, |
| "grad_norm": 1.5461801290512085, |
| "learning_rate": 0.00017380249716231552, |
| "loss": 11.1319, |
| "step": 3180 |
| }, |
| { |
| "epoch": 3.580654218728064, |
| "grad_norm": 1.5740439891815186, |
| "learning_rate": 0.0001724404086265607, |
| "loss": 11.2028, |
| "step": 3190 |
| }, |
| { |
| "epoch": 3.591885441527446, |
| "grad_norm": 1.3753931522369385, |
| "learning_rate": 0.00017107832009080588, |
| "loss": 11.1389, |
| "step": 3200 |
| }, |
| { |
| "epoch": 3.6031166643268286, |
| "grad_norm": 1.5603852272033691, |
| "learning_rate": 0.00016971623155505105, |
| "loss": 11.1247, |
| "step": 3210 |
| }, |
| { |
| "epoch": 3.614347887126211, |
| "grad_norm": 1.5574864149093628, |
| "learning_rate": 0.00016835414301929623, |
| "loss": 11.1183, |
| "step": 3220 |
| }, |
| { |
| "epoch": 3.625579109925593, |
| "grad_norm": 1.51617431640625, |
| "learning_rate": 0.0001669920544835414, |
| "loss": 11.1271, |
| "step": 3230 |
| }, |
| { |
| "epoch": 3.6368103327249752, |
| "grad_norm": 1.517800211906433, |
| "learning_rate": 0.00016562996594778658, |
| "loss": 11.1425, |
| "step": 3240 |
| }, |
| { |
| "epoch": 3.6480415555243577, |
| "grad_norm": 1.5872033834457397, |
| "learning_rate": 0.00016426787741203176, |
| "loss": 11.1319, |
| "step": 3250 |
| }, |
| { |
| "epoch": 3.65927277832374, |
| "grad_norm": 1.4323076009750366, |
| "learning_rate": 0.00016290578887627694, |
| "loss": 11.0476, |
| "step": 3260 |
| }, |
| { |
| "epoch": 3.6705040011231223, |
| "grad_norm": 1.4947340488433838, |
| "learning_rate": 0.00016154370034052211, |
| "loss": 11.0651, |
| "step": 3270 |
| }, |
| { |
| "epoch": 3.6817352239225043, |
| "grad_norm": 1.4352633953094482, |
| "learning_rate": 0.0001601816118047673, |
| "loss": 11.0978, |
| "step": 3280 |
| }, |
| { |
| "epoch": 3.692966446721887, |
| "grad_norm": 1.6209876537322998, |
| "learning_rate": 0.00015881952326901247, |
| "loss": 11.0699, |
| "step": 3290 |
| }, |
| { |
| "epoch": 3.7041976695212693, |
| "grad_norm": 1.4265753030776978, |
| "learning_rate": 0.00015745743473325765, |
| "loss": 11.0495, |
| "step": 3300 |
| }, |
| { |
| "epoch": 3.7154288923206513, |
| "grad_norm": 1.5374298095703125, |
| "learning_rate": 0.00015609534619750282, |
| "loss": 11.0698, |
| "step": 3310 |
| }, |
| { |
| "epoch": 3.726660115120034, |
| "grad_norm": 1.5547951459884644, |
| "learning_rate": 0.000154733257661748, |
| "loss": 11.105, |
| "step": 3320 |
| }, |
| { |
| "epoch": 3.737891337919416, |
| "grad_norm": 1.4317471981048584, |
| "learning_rate": 0.00015337116912599318, |
| "loss": 11.0474, |
| "step": 3330 |
| }, |
| { |
| "epoch": 3.7491225607187983, |
| "grad_norm": 1.4616270065307617, |
| "learning_rate": 0.00015200908059023835, |
| "loss": 11.0472, |
| "step": 3340 |
| }, |
| { |
| "epoch": 3.7603537835181804, |
| "grad_norm": 1.4159753322601318, |
| "learning_rate": 0.0001506469920544835, |
| "loss": 11.0644, |
| "step": 3350 |
| }, |
| { |
| "epoch": 3.771585006317563, |
| "grad_norm": 1.5320748090744019, |
| "learning_rate": 0.0001492849035187287, |
| "loss": 11.092, |
| "step": 3360 |
| }, |
| { |
| "epoch": 3.7828162291169454, |
| "grad_norm": 1.4254796504974365, |
| "learning_rate": 0.00014792281498297389, |
| "loss": 11.0266, |
| "step": 3370 |
| }, |
| { |
| "epoch": 3.7940474519163274, |
| "grad_norm": 1.5224483013153076, |
| "learning_rate": 0.00014656072644721906, |
| "loss": 11.0699, |
| "step": 3380 |
| }, |
| { |
| "epoch": 3.8052786747157095, |
| "grad_norm": 1.3961817026138306, |
| "learning_rate": 0.00014519863791146424, |
| "loss": 11.0638, |
| "step": 3390 |
| }, |
| { |
| "epoch": 3.816509897515092, |
| "grad_norm": 1.4628331661224365, |
| "learning_rate": 0.00014383654937570942, |
| "loss": 11.0998, |
| "step": 3400 |
| }, |
| { |
| "epoch": 3.8277411203144744, |
| "grad_norm": 1.469596266746521, |
| "learning_rate": 0.0001424744608399546, |
| "loss": 11.0611, |
| "step": 3410 |
| }, |
| { |
| "epoch": 3.8389723431138565, |
| "grad_norm": 1.4373725652694702, |
| "learning_rate": 0.00014111237230419977, |
| "loss": 11.0567, |
| "step": 3420 |
| }, |
| { |
| "epoch": 3.8502035659132385, |
| "grad_norm": 1.4011608362197876, |
| "learning_rate": 0.00013975028376844495, |
| "loss": 11.0241, |
| "step": 3430 |
| }, |
| { |
| "epoch": 3.861434788712621, |
| "grad_norm": 1.5255424976348877, |
| "learning_rate": 0.00013838819523269013, |
| "loss": 11.023, |
| "step": 3440 |
| }, |
| { |
| "epoch": 3.8726660115120035, |
| "grad_norm": 1.4086155891418457, |
| "learning_rate": 0.0001370261066969353, |
| "loss": 11.0914, |
| "step": 3450 |
| }, |
| { |
| "epoch": 3.8838972343113856, |
| "grad_norm": 1.4729487895965576, |
| "learning_rate": 0.00013566401816118048, |
| "loss": 11.0527, |
| "step": 3460 |
| }, |
| { |
| "epoch": 3.895128457110768, |
| "grad_norm": 1.4167112112045288, |
| "learning_rate": 0.00013430192962542566, |
| "loss": 11.0646, |
| "step": 3470 |
| }, |
| { |
| "epoch": 3.90635967991015, |
| "grad_norm": 1.4228618144989014, |
| "learning_rate": 0.00013293984108967083, |
| "loss": 11.0896, |
| "step": 3480 |
| }, |
| { |
| "epoch": 3.9175909027095326, |
| "grad_norm": 1.3969998359680176, |
| "learning_rate": 0.000131577752553916, |
| "loss": 11.0844, |
| "step": 3490 |
| }, |
| { |
| "epoch": 3.9288221255089146, |
| "grad_norm": 1.3993597030639648, |
| "learning_rate": 0.0001302156640181612, |
| "loss": 11.0194, |
| "step": 3500 |
| }, |
| { |
| "epoch": 3.940053348308297, |
| "grad_norm": 1.4945032596588135, |
| "learning_rate": 0.00012885357548240634, |
| "loss": 11.0404, |
| "step": 3510 |
| }, |
| { |
| "epoch": 3.9512845711076796, |
| "grad_norm": 1.4512568712234497, |
| "learning_rate": 0.00012749148694665152, |
| "loss": 11.0251, |
| "step": 3520 |
| }, |
| { |
| "epoch": 3.9625157939070617, |
| "grad_norm": 1.4356528520584106, |
| "learning_rate": 0.0001261293984108967, |
| "loss": 11.0682, |
| "step": 3530 |
| }, |
| { |
| "epoch": 3.9737470167064437, |
| "grad_norm": 1.4650399684906006, |
| "learning_rate": 0.00012476730987514187, |
| "loss": 11.042, |
| "step": 3540 |
| }, |
| { |
| "epoch": 3.984978239505826, |
| "grad_norm": 1.482191562652588, |
| "learning_rate": 0.00012340522133938705, |
| "loss": 11.0361, |
| "step": 3550 |
| }, |
| { |
| "epoch": 3.9962094623052087, |
| "grad_norm": 1.4656471014022827, |
| "learning_rate": 0.00012204313280363224, |
| "loss": 10.9494, |
| "step": 3560 |
| }, |
| { |
| "epoch": 4.006738733679629, |
| "grad_norm": 1.5250879526138306, |
| "learning_rate": 0.00012068104426787741, |
| "loss": 10.3207, |
| "step": 3570 |
| }, |
| { |
| "epoch": 4.0179699564790115, |
| "grad_norm": 1.4742364883422852, |
| "learning_rate": 0.00011931895573212258, |
| "loss": 10.9206, |
| "step": 3580 |
| }, |
| { |
| "epoch": 4.0292011792783935, |
| "grad_norm": 1.4390878677368164, |
| "learning_rate": 0.00011795686719636776, |
| "loss": 10.9298, |
| "step": 3590 |
| }, |
| { |
| "epoch": 4.040432402077776, |
| "grad_norm": 1.4271754026412964, |
| "learning_rate": 0.00011659477866061293, |
| "loss": 10.9401, |
| "step": 3600 |
| }, |
| { |
| "epoch": 4.0516636248771585, |
| "grad_norm": 1.4713624715805054, |
| "learning_rate": 0.00011523269012485811, |
| "loss": 10.9334, |
| "step": 3610 |
| }, |
| { |
| "epoch": 4.0628948476765405, |
| "grad_norm": 1.416982650756836, |
| "learning_rate": 0.00011387060158910329, |
| "loss": 10.9586, |
| "step": 3620 |
| }, |
| { |
| "epoch": 4.0741260704759235, |
| "grad_norm": 1.4600762128829956, |
| "learning_rate": 0.00011250851305334846, |
| "loss": 10.8739, |
| "step": 3630 |
| }, |
| { |
| "epoch": 4.0853572932753055, |
| "grad_norm": 1.3993488550186157, |
| "learning_rate": 0.00011114642451759364, |
| "loss": 10.9006, |
| "step": 3640 |
| }, |
| { |
| "epoch": 4.0965885160746875, |
| "grad_norm": 1.3790336847305298, |
| "learning_rate": 0.00010978433598183882, |
| "loss": 10.9069, |
| "step": 3650 |
| }, |
| { |
| "epoch": 4.10781973887407, |
| "grad_norm": 1.4517656564712524, |
| "learning_rate": 0.000108422247446084, |
| "loss": 10.8724, |
| "step": 3660 |
| }, |
| { |
| "epoch": 4.1190509616734525, |
| "grad_norm": 1.5568796396255493, |
| "learning_rate": 0.00010706015891032917, |
| "loss": 10.8934, |
| "step": 3670 |
| }, |
| { |
| "epoch": 4.130282184472835, |
| "grad_norm": 1.4685585498809814, |
| "learning_rate": 0.00010569807037457435, |
| "loss": 10.899, |
| "step": 3680 |
| }, |
| { |
| "epoch": 4.141513407272217, |
| "grad_norm": 1.4848984479904175, |
| "learning_rate": 0.00010433598183881953, |
| "loss": 10.8616, |
| "step": 3690 |
| }, |
| { |
| "epoch": 4.152744630071599, |
| "grad_norm": 1.4731574058532715, |
| "learning_rate": 0.0001029738933030647, |
| "loss": 10.8857, |
| "step": 3700 |
| }, |
| { |
| "epoch": 4.163975852870982, |
| "grad_norm": 1.447550892829895, |
| "learning_rate": 0.00010161180476730987, |
| "loss": 10.8534, |
| "step": 3710 |
| }, |
| { |
| "epoch": 4.175207075670364, |
| "grad_norm": 1.4320396184921265, |
| "learning_rate": 0.00010024971623155504, |
| "loss": 10.8808, |
| "step": 3720 |
| }, |
| { |
| "epoch": 4.186438298469746, |
| "grad_norm": 1.4548975229263306, |
| "learning_rate": 9.888762769580022e-05, |
| "loss": 10.8875, |
| "step": 3730 |
| }, |
| { |
| "epoch": 4.197669521269129, |
| "grad_norm": 1.405381441116333, |
| "learning_rate": 9.75255391600454e-05, |
| "loss": 10.8869, |
| "step": 3740 |
| }, |
| { |
| "epoch": 4.208900744068511, |
| "grad_norm": 1.394189476966858, |
| "learning_rate": 9.616345062429058e-05, |
| "loss": 10.8952, |
| "step": 3750 |
| }, |
| { |
| "epoch": 4.220131966867893, |
| "grad_norm": 1.3617000579833984, |
| "learning_rate": 9.480136208853575e-05, |
| "loss": 10.8854, |
| "step": 3760 |
| }, |
| { |
| "epoch": 4.231363189667275, |
| "grad_norm": 1.4552937746047974, |
| "learning_rate": 9.343927355278093e-05, |
| "loss": 10.9343, |
| "step": 3770 |
| }, |
| { |
| "epoch": 4.242594412466658, |
| "grad_norm": 1.3947829008102417, |
| "learning_rate": 9.20771850170261e-05, |
| "loss": 10.8487, |
| "step": 3780 |
| }, |
| { |
| "epoch": 4.25382563526604, |
| "grad_norm": 1.4310104846954346, |
| "learning_rate": 9.071509648127128e-05, |
| "loss": 10.8277, |
| "step": 3790 |
| }, |
| { |
| "epoch": 4.265056858065422, |
| "grad_norm": 1.4186252355575562, |
| "learning_rate": 8.935300794551646e-05, |
| "loss": 10.9158, |
| "step": 3800 |
| }, |
| { |
| "epoch": 4.276288080864804, |
| "grad_norm": 1.3875808715820312, |
| "learning_rate": 8.799091940976164e-05, |
| "loss": 10.9288, |
| "step": 3810 |
| }, |
| { |
| "epoch": 4.287519303664187, |
| "grad_norm": 1.3773916959762573, |
| "learning_rate": 8.662883087400681e-05, |
| "loss": 10.8744, |
| "step": 3820 |
| }, |
| { |
| "epoch": 4.298750526463569, |
| "grad_norm": 1.5134872198104858, |
| "learning_rate": 8.526674233825199e-05, |
| "loss": 10.8303, |
| "step": 3830 |
| }, |
| { |
| "epoch": 4.309981749262951, |
| "grad_norm": 1.525930643081665, |
| "learning_rate": 8.390465380249717e-05, |
| "loss": 10.8502, |
| "step": 3840 |
| }, |
| { |
| "epoch": 4.321212972062333, |
| "grad_norm": 1.4034548997879028, |
| "learning_rate": 8.254256526674233e-05, |
| "loss": 10.8712, |
| "step": 3850 |
| }, |
| { |
| "epoch": 4.332444194861716, |
| "grad_norm": 1.431321144104004, |
| "learning_rate": 8.118047673098751e-05, |
| "loss": 10.9037, |
| "step": 3860 |
| }, |
| { |
| "epoch": 4.343675417661098, |
| "grad_norm": 1.4377447366714478, |
| "learning_rate": 7.981838819523269e-05, |
| "loss": 10.9046, |
| "step": 3870 |
| }, |
| { |
| "epoch": 4.35490664046048, |
| "grad_norm": 1.4167426824569702, |
| "learning_rate": 7.845629965947786e-05, |
| "loss": 10.8576, |
| "step": 3880 |
| }, |
| { |
| "epoch": 4.366137863259862, |
| "grad_norm": 1.3977469205856323, |
| "learning_rate": 7.709421112372304e-05, |
| "loss": 10.8286, |
| "step": 3890 |
| }, |
| { |
| "epoch": 4.377369086059245, |
| "grad_norm": 1.4152209758758545, |
| "learning_rate": 7.573212258796822e-05, |
| "loss": 10.9304, |
| "step": 3900 |
| }, |
| { |
| "epoch": 4.388600308858627, |
| "grad_norm": 1.4083077907562256, |
| "learning_rate": 7.437003405221338e-05, |
| "loss": 10.8174, |
| "step": 3910 |
| }, |
| { |
| "epoch": 4.399831531658009, |
| "grad_norm": 1.3758295774459839, |
| "learning_rate": 7.300794551645856e-05, |
| "loss": 10.8231, |
| "step": 3920 |
| }, |
| { |
| "epoch": 4.411062754457392, |
| "grad_norm": 1.3821783065795898, |
| "learning_rate": 7.164585698070374e-05, |
| "loss": 10.8213, |
| "step": 3930 |
| }, |
| { |
| "epoch": 4.422293977256774, |
| "grad_norm": 1.4114271402359009, |
| "learning_rate": 7.028376844494891e-05, |
| "loss": 10.8755, |
| "step": 3940 |
| }, |
| { |
| "epoch": 4.433525200056156, |
| "grad_norm": 1.474861741065979, |
| "learning_rate": 6.892167990919409e-05, |
| "loss": 10.8719, |
| "step": 3950 |
| }, |
| { |
| "epoch": 4.444756422855538, |
| "grad_norm": 1.455072283744812, |
| "learning_rate": 6.755959137343927e-05, |
| "loss": 10.8429, |
| "step": 3960 |
| }, |
| { |
| "epoch": 4.455987645654921, |
| "grad_norm": 1.4050872325897217, |
| "learning_rate": 6.619750283768444e-05, |
| "loss": 10.8245, |
| "step": 3970 |
| }, |
| { |
| "epoch": 4.467218868454303, |
| "grad_norm": 1.415117621421814, |
| "learning_rate": 6.483541430192962e-05, |
| "loss": 10.8377, |
| "step": 3980 |
| }, |
| { |
| "epoch": 4.478450091253685, |
| "grad_norm": 1.4472249746322632, |
| "learning_rate": 6.34733257661748e-05, |
| "loss": 10.8068, |
| "step": 3990 |
| }, |
| { |
| "epoch": 4.489681314053067, |
| "grad_norm": 1.411483645439148, |
| "learning_rate": 6.211123723041998e-05, |
| "loss": 10.8779, |
| "step": 4000 |
| }, |
| { |
| "epoch": 4.50091253685245, |
| "grad_norm": 1.3772433996200562, |
| "learning_rate": 6.0749148694665146e-05, |
| "loss": 10.8594, |
| "step": 4010 |
| }, |
| { |
| "epoch": 4.512143759651832, |
| "grad_norm": 1.4007872343063354, |
| "learning_rate": 5.938706015891032e-05, |
| "loss": 10.8399, |
| "step": 4020 |
| }, |
| { |
| "epoch": 4.523374982451214, |
| "grad_norm": 1.3770571947097778, |
| "learning_rate": 5.80249716231555e-05, |
| "loss": 10.7955, |
| "step": 4030 |
| }, |
| { |
| "epoch": 4.534606205250597, |
| "grad_norm": 1.4038739204406738, |
| "learning_rate": 5.666288308740068e-05, |
| "loss": 10.8186, |
| "step": 4040 |
| }, |
| { |
| "epoch": 4.545837428049979, |
| "grad_norm": 1.3469901084899902, |
| "learning_rate": 5.5300794551645854e-05, |
| "loss": 10.7957, |
| "step": 4050 |
| }, |
| { |
| "epoch": 4.557068650849361, |
| "grad_norm": 1.358163833618164, |
| "learning_rate": 5.393870601589103e-05, |
| "loss": 10.8953, |
| "step": 4060 |
| }, |
| { |
| "epoch": 4.568299873648743, |
| "grad_norm": 1.360987901687622, |
| "learning_rate": 5.25766174801362e-05, |
| "loss": 10.8231, |
| "step": 4070 |
| }, |
| { |
| "epoch": 4.579531096448126, |
| "grad_norm": 1.3678861856460571, |
| "learning_rate": 5.121452894438138e-05, |
| "loss": 10.8246, |
| "step": 4080 |
| }, |
| { |
| "epoch": 4.590762319247508, |
| "grad_norm": 1.3450872898101807, |
| "learning_rate": 4.9852440408626556e-05, |
| "loss": 10.8162, |
| "step": 4090 |
| }, |
| { |
| "epoch": 4.60199354204689, |
| "grad_norm": 1.371072769165039, |
| "learning_rate": 4.849035187287173e-05, |
| "loss": 10.7912, |
| "step": 4100 |
| }, |
| { |
| "epoch": 4.613224764846272, |
| "grad_norm": 1.3714163303375244, |
| "learning_rate": 4.712826333711691e-05, |
| "loss": 10.7708, |
| "step": 4110 |
| }, |
| { |
| "epoch": 4.624455987645655, |
| "grad_norm": 1.371570348739624, |
| "learning_rate": 4.576617480136209e-05, |
| "loss": 10.7985, |
| "step": 4120 |
| }, |
| { |
| "epoch": 4.635687210445037, |
| "grad_norm": 1.4231752157211304, |
| "learning_rate": 4.440408626560726e-05, |
| "loss": 10.827, |
| "step": 4130 |
| }, |
| { |
| "epoch": 4.646918433244419, |
| "grad_norm": 1.3889615535736084, |
| "learning_rate": 4.3041997729852435e-05, |
| "loss": 10.837, |
| "step": 4140 |
| }, |
| { |
| "epoch": 4.658149656043802, |
| "grad_norm": 1.3703919649124146, |
| "learning_rate": 4.167990919409761e-05, |
| "loss": 10.8179, |
| "step": 4150 |
| }, |
| { |
| "epoch": 4.669380878843184, |
| "grad_norm": 1.3489488363265991, |
| "learning_rate": 4.031782065834279e-05, |
| "loss": 10.8593, |
| "step": 4160 |
| }, |
| { |
| "epoch": 4.680612101642566, |
| "grad_norm": 1.3783739805221558, |
| "learning_rate": 3.8955732122587966e-05, |
| "loss": 10.7853, |
| "step": 4170 |
| }, |
| { |
| "epoch": 4.691843324441948, |
| "grad_norm": 1.3344446420669556, |
| "learning_rate": 3.759364358683314e-05, |
| "loss": 10.8021, |
| "step": 4180 |
| }, |
| { |
| "epoch": 4.7030745472413304, |
| "grad_norm": 1.3575279712677002, |
| "learning_rate": 3.623155505107832e-05, |
| "loss": 10.7695, |
| "step": 4190 |
| }, |
| { |
| "epoch": 4.714305770040713, |
| "grad_norm": 1.3288753032684326, |
| "learning_rate": 3.486946651532349e-05, |
| "loss": 10.8142, |
| "step": 4200 |
| }, |
| { |
| "epoch": 4.725536992840095, |
| "grad_norm": 1.3467031717300415, |
| "learning_rate": 3.350737797956867e-05, |
| "loss": 10.809, |
| "step": 4210 |
| }, |
| { |
| "epoch": 4.7367682156394775, |
| "grad_norm": 1.3722003698349, |
| "learning_rate": 3.2145289443813845e-05, |
| "loss": 10.7404, |
| "step": 4220 |
| }, |
| { |
| "epoch": 4.74799943843886, |
| "grad_norm": 1.3655105829238892, |
| "learning_rate": 3.078320090805902e-05, |
| "loss": 10.8864, |
| "step": 4230 |
| }, |
| { |
| "epoch": 4.7592306612382425, |
| "grad_norm": 1.3268227577209473, |
| "learning_rate": 2.94211123723042e-05, |
| "loss": 10.7649, |
| "step": 4240 |
| }, |
| { |
| "epoch": 4.7704618840376245, |
| "grad_norm": 1.3199375867843628, |
| "learning_rate": 2.8059023836549373e-05, |
| "loss": 10.7571, |
| "step": 4250 |
| }, |
| { |
| "epoch": 4.7816931068370065, |
| "grad_norm": 1.3348361253738403, |
| "learning_rate": 2.669693530079455e-05, |
| "loss": 10.7514, |
| "step": 4260 |
| }, |
| { |
| "epoch": 4.7929243296363895, |
| "grad_norm": 1.3305360078811646, |
| "learning_rate": 2.5334846765039727e-05, |
| "loss": 10.7446, |
| "step": 4270 |
| }, |
| { |
| "epoch": 4.8041555524357715, |
| "grad_norm": 1.322024941444397, |
| "learning_rate": 2.3972758229284904e-05, |
| "loss": 10.816, |
| "step": 4280 |
| }, |
| { |
| "epoch": 4.815386775235154, |
| "grad_norm": 1.3302651643753052, |
| "learning_rate": 2.2610669693530078e-05, |
| "loss": 10.7709, |
| "step": 4290 |
| }, |
| { |
| "epoch": 4.826617998034536, |
| "grad_norm": 1.3159739971160889, |
| "learning_rate": 2.1248581157775255e-05, |
| "loss": 10.7594, |
| "step": 4300 |
| }, |
| { |
| "epoch": 4.8378492208339186, |
| "grad_norm": 1.3260287046432495, |
| "learning_rate": 1.9886492622020432e-05, |
| "loss": 10.8124, |
| "step": 4310 |
| }, |
| { |
| "epoch": 4.849080443633301, |
| "grad_norm": 1.3304709196090698, |
| "learning_rate": 1.8524404086265606e-05, |
| "loss": 10.8415, |
| "step": 4320 |
| }, |
| { |
| "epoch": 4.860311666432683, |
| "grad_norm": 1.342634916305542, |
| "learning_rate": 1.716231555051078e-05, |
| "loss": 10.7881, |
| "step": 4330 |
| }, |
| { |
| "epoch": 4.871542889232066, |
| "grad_norm": 1.333287000656128, |
| "learning_rate": 1.5800227014755956e-05, |
| "loss": 10.757, |
| "step": 4340 |
| }, |
| { |
| "epoch": 4.882774112031448, |
| "grad_norm": 1.3149112462997437, |
| "learning_rate": 1.4438138479001134e-05, |
| "loss": 10.773, |
| "step": 4350 |
| }, |
| { |
| "epoch": 4.89400533483083, |
| "grad_norm": 1.306639552116394, |
| "learning_rate": 1.3076049943246309e-05, |
| "loss": 10.781, |
| "step": 4360 |
| }, |
| { |
| "epoch": 4.905236557630212, |
| "grad_norm": 1.3133597373962402, |
| "learning_rate": 1.1713961407491484e-05, |
| "loss": 10.7741, |
| "step": 4370 |
| }, |
| { |
| "epoch": 4.916467780429595, |
| "grad_norm": 1.3073337078094482, |
| "learning_rate": 1.0351872871736661e-05, |
| "loss": 10.8127, |
| "step": 4380 |
| }, |
| { |
| "epoch": 4.927699003228977, |
| "grad_norm": 1.3013139963150024, |
| "learning_rate": 8.989784335981839e-06, |
| "loss": 10.7386, |
| "step": 4390 |
| }, |
| { |
| "epoch": 4.938930226028359, |
| "grad_norm": 1.2934496402740479, |
| "learning_rate": 7.627695800227015e-06, |
| "loss": 10.7845, |
| "step": 4400 |
| }, |
| { |
| "epoch": 4.950161448827741, |
| "grad_norm": 1.3016905784606934, |
| "learning_rate": 6.26560726447219e-06, |
| "loss": 10.7222, |
| "step": 4410 |
| }, |
| { |
| "epoch": 4.961392671627124, |
| "grad_norm": 1.2927377223968506, |
| "learning_rate": 4.9035187287173665e-06, |
| "loss": 10.7998, |
| "step": 4420 |
| }, |
| { |
| "epoch": 4.972623894426506, |
| "grad_norm": 1.277617335319519, |
| "learning_rate": 3.5414301929625423e-06, |
| "loss": 10.8386, |
| "step": 4430 |
| }, |
| { |
| "epoch": 4.983855117225888, |
| "grad_norm": 1.2841224670410156, |
| "learning_rate": 2.179341657207718e-06, |
| "loss": 10.7737, |
| "step": 4440 |
| }, |
| { |
| "epoch": 4.995086340025271, |
| "grad_norm": 1.2789523601531982, |
| "learning_rate": 8.172531214528943e-07, |
| "loss": 10.7885, |
| "step": 4450 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 4455, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.8215997838065664e+17, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|