Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.9990416866315286, | |
| "eval_steps": 200, | |
| "global_step": 3336, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011978917105893628, | |
| "grad_norm": 476.6510925292969, | |
| "learning_rate": 8e-07, | |
| "loss": 11.6475, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.023957834211787255, | |
| "grad_norm": 74.57940673828125, | |
| "learning_rate": 1.9999928625229307e-06, | |
| "loss": 2.3869, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.035936751317680884, | |
| "grad_norm": 125.54178619384766, | |
| "learning_rate": 1.999912567076008e-06, | |
| "loss": 7.1899, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04791566842357451, | |
| "grad_norm": 14.804201126098633, | |
| "learning_rate": 1.999743061523497e-06, | |
| "loss": 5.0722, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.059894585529468136, | |
| "grad_norm": 9.310958862304688, | |
| "learning_rate": 1.999484360988329e-06, | |
| "loss": 2.9189, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07187350263536177, | |
| "grad_norm": 306.2783508300781, | |
| "learning_rate": 1.999136488551224e-06, | |
| "loss": 2.8403, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08385241974125539, | |
| "grad_norm": 134.76495361328125, | |
| "learning_rate": 1.9986994752486316e-06, | |
| "loss": 4.2047, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.09583133684714902, | |
| "grad_norm": 78.88103485107422, | |
| "learning_rate": 1.998173360069964e-06, | |
| "loss": 5.1269, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.10781025395304264, | |
| "grad_norm": 60.88028335571289, | |
| "learning_rate": 1.997558189954117e-06, | |
| "loss": 4.8787, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.11978917105893627, | |
| "grad_norm": 48.85028076171875, | |
| "learning_rate": 1.9968540197852784e-06, | |
| "loss": 2.6971, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1317680881648299, | |
| "grad_norm": 82.52726745605469, | |
| "learning_rate": 1.9960609123880376e-06, | |
| "loss": 6.6349, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.14374700527072354, | |
| "grad_norm": 18.934968948364258, | |
| "learning_rate": 1.9951789385217753e-06, | |
| "loss": 3.6926, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.15572592237661714, | |
| "grad_norm": 95.94075012207031, | |
| "learning_rate": 1.9942081768743535e-06, | |
| "loss": 5.221, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.16770483948251078, | |
| "grad_norm": 69.58992767333984, | |
| "learning_rate": 1.9931487140550935e-06, | |
| "loss": 5.8621, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.1796837565884044, | |
| "grad_norm": 18.388065338134766, | |
| "learning_rate": 1.9920006445870497e-06, | |
| "loss": 5.2103, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.19166267369429804, | |
| "grad_norm": 117.37078857421875, | |
| "learning_rate": 1.9907640708985766e-06, | |
| "loss": 5.8106, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.20364159080019167, | |
| "grad_norm": 118.88228607177734, | |
| "learning_rate": 1.9894391033141887e-06, | |
| "loss": 4.0891, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.21562050790608528, | |
| "grad_norm": 17.98489761352539, | |
| "learning_rate": 1.9880258600447204e-06, | |
| "loss": 5.7061, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2275994250119789, | |
| "grad_norm": 170.75030517578125, | |
| "learning_rate": 1.986524467176777e-06, | |
| "loss": 4.2787, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.23957834211787254, | |
| "grad_norm": 74.85968017578125, | |
| "learning_rate": 1.9849350586614863e-06, | |
| "loss": 7.8201, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.23957834211787254, | |
| "eval_loss": 1.2861307859420776, | |
| "eval_runtime": 238.5238, | |
| "eval_samples_per_second": 6.247, | |
| "eval_steps_per_second": 3.123, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.25155725922376615, | |
| "grad_norm": 254.58087158203125, | |
| "learning_rate": 1.983257776302548e-06, | |
| "loss": 5.8449, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2635361763296598, | |
| "grad_norm": 12.087563514709473, | |
| "learning_rate": 1.9814927697435826e-06, | |
| "loss": 5.7451, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2755150934355534, | |
| "grad_norm": 11.869245529174805, | |
| "learning_rate": 1.9796401964547794e-06, | |
| "loss": 6.4206, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2874940105414471, | |
| "grad_norm": 18.04955291748047, | |
| "learning_rate": 1.977700221718848e-06, | |
| "loss": 3.3466, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2994729276473407, | |
| "grad_norm": 40.88437271118164, | |
| "learning_rate": 1.975673018616273e-06, | |
| "loss": 4.5986, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3114518447532343, | |
| "grad_norm": 17.213830947875977, | |
| "learning_rate": 1.97355876800987e-06, | |
| "loss": 3.6809, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.32343076185912795, | |
| "grad_norm": 31.727243423461914, | |
| "learning_rate": 1.9713576585286513e-06, | |
| "loss": 4.692, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.33540967896502155, | |
| "grad_norm": 95.36638641357422, | |
| "learning_rate": 1.9690698865509964e-06, | |
| "loss": 6.1814, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3473885960709152, | |
| "grad_norm": 25.670534133911133, | |
| "learning_rate": 1.966695656187131e-06, | |
| "loss": 2.8556, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3593675131768088, | |
| "grad_norm": 137.57431030273438, | |
| "learning_rate": 1.9642351792609162e-06, | |
| "loss": 3.3607, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3713464302827024, | |
| "grad_norm": 46.084354400634766, | |
| "learning_rate": 1.9616886752909523e-06, | |
| "loss": 6.1352, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3833253473885961, | |
| "grad_norm": 110.49552154541016, | |
| "learning_rate": 1.9590563714709916e-06, | |
| "loss": 5.8323, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3953042644944897, | |
| "grad_norm": 77.74449157714844, | |
| "learning_rate": 1.9563385026496687e-06, | |
| "loss": 5.7407, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.40728318160038335, | |
| "grad_norm": 26.705305099487305, | |
| "learning_rate": 1.9535353113095493e-06, | |
| "loss": 5.3508, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.41926209870627695, | |
| "grad_norm": 146.0390167236328, | |
| "learning_rate": 1.9506470475454957e-06, | |
| "loss": 2.9407, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.43124101581217056, | |
| "grad_norm": 25.178991317749023, | |
| "learning_rate": 1.947673969042353e-06, | |
| "loss": 3.0089, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4432199329180642, | |
| "grad_norm": 8.458102226257324, | |
| "learning_rate": 1.9446163410519603e-06, | |
| "loss": 2.885, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4551988500239578, | |
| "grad_norm": 94.5208740234375, | |
| "learning_rate": 1.9414744363694842e-06, | |
| "loss": 3.8878, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.4671777671298515, | |
| "grad_norm": 138.86561584472656, | |
| "learning_rate": 1.938248535309083e-06, | |
| "loss": 5.5948, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4791566842357451, | |
| "grad_norm": 10.55215072631836, | |
| "learning_rate": 1.9349389256788943e-06, | |
| "loss": 2.9242, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4791566842357451, | |
| "eval_loss": 1.1180063486099243, | |
| "eval_runtime": 237.218, | |
| "eval_samples_per_second": 6.281, | |
| "eval_steps_per_second": 3.141, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4911356013416387, | |
| "grad_norm": 11.329914093017578, | |
| "learning_rate": 1.931545902755359e-06, | |
| "loss": 5.7209, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5031145184475323, | |
| "grad_norm": 121.2057113647461, | |
| "learning_rate": 1.928069769256879e-06, | |
| "loss": 4.2294, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.515093435553426, | |
| "grad_norm": 70.16046905517578, | |
| "learning_rate": 1.9245108353168055e-06, | |
| "loss": 5.1172, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5270723526593196, | |
| "grad_norm": 55.029964447021484, | |
| "learning_rate": 1.9208694184557735e-06, | |
| "loss": 3.8455, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5390512697652132, | |
| "grad_norm": 16.75533103942871, | |
| "learning_rate": 1.9171458435533706e-06, | |
| "loss": 2.1762, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5510301868711068, | |
| "grad_norm": 37.192169189453125, | |
| "learning_rate": 1.913340442819153e-06, | |
| "loss": 4.6994, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5630091039770004, | |
| "grad_norm": 190.2852020263672, | |
| "learning_rate": 1.9094535557630067e-06, | |
| "loss": 8.188, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5749880210828942, | |
| "grad_norm": 14.840240478515625, | |
| "learning_rate": 1.905485529164856e-06, | |
| "loss": 2.4346, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5869669381887878, | |
| "grad_norm": 17.85882568359375, | |
| "learning_rate": 1.9014367170437255e-06, | |
| "loss": 5.1088, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5989458552946814, | |
| "grad_norm": 98.2167739868164, | |
| "learning_rate": 1.8973074806261558e-06, | |
| "loss": 4.4192, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.610924772400575, | |
| "grad_norm": 72.538330078125, | |
| "learning_rate": 1.8930981883139734e-06, | |
| "loss": 4.2753, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6229036895064686, | |
| "grad_norm": 121.5967788696289, | |
| "learning_rate": 1.8888092156514252e-06, | |
| "loss": 5.0462, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6348826066123623, | |
| "grad_norm": 161.8177947998047, | |
| "learning_rate": 1.8844409452916719e-06, | |
| "loss": 3.2489, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6468615237182559, | |
| "grad_norm": 149.68197631835938, | |
| "learning_rate": 1.8799937669626481e-06, | |
| "loss": 4.8399, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6588404408241495, | |
| "grad_norm": 87.29440307617188, | |
| "learning_rate": 1.8754680774322934e-06, | |
| "loss": 5.3579, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6708193579300431, | |
| "grad_norm": 70.05744171142578, | |
| "learning_rate": 1.8708642804731513e-06, | |
| "loss": 1.967, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6827982750359367, | |
| "grad_norm": 49.84896469116211, | |
| "learning_rate": 1.866182786826347e-06, | |
| "loss": 4.1978, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6947771921418304, | |
| "grad_norm": 29.61354637145996, | |
| "learning_rate": 1.861424014164941e-06, | |
| "loss": 4.025, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.706756109247724, | |
| "grad_norm": 99.13072204589844, | |
| "learning_rate": 1.8565883870566666e-06, | |
| "loss": 4.1162, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7187350263536176, | |
| "grad_norm": 130.23606872558594, | |
| "learning_rate": 1.8516763369260492e-06, | |
| "loss": 3.0065, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7187350263536176, | |
| "eval_loss": 1.104053258895874, | |
| "eval_runtime": 238.4518, | |
| "eval_samples_per_second": 6.249, | |
| "eval_steps_per_second": 3.124, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7307139434595112, | |
| "grad_norm": 50.47702407836914, | |
| "learning_rate": 1.8466883020159161e-06, | |
| "loss": 4.3503, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7426928605654048, | |
| "grad_norm": 17.21928596496582, | |
| "learning_rate": 1.8416247273482988e-06, | |
| "loss": 4.4346, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7546717776712986, | |
| "grad_norm": 49.47705841064453, | |
| "learning_rate": 1.8364860646847262e-06, | |
| "loss": 3.9906, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7666506947771922, | |
| "grad_norm": 14.143331527709961, | |
| "learning_rate": 1.831272772485922e-06, | |
| "loss": 3.3026, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.7786296118830858, | |
| "grad_norm": 17.33100128173828, | |
| "learning_rate": 1.8259853158708997e-06, | |
| "loss": 6.0244, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7906085289889794, | |
| "grad_norm": 10.411093711853027, | |
| "learning_rate": 1.8206241665754687e-06, | |
| "loss": 2.8721, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.802587446094873, | |
| "grad_norm": 181.8240966796875, | |
| "learning_rate": 1.815189802910143e-06, | |
| "loss": 5.1721, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8145663632007667, | |
| "grad_norm": 39.83287048339844, | |
| "learning_rate": 1.80968270971747e-06, | |
| "loss": 4.9115, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.8265452803066603, | |
| "grad_norm": 39.86928176879883, | |
| "learning_rate": 1.8041033783287737e-06, | |
| "loss": 3.8957, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.8385241974125539, | |
| "grad_norm": 57.7371711730957, | |
| "learning_rate": 1.7984523065203188e-06, | |
| "loss": 3.1863, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8505031145184475, | |
| "grad_norm": 14.148628234863281, | |
| "learning_rate": 1.792729998468899e-06, | |
| "loss": 4.26, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.8624820316243411, | |
| "grad_norm": 71.03279113769531, | |
| "learning_rate": 1.7869369647068577e-06, | |
| "loss": 4.9559, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8744609487302348, | |
| "grad_norm": 17.670730590820312, | |
| "learning_rate": 1.7810737220765372e-06, | |
| "loss": 3.9867, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.8864398658361284, | |
| "grad_norm": 14.698404312133789, | |
| "learning_rate": 1.7751407936841684e-06, | |
| "loss": 2.7134, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.898418782942022, | |
| "grad_norm": 66.42393493652344, | |
| "learning_rate": 1.7691387088532001e-06, | |
| "loss": 3.2121, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9103977000479156, | |
| "grad_norm": 76.34748077392578, | |
| "learning_rate": 1.7630680030770732e-06, | |
| "loss": 4.7613, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.9223766171538093, | |
| "grad_norm": 46.27962112426758, | |
| "learning_rate": 1.7569292179714465e-06, | |
| "loss": 3.2976, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.934355534259703, | |
| "grad_norm": 14.20971965789795, | |
| "learning_rate": 1.750722901225873e-06, | |
| "loss": 1.9176, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9463344513655966, | |
| "grad_norm": 56.962379455566406, | |
| "learning_rate": 1.7444496065549384e-06, | |
| "loss": 1.9859, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.9583133684714902, | |
| "grad_norm": 32.20167541503906, | |
| "learning_rate": 1.7381098936488574e-06, | |
| "loss": 6.9549, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.9583133684714902, | |
| "eval_loss": 1.0978227853775024, | |
| "eval_runtime": 238.5511, | |
| "eval_samples_per_second": 6.246, | |
| "eval_steps_per_second": 3.123, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.9702922855773838, | |
| "grad_norm": 82.94165802001953, | |
| "learning_rate": 1.7317043281235418e-06, | |
| "loss": 4.1317, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.9822712026832774, | |
| "grad_norm": 110.44422912597656, | |
| "learning_rate": 1.725233481470135e-06, | |
| "loss": 3.2924, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.9942501197891711, | |
| "grad_norm": 88.77394104003906, | |
| "learning_rate": 1.7186979310040268e-06, | |
| "loss": 5.5422, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.0071873502635362, | |
| "grad_norm": 167.41412353515625, | |
| "learning_rate": 1.7120982598133456e-06, | |
| "loss": 3.5133, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.0191662673694297, | |
| "grad_norm": 16.926607131958008, | |
| "learning_rate": 1.7054350567069364e-06, | |
| "loss": 4.2376, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.0311451844753234, | |
| "grad_norm": 96.12760925292969, | |
| "learning_rate": 1.698708916161829e-06, | |
| "loss": 3.6823, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.0431241015812172, | |
| "grad_norm": 103.28370666503906, | |
| "learning_rate": 1.6919204382701987e-06, | |
| "loss": 2.5705, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.0551030186871106, | |
| "grad_norm": 166.0828857421875, | |
| "learning_rate": 1.6850702286858298e-06, | |
| "loss": 2.9061, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.0670819357930044, | |
| "grad_norm": 136.93392944335938, | |
| "learning_rate": 1.678158898570078e-06, | |
| "loss": 2.8635, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.0790608528988979, | |
| "grad_norm": 14.542271614074707, | |
| "learning_rate": 1.6711870645373449e-06, | |
| "loss": 4.2555, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.0910397700047916, | |
| "grad_norm": 53.567359924316406, | |
| "learning_rate": 1.6641553486000651e-06, | |
| "loss": 3.1885, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.1030186871106853, | |
| "grad_norm": 100.29656982421875, | |
| "learning_rate": 1.6570643781132118e-06, | |
| "loss": 4.953, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.1149976042165788, | |
| "grad_norm": 89.06425476074219, | |
| "learning_rate": 1.649914785718324e-06, | |
| "loss": 4.9896, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.1269765213224725, | |
| "grad_norm": 17.858898162841797, | |
| "learning_rate": 1.6427072092870651e-06, | |
| "loss": 1.5295, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.138955438428366, | |
| "grad_norm": 15.720714569091797, | |
| "learning_rate": 1.6354422918643133e-06, | |
| "loss": 3.0117, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.1509343555342597, | |
| "grad_norm": 13.404827117919922, | |
| "learning_rate": 1.628120681610789e-06, | |
| "loss": 2.1361, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.1629132726401532, | |
| "grad_norm": 130.946044921875, | |
| "learning_rate": 1.6207430317452297e-06, | |
| "loss": 3.941, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.174892189746047, | |
| "grad_norm": 14.730375289916992, | |
| "learning_rate": 1.613310000486108e-06, | |
| "loss": 3.2318, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.1868711068519406, | |
| "grad_norm": 82.65552520751953, | |
| "learning_rate": 1.6058222509929096e-06, | |
| "loss": 3.9045, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.1988500239578341, | |
| "grad_norm": Infinity, | |
| "learning_rate": 1.5982804513069664e-06, | |
| "loss": 5.5404, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.1988500239578341, | |
| "eval_loss": 1.1037527322769165, | |
| "eval_runtime": 238.5531, | |
| "eval_samples_per_second": 6.246, | |
| "eval_steps_per_second": 3.123, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.2108289410637278, | |
| "grad_norm": 67.75409698486328, | |
| "learning_rate": 1.5914471746978935e-06, | |
| "loss": 2.6392, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.2228078581696216, | |
| "grad_norm": 49.21822738647461, | |
| "learning_rate": 1.5838045373221053e-06, | |
| "loss": 4.0259, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.234786775275515, | |
| "grad_norm": 228.187744140625, | |
| "learning_rate": 1.5761098141278849e-06, | |
| "loss": 5.8343, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.2467656923814088, | |
| "grad_norm": 33.68708419799805, | |
| "learning_rate": 1.5683636916223236e-06, | |
| "loss": 3.9807, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.2587446094873023, | |
| "grad_norm": 140.39187622070312, | |
| "learning_rate": 1.5605668608982526e-06, | |
| "loss": 3.9716, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.270723526593196, | |
| "grad_norm": 11.902155876159668, | |
| "learning_rate": 1.5527200175725842e-06, | |
| "loss": 3.2315, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.2827024436990895, | |
| "grad_norm": 144.3499298095703, | |
| "learning_rate": 1.5448238617242488e-06, | |
| "loss": 2.6336, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.2946813608049832, | |
| "grad_norm": 59.747928619384766, | |
| "learning_rate": 1.5368790978317395e-06, | |
| "loss": 3.206, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.306660277910877, | |
| "grad_norm": 53.534950256347656, | |
| "learning_rate": 1.5288864347102545e-06, | |
| "loss": 4.3036, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.3186391950167704, | |
| "grad_norm": 48.62434387207031, | |
| "learning_rate": 1.520846585448463e-06, | |
| "loss": 2.4486, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.3306181121226641, | |
| "grad_norm": 18.836971282958984, | |
| "learning_rate": 1.512760267344882e-06, | |
| "loss": 4.0121, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.3425970292285578, | |
| "grad_norm": 224.24917602539062, | |
| "learning_rate": 1.5046282018438814e-06, | |
| "loss": 2.8545, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.3545759463344513, | |
| "grad_norm": 18.120187759399414, | |
| "learning_rate": 1.4964511144713174e-06, | |
| "loss": 3.1619, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.366554863440345, | |
| "grad_norm": 39.76359939575195, | |
| "learning_rate": 1.4882297347698048e-06, | |
| "loss": 3.0413, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.3785337805462385, | |
| "grad_norm": 64.61255645751953, | |
| "learning_rate": 1.4799647962336255e-06, | |
| "loss": 3.8001, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.3905126976521323, | |
| "grad_norm": 12.653026580810547, | |
| "learning_rate": 1.471657036243291e-06, | |
| "loss": 5.532, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.4024916147580258, | |
| "grad_norm": 53.37699508666992, | |
| "learning_rate": 1.4633071959997525e-06, | |
| "loss": 3.4156, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.4144705318639195, | |
| "grad_norm": 37.08938217163086, | |
| "learning_rate": 1.4549160204582731e-06, | |
| "loss": 2.5073, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.4264494489698132, | |
| "grad_norm": 139.479736328125, | |
| "learning_rate": 1.4464842582619652e-06, | |
| "loss": 3.36, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.4384283660757067, | |
| "grad_norm": 88.59599304199219, | |
| "learning_rate": 1.4380126616749975e-06, | |
| "loss": 5.2213, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.4384283660757067, | |
| "eval_loss": 1.1036500930786133, | |
| "eval_runtime": 238.313, | |
| "eval_samples_per_second": 6.252, | |
| "eval_steps_per_second": 3.126, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.4504072831816004, | |
| "grad_norm": 129.61988830566406, | |
| "learning_rate": 1.4295019865154785e-06, | |
| "loss": 7.1682, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.462386200287494, | |
| "grad_norm": 19.823545455932617, | |
| "learning_rate": 1.4209529920880272e-06, | |
| "loss": 4.6843, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.4743651173933876, | |
| "grad_norm": 107.53630065917969, | |
| "learning_rate": 1.4123664411160252e-06, | |
| "loss": 2.4525, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.4863440344992813, | |
| "grad_norm": 54.906280517578125, | |
| "learning_rate": 1.4037430996735722e-06, | |
| "loss": 5.9388, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.4983229516051748, | |
| "grad_norm": 77.29488372802734, | |
| "learning_rate": 1.3950837371171355e-06, | |
| "loss": 5.3705, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.5103018687110685, | |
| "grad_norm": 115.9428482055664, | |
| "learning_rate": 1.3863891260169114e-06, | |
| "loss": 4.0317, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.522280785816962, | |
| "grad_norm": 19.207189559936523, | |
| "learning_rate": 1.3776600420878973e-06, | |
| "loss": 3.8767, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.5342597029228557, | |
| "grad_norm": 83.13814544677734, | |
| "learning_rate": 1.3688972641206837e-06, | |
| "loss": 4.5835, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.5462386200287495, | |
| "grad_norm": 222.7005157470703, | |
| "learning_rate": 1.3601015739119733e-06, | |
| "loss": 3.3379, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.558217537134643, | |
| "grad_norm": 51.51054382324219, | |
| "learning_rate": 1.35127375619483e-06, | |
| "loss": 5.4397, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.5701964542405367, | |
| "grad_norm": 109.09092712402344, | |
| "learning_rate": 1.3424145985686662e-06, | |
| "loss": 3.1896, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.5821753713464304, | |
| "grad_norm": 11.890337944030762, | |
| "learning_rate": 1.333524891428976e-06, | |
| "loss": 4.4828, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.5941542884523239, | |
| "grad_norm": 18.87173843383789, | |
| "learning_rate": 1.324605427896817e-06, | |
| "loss": 2.4719, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.6061332055582176, | |
| "grad_norm": 110.26778411865234, | |
| "learning_rate": 1.3156570037480497e-06, | |
| "loss": 3.4721, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.6181121226641113, | |
| "grad_norm": 19.194913864135742, | |
| "learning_rate": 1.3066804173423397e-06, | |
| "loss": 4.3532, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.6300910397700048, | |
| "grad_norm": 89.65567016601562, | |
| "learning_rate": 1.297676469551931e-06, | |
| "loss": 4.1742, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.6420699568758983, | |
| "grad_norm": 15.179718017578125, | |
| "learning_rate": 1.2886459636901927e-06, | |
| "loss": 4.2612, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.654048873981792, | |
| "grad_norm": 14.557687759399414, | |
| "learning_rate": 1.2795897054399498e-06, | |
| "loss": 5.2594, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.6660277910876857, | |
| "grad_norm": 20.148624420166016, | |
| "learning_rate": 1.2705085027816008e-06, | |
| "loss": 3.3919, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.6780067081935792, | |
| "grad_norm": 9.0452241897583, | |
| "learning_rate": 1.261403165921032e-06, | |
| "loss": 4.3208, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.6780067081935792, | |
| "eval_loss": 1.0974289178848267, | |
| "eval_runtime": 238.4129, | |
| "eval_samples_per_second": 6.25, | |
| "eval_steps_per_second": 3.125, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.689985625299473, | |
| "grad_norm": 18.7681941986084, | |
| "learning_rate": 1.2522745072173336e-06, | |
| "loss": 2.5784, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.7019645424053667, | |
| "grad_norm": 112.4189224243164, | |
| "learning_rate": 1.243123341110321e-06, | |
| "loss": 4.0173, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.7139434595112601, | |
| "grad_norm": 13.051095008850098, | |
| "learning_rate": 1.2339504840478738e-06, | |
| "loss": 3.1098, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.7259223766171539, | |
| "grad_norm": 16.918392181396484, | |
| "learning_rate": 1.224756754413092e-06, | |
| "loss": 3.1983, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.7379012937230476, | |
| "grad_norm": 167.8545379638672, | |
| "learning_rate": 1.2155429724512838e-06, | |
| "loss": 4.8368, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.749880210828941, | |
| "grad_norm": 19.237834930419922, | |
| "learning_rate": 1.206309960196784e-06, | |
| "loss": 3.1809, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.7618591279348346, | |
| "grad_norm": 95.0063247680664, | |
| "learning_rate": 1.1970585413996132e-06, | |
| "loss": 3.9006, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.7738380450407283, | |
| "grad_norm": 55.70530319213867, | |
| "learning_rate": 1.1877895414519858e-06, | |
| "loss": 3.3394, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.785816962146622, | |
| "grad_norm": 23.173871994018555, | |
| "learning_rate": 1.1785037873146697e-06, | |
| "loss": 2.4079, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.7977958792525155, | |
| "grad_norm": 120.7456283569336, | |
| "learning_rate": 1.1692021074432054e-06, | |
| "loss": 4.2111, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.8097747963584092, | |
| "grad_norm": 89.36892700195312, | |
| "learning_rate": 1.1598853317139958e-06, | |
| "loss": 1.8205, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.821753713464303, | |
| "grad_norm": 66.96819305419922, | |
| "learning_rate": 1.150554291350263e-06, | |
| "loss": 4.6707, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.8337326305701964, | |
| "grad_norm": 52.6048583984375, | |
| "learning_rate": 1.1412098188478914e-06, | |
| "loss": 2.2611, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.8457115476760901, | |
| "grad_norm": 100.39757537841797, | |
| "learning_rate": 1.1318527479011513e-06, | |
| "loss": 3.3554, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.8576904647819839, | |
| "grad_norm": 12.364606857299805, | |
| "learning_rate": 1.1224839133283208e-06, | |
| "loss": 2.7868, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.8696693818878773, | |
| "grad_norm": 93.31403350830078, | |
| "learning_rate": 1.1131041509972032e-06, | |
| "loss": 3.7607, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.8816482989937708, | |
| "grad_norm": 77.30158233642578, | |
| "learning_rate": 1.1037142977505548e-06, | |
| "loss": 3.28, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.8936272160996646, | |
| "grad_norm": 13.82420825958252, | |
| "learning_rate": 1.0943151913314211e-06, | |
| "loss": 3.3544, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.9056061332055583, | |
| "grad_norm": 16.398128509521484, | |
| "learning_rate": 1.084907670308397e-06, | |
| "loss": 2.7871, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.9175850503114518, | |
| "grad_norm": 19.750925064086914, | |
| "learning_rate": 1.0754925740008098e-06, | |
| "loss": 4.1985, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.9175850503114518, | |
| "eval_loss": 1.1042989492416382, | |
| "eval_runtime": 238.4482, | |
| "eval_samples_per_second": 6.249, | |
| "eval_steps_per_second": 3.124, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.9295639674173455, | |
| "grad_norm": 16.436159133911133, | |
| "learning_rate": 1.066070742403839e-06, | |
| "loss": 3.9566, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.9415428845232392, | |
| "grad_norm": 67.2239761352539, | |
| "learning_rate": 1.056643016113572e-06, | |
| "loss": 4.0604, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.9535218016291327, | |
| "grad_norm": 52.419456481933594, | |
| "learning_rate": 1.047210236252008e-06, | |
| "loss": 4.4566, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.9655007187350264, | |
| "grad_norm": 102.50648498535156, | |
| "learning_rate": 1.0377732443920155e-06, | |
| "loss": 2.5929, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.9774796358409201, | |
| "grad_norm": 85.2761459350586, | |
| "learning_rate": 1.0283328824822498e-06, | |
| "loss": 3.278, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.9894585529468136, | |
| "grad_norm": 111.09942626953125, | |
| "learning_rate": 1.0188899927720324e-06, | |
| "loss": 2.1727, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.0023957834211785, | |
| "grad_norm": 88.2173080444336, | |
| "learning_rate": 1.009445417736213e-06, | |
| "loss": 4.7098, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.0143747005270725, | |
| "grad_norm": 15.37248420715332, | |
| "learning_rate": 1e-06, | |
| "loss": 3.3523, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.026353617632966, | |
| "grad_norm": 428.4352111816406, | |
| "learning_rate": 9.905545822637871e-07, | |
| "loss": 4.4776, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.0383325347388594, | |
| "grad_norm": 16.143062591552734, | |
| "learning_rate": 9.811100072279673e-07, | |
| "loss": 3.2249, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.0503114518447534, | |
| "grad_norm": 142.73899841308594, | |
| "learning_rate": 9.716671175177506e-07, | |
| "loss": 3.6488, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.062290368950647, | |
| "grad_norm": 86.692626953125, | |
| "learning_rate": 9.622267556079844e-07, | |
| "loss": 2.3491, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.0742692860565404, | |
| "grad_norm": 127.98802947998047, | |
| "learning_rate": 9.527897637479921e-07, | |
| "loss": 4.828, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.0862482031624343, | |
| "grad_norm": 14.027091026306152, | |
| "learning_rate": 9.433569838864282e-07, | |
| "loss": 4.176, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.098227120268328, | |
| "grad_norm": 16.68279266357422, | |
| "learning_rate": 9.33929257596161e-07, | |
| "loss": 2.8037, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.1102060373742213, | |
| "grad_norm": 68.17903900146484, | |
| "learning_rate": 9.245074259991904e-07, | |
| "loss": 3.448, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.122184954480115, | |
| "grad_norm": 90.32327270507812, | |
| "learning_rate": 9.150923296916032e-07, | |
| "loss": 3.3154, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.1341638715860087, | |
| "grad_norm": 218.0877227783203, | |
| "learning_rate": 9.056848086685789e-07, | |
| "loss": 3.8818, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.1461427886919022, | |
| "grad_norm": 117.9521484375, | |
| "learning_rate": 8.96285702249445e-07, | |
| "loss": 3.7345, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.1581217057977957, | |
| "grad_norm": 14.993733406066895, | |
| "learning_rate": 8.868958490027966e-07, | |
| "loss": 3.3709, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.1581217057977957, | |
| "eval_loss": 1.1172066926956177, | |
| "eval_runtime": 238.6675, | |
| "eval_samples_per_second": 6.243, | |
| "eval_steps_per_second": 3.121, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.1701006229036897, | |
| "grad_norm": 273.74365234375, | |
| "learning_rate": 8.775160866716791e-07, | |
| "loss": 4.06, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.182079540009583, | |
| "grad_norm": 11.733575820922852, | |
| "learning_rate": 8.681472520988488e-07, | |
| "loss": 2.4028, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.1940584571154766, | |
| "grad_norm": 270.543212890625, | |
| "learning_rate": 8.587901811521087e-07, | |
| "loss": 5.1853, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.2060373742213706, | |
| "grad_norm": 18.38782501220703, | |
| "learning_rate": 8.494457086497368e-07, | |
| "loss": 1.9458, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.218016291327264, | |
| "grad_norm": 127.68826293945312, | |
| "learning_rate": 8.401146682860041e-07, | |
| "loss": 3.2338, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.2299952084331576, | |
| "grad_norm": 186.4503173828125, | |
| "learning_rate": 8.307978925567945e-07, | |
| "loss": 2.5094, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.241974125539051, | |
| "grad_norm": 15.038443565368652, | |
| "learning_rate": 8.214962126853307e-07, | |
| "loss": 2.6388, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.253953042644945, | |
| "grad_norm": 74.26438903808594, | |
| "learning_rate": 8.122104585480143e-07, | |
| "loss": 2.24, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.2659319597508385, | |
| "grad_norm": 57.38971710205078, | |
| "learning_rate": 8.029414586003866e-07, | |
| "loss": 4.3915, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.277910876856732, | |
| "grad_norm": 221.86866760253906, | |
| "learning_rate": 7.93690039803216e-07, | |
| "loss": 3.0979, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.289889793962626, | |
| "grad_norm": 66.43515014648438, | |
| "learning_rate": 7.844570275487159e-07, | |
| "loss": 2.0459, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.3018687110685194, | |
| "grad_norm": 174.6913604736328, | |
| "learning_rate": 7.752432455869081e-07, | |
| "loss": 4.3113, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.313847628174413, | |
| "grad_norm": 12.269379615783691, | |
| "learning_rate": 7.660495159521264e-07, | |
| "loss": 2.6802, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.3258265452803064, | |
| "grad_norm": 14.982316970825195, | |
| "learning_rate": 7.56876658889679e-07, | |
| "loss": 3.2325, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.3378054623862004, | |
| "grad_norm": 75.86939239501953, | |
| "learning_rate": 7.477254927826664e-07, | |
| "loss": 1.0064, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.349784379492094, | |
| "grad_norm": 12.59216594696045, | |
| "learning_rate": 7.38596834078968e-07, | |
| "loss": 2.3464, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.361763296597988, | |
| "grad_norm": 44.80192184448242, | |
| "learning_rate": 7.294914972183992e-07, | |
| "loss": 3.9336, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.3737422137038813, | |
| "grad_norm": 227.7502899169922, | |
| "learning_rate": 7.204102945600502e-07, | |
| "loss": 3.3652, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.3857211308097748, | |
| "grad_norm": 202.1083526611328, | |
| "learning_rate": 7.113540363098072e-07, | |
| "loss": 3.0293, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.3977000479156683, | |
| "grad_norm": 148.12112426757812, | |
| "learning_rate": 7.02323530448069e-07, | |
| "loss": 3.7548, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.3977000479156683, | |
| "eval_loss": 1.1212018728256226, | |
| "eval_runtime": 237.7587, | |
| "eval_samples_per_second": 6.267, | |
| "eval_steps_per_second": 3.133, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.409678965021562, | |
| "grad_norm": 107.33174133300781, | |
| "learning_rate": 6.933195826576603e-07, | |
| "loss": 3.9499, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.4216578821274557, | |
| "grad_norm": 15.232194900512695, | |
| "learning_rate": 6.843429962519504e-07, | |
| "loss": 4.3203, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.433636799233349, | |
| "grad_norm": 14.64511489868164, | |
| "learning_rate": 6.75394572103183e-07, | |
| "loss": 4.5243, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.445615716339243, | |
| "grad_norm": 16.1613712310791, | |
| "learning_rate": 6.664751085710239e-07, | |
| "loss": 3.5644, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.4575946334451366, | |
| "grad_norm": 61.8858642578125, | |
| "learning_rate": 6.575854014313338e-07, | |
| "loss": 3.7972, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.46957355055103, | |
| "grad_norm": 52.300514221191406, | |
| "learning_rate": 6.487262438051701e-07, | |
| "loss": 3.6956, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.4815524676569236, | |
| "grad_norm": 208.3380126953125, | |
| "learning_rate": 6.398984260880266e-07, | |
| "loss": 3.6895, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.4935313847628175, | |
| "grad_norm": 18.773216247558594, | |
| "learning_rate": 6.311027358793166e-07, | |
| "loss": 3.0383, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.505510301868711, | |
| "grad_norm": 19.703941345214844, | |
| "learning_rate": 6.223399579121029e-07, | |
| "loss": 2.5712, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.5174892189746045, | |
| "grad_norm": 63.63347625732422, | |
| "learning_rate": 6.136108739830886e-07, | |
| "loss": 2.2939, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.5294681360804985, | |
| "grad_norm": 23.118276596069336, | |
| "learning_rate": 6.049162628828644e-07, | |
| "loss": 3.329, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.541447053186392, | |
| "grad_norm": 16.502140045166016, | |
| "learning_rate": 5.962569003264276e-07, | |
| "loss": 4.0458, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.5534259702922855, | |
| "grad_norm": 11.94331169128418, | |
| "learning_rate": 5.876335588839746e-07, | |
| "loss": 3.7107, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.565404887398179, | |
| "grad_norm": 20.692808151245117, | |
| "learning_rate": 5.79047007911973e-07, | |
| "loss": 2.3799, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.577383804504073, | |
| "grad_norm": 170.5503692626953, | |
| "learning_rate": 5.704980134845213e-07, | |
| "loss": 2.808, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.5893627216099664, | |
| "grad_norm": 17.668119430541992, | |
| "learning_rate": 5.619873383250029e-07, | |
| "loss": 2.4657, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.6013416387158603, | |
| "grad_norm": 12.804652214050293, | |
| "learning_rate": 5.535157417380346e-07, | |
| "loss": 4.2857, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.613320555821754, | |
| "grad_norm": 231.0024871826172, | |
| "learning_rate": 5.450839795417266e-07, | |
| "loss": 5.443, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.6252994729276473, | |
| "grad_norm": 156.8734893798828, | |
| "learning_rate": 5.366928040002476e-07, | |
| "loss": 4.17, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.637278390033541, | |
| "grad_norm": 49.93342590332031, | |
| "learning_rate": 5.283429637567091e-07, | |
| "loss": 3.2694, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.637278390033541, | |
| "eval_loss": 1.1175537109375, | |
| "eval_runtime": 238.0084, | |
| "eval_samples_per_second": 6.26, | |
| "eval_steps_per_second": 3.13, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.6492573071394347, | |
| "grad_norm": 11.659183502197266, | |
| "learning_rate": 5.200352037663745e-07, | |
| "loss": 1.2186, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.6612362242453282, | |
| "grad_norm": 16.97669219970703, | |
| "learning_rate": 5.117702652301952e-07, | |
| "loss": 3.9984, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.6732151413512217, | |
| "grad_norm": 49.165199279785156, | |
| "learning_rate": 5.035488855286823e-07, | |
| "loss": 2.916, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.6851940584571157, | |
| "grad_norm": 14.636496543884277, | |
| "learning_rate": 4.953717981561186e-07, | |
| "loss": 3.1995, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.697172975563009, | |
| "grad_norm": 13.993462562561035, | |
| "learning_rate": 4.872397326551179e-07, | |
| "loss": 3.1531, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.7091518926689027, | |
| "grad_norm": 14.866923332214355, | |
| "learning_rate": 4.791534145515368e-07, | |
| "loss": 2.7951, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.721130809774796, | |
| "grad_norm": 12.874122619628906, | |
| "learning_rate": 4.711135652897452e-07, | |
| "loss": 4.0197, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.73310972688069, | |
| "grad_norm": 166.74929809570312, | |
| "learning_rate": 4.6312090216826074e-07, | |
| "loss": 4.324, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.7450886439865836, | |
| "grad_norm": 169.29539489746094, | |
| "learning_rate": 4.551761382757513e-07, | |
| "loss": 4.1737, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.757067561092477, | |
| "grad_norm": 12.698741912841797, | |
| "learning_rate": 4.4727998242741627e-07, | |
| "loss": 3.2982, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.769046478198371, | |
| "grad_norm": 97.41565704345703, | |
| "learning_rate": 4.394331391017474e-07, | |
| "loss": 3.0522, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.7810253953042645, | |
| "grad_norm": 13.397387504577637, | |
| "learning_rate": 4.316363083776766e-07, | |
| "loss": 5.0599, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.793004312410158, | |
| "grad_norm": 14.772340774536133, | |
| "learning_rate": 4.2389018587211524e-07, | |
| "loss": 2.43, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.8049832295160515, | |
| "grad_norm": 218.20303344726562, | |
| "learning_rate": 4.1619546267789453e-07, | |
| "loss": 5.5137, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.8169621466219454, | |
| "grad_norm": 312.639892578125, | |
| "learning_rate": 4.0855282530210676e-07, | |
| "loss": 4.4751, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.828941063727839, | |
| "grad_norm": 293.28387451171875, | |
| "learning_rate": 4.0096295560485547e-07, | |
| "loss": 4.1398, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.840919980833733, | |
| "grad_norm": 84.95050811767578, | |
| "learning_rate": 3.934265307384239e-07, | |
| "loss": 3.7418, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.8528988979396264, | |
| "grad_norm": 108.16950988769531, | |
| "learning_rate": 3.8594422308685793e-07, | |
| "loss": 3.392, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.86487781504552, | |
| "grad_norm": 258.593505859375, | |
| "learning_rate": 3.785167002059799e-07, | |
| "loss": 4.247, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.8768567321514134, | |
| "grad_norm": 247.61813354492188, | |
| "learning_rate": 3.7114462476382966e-07, | |
| "loss": 3.9058, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.8768567321514134, | |
| "eval_loss": 1.125927209854126, | |
| "eval_runtime": 237.8939, | |
| "eval_samples_per_second": 6.263, | |
| "eval_steps_per_second": 3.132, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.8888356492573073, | |
| "grad_norm": 82.83909606933594, | |
| "learning_rate": 3.6382865448154187e-07, | |
| "loss": 3.9744, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.900814566363201, | |
| "grad_norm": 289.5834655761719, | |
| "learning_rate": 3.5656944207466633e-07, | |
| "loss": 4.8423, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.9127934834690943, | |
| "grad_norm": 11.12870979309082, | |
| "learning_rate": 3.4936763519493495e-07, | |
| "loss": 1.8868, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.924772400574988, | |
| "grad_norm": 259.54205322265625, | |
| "learning_rate": 3.4222387637247806e-07, | |
| "loss": 5.073, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.9367513176808817, | |
| "grad_norm": 260.9941711425781, | |
| "learning_rate": 3.351388029585007e-07, | |
| "loss": 3.9769, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.948730234786775, | |
| "grad_norm": 138.75767517089844, | |
| "learning_rate": 3.281130470684166e-07, | |
| "loss": 1.8905, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.9607091518926687, | |
| "grad_norm": 212.35589599609375, | |
| "learning_rate": 3.2114723552545606e-07, | |
| "loss": 5.6336, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.9726880689985626, | |
| "grad_norm": 77.375, | |
| "learning_rate": 3.142419898047399e-07, | |
| "loss": 2.5325, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.984666986104456, | |
| "grad_norm": 301.5721740722656, | |
| "learning_rate": 3.073979259778332e-07, | |
| "loss": 2.8451, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.9966459032103496, | |
| "grad_norm": 23.281728744506836, | |
| "learning_rate": 3.006156546577796e-07, | |
| "loss": 4.271, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 3.009583133684715, | |
| "grad_norm": 60.59037399291992, | |
| "learning_rate": 2.9389578094462607e-07, | |
| "loss": 4.768, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 3.0215620507906085, | |
| "grad_norm": 12.599560737609863, | |
| "learning_rate": 2.872389043714343e-07, | |
| "loss": 3.4648, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 3.033540967896502, | |
| "grad_norm": 60.7338981628418, | |
| "learning_rate": 2.806456188507943e-07, | |
| "loss": 4.1664, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 3.045519885002396, | |
| "grad_norm": 124.4323959350586, | |
| "learning_rate": 2.7411651262183465e-07, | |
| "loss": 3.2584, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 3.0574988021082894, | |
| "grad_norm": 10.696157455444336, | |
| "learning_rate": 2.676521681977425e-07, | |
| "loss": 1.6209, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 3.069477719214183, | |
| "grad_norm": 78.30728149414062, | |
| "learning_rate": 2.612531623137922e-07, | |
| "loss": 2.9967, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 3.081456636320077, | |
| "grad_norm": 12.110499382019043, | |
| "learning_rate": 2.5492006587589033e-07, | |
| "loss": 1.8501, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 3.0934355534259703, | |
| "grad_norm": 142.67459106445312, | |
| "learning_rate": 2.4865344390964016e-07, | |
| "loss": 6.7426, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 3.105414470531864, | |
| "grad_norm": 84.46195220947266, | |
| "learning_rate": 2.424538555099326e-07, | |
| "loss": 3.011, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 3.1173933876377578, | |
| "grad_norm": 287.4181213378906, | |
| "learning_rate": 2.3632185379106383e-07, | |
| "loss": 4.2906, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 3.1173933876377578, | |
| "eval_loss": 1.1396493911743164, | |
| "eval_runtime": 240.643, | |
| "eval_samples_per_second": 6.192, | |
| "eval_steps_per_second": 3.096, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 3.1293723047436512, | |
| "grad_norm": 17.018083572387695, | |
| "learning_rate": 2.302579858373881e-07, | |
| "loss": 2.1286, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 3.1413512218495447, | |
| "grad_norm": 13.265679359436035, | |
| "learning_rate": 2.2426279265450708e-07, | |
| "loss": 2.9965, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 3.1533301389554382, | |
| "grad_norm": 156.8545684814453, | |
| "learning_rate": 2.183368091210037e-07, | |
| "loss": 3.7899, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 3.165309056061332, | |
| "grad_norm": 17.818618774414062, | |
| "learning_rate": 2.1248056394072078e-07, | |
| "loss": 4.1165, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 3.1772879731672257, | |
| "grad_norm": 15.270909309387207, | |
| "learning_rate": 2.0669457959559177e-07, | |
| "loss": 3.1192, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 3.189266890273119, | |
| "grad_norm": 78.96019744873047, | |
| "learning_rate": 2.0097937229902485e-07, | |
| "loss": 5.3403, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 3.201245807379013, | |
| "grad_norm": 376.2461242675781, | |
| "learning_rate": 1.9533545194984791e-07, | |
| "loss": 3.9551, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 3.2132247244849066, | |
| "grad_norm": 16.849010467529297, | |
| "learning_rate": 1.8976332208681744e-07, | |
| "loss": 5.6715, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 3.2252036415908, | |
| "grad_norm": 171.6584930419922, | |
| "learning_rate": 1.8426347984369273e-07, | |
| "loss": 5.2323, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 3.237182558696694, | |
| "grad_norm": 120.8387680053711, | |
| "learning_rate": 1.788364159048833e-07, | |
| "loss": 4.2853, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 3.2491614758025875, | |
| "grad_norm": 13.419800758361816, | |
| "learning_rate": 1.734826144616698e-07, | |
| "loss": 2.9811, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 3.261140392908481, | |
| "grad_norm": 196.2430877685547, | |
| "learning_rate": 1.6820255316900756e-07, | |
| "loss": 4.3565, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 3.2731193100143745, | |
| "grad_norm": 15.63242244720459, | |
| "learning_rate": 1.6299670310290915e-07, | |
| "loss": 2.4933, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 3.2850982271202684, | |
| "grad_norm": 60.21677017211914, | |
| "learning_rate": 1.5786552871841774e-07, | |
| "loss": 2.6513, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 3.297077144226162, | |
| "grad_norm": 223.22097778320312, | |
| "learning_rate": 1.528094878081677e-07, | |
| "loss": 3.3477, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 3.3090560613320554, | |
| "grad_norm": 95.59901428222656, | |
| "learning_rate": 1.478290314615427e-07, | |
| "loss": 3.0678, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 3.3210349784379494, | |
| "grad_norm": 36.64326095581055, | |
| "learning_rate": 1.4292460402442995e-07, | |
| "loss": 3.4483, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 3.333013895543843, | |
| "grad_norm": 233.20370483398438, | |
| "learning_rate": 1.3809664305957625e-07, | |
| "loss": 2.9447, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 3.3449928126497364, | |
| "grad_norm": 14.824277877807617, | |
| "learning_rate": 1.3334557930754963e-07, | |
| "loss": 2.547, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 3.35697172975563, | |
| "grad_norm": 208.10154724121094, | |
| "learning_rate": 1.2867183664831038e-07, | |
| "loss": 6.0572, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 3.35697172975563, | |
| "eval_loss": 1.147083044052124, | |
| "eval_runtime": 239.357, | |
| "eval_samples_per_second": 6.225, | |
| "eval_steps_per_second": 3.113, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 3.368950646861524, | |
| "grad_norm": 167.9786834716797, | |
| "learning_rate": 1.2407583206339256e-07, | |
| "loss": 2.6004, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 3.3809295639674173, | |
| "grad_norm": 236.07484436035156, | |
| "learning_rate": 1.195579755987024e-07, | |
| "loss": 2.3534, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 3.3929084810733112, | |
| "grad_norm": 38.18111801147461, | |
| "learning_rate": 1.1511867032793321e-07, | |
| "loss": 2.5498, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 3.4048873981792047, | |
| "grad_norm": 136.2451934814453, | |
| "learning_rate": 1.107583123166066e-07, | |
| "loss": 5.3208, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 3.416866315285098, | |
| "grad_norm": 316.2095947265625, | |
| "learning_rate": 1.0647729058673427e-07, | |
| "loss": 4.3772, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 3.4288452323909917, | |
| "grad_norm": 14.228015899658203, | |
| "learning_rate": 1.0227598708211172e-07, | |
| "loss": 2.2948, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 3.4408241494968856, | |
| "grad_norm": 52.19302749633789, | |
| "learning_rate": 9.81547766342401e-08, | |
| "loss": 2.223, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 3.452803066602779, | |
| "grad_norm": 18.04366683959961, | |
| "learning_rate": 9.411402692888715e-08, | |
| "loss": 3.6671, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 3.4647819837086726, | |
| "grad_norm": 12.00094985961914, | |
| "learning_rate": 9.015409847328037e-08, | |
| "loss": 2.3488, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 3.4767609008145666, | |
| "grad_norm": 88.41423797607422, | |
| "learning_rate": 8.62753445639457e-08, | |
| "loss": 3.2758, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 3.48873981792046, | |
| "grad_norm": 56.944828033447266, | |
| "learning_rate": 8.247811125518489e-08, | |
| "loss": 1.576, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 3.5007187350263536, | |
| "grad_norm": 323.6489562988281, | |
| "learning_rate": 7.876273732820327e-08, | |
| "loss": 4.9528, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 3.512697652132247, | |
| "grad_norm": 10.932809829711914, | |
| "learning_rate": 7.51295542608834e-08, | |
| "loss": 3.6918, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 3.524676569238141, | |
| "grad_norm": 12.945392608642578, | |
| "learning_rate": 7.157888619821106e-08, | |
| "loss": 1.7161, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 3.5366554863440345, | |
| "grad_norm": 358.42498779296875, | |
| "learning_rate": 6.811104992335648e-08, | |
| "loss": 4.4565, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 3.548634403449928, | |
| "grad_norm": 231.78411865234375, | |
| "learning_rate": 6.47263548294108e-08, | |
| "loss": 3.5586, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 3.560613320555822, | |
| "grad_norm": 231.84341430664062, | |
| "learning_rate": 6.142510289178337e-08, | |
| "loss": 3.4724, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 3.5725922376617154, | |
| "grad_norm": 198.78810119628906, | |
| "learning_rate": 5.8207588641260185e-08, | |
| "loss": 2.5415, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 3.584571154767609, | |
| "grad_norm": 76.49454498291016, | |
| "learning_rate": 5.507409913772543e-08, | |
| "loss": 3.7494, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 3.5965500718735024, | |
| "grad_norm": 262.359130859375, | |
| "learning_rate": 5.202491394455155e-08, | |
| "loss": 4.0544, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.5965500718735024, | |
| "eval_loss": 1.1526151895523071, | |
| "eval_runtime": 238.4822, | |
| "eval_samples_per_second": 6.248, | |
| "eval_steps_per_second": 3.124, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.6085289889793963, | |
| "grad_norm": 14.773038864135742, | |
| "learning_rate": 4.9060305103657e-08, | |
| "loss": 3.8126, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 3.62050790608529, | |
| "grad_norm": 288.3175354003906, | |
| "learning_rate": 4.61805371112356e-08, | |
| "loss": 2.2371, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 3.6324868231911838, | |
| "grad_norm": 15.71839427947998, | |
| "learning_rate": 4.3661497350331423e-08, | |
| "loss": 2.8255, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 3.6444657402970773, | |
| "grad_norm": 47.34138870239258, | |
| "learning_rate": 4.094362852900846e-08, | |
| "loss": 2.4564, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 3.6564446574029708, | |
| "grad_norm": 133.03443908691406, | |
| "learning_rate": 3.8311324709047524e-08, | |
| "loss": 3.9076, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 3.6684235745088642, | |
| "grad_norm": 85.79124450683594, | |
| "learning_rate": 3.57648207390836e-08, | |
| "loss": 1.6264, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 3.680402491614758, | |
| "grad_norm": 11.73284912109375, | |
| "learning_rate": 3.3304343812869175e-08, | |
| "loss": 2.2377, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 3.6923814087206517, | |
| "grad_norm": 103.52379608154297, | |
| "learning_rate": 3.0930113449003536e-08, | |
| "loss": 2.5226, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 3.704360325826545, | |
| "grad_norm": 121.46673583984375, | |
| "learning_rate": 2.8642341471348585e-08, | |
| "loss": 5.8129, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 3.716339242932439, | |
| "grad_norm": 39.363887786865234, | |
| "learning_rate": 2.644123199013004e-08, | |
| "loss": 1.8684, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 3.7283181600383326, | |
| "grad_norm": 22.17119598388672, | |
| "learning_rate": 2.432698138372713e-08, | |
| "loss": 4.5753, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 3.740297077144226, | |
| "grad_norm": 123.43873596191406, | |
| "learning_rate": 2.2299778281151927e-08, | |
| "loss": 3.4706, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 3.7522759942501196, | |
| "grad_norm": 62.01347351074219, | |
| "learning_rate": 2.03598035452206e-08, | |
| "loss": 1.0962, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 3.7642549113560135, | |
| "grad_norm": 106.05194091796875, | |
| "learning_rate": 1.8507230256417316e-08, | |
| "loss": 4.0847, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 3.776233828461907, | |
| "grad_norm": 14.505044937133789, | |
| "learning_rate": 1.674222369745182e-08, | |
| "loss": 2.7005, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 3.7882127455678005, | |
| "grad_norm": 74.14771270751953, | |
| "learning_rate": 1.5064941338513548e-08, | |
| "loss": 4.5833, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 3.8001916626736945, | |
| "grad_norm": 109.61576080322266, | |
| "learning_rate": 1.3475532823222779e-08, | |
| "loss": 3.3511, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 3.812170579779588, | |
| "grad_norm": 142.658447265625, | |
| "learning_rate": 1.1974139955279294e-08, | |
| "loss": 4.0569, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 3.8241494968854814, | |
| "grad_norm": 374.00701904296875, | |
| "learning_rate": 1.0560896685811061e-08, | |
| "loss": 3.657, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 3.836128413991375, | |
| "grad_norm": 191.76698303222656, | |
| "learning_rate": 9.235929101423457e-09, | |
| "loss": 3.2204, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 3.836128413991375, | |
| "eval_loss": 1.1512279510498047, | |
| "eval_runtime": 239.5597, | |
| "eval_samples_per_second": 6.22, | |
| "eval_steps_per_second": 3.11, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 3.848107331097269, | |
| "grad_norm": 89.6385726928711, | |
| "learning_rate": 7.99935541295016e-09, | |
| "loss": 4.2172, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 3.8600862482031624, | |
| "grad_norm": 52.579227447509766, | |
| "learning_rate": 6.8512859449064705e-09, | |
| "loss": 3.1792, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 3.8720651653090563, | |
| "grad_norm": 159.96087646484375, | |
| "learning_rate": 5.791823125646522e-09, | |
| "loss": 4.6078, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 3.88404408241495, | |
| "grad_norm": 248.06581115722656, | |
| "learning_rate": 4.8210614782245866e-09, | |
| "loss": 4.6227, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 3.8960229995208433, | |
| "grad_norm": 10.945176124572754, | |
| "learning_rate": 3.939087611962377e-09, | |
| "loss": 1.8884, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 3.908001916626737, | |
| "grad_norm": 279.2950439453125, | |
| "learning_rate": 3.1459802147214554e-09, | |
| "loss": 3.5358, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 3.9199808337326307, | |
| "grad_norm": 278.5760192871094, | |
| "learning_rate": 2.441810045883175e-09, | |
| "loss": 4.0377, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 3.9319597508385242, | |
| "grad_norm": 20.70584487915039, | |
| "learning_rate": 1.8266399300355118e-09, | |
| "loss": 4.9587, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 3.9439386679444177, | |
| "grad_norm": 238.9534912109375, | |
| "learning_rate": 1.300524751368326e-09, | |
| "loss": 3.8602, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 3.9559175850503117, | |
| "grad_norm": 16.01015281677246, | |
| "learning_rate": 8.635114487760553e-10, | |
| "loss": 1.4575, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 3.967896502156205, | |
| "grad_norm": 14.110264778137207, | |
| "learning_rate": 5.156390116707321e-10, | |
| "loss": 1.1673, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 3.9798754192620986, | |
| "grad_norm": 43.93592834472656, | |
| "learning_rate": 2.56938476502655e-10, | |
| "loss": 3.8769, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 3.991854336367992, | |
| "grad_norm": 9.19588851928711, | |
| "learning_rate": 8.743292399204793e-11, | |
| "loss": 2.4475, | |
| "step": 3330 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3336, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.982695446856335e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |