Millfield_School_500 / trainer_state.json
Linksome's picture
Upload folder using huggingface_hub
a42107a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.070711128967457,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021427614838623278,
"grad_norm": 64.89540762075667,
"learning_rate": 5e-06,
"loss": 3.2563,
"num_input_tokens_seen": 1048576,
"step": 1
},
{
"epoch": 0.0042855229677246556,
"grad_norm": 64.53587667772443,
"learning_rate": 1e-05,
"loss": 3.2442,
"num_input_tokens_seen": 2097152,
"step": 2
},
{
"epoch": 0.0064282844515869825,
"grad_norm": 45.91705534890451,
"learning_rate": 1.5e-05,
"loss": 2.7435,
"num_input_tokens_seen": 3145728,
"step": 3
},
{
"epoch": 0.008571045935449311,
"grad_norm": 9.616577532098649,
"learning_rate": 2e-05,
"loss": 2.0932,
"num_input_tokens_seen": 4194304,
"step": 4
},
{
"epoch": 0.010713807419311638,
"grad_norm": 22.677650894260427,
"learning_rate": 2.5e-05,
"loss": 2.1313,
"num_input_tokens_seen": 5242880,
"step": 5
},
{
"epoch": 0.012856568903173965,
"grad_norm": 16.200400277863025,
"learning_rate": 3e-05,
"loss": 2.1563,
"num_input_tokens_seen": 6291456,
"step": 6
},
{
"epoch": 0.014999330387036294,
"grad_norm": 7.7723602177379725,
"learning_rate": 3.5e-05,
"loss": 1.9378,
"num_input_tokens_seen": 7340032,
"step": 7
},
{
"epoch": 0.017142091870898622,
"grad_norm": 8.349008010722175,
"learning_rate": 4e-05,
"loss": 1.8095,
"num_input_tokens_seen": 8388608,
"step": 8
},
{
"epoch": 0.019284853354760947,
"grad_norm": 4.24057283338546,
"learning_rate": 4.5e-05,
"loss": 1.6948,
"num_input_tokens_seen": 9437184,
"step": 9
},
{
"epoch": 0.021427614838623276,
"grad_norm": 9.738414333035731,
"learning_rate": 5e-05,
"loss": 1.7145,
"num_input_tokens_seen": 10485760,
"step": 10
},
{
"epoch": 0.023570376322485605,
"grad_norm": 9.427464720180852,
"learning_rate": 4.999999429436697e-05,
"loss": 1.7124,
"num_input_tokens_seen": 11534336,
"step": 11
},
{
"epoch": 0.02571313780634793,
"grad_norm": 3.2252035671130743,
"learning_rate": 4.9999977177470465e-05,
"loss": 1.6181,
"num_input_tokens_seen": 12582912,
"step": 12
},
{
"epoch": 0.02785589929021026,
"grad_norm": 5.389002593456943,
"learning_rate": 4.999994864931831e-05,
"loss": 1.5381,
"num_input_tokens_seen": 13631488,
"step": 13
},
{
"epoch": 0.029998660774072587,
"grad_norm": 2.93969979997987,
"learning_rate": 4.999990870992352e-05,
"loss": 1.532,
"num_input_tokens_seen": 14680064,
"step": 14
},
{
"epoch": 0.03214142225793491,
"grad_norm": 3.0591292630760933,
"learning_rate": 4.999985735930432e-05,
"loss": 1.4952,
"num_input_tokens_seen": 15728640,
"step": 15
},
{
"epoch": 0.034284183741797244,
"grad_norm": 3.324378449482722,
"learning_rate": 4.9999794597484165e-05,
"loss": 1.4567,
"num_input_tokens_seen": 16777216,
"step": 16
},
{
"epoch": 0.03642694522565957,
"grad_norm": 4.561130115369689,
"learning_rate": 4.999972042449169e-05,
"loss": 1.4686,
"num_input_tokens_seen": 17825792,
"step": 17
},
{
"epoch": 0.038569706709521895,
"grad_norm": 1.780527582253664,
"learning_rate": 4.9999634840360755e-05,
"loss": 1.4052,
"num_input_tokens_seen": 18874368,
"step": 18
},
{
"epoch": 0.04071246819338423,
"grad_norm": 3.117995934114996,
"learning_rate": 4.9999537845130426e-05,
"loss": 1.4083,
"num_input_tokens_seen": 19922944,
"step": 19
},
{
"epoch": 0.04285522967724655,
"grad_norm": 2.848287146164459,
"learning_rate": 4.999942943884498e-05,
"loss": 1.3887,
"num_input_tokens_seen": 20971520,
"step": 20
},
{
"epoch": 0.04499799116110888,
"grad_norm": 1.69625375895056,
"learning_rate": 4.9999309621553894e-05,
"loss": 1.349,
"num_input_tokens_seen": 22020096,
"step": 21
},
{
"epoch": 0.04714075264497121,
"grad_norm": 2.567244377686529,
"learning_rate": 4.9999178393311855e-05,
"loss": 1.3423,
"num_input_tokens_seen": 23068672,
"step": 22
},
{
"epoch": 0.049283514128833535,
"grad_norm": 1.7526016889237623,
"learning_rate": 4.999903575417877e-05,
"loss": 1.3301,
"num_input_tokens_seen": 24117248,
"step": 23
},
{
"epoch": 0.05142627561269586,
"grad_norm": 2.1556250824756282,
"learning_rate": 4.9998881704219745e-05,
"loss": 1.3152,
"num_input_tokens_seen": 25165824,
"step": 24
},
{
"epoch": 0.05356903709655819,
"grad_norm": 1.8871936642830933,
"learning_rate": 4.9998716243505096e-05,
"loss": 1.304,
"num_input_tokens_seen": 26214400,
"step": 25
},
{
"epoch": 0.05571179858042052,
"grad_norm": 1.674338621481819,
"learning_rate": 4.999853937211034e-05,
"loss": 1.2796,
"num_input_tokens_seen": 27262976,
"step": 26
},
{
"epoch": 0.05785456006428284,
"grad_norm": 1.761831320598704,
"learning_rate": 4.9998351090116226e-05,
"loss": 1.2732,
"num_input_tokens_seen": 28311552,
"step": 27
},
{
"epoch": 0.059997321548145174,
"grad_norm": 1.7061574034058262,
"learning_rate": 4.9998151397608674e-05,
"loss": 1.2686,
"num_input_tokens_seen": 29360128,
"step": 28
},
{
"epoch": 0.0621400830320075,
"grad_norm": 1.5863747354870246,
"learning_rate": 4.999794029467886e-05,
"loss": 1.2613,
"num_input_tokens_seen": 30408704,
"step": 29
},
{
"epoch": 0.06428284451586982,
"grad_norm": 1.7274454226000222,
"learning_rate": 4.9997717781423114e-05,
"loss": 1.2526,
"num_input_tokens_seen": 31457280,
"step": 30
},
{
"epoch": 0.06642560599973216,
"grad_norm": 1.4317285831126387,
"learning_rate": 4.999748385794302e-05,
"loss": 1.2329,
"num_input_tokens_seen": 32505856,
"step": 31
},
{
"epoch": 0.06856836748359449,
"grad_norm": 1.8999621450491984,
"learning_rate": 4.999723852434535e-05,
"loss": 1.2436,
"num_input_tokens_seen": 33554432,
"step": 32
},
{
"epoch": 0.07071112896745681,
"grad_norm": 1.4448128803947724,
"learning_rate": 4.999698178074209e-05,
"loss": 1.2355,
"num_input_tokens_seen": 34603008,
"step": 33
},
{
"epoch": 0.07285389045131914,
"grad_norm": 2.144552654239913,
"learning_rate": 4.9996713627250426e-05,
"loss": 1.2217,
"num_input_tokens_seen": 35651584,
"step": 34
},
{
"epoch": 0.07499665193518147,
"grad_norm": 1.1224127832608906,
"learning_rate": 4.999643406399275e-05,
"loss": 1.2163,
"num_input_tokens_seen": 36700160,
"step": 35
},
{
"epoch": 0.07713941341904379,
"grad_norm": 2.0366823883396057,
"learning_rate": 4.9996143091096684e-05,
"loss": 1.2142,
"num_input_tokens_seen": 37748736,
"step": 36
},
{
"epoch": 0.07928217490290612,
"grad_norm": 1.296430607752612,
"learning_rate": 4.999584070869502e-05,
"loss": 1.2073,
"num_input_tokens_seen": 38797312,
"step": 37
},
{
"epoch": 0.08142493638676845,
"grad_norm": 1.4801029998241608,
"learning_rate": 4.999552691692581e-05,
"loss": 1.2124,
"num_input_tokens_seen": 39845888,
"step": 38
},
{
"epoch": 0.08356769787063077,
"grad_norm": 1.4660757543282248,
"learning_rate": 4.999520171593226e-05,
"loss": 1.1989,
"num_input_tokens_seen": 40894464,
"step": 39
},
{
"epoch": 0.0857104593544931,
"grad_norm": 1.7036809143512879,
"learning_rate": 4.999486510586282e-05,
"loss": 1.1902,
"num_input_tokens_seen": 41943040,
"step": 40
},
{
"epoch": 0.08785322083835544,
"grad_norm": 1.5061981122893944,
"learning_rate": 4.999451708687114e-05,
"loss": 1.1964,
"num_input_tokens_seen": 42991616,
"step": 41
},
{
"epoch": 0.08999598232221775,
"grad_norm": 1.050371458696268,
"learning_rate": 4.999415765911606e-05,
"loss": 1.1799,
"num_input_tokens_seen": 44040192,
"step": 42
},
{
"epoch": 0.09213874380608009,
"grad_norm": 1.6332624514974972,
"learning_rate": 4.9993786822761656e-05,
"loss": 1.1769,
"num_input_tokens_seen": 45088768,
"step": 43
},
{
"epoch": 0.09428150528994242,
"grad_norm": 1.351155620545513,
"learning_rate": 4.999340457797718e-05,
"loss": 1.1779,
"num_input_tokens_seen": 46137344,
"step": 44
},
{
"epoch": 0.09642426677380474,
"grad_norm": 1.2370952346467414,
"learning_rate": 4.999301092493712e-05,
"loss": 1.183,
"num_input_tokens_seen": 47185920,
"step": 45
},
{
"epoch": 0.09856702825766707,
"grad_norm": 1.4038096900765242,
"learning_rate": 4.999260586382116e-05,
"loss": 1.1645,
"num_input_tokens_seen": 48234496,
"step": 46
},
{
"epoch": 0.1007097897415294,
"grad_norm": 1.1452882430899725,
"learning_rate": 4.999218939481418e-05,
"loss": 1.1727,
"num_input_tokens_seen": 49283072,
"step": 47
},
{
"epoch": 0.10285255122539172,
"grad_norm": 1.3160375257186312,
"learning_rate": 4.999176151810629e-05,
"loss": 1.1574,
"num_input_tokens_seen": 50331648,
"step": 48
},
{
"epoch": 0.10499531270925405,
"grad_norm": 1.1507076301290393,
"learning_rate": 4.9991322233892784e-05,
"loss": 1.1581,
"num_input_tokens_seen": 51380224,
"step": 49
},
{
"epoch": 0.10713807419311638,
"grad_norm": 1.6090478698286774,
"learning_rate": 4.999087154237418e-05,
"loss": 1.1568,
"num_input_tokens_seen": 52428800,
"step": 50
},
{
"epoch": 0.1092808356769787,
"grad_norm": 1.2451517727795873,
"learning_rate": 4.999040944375619e-05,
"loss": 1.1469,
"num_input_tokens_seen": 53477376,
"step": 51
},
{
"epoch": 0.11142359716084103,
"grad_norm": 1.3185344813227535,
"learning_rate": 4.998993593824975e-05,
"loss": 1.1446,
"num_input_tokens_seen": 54525952,
"step": 52
},
{
"epoch": 0.11356635864470337,
"grad_norm": 1.3295965754074688,
"learning_rate": 4.9989451026070975e-05,
"loss": 1.1575,
"num_input_tokens_seen": 55574528,
"step": 53
},
{
"epoch": 0.11570912012856568,
"grad_norm": 1.3620844847038756,
"learning_rate": 4.9988954707441226e-05,
"loss": 1.137,
"num_input_tokens_seen": 56623104,
"step": 54
},
{
"epoch": 0.11785188161242802,
"grad_norm": 1.14332778163853,
"learning_rate": 4.9988446982587035e-05,
"loss": 1.1377,
"num_input_tokens_seen": 57671680,
"step": 55
},
{
"epoch": 0.11999464309629035,
"grad_norm": 1.6078355451981008,
"learning_rate": 4.998792785174014e-05,
"loss": 1.1424,
"num_input_tokens_seen": 58720256,
"step": 56
},
{
"epoch": 0.12213740458015267,
"grad_norm": 1.092036338689917,
"learning_rate": 4.998739731513753e-05,
"loss": 1.1428,
"num_input_tokens_seen": 59768832,
"step": 57
},
{
"epoch": 0.124280166064015,
"grad_norm": 1.2636480648876944,
"learning_rate": 4.998685537302135e-05,
"loss": 1.1343,
"num_input_tokens_seen": 60817408,
"step": 58
},
{
"epoch": 0.12642292754787732,
"grad_norm": 1.107538929643645,
"learning_rate": 4.998630202563896e-05,
"loss": 1.1321,
"num_input_tokens_seen": 61865984,
"step": 59
},
{
"epoch": 0.12856568903173965,
"grad_norm": 1.4992869675310534,
"learning_rate": 4.998573727324295e-05,
"loss": 1.1337,
"num_input_tokens_seen": 62914560,
"step": 60
},
{
"epoch": 0.13070845051560198,
"grad_norm": 1.2531055693275621,
"learning_rate": 4.998516111609111e-05,
"loss": 1.127,
"num_input_tokens_seen": 63963136,
"step": 61
},
{
"epoch": 0.1328512119994643,
"grad_norm": 1.2048006235711455,
"learning_rate": 4.9984573554446404e-05,
"loss": 1.1165,
"num_input_tokens_seen": 65011712,
"step": 62
},
{
"epoch": 0.13499397348332665,
"grad_norm": 1.4888034799511158,
"learning_rate": 4.998397458857704e-05,
"loss": 1.1274,
"num_input_tokens_seen": 66060288,
"step": 63
},
{
"epoch": 0.13713673496718898,
"grad_norm": 0.8078512840634009,
"learning_rate": 4.998336421875641e-05,
"loss": 1.1263,
"num_input_tokens_seen": 67108864,
"step": 64
},
{
"epoch": 0.13927949645105128,
"grad_norm": 1.0550151301129909,
"learning_rate": 4.998274244526313e-05,
"loss": 1.1194,
"num_input_tokens_seen": 68157440,
"step": 65
},
{
"epoch": 0.14142225793491361,
"grad_norm": 1.7338596462962559,
"learning_rate": 4.9982109268380995e-05,
"loss": 1.13,
"num_input_tokens_seen": 69206016,
"step": 66
},
{
"epoch": 0.14356501941877595,
"grad_norm": 0.9679831352642563,
"learning_rate": 4.998146468839903e-05,
"loss": 1.1263,
"num_input_tokens_seen": 70254592,
"step": 67
},
{
"epoch": 0.14570778090263828,
"grad_norm": 1.4660673328866483,
"learning_rate": 4.9980808705611435e-05,
"loss": 1.1121,
"num_input_tokens_seen": 71303168,
"step": 68
},
{
"epoch": 0.1478505423865006,
"grad_norm": 1.2072777943721469,
"learning_rate": 4.998014132031766e-05,
"loss": 1.1041,
"num_input_tokens_seen": 72351744,
"step": 69
},
{
"epoch": 0.14999330387036294,
"grad_norm": 1.572431062612095,
"learning_rate": 4.997946253282231e-05,
"loss": 1.1131,
"num_input_tokens_seen": 73400320,
"step": 70
},
{
"epoch": 0.15213606535422525,
"grad_norm": 1.0201292692682673,
"learning_rate": 4.9978772343435234e-05,
"loss": 1.1053,
"num_input_tokens_seen": 74448896,
"step": 71
},
{
"epoch": 0.15427882683808758,
"grad_norm": 1.8688137247551255,
"learning_rate": 4.997807075247146e-05,
"loss": 1.1142,
"num_input_tokens_seen": 75497472,
"step": 72
},
{
"epoch": 0.1564215883219499,
"grad_norm": 1.121453660135061,
"learning_rate": 4.997735776025124e-05,
"loss": 1.1163,
"num_input_tokens_seen": 76546048,
"step": 73
},
{
"epoch": 0.15856434980581224,
"grad_norm": 1.3729465860631151,
"learning_rate": 4.99766333671e-05,
"loss": 1.1063,
"num_input_tokens_seen": 77594624,
"step": 74
},
{
"epoch": 0.16070711128967458,
"grad_norm": 1.1291336752784253,
"learning_rate": 4.997589757334842e-05,
"loss": 1.1002,
"num_input_tokens_seen": 78643200,
"step": 75
},
{
"epoch": 0.1628498727735369,
"grad_norm": 1.0444442505925857,
"learning_rate": 4.997515037933232e-05,
"loss": 1.1045,
"num_input_tokens_seen": 79691776,
"step": 76
},
{
"epoch": 0.1649926342573992,
"grad_norm": 1.1429743430825556,
"learning_rate": 4.997439178539278e-05,
"loss": 1.0939,
"num_input_tokens_seen": 80740352,
"step": 77
},
{
"epoch": 0.16713539574126154,
"grad_norm": 1.3216220734620914,
"learning_rate": 4.9973621791876055e-05,
"loss": 1.1102,
"num_input_tokens_seen": 81788928,
"step": 78
},
{
"epoch": 0.16927815722512388,
"grad_norm": 0.9852096617413347,
"learning_rate": 4.99728403991336e-05,
"loss": 1.0977,
"num_input_tokens_seen": 82837504,
"step": 79
},
{
"epoch": 0.1714209187089862,
"grad_norm": 1.2499776417522643,
"learning_rate": 4.99720476075221e-05,
"loss": 1.0953,
"num_input_tokens_seen": 83886080,
"step": 80
},
{
"epoch": 0.17356368019284854,
"grad_norm": 1.0988616217785656,
"learning_rate": 4.9971243417403414e-05,
"loss": 1.0947,
"num_input_tokens_seen": 84934656,
"step": 81
},
{
"epoch": 0.17570644167671087,
"grad_norm": 1.0199454529298497,
"learning_rate": 4.997042782914462e-05,
"loss": 1.0728,
"num_input_tokens_seen": 85983232,
"step": 82
},
{
"epoch": 0.17784920316057318,
"grad_norm": 0.8316506162348385,
"learning_rate": 4.996960084311798e-05,
"loss": 1.0929,
"num_input_tokens_seen": 87031808,
"step": 83
},
{
"epoch": 0.1799919646444355,
"grad_norm": 1.284554753733248,
"learning_rate": 4.9968762459700994e-05,
"loss": 1.0885,
"num_input_tokens_seen": 88080384,
"step": 84
},
{
"epoch": 0.18213472612829784,
"grad_norm": 1.2390646541659713,
"learning_rate": 4.9967912679276316e-05,
"loss": 1.0849,
"num_input_tokens_seen": 89128960,
"step": 85
},
{
"epoch": 0.18427748761216017,
"grad_norm": 1.123613767329084,
"learning_rate": 4.996705150223186e-05,
"loss": 1.0875,
"num_input_tokens_seen": 90177536,
"step": 86
},
{
"epoch": 0.1864202490960225,
"grad_norm": 1.0399206213607557,
"learning_rate": 4.996617892896069e-05,
"loss": 1.0828,
"num_input_tokens_seen": 91226112,
"step": 87
},
{
"epoch": 0.18856301057988484,
"grad_norm": 1.3129154484684602,
"learning_rate": 4.9965294959861095e-05,
"loss": 1.0904,
"num_input_tokens_seen": 92274688,
"step": 88
},
{
"epoch": 0.19070577206374714,
"grad_norm": 1.1647296416332669,
"learning_rate": 4.996439959533656e-05,
"loss": 1.0906,
"num_input_tokens_seen": 93323264,
"step": 89
},
{
"epoch": 0.19284853354760947,
"grad_norm": 1.3475745928522973,
"learning_rate": 4.9963492835795797e-05,
"loss": 1.0856,
"num_input_tokens_seen": 94371840,
"step": 90
},
{
"epoch": 0.1949912950314718,
"grad_norm": 1.1041626171352936,
"learning_rate": 4.9962574681652675e-05,
"loss": 1.0827,
"num_input_tokens_seen": 95420416,
"step": 91
},
{
"epoch": 0.19713405651533414,
"grad_norm": 0.924276147277848,
"learning_rate": 4.996164513332628e-05,
"loss": 1.0815,
"num_input_tokens_seen": 96468992,
"step": 92
},
{
"epoch": 0.19927681799919647,
"grad_norm": 1.3177961658200323,
"learning_rate": 4.9960704191240926e-05,
"loss": 1.0792,
"num_input_tokens_seen": 97517568,
"step": 93
},
{
"epoch": 0.2014195794830588,
"grad_norm": 0.9334752844114854,
"learning_rate": 4.99597518558261e-05,
"loss": 1.0729,
"num_input_tokens_seen": 98566144,
"step": 94
},
{
"epoch": 0.2035623409669211,
"grad_norm": 1.1860646160155437,
"learning_rate": 4.995878812751649e-05,
"loss": 1.0659,
"num_input_tokens_seen": 99614720,
"step": 95
},
{
"epoch": 0.20570510245078344,
"grad_norm": 1.1073970834673053,
"learning_rate": 4.995781300675199e-05,
"loss": 1.0738,
"num_input_tokens_seen": 100663296,
"step": 96
},
{
"epoch": 0.20784786393464577,
"grad_norm": 1.0410882369872347,
"learning_rate": 4.99568264939777e-05,
"loss": 1.0654,
"num_input_tokens_seen": 101711872,
"step": 97
},
{
"epoch": 0.2099906254185081,
"grad_norm": 0.9473960462530998,
"learning_rate": 4.995582858964392e-05,
"loss": 1.0739,
"num_input_tokens_seen": 102760448,
"step": 98
},
{
"epoch": 0.21213338690237044,
"grad_norm": 1.0886939381352059,
"learning_rate": 4.9954819294206124e-05,
"loss": 1.0662,
"num_input_tokens_seen": 103809024,
"step": 99
},
{
"epoch": 0.21427614838623277,
"grad_norm": 1.383048793026184,
"learning_rate": 4.9953798608125025e-05,
"loss": 1.078,
"num_input_tokens_seen": 104857600,
"step": 100
},
{
"epoch": 0.21641890987009507,
"grad_norm": 0.8614390068736321,
"learning_rate": 4.995276653186651e-05,
"loss": 1.0661,
"num_input_tokens_seen": 105906176,
"step": 101
},
{
"epoch": 0.2185616713539574,
"grad_norm": 1.1754449020307258,
"learning_rate": 4.9951723065901665e-05,
"loss": 1.0797,
"num_input_tokens_seen": 106954752,
"step": 102
},
{
"epoch": 0.22070443283781974,
"grad_norm": 1.1156873692946436,
"learning_rate": 4.995066821070679e-05,
"loss": 1.0656,
"num_input_tokens_seen": 108003328,
"step": 103
},
{
"epoch": 0.22284719432168207,
"grad_norm": 0.9815049988878293,
"learning_rate": 4.994960196676337e-05,
"loss": 1.0615,
"num_input_tokens_seen": 109051904,
"step": 104
},
{
"epoch": 0.2249899558055444,
"grad_norm": 1.1594613245286667,
"learning_rate": 4.994852433455809e-05,
"loss": 1.0727,
"num_input_tokens_seen": 110100480,
"step": 105
},
{
"epoch": 0.22713271728940673,
"grad_norm": 1.1219733488329482,
"learning_rate": 4.9947435314582844e-05,
"loss": 1.0661,
"num_input_tokens_seen": 111149056,
"step": 106
},
{
"epoch": 0.22927547877326904,
"grad_norm": 1.2406645521219986,
"learning_rate": 4.99463349073347e-05,
"loss": 1.0665,
"num_input_tokens_seen": 112197632,
"step": 107
},
{
"epoch": 0.23141824025713137,
"grad_norm": 0.9176808882852722,
"learning_rate": 4.9945223113315966e-05,
"loss": 1.0696,
"num_input_tokens_seen": 113246208,
"step": 108
},
{
"epoch": 0.2335610017409937,
"grad_norm": 0.9318195936282073,
"learning_rate": 4.994409993303409e-05,
"loss": 1.0605,
"num_input_tokens_seen": 114294784,
"step": 109
},
{
"epoch": 0.23570376322485603,
"grad_norm": 1.0593738678656253,
"learning_rate": 4.994296536700177e-05,
"loss": 1.0649,
"num_input_tokens_seen": 115343360,
"step": 110
},
{
"epoch": 0.23784652470871837,
"grad_norm": 1.2171400848983855,
"learning_rate": 4.994181941573687e-05,
"loss": 1.0502,
"num_input_tokens_seen": 116391936,
"step": 111
},
{
"epoch": 0.2399892861925807,
"grad_norm": 1.0177436664305382,
"learning_rate": 4.994066207976247e-05,
"loss": 1.052,
"num_input_tokens_seen": 117440512,
"step": 112
},
{
"epoch": 0.24213204767644303,
"grad_norm": 1.0732635557180488,
"learning_rate": 4.993949335960683e-05,
"loss": 1.0566,
"num_input_tokens_seen": 118489088,
"step": 113
},
{
"epoch": 0.24427480916030533,
"grad_norm": 1.229545019453543,
"learning_rate": 4.9938313255803406e-05,
"loss": 1.0538,
"num_input_tokens_seen": 119537664,
"step": 114
},
{
"epoch": 0.24641757064416767,
"grad_norm": 0.8920833241411221,
"learning_rate": 4.993712176889086e-05,
"loss": 1.0422,
"num_input_tokens_seen": 120586240,
"step": 115
},
{
"epoch": 0.24856033212803,
"grad_norm": 0.8904929780704225,
"learning_rate": 4.993591889941306e-05,
"loss": 1.0576,
"num_input_tokens_seen": 121634816,
"step": 116
},
{
"epoch": 0.2507030936118923,
"grad_norm": 1.1237317297804248,
"learning_rate": 4.993470464791904e-05,
"loss": 1.0465,
"num_input_tokens_seen": 122683392,
"step": 117
},
{
"epoch": 0.25284585509575463,
"grad_norm": 0.8695137661037583,
"learning_rate": 4.9933479014963055e-05,
"loss": 1.0615,
"num_input_tokens_seen": 123731968,
"step": 118
},
{
"epoch": 0.25498861657961697,
"grad_norm": 0.8752675990810166,
"learning_rate": 4.9932242001104556e-05,
"loss": 1.0427,
"num_input_tokens_seen": 124780544,
"step": 119
},
{
"epoch": 0.2571313780634793,
"grad_norm": 1.1263946339650905,
"learning_rate": 4.9930993606908154e-05,
"loss": 1.043,
"num_input_tokens_seen": 125829120,
"step": 120
},
{
"epoch": 0.25927413954734163,
"grad_norm": 1.1587989669007857,
"learning_rate": 4.99297338329437e-05,
"loss": 1.0558,
"num_input_tokens_seen": 126877696,
"step": 121
},
{
"epoch": 0.26141690103120396,
"grad_norm": 1.100292756182761,
"learning_rate": 4.992846267978621e-05,
"loss": 1.0595,
"num_input_tokens_seen": 127926272,
"step": 122
},
{
"epoch": 0.2635596625150663,
"grad_norm": 0.9764047931891696,
"learning_rate": 4.99271801480159e-05,
"loss": 1.0567,
"num_input_tokens_seen": 128974848,
"step": 123
},
{
"epoch": 0.2657024239989286,
"grad_norm": 1.0337777172886875,
"learning_rate": 4.992588623821819e-05,
"loss": 1.0402,
"num_input_tokens_seen": 130023424,
"step": 124
},
{
"epoch": 0.26784518548279096,
"grad_norm": 1.0931510943833824,
"learning_rate": 4.992458095098368e-05,
"loss": 1.0518,
"num_input_tokens_seen": 131072000,
"step": 125
},
{
"epoch": 0.2699879469666533,
"grad_norm": 0.965385066043555,
"learning_rate": 4.9923264286908164e-05,
"loss": 1.0443,
"num_input_tokens_seen": 132120576,
"step": 126
},
{
"epoch": 0.2721307084505156,
"grad_norm": 0.9207891788376037,
"learning_rate": 4.9921936246592656e-05,
"loss": 1.0335,
"num_input_tokens_seen": 133169152,
"step": 127
},
{
"epoch": 0.27427346993437796,
"grad_norm": 0.9453747401932903,
"learning_rate": 4.992059683064332e-05,
"loss": 1.0345,
"num_input_tokens_seen": 134217728,
"step": 128
},
{
"epoch": 0.27641623141824023,
"grad_norm": 0.8736473057042122,
"learning_rate": 4.991924603967154e-05,
"loss": 1.036,
"num_input_tokens_seen": 135266304,
"step": 129
},
{
"epoch": 0.27855899290210256,
"grad_norm": 0.9112732853382438,
"learning_rate": 4.991788387429388e-05,
"loss": 1.0608,
"num_input_tokens_seen": 136314880,
"step": 130
},
{
"epoch": 0.2807017543859649,
"grad_norm": 0.9148465256720132,
"learning_rate": 4.991651033513212e-05,
"loss": 1.0245,
"num_input_tokens_seen": 137363456,
"step": 131
},
{
"epoch": 0.28284451586982723,
"grad_norm": 0.9883419785308301,
"learning_rate": 4.9915125422813187e-05,
"loss": 1.0629,
"num_input_tokens_seen": 138412032,
"step": 132
},
{
"epoch": 0.28498727735368956,
"grad_norm": 1.270307696820798,
"learning_rate": 4.991372913796924e-05,
"loss": 1.0389,
"num_input_tokens_seen": 139460608,
"step": 133
},
{
"epoch": 0.2871300388375519,
"grad_norm": 0.9097618800977756,
"learning_rate": 4.991232148123761e-05,
"loss": 1.0436,
"num_input_tokens_seen": 140509184,
"step": 134
},
{
"epoch": 0.2892728003214142,
"grad_norm": 1.0267713390462572,
"learning_rate": 4.9910902453260824e-05,
"loss": 1.0266,
"num_input_tokens_seen": 141557760,
"step": 135
},
{
"epoch": 0.29141556180527656,
"grad_norm": 1.005063306500544,
"learning_rate": 4.99094720546866e-05,
"loss": 1.0378,
"num_input_tokens_seen": 142606336,
"step": 136
},
{
"epoch": 0.2935583232891389,
"grad_norm": 1.1299519867382342,
"learning_rate": 4.990803028616785e-05,
"loss": 1.0403,
"num_input_tokens_seen": 143654912,
"step": 137
},
{
"epoch": 0.2957010847730012,
"grad_norm": 0.9592070371054032,
"learning_rate": 4.990657714836266e-05,
"loss": 1.0371,
"num_input_tokens_seen": 144703488,
"step": 138
},
{
"epoch": 0.29784384625686355,
"grad_norm": 1.141468525308474,
"learning_rate": 4.990511264193431e-05,
"loss": 1.0365,
"num_input_tokens_seen": 145752064,
"step": 139
},
{
"epoch": 0.2999866077407259,
"grad_norm": 1.0214002809180978,
"learning_rate": 4.9903636767551285e-05,
"loss": 1.0309,
"num_input_tokens_seen": 146800640,
"step": 140
},
{
"epoch": 0.30212936922458816,
"grad_norm": 0.9708959913413399,
"learning_rate": 4.9902149525887255e-05,
"loss": 1.0362,
"num_input_tokens_seen": 147849216,
"step": 141
},
{
"epoch": 0.3042721307084505,
"grad_norm": 0.9885069162667705,
"learning_rate": 4.990065091762106e-05,
"loss": 1.0336,
"num_input_tokens_seen": 148897792,
"step": 142
},
{
"epoch": 0.3064148921923128,
"grad_norm": 0.9783204712664121,
"learning_rate": 4.989914094343675e-05,
"loss": 1.0346,
"num_input_tokens_seen": 149946368,
"step": 143
},
{
"epoch": 0.30855765367617516,
"grad_norm": 1.1202679861078162,
"learning_rate": 4.9897619604023545e-05,
"loss": 1.0246,
"num_input_tokens_seen": 150994944,
"step": 144
},
{
"epoch": 0.3107004151600375,
"grad_norm": 0.9657646058287815,
"learning_rate": 4.9896086900075865e-05,
"loss": 1.0289,
"num_input_tokens_seen": 152043520,
"step": 145
},
{
"epoch": 0.3128431766438998,
"grad_norm": 0.7765273512748966,
"learning_rate": 4.989454283229331e-05,
"loss": 1.0316,
"num_input_tokens_seen": 153092096,
"step": 146
},
{
"epoch": 0.31498593812776216,
"grad_norm": 0.9990709966822854,
"learning_rate": 4.9892987401380686e-05,
"loss": 1.0403,
"num_input_tokens_seen": 154140672,
"step": 147
},
{
"epoch": 0.3171286996116245,
"grad_norm": 0.9112558770615148,
"learning_rate": 4.989142060804796e-05,
"loss": 1.0207,
"num_input_tokens_seen": 155189248,
"step": 148
},
{
"epoch": 0.3192714610954868,
"grad_norm": 0.8239604360975109,
"learning_rate": 4.988984245301028e-05,
"loss": 1.0335,
"num_input_tokens_seen": 156237824,
"step": 149
},
{
"epoch": 0.32141422257934915,
"grad_norm": 0.7965066882079385,
"learning_rate": 4.988825293698802e-05,
"loss": 1.0262,
"num_input_tokens_seen": 157286400,
"step": 150
},
{
"epoch": 0.3235569840632115,
"grad_norm": 0.806696091204924,
"learning_rate": 4.988665206070671e-05,
"loss": 1.0243,
"num_input_tokens_seen": 158334976,
"step": 151
},
{
"epoch": 0.3256997455470738,
"grad_norm": 0.7247103495924784,
"learning_rate": 4.988503982489707e-05,
"loss": 1.0182,
"num_input_tokens_seen": 159383552,
"step": 152
},
{
"epoch": 0.3278425070309361,
"grad_norm": 0.7394277153793506,
"learning_rate": 4.988341623029499e-05,
"loss": 1.0367,
"num_input_tokens_seen": 160432128,
"step": 153
},
{
"epoch": 0.3299852685147984,
"grad_norm": 0.7909085688130246,
"learning_rate": 4.9881781277641586e-05,
"loss": 1.0315,
"num_input_tokens_seen": 161480704,
"step": 154
},
{
"epoch": 0.33212802999866076,
"grad_norm": 0.9199353950108281,
"learning_rate": 4.9880134967683124e-05,
"loss": 1.0177,
"num_input_tokens_seen": 162529280,
"step": 155
},
{
"epoch": 0.3342707914825231,
"grad_norm": 1.0018645701488917,
"learning_rate": 4.987847730117106e-05,
"loss": 1.0339,
"num_input_tokens_seen": 163577856,
"step": 156
},
{
"epoch": 0.3364135529663854,
"grad_norm": 1.0554032523781822,
"learning_rate": 4.987680827886203e-05,
"loss": 1.0157,
"num_input_tokens_seen": 164626432,
"step": 157
},
{
"epoch": 0.33855631445024775,
"grad_norm": 0.9124413476479185,
"learning_rate": 4.987512790151787e-05,
"loss": 1.0247,
"num_input_tokens_seen": 165675008,
"step": 158
},
{
"epoch": 0.3406990759341101,
"grad_norm": 0.9524677697798024,
"learning_rate": 4.987343616990559e-05,
"loss": 1.0222,
"num_input_tokens_seen": 166723584,
"step": 159
},
{
"epoch": 0.3428418374179724,
"grad_norm": 1.0777046064102398,
"learning_rate": 4.987173308479738e-05,
"loss": 1.0243,
"num_input_tokens_seen": 167772160,
"step": 160
},
{
"epoch": 0.34498459890183475,
"grad_norm": 1.0500416155465637,
"learning_rate": 4.987001864697062e-05,
"loss": 1.0264,
"num_input_tokens_seen": 168820736,
"step": 161
},
{
"epoch": 0.3471273603856971,
"grad_norm": 1.0818693332363452,
"learning_rate": 4.986829285720785e-05,
"loss": 1.0247,
"num_input_tokens_seen": 169869312,
"step": 162
},
{
"epoch": 0.3492701218695594,
"grad_norm": 0.9559100827296363,
"learning_rate": 4.986655571629682e-05,
"loss": 1.0242,
"num_input_tokens_seen": 170917888,
"step": 163
},
{
"epoch": 0.35141288335342175,
"grad_norm": 0.9840195580062531,
"learning_rate": 4.9864807225030454e-05,
"loss": 1.0181,
"num_input_tokens_seen": 171966464,
"step": 164
},
{
"epoch": 0.353555644837284,
"grad_norm": 0.9455632571654006,
"learning_rate": 4.9863047384206835e-05,
"loss": 1.0107,
"num_input_tokens_seen": 173015040,
"step": 165
},
{
"epoch": 0.35569840632114635,
"grad_norm": 0.9705144544240498,
"learning_rate": 4.9861276194629256e-05,
"loss": 1.0256,
"num_input_tokens_seen": 174063616,
"step": 166
},
{
"epoch": 0.3578411678050087,
"grad_norm": 0.8866463242874902,
"learning_rate": 4.9859493657106185e-05,
"loss": 1.0141,
"num_input_tokens_seen": 175112192,
"step": 167
},
{
"epoch": 0.359983929288871,
"grad_norm": 0.6883759012573423,
"learning_rate": 4.985769977245124e-05,
"loss": 1.0207,
"num_input_tokens_seen": 176160768,
"step": 168
},
{
"epoch": 0.36212669077273335,
"grad_norm": 0.7831417666898026,
"learning_rate": 4.985589454148326e-05,
"loss": 1.0171,
"num_input_tokens_seen": 177209344,
"step": 169
},
{
"epoch": 0.3642694522565957,
"grad_norm": 0.8541836155121969,
"learning_rate": 4.9854077965026234e-05,
"loss": 1.0224,
"num_input_tokens_seen": 178257920,
"step": 170
},
{
"epoch": 0.366412213740458,
"grad_norm": 0.7420919461973196,
"learning_rate": 4.985225004390934e-05,
"loss": 1.0244,
"num_input_tokens_seen": 179306496,
"step": 171
},
{
"epoch": 0.36855497522432035,
"grad_norm": 0.7079902130346447,
"learning_rate": 4.985041077896695e-05,
"loss": 1.0208,
"num_input_tokens_seen": 180355072,
"step": 172
},
{
"epoch": 0.3706977367081827,
"grad_norm": 1.0041856975865897,
"learning_rate": 4.984856017103857e-05,
"loss": 1.0274,
"num_input_tokens_seen": 181403648,
"step": 173
},
{
"epoch": 0.372840498192045,
"grad_norm": 1.0259481061387932,
"learning_rate": 4.9846698220968934e-05,
"loss": 1.0056,
"num_input_tokens_seen": 182452224,
"step": 174
},
{
"epoch": 0.37498325967590734,
"grad_norm": 1.0380142050192422,
"learning_rate": 4.984482492960791e-05,
"loss": 1.0111,
"num_input_tokens_seen": 183500800,
"step": 175
},
{
"epoch": 0.3771260211597697,
"grad_norm": 0.8459728811497111,
"learning_rate": 4.984294029781059e-05,
"loss": 1.0112,
"num_input_tokens_seen": 184549376,
"step": 176
},
{
"epoch": 0.379268782643632,
"grad_norm": 0.7095378896130021,
"learning_rate": 4.9841044326437194e-05,
"loss": 1.0178,
"num_input_tokens_seen": 185597952,
"step": 177
},
{
"epoch": 0.3814115441274943,
"grad_norm": 0.8696432175517164,
"learning_rate": 4.9839137016353147e-05,
"loss": 1.0017,
"num_input_tokens_seen": 186646528,
"step": 178
},
{
"epoch": 0.3835543056113566,
"grad_norm": 1.0512720531438484,
"learning_rate": 4.983721836842903e-05,
"loss": 1.0235,
"num_input_tokens_seen": 187695104,
"step": 179
},
{
"epoch": 0.38569706709521895,
"grad_norm": 0.9561870070751214,
"learning_rate": 4.9835288383540626e-05,
"loss": 1.0073,
"num_input_tokens_seen": 188743680,
"step": 180
},
{
"epoch": 0.3878398285790813,
"grad_norm": 0.9055977049493014,
"learning_rate": 4.983334706256888e-05,
"loss": 1.0075,
"num_input_tokens_seen": 189792256,
"step": 181
},
{
"epoch": 0.3899825900629436,
"grad_norm": 0.9275644252118139,
"learning_rate": 4.98313944063999e-05,
"loss": 1.0127,
"num_input_tokens_seen": 190840832,
"step": 182
},
{
"epoch": 0.39212535154680594,
"grad_norm": 0.8560179313292782,
"learning_rate": 4.9829430415924974e-05,
"loss": 1.0125,
"num_input_tokens_seen": 191889408,
"step": 183
},
{
"epoch": 0.3942681130306683,
"grad_norm": 1.648732944069561,
"learning_rate": 4.982745509204058e-05,
"loss": 1.0099,
"num_input_tokens_seen": 192937984,
"step": 184
},
{
"epoch": 0.3964108745145306,
"grad_norm": 0.9197658476204091,
"learning_rate": 4.982546843564834e-05,
"loss": 1.0086,
"num_input_tokens_seen": 193986560,
"step": 185
},
{
"epoch": 0.39855363599839294,
"grad_norm": 1.2107999479278293,
"learning_rate": 4.982347044765508e-05,
"loss": 1.0284,
"num_input_tokens_seen": 195035136,
"step": 186
},
{
"epoch": 0.4006963974822553,
"grad_norm": 1.181894489618923,
"learning_rate": 4.982146112897277e-05,
"loss": 1.019,
"num_input_tokens_seen": 196083712,
"step": 187
},
{
"epoch": 0.4028391589661176,
"grad_norm": 1.1976480443902457,
"learning_rate": 4.9819440480518574e-05,
"loss": 1.0188,
"num_input_tokens_seen": 197132288,
"step": 188
},
{
"epoch": 0.40498192044997994,
"grad_norm": 0.8228565634507706,
"learning_rate": 4.981740850321481e-05,
"loss": 1.0066,
"num_input_tokens_seen": 198180864,
"step": 189
},
{
"epoch": 0.4071246819338422,
"grad_norm": 1.3694193286801866,
"learning_rate": 4.9815365197988986e-05,
"loss": 1.0199,
"num_input_tokens_seen": 199229440,
"step": 190
},
{
"epoch": 0.40926744341770455,
"grad_norm": 0.9081276834937746,
"learning_rate": 4.981331056577376e-05,
"loss": 1.0128,
"num_input_tokens_seen": 200278016,
"step": 191
},
{
"epoch": 0.4114102049015669,
"grad_norm": 0.8864947540491567,
"learning_rate": 4.981124460750698e-05,
"loss": 1.0082,
"num_input_tokens_seen": 201326592,
"step": 192
},
{
"epoch": 0.4135529663854292,
"grad_norm": 0.8828235461765532,
"learning_rate": 4.9809167324131645e-05,
"loss": 1.0016,
"num_input_tokens_seen": 202375168,
"step": 193
},
{
"epoch": 0.41569572786929154,
"grad_norm": 0.9051114336975766,
"learning_rate": 4.980707871659593e-05,
"loss": 1.0042,
"num_input_tokens_seen": 203423744,
"step": 194
},
{
"epoch": 0.4178384893531539,
"grad_norm": 0.9036581277126695,
"learning_rate": 4.9804978785853196e-05,
"loss": 0.9922,
"num_input_tokens_seen": 204472320,
"step": 195
},
{
"epoch": 0.4199812508370162,
"grad_norm": 0.8304193637983237,
"learning_rate": 4.980286753286195e-05,
"loss": 1.0079,
"num_input_tokens_seen": 205520896,
"step": 196
},
{
"epoch": 0.42212401232087854,
"grad_norm": 0.9472951756101595,
"learning_rate": 4.9800744958585864e-05,
"loss": 1.001,
"num_input_tokens_seen": 206569472,
"step": 197
},
{
"epoch": 0.42426677380474087,
"grad_norm": 1.1322195801321533,
"learning_rate": 4.9798611063993805e-05,
"loss": 1.0036,
"num_input_tokens_seen": 207618048,
"step": 198
},
{
"epoch": 0.4264095352886032,
"grad_norm": 0.8872065766562148,
"learning_rate": 4.979646585005978e-05,
"loss": 0.9966,
"num_input_tokens_seen": 208666624,
"step": 199
},
{
"epoch": 0.42855229677246554,
"grad_norm": 1.4965045521917073,
"learning_rate": 4.979430931776298e-05,
"loss": 1.0088,
"num_input_tokens_seen": 209715200,
"step": 200
},
{
"epoch": 0.43069505825632787,
"grad_norm": 0.8377677047446743,
"learning_rate": 4.9792141468087746e-05,
"loss": 1.0005,
"num_input_tokens_seen": 210763776,
"step": 201
},
{
"epoch": 0.43283781974019014,
"grad_norm": 1.2081647491986152,
"learning_rate": 4.97899623020236e-05,
"loss": 0.9931,
"num_input_tokens_seen": 211812352,
"step": 202
},
{
"epoch": 0.4349805812240525,
"grad_norm": 1.096772412526795,
"learning_rate": 4.978777182056523e-05,
"loss": 1.0001,
"num_input_tokens_seen": 212860928,
"step": 203
},
{
"epoch": 0.4371233427079148,
"grad_norm": 0.7671831408885996,
"learning_rate": 4.9785570024712475e-05,
"loss": 1.0049,
"num_input_tokens_seen": 213909504,
"step": 204
},
{
"epoch": 0.43926610419177714,
"grad_norm": 1.2438496225084703,
"learning_rate": 4.9783356915470344e-05,
"loss": 1.0171,
"num_input_tokens_seen": 214958080,
"step": 205
},
{
"epoch": 0.4414088656756395,
"grad_norm": 0.9608971236273456,
"learning_rate": 4.9781132493849025e-05,
"loss": 0.9959,
"num_input_tokens_seen": 216006656,
"step": 206
},
{
"epoch": 0.4435516271595018,
"grad_norm": 0.9297760035172808,
"learning_rate": 4.977889676086383e-05,
"loss": 0.991,
"num_input_tokens_seen": 217055232,
"step": 207
},
{
"epoch": 0.44569438864336414,
"grad_norm": 0.9414486663179142,
"learning_rate": 4.97766497175353e-05,
"loss": 0.9944,
"num_input_tokens_seen": 218103808,
"step": 208
},
{
"epoch": 0.44783715012722647,
"grad_norm": 0.8000456781814055,
"learning_rate": 4.977439136488907e-05,
"loss": 1.0104,
"num_input_tokens_seen": 219152384,
"step": 209
},
{
"epoch": 0.4499799116110888,
"grad_norm": 0.6756812079262554,
"learning_rate": 4.977212170395598e-05,
"loss": 0.9981,
"num_input_tokens_seen": 220200960,
"step": 210
},
{
"epoch": 0.45212267309495113,
"grad_norm": 0.6807847019333788,
"learning_rate": 4.9769840735772e-05,
"loss": 1.0012,
"num_input_tokens_seen": 221249536,
"step": 211
},
{
"epoch": 0.45426543457881347,
"grad_norm": 0.8214478039872608,
"learning_rate": 4.9767548461378296e-05,
"loss": 1.0019,
"num_input_tokens_seen": 222298112,
"step": 212
},
{
"epoch": 0.4564081960626758,
"grad_norm": 0.9135429305296144,
"learning_rate": 4.976524488182118e-05,
"loss": 0.9853,
"num_input_tokens_seen": 223346688,
"step": 213
},
{
"epoch": 0.4585509575465381,
"grad_norm": 0.6387595131943387,
"learning_rate": 4.976292999815211e-05,
"loss": 0.9887,
"num_input_tokens_seen": 224395264,
"step": 214
},
{
"epoch": 0.4606937190304004,
"grad_norm": 0.7940521466204665,
"learning_rate": 4.976060381142773e-05,
"loss": 0.993,
"num_input_tokens_seen": 225443840,
"step": 215
},
{
"epoch": 0.46283648051426274,
"grad_norm": 0.9880642635060383,
"learning_rate": 4.975826632270982e-05,
"loss": 0.9938,
"num_input_tokens_seen": 226492416,
"step": 216
},
{
"epoch": 0.46497924199812507,
"grad_norm": 0.8686661622915615,
"learning_rate": 4.975591753306533e-05,
"loss": 0.9997,
"num_input_tokens_seen": 227540992,
"step": 217
},
{
"epoch": 0.4671220034819874,
"grad_norm": 0.8041033834848365,
"learning_rate": 4.975355744356637e-05,
"loss": 0.9894,
"num_input_tokens_seen": 228589568,
"step": 218
},
{
"epoch": 0.46926476496584973,
"grad_norm": 0.8132377305765438,
"learning_rate": 4.975118605529019e-05,
"loss": 0.9915,
"num_input_tokens_seen": 229638144,
"step": 219
},
{
"epoch": 0.47140752644971207,
"grad_norm": 0.9285629824414583,
"learning_rate": 4.974880336931923e-05,
"loss": 0.9985,
"num_input_tokens_seen": 230686720,
"step": 220
},
{
"epoch": 0.4735502879335744,
"grad_norm": 1.0134617586666492,
"learning_rate": 4.974640938674107e-05,
"loss": 1.0019,
"num_input_tokens_seen": 231735296,
"step": 221
},
{
"epoch": 0.47569304941743673,
"grad_norm": 0.6308767616473976,
"learning_rate": 4.974400410864842e-05,
"loss": 0.9842,
"num_input_tokens_seen": 232783872,
"step": 222
},
{
"epoch": 0.47783581090129906,
"grad_norm": 0.793272468348317,
"learning_rate": 4.9741587536139204e-05,
"loss": 0.9973,
"num_input_tokens_seen": 233832448,
"step": 223
},
{
"epoch": 0.4799785723851614,
"grad_norm": 0.8278454064136296,
"learning_rate": 4.973915967031644e-05,
"loss": 0.993,
"num_input_tokens_seen": 234881024,
"step": 224
},
{
"epoch": 0.4821213338690237,
"grad_norm": 0.7549982264523979,
"learning_rate": 4.9736720512288334e-05,
"loss": 0.9956,
"num_input_tokens_seen": 235929600,
"step": 225
},
{
"epoch": 0.48426409535288606,
"grad_norm": 0.9478859353787976,
"learning_rate": 4.973427006316826e-05,
"loss": 0.9834,
"num_input_tokens_seen": 236978176,
"step": 226
},
{
"epoch": 0.48640685683674834,
"grad_norm": 1.2636972768723895,
"learning_rate": 4.9731808324074717e-05,
"loss": 1.0092,
"num_input_tokens_seen": 238026752,
"step": 227
},
{
"epoch": 0.48854961832061067,
"grad_norm": 0.7293159014596692,
"learning_rate": 4.972933529613135e-05,
"loss": 0.9904,
"num_input_tokens_seen": 239075328,
"step": 228
},
{
"epoch": 0.490692379804473,
"grad_norm": 7.0328945417058595,
"learning_rate": 4.9726850980467e-05,
"loss": 1.0093,
"num_input_tokens_seen": 240123904,
"step": 229
},
{
"epoch": 0.49283514128833533,
"grad_norm": 1.6482931792205036,
"learning_rate": 4.972435537821562e-05,
"loss": 1.015,
"num_input_tokens_seen": 241172480,
"step": 230
},
{
"epoch": 0.49497790277219766,
"grad_norm": 0.866041440885184,
"learning_rate": 4.972184849051633e-05,
"loss": 0.9884,
"num_input_tokens_seen": 242221056,
"step": 231
},
{
"epoch": 0.49712066425606,
"grad_norm": 1.1433705805979748,
"learning_rate": 4.971933031851341e-05,
"loss": 0.9992,
"num_input_tokens_seen": 243269632,
"step": 232
},
{
"epoch": 0.49926342573992233,
"grad_norm": 1.1749688400490832,
"learning_rate": 4.971680086335627e-05,
"loss": 1.0002,
"num_input_tokens_seen": 244318208,
"step": 233
},
{
"epoch": 0.5014061872237846,
"grad_norm": 1.1383165304481435,
"learning_rate": 4.971426012619949e-05,
"loss": 1.0272,
"num_input_tokens_seen": 245366784,
"step": 234
},
{
"epoch": 0.503548948707647,
"grad_norm": 0.9272257838508966,
"learning_rate": 4.971170810820279e-05,
"loss": 0.9856,
"num_input_tokens_seen": 246415360,
"step": 235
},
{
"epoch": 0.5056917101915093,
"grad_norm": 1.049285697170834,
"learning_rate": 4.9709144810531026e-05,
"loss": 1.0075,
"num_input_tokens_seen": 247463936,
"step": 236
},
{
"epoch": 0.5078344716753717,
"grad_norm": 1.1031939359682563,
"learning_rate": 4.970657023435424e-05,
"loss": 0.9938,
"num_input_tokens_seen": 248512512,
"step": 237
},
{
"epoch": 0.5099772331592339,
"grad_norm": 0.8559721116198098,
"learning_rate": 4.970398438084758e-05,
"loss": 1.0073,
"num_input_tokens_seen": 249561088,
"step": 238
},
{
"epoch": 0.5121199946430963,
"grad_norm": 0.8693796569575785,
"learning_rate": 4.9701387251191364e-05,
"loss": 0.9939,
"num_input_tokens_seen": 250609664,
"step": 239
},
{
"epoch": 0.5142627561269586,
"grad_norm": 0.7154096347287712,
"learning_rate": 4.969877884657107e-05,
"loss": 0.9923,
"num_input_tokens_seen": 251658240,
"step": 240
},
{
"epoch": 0.516405517610821,
"grad_norm": 0.7434887839314398,
"learning_rate": 4.969615916817728e-05,
"loss": 0.9953,
"num_input_tokens_seen": 252706816,
"step": 241
},
{
"epoch": 0.5185482790946833,
"grad_norm": 0.8259812014838418,
"learning_rate": 4.969352821720577e-05,
"loss": 0.9751,
"num_input_tokens_seen": 253755392,
"step": 242
},
{
"epoch": 0.5206910405785456,
"grad_norm": 0.8349207689087657,
"learning_rate": 4.969088599485743e-05,
"loss": 0.9772,
"num_input_tokens_seen": 254803968,
"step": 243
},
{
"epoch": 0.5228338020624079,
"grad_norm": 0.9703418596197566,
"learning_rate": 4.96882325023383e-05,
"loss": 0.9855,
"num_input_tokens_seen": 255852544,
"step": 244
},
{
"epoch": 0.5249765635462702,
"grad_norm": 0.780048511806156,
"learning_rate": 4.968556774085957e-05,
"loss": 0.9938,
"num_input_tokens_seen": 256901120,
"step": 245
},
{
"epoch": 0.5271193250301326,
"grad_norm": 0.7459697569119867,
"learning_rate": 4.968289171163758e-05,
"loss": 0.9774,
"num_input_tokens_seen": 257949696,
"step": 246
},
{
"epoch": 0.5292620865139949,
"grad_norm": 0.6952628328824583,
"learning_rate": 4.9680204415893804e-05,
"loss": 0.9858,
"num_input_tokens_seen": 258998272,
"step": 247
},
{
"epoch": 0.5314048479978573,
"grad_norm": 0.6326440824353008,
"learning_rate": 4.967750585485484e-05,
"loss": 0.9878,
"num_input_tokens_seen": 260046848,
"step": 248
},
{
"epoch": 0.5335476094817195,
"grad_norm": 0.6886573402560195,
"learning_rate": 4.967479602975248e-05,
"loss": 0.9858,
"num_input_tokens_seen": 261095424,
"step": 249
},
{
"epoch": 0.5356903709655819,
"grad_norm": 0.7358477832059849,
"learning_rate": 4.967207494182361e-05,
"loss": 0.968,
"num_input_tokens_seen": 262144000,
"step": 250
},
{
"epoch": 0.5378331324494442,
"grad_norm": 0.719311517365177,
"learning_rate": 4.966934259231026e-05,
"loss": 0.9732,
"num_input_tokens_seen": 263192576,
"step": 251
},
{
"epoch": 0.5399758939333066,
"grad_norm": 0.6469259741219804,
"learning_rate": 4.9666598982459635e-05,
"loss": 0.9804,
"num_input_tokens_seen": 264241152,
"step": 252
},
{
"epoch": 0.5421186554171689,
"grad_norm": 0.6295830105068848,
"learning_rate": 4.9663844113524035e-05,
"loss": 0.9849,
"num_input_tokens_seen": 265289728,
"step": 253
},
{
"epoch": 0.5442614169010312,
"grad_norm": 0.6470841192271415,
"learning_rate": 4.966107798676095e-05,
"loss": 0.998,
"num_input_tokens_seen": 266338304,
"step": 254
},
{
"epoch": 0.5464041783848935,
"grad_norm": 0.745769037672019,
"learning_rate": 4.965830060343295e-05,
"loss": 0.9786,
"num_input_tokens_seen": 267386880,
"step": 255
},
{
"epoch": 0.5485469398687559,
"grad_norm": 0.8367677663996089,
"learning_rate": 4.9655511964807785e-05,
"loss": 0.966,
"num_input_tokens_seen": 268435456,
"step": 256
},
{
"epoch": 0.5506897013526182,
"grad_norm": 0.7767484325908519,
"learning_rate": 4.965271207215835e-05,
"loss": 0.9812,
"num_input_tokens_seen": 269484032,
"step": 257
},
{
"epoch": 0.5528324628364805,
"grad_norm": 0.8001991506895153,
"learning_rate": 4.964990092676263e-05,
"loss": 0.978,
"num_input_tokens_seen": 270532608,
"step": 258
},
{
"epoch": 0.5549752243203429,
"grad_norm": 0.8240750872880366,
"learning_rate": 4.964707852990378e-05,
"loss": 0.9774,
"num_input_tokens_seen": 271581184,
"step": 259
},
{
"epoch": 0.5571179858042051,
"grad_norm": 0.8910873941590336,
"learning_rate": 4.964424488287009e-05,
"loss": 0.9748,
"num_input_tokens_seen": 272629760,
"step": 260
},
{
"epoch": 0.5592607472880675,
"grad_norm": 0.8594739011298812,
"learning_rate": 4.9641399986955e-05,
"loss": 0.9774,
"num_input_tokens_seen": 273678336,
"step": 261
},
{
"epoch": 0.5614035087719298,
"grad_norm": 0.7932194322462572,
"learning_rate": 4.963854384345702e-05,
"loss": 0.977,
"num_input_tokens_seen": 274726912,
"step": 262
},
{
"epoch": 0.5635462702557922,
"grad_norm": 0.7206472566566292,
"learning_rate": 4.963567645367988e-05,
"loss": 0.9787,
"num_input_tokens_seen": 275775488,
"step": 263
},
{
"epoch": 0.5656890317396545,
"grad_norm": 0.67771883626031,
"learning_rate": 4.9632797818932374e-05,
"loss": 0.974,
"num_input_tokens_seen": 276824064,
"step": 264
},
{
"epoch": 0.5678317932235168,
"grad_norm": 0.6254390758836766,
"learning_rate": 4.962990794052847e-05,
"loss": 0.982,
"num_input_tokens_seen": 277872640,
"step": 265
},
{
"epoch": 0.5699745547073791,
"grad_norm": 0.6719281453617003,
"learning_rate": 4.962700681978725e-05,
"loss": 0.9784,
"num_input_tokens_seen": 278921216,
"step": 266
},
{
"epoch": 0.5721173161912415,
"grad_norm": 0.7065800364114421,
"learning_rate": 4.9624094458032946e-05,
"loss": 0.9645,
"num_input_tokens_seen": 279969792,
"step": 267
},
{
"epoch": 0.5742600776751038,
"grad_norm": 0.8137810359656845,
"learning_rate": 4.962117085659489e-05,
"loss": 0.976,
"num_input_tokens_seen": 281018368,
"step": 268
},
{
"epoch": 0.5764028391589661,
"grad_norm": 0.8384457143591761,
"learning_rate": 4.9618236016807564e-05,
"loss": 0.9745,
"num_input_tokens_seen": 282066944,
"step": 269
},
{
"epoch": 0.5785456006428285,
"grad_norm": 0.8204715252519866,
"learning_rate": 4.9615289940010584e-05,
"loss": 0.9593,
"num_input_tokens_seen": 283115520,
"step": 270
},
{
"epoch": 0.5806883621266907,
"grad_norm": 0.7458887228441663,
"learning_rate": 4.9612332627548686e-05,
"loss": 0.9629,
"num_input_tokens_seen": 284164096,
"step": 271
},
{
"epoch": 0.5828311236105531,
"grad_norm": 0.7023067986639293,
"learning_rate": 4.9609364080771735e-05,
"loss": 0.9601,
"num_input_tokens_seen": 285212672,
"step": 272
},
{
"epoch": 0.5849738850944154,
"grad_norm": 0.6929007805184092,
"learning_rate": 4.960638430103473e-05,
"loss": 0.9699,
"num_input_tokens_seen": 286261248,
"step": 273
},
{
"epoch": 0.5871166465782778,
"grad_norm": 0.6061285658814772,
"learning_rate": 4.96033932896978e-05,
"loss": 0.9748,
"num_input_tokens_seen": 287309824,
"step": 274
},
{
"epoch": 0.58925940806214,
"grad_norm": 0.6819628224136882,
"learning_rate": 4.960039104812618e-05,
"loss": 0.967,
"num_input_tokens_seen": 288358400,
"step": 275
},
{
"epoch": 0.5914021695460024,
"grad_norm": 0.7578327655133753,
"learning_rate": 4.959737757769025e-05,
"loss": 0.9697,
"num_input_tokens_seen": 289406976,
"step": 276
},
{
"epoch": 0.5935449310298647,
"grad_norm": 0.7930136215053085,
"learning_rate": 4.959435287976551e-05,
"loss": 0.9798,
"num_input_tokens_seen": 290455552,
"step": 277
},
{
"epoch": 0.5956876925137271,
"grad_norm": 0.6902790672018838,
"learning_rate": 4.9591316955732595e-05,
"loss": 0.9683,
"num_input_tokens_seen": 291504128,
"step": 278
},
{
"epoch": 0.5978304539975894,
"grad_norm": 0.7366948695256316,
"learning_rate": 4.9588269806977236e-05,
"loss": 0.981,
"num_input_tokens_seen": 292552704,
"step": 279
},
{
"epoch": 0.5999732154814518,
"grad_norm": 0.7412565614434039,
"learning_rate": 4.958521143489032e-05,
"loss": 0.9655,
"num_input_tokens_seen": 293601280,
"step": 280
},
{
"epoch": 0.602115976965314,
"grad_norm": 0.7638328455546902,
"learning_rate": 4.9582141840867835e-05,
"loss": 0.9768,
"num_input_tokens_seen": 294649856,
"step": 281
},
{
"epoch": 0.6042587384491763,
"grad_norm": 0.7859880911062314,
"learning_rate": 4.957906102631091e-05,
"loss": 0.9655,
"num_input_tokens_seen": 295698432,
"step": 282
},
{
"epoch": 0.6064014999330387,
"grad_norm": 0.7021334869310707,
"learning_rate": 4.9575968992625775e-05,
"loss": 0.9714,
"num_input_tokens_seen": 296747008,
"step": 283
},
{
"epoch": 0.608544261416901,
"grad_norm": 0.7151227917175634,
"learning_rate": 4.957286574122379e-05,
"loss": 0.9805,
"num_input_tokens_seen": 297795584,
"step": 284
},
{
"epoch": 0.6106870229007634,
"grad_norm": 0.8089921958905767,
"learning_rate": 4.9569751273521454e-05,
"loss": 0.9749,
"num_input_tokens_seen": 298844160,
"step": 285
},
{
"epoch": 0.6128297843846257,
"grad_norm": 0.8163799273123393,
"learning_rate": 4.956662559094034e-05,
"loss": 0.9628,
"num_input_tokens_seen": 299892736,
"step": 286
},
{
"epoch": 0.614972545868488,
"grad_norm": 0.7583938434718084,
"learning_rate": 4.9563488694907186e-05,
"loss": 0.9855,
"num_input_tokens_seen": 300941312,
"step": 287
},
{
"epoch": 0.6171153073523503,
"grad_norm": 0.7961064465249695,
"learning_rate": 4.9560340586853825e-05,
"loss": 0.9812,
"num_input_tokens_seen": 301989888,
"step": 288
},
{
"epoch": 0.6192580688362127,
"grad_norm": 0.8537459633314501,
"learning_rate": 4.9557181268217227e-05,
"loss": 0.9788,
"num_input_tokens_seen": 303038464,
"step": 289
},
{
"epoch": 0.621400830320075,
"grad_norm": 0.8608551087009711,
"learning_rate": 4.9554010740439435e-05,
"loss": 0.9649,
"num_input_tokens_seen": 304087040,
"step": 290
},
{
"epoch": 0.6235435918039374,
"grad_norm": 0.8426716239141546,
"learning_rate": 4.955082900496766e-05,
"loss": 0.9652,
"num_input_tokens_seen": 305135616,
"step": 291
},
{
"epoch": 0.6256863532877996,
"grad_norm": 0.8378716840701593,
"learning_rate": 4.9547636063254196e-05,
"loss": 0.9772,
"num_input_tokens_seen": 306184192,
"step": 292
},
{
"epoch": 0.627829114771662,
"grad_norm": 0.7383256729006417,
"learning_rate": 4.954443191675648e-05,
"loss": 0.968,
"num_input_tokens_seen": 307232768,
"step": 293
},
{
"epoch": 0.6299718762555243,
"grad_norm": 0.8321449404267852,
"learning_rate": 4.954121656693703e-05,
"loss": 0.9608,
"num_input_tokens_seen": 308281344,
"step": 294
},
{
"epoch": 0.6321146377393866,
"grad_norm": 0.9058689828945011,
"learning_rate": 4.9537990015263505e-05,
"loss": 0.9624,
"num_input_tokens_seen": 309329920,
"step": 295
},
{
"epoch": 0.634257399223249,
"grad_norm": 0.8558551476463713,
"learning_rate": 4.953475226320866e-05,
"loss": 0.9759,
"num_input_tokens_seen": 310378496,
"step": 296
},
{
"epoch": 0.6364001607071113,
"grad_norm": 0.8994060904273229,
"learning_rate": 4.9531503312250375e-05,
"loss": 0.9698,
"num_input_tokens_seen": 311427072,
"step": 297
},
{
"epoch": 0.6385429221909736,
"grad_norm": 0.9092829036891147,
"learning_rate": 4.952824316387163e-05,
"loss": 0.9588,
"num_input_tokens_seen": 312475648,
"step": 298
},
{
"epoch": 0.6406856836748359,
"grad_norm": 0.7764855569244933,
"learning_rate": 4.952497181956053e-05,
"loss": 0.9622,
"num_input_tokens_seen": 313524224,
"step": 299
},
{
"epoch": 0.6428284451586983,
"grad_norm": 0.7428489302275753,
"learning_rate": 4.952168928081027e-05,
"loss": 0.9663,
"num_input_tokens_seen": 314572800,
"step": 300
},
{
"epoch": 0.6449712066425606,
"grad_norm": 0.6931852532792968,
"learning_rate": 4.951839554911917e-05,
"loss": 0.9599,
"num_input_tokens_seen": 315621376,
"step": 301
},
{
"epoch": 0.647113968126423,
"grad_norm": 0.6380122510191174,
"learning_rate": 4.951509062599066e-05,
"loss": 0.9696,
"num_input_tokens_seen": 316669952,
"step": 302
},
{
"epoch": 0.6492567296102852,
"grad_norm": 0.627484942520017,
"learning_rate": 4.951177451293328e-05,
"loss": 0.9649,
"num_input_tokens_seen": 317718528,
"step": 303
},
{
"epoch": 0.6513994910941476,
"grad_norm": 0.669276814072384,
"learning_rate": 4.950844721146066e-05,
"loss": 0.9617,
"num_input_tokens_seen": 318767104,
"step": 304
},
{
"epoch": 0.6535422525780099,
"grad_norm": 0.6472064020866986,
"learning_rate": 4.950510872309155e-05,
"loss": 0.9593,
"num_input_tokens_seen": 319815680,
"step": 305
},
{
"epoch": 0.6556850140618722,
"grad_norm": 0.6282593277453888,
"learning_rate": 4.950175904934982e-05,
"loss": 0.9682,
"num_input_tokens_seen": 320864256,
"step": 306
},
{
"epoch": 0.6578277755457346,
"grad_norm": 0.6233384774128375,
"learning_rate": 4.949839819176442e-05,
"loss": 0.9672,
"num_input_tokens_seen": 321912832,
"step": 307
},
{
"epoch": 0.6599705370295968,
"grad_norm": 0.6406545710871328,
"learning_rate": 4.949502615186941e-05,
"loss": 0.9667,
"num_input_tokens_seen": 322961408,
"step": 308
},
{
"epoch": 0.6621132985134592,
"grad_norm": 0.66860629438086,
"learning_rate": 4.949164293120397e-05,
"loss": 0.9672,
"num_input_tokens_seen": 324009984,
"step": 309
},
{
"epoch": 0.6642560599973215,
"grad_norm": 0.7221412776198233,
"learning_rate": 4.948824853131236e-05,
"loss": 0.9768,
"num_input_tokens_seen": 325058560,
"step": 310
},
{
"epoch": 0.6663988214811839,
"grad_norm": 0.7520232960695742,
"learning_rate": 4.948484295374397e-05,
"loss": 0.9636,
"num_input_tokens_seen": 326107136,
"step": 311
},
{
"epoch": 0.6685415829650462,
"grad_norm": 0.6404508643255505,
"learning_rate": 4.948142620005328e-05,
"loss": 0.956,
"num_input_tokens_seen": 327155712,
"step": 312
},
{
"epoch": 0.6706843444489086,
"grad_norm": 0.7309479445003378,
"learning_rate": 4.947799827179986e-05,
"loss": 0.9562,
"num_input_tokens_seen": 328204288,
"step": 313
},
{
"epoch": 0.6728271059327708,
"grad_norm": 0.6802683062309518,
"learning_rate": 4.9474559170548387e-05,
"loss": 0.9746,
"num_input_tokens_seen": 329252864,
"step": 314
},
{
"epoch": 0.6749698674166332,
"grad_norm": 0.6778007963541558,
"learning_rate": 4.947110889786864e-05,
"loss": 0.9578,
"num_input_tokens_seen": 330301440,
"step": 315
},
{
"epoch": 0.6771126289004955,
"grad_norm": 0.7269233162497253,
"learning_rate": 4.946764745533552e-05,
"loss": 0.955,
"num_input_tokens_seen": 331350016,
"step": 316
},
{
"epoch": 0.6792553903843579,
"grad_norm": 0.8004146317091602,
"learning_rate": 4.9464174844528984e-05,
"loss": 0.9601,
"num_input_tokens_seen": 332398592,
"step": 317
},
{
"epoch": 0.6813981518682202,
"grad_norm": 0.7753178744802244,
"learning_rate": 4.946069106703411e-05,
"loss": 0.9594,
"num_input_tokens_seen": 333447168,
"step": 318
},
{
"epoch": 0.6835409133520824,
"grad_norm": 0.6815178840652059,
"learning_rate": 4.9457196124441073e-05,
"loss": 0.9578,
"num_input_tokens_seen": 334495744,
"step": 319
},
{
"epoch": 0.6856836748359448,
"grad_norm": 0.686119514860132,
"learning_rate": 4.9453690018345144e-05,
"loss": 0.9605,
"num_input_tokens_seen": 335544320,
"step": 320
},
{
"epoch": 0.6878264363198071,
"grad_norm": 0.6182467278075696,
"learning_rate": 4.9450172750346684e-05,
"loss": 0.9531,
"num_input_tokens_seen": 336592896,
"step": 321
},
{
"epoch": 0.6899691978036695,
"grad_norm": 0.7082354365899891,
"learning_rate": 4.944664432205115e-05,
"loss": 0.9652,
"num_input_tokens_seen": 337641472,
"step": 322
},
{
"epoch": 0.6921119592875318,
"grad_norm": 0.6792138640035731,
"learning_rate": 4.944310473506911e-05,
"loss": 0.9535,
"num_input_tokens_seen": 338690048,
"step": 323
},
{
"epoch": 0.6942547207713942,
"grad_norm": 0.6443309953592322,
"learning_rate": 4.9439553991016187e-05,
"loss": 0.9659,
"num_input_tokens_seen": 339738624,
"step": 324
},
{
"epoch": 0.6963974822552564,
"grad_norm": 0.6884219897088883,
"learning_rate": 4.943599209151314e-05,
"loss": 0.9626,
"num_input_tokens_seen": 340787200,
"step": 325
},
{
"epoch": 0.6985402437391188,
"grad_norm": 0.7474123707956007,
"learning_rate": 4.9432419038185794e-05,
"loss": 0.9579,
"num_input_tokens_seen": 341835776,
"step": 326
},
{
"epoch": 0.7006830052229811,
"grad_norm": 0.8001439442798239,
"learning_rate": 4.942883483266507e-05,
"loss": 0.9585,
"num_input_tokens_seen": 342884352,
"step": 327
},
{
"epoch": 0.7028257667068435,
"grad_norm": 0.6776134193455201,
"learning_rate": 4.942523947658698e-05,
"loss": 0.9584,
"num_input_tokens_seen": 343932928,
"step": 328
},
{
"epoch": 0.7049685281907058,
"grad_norm": 0.6444901731086969,
"learning_rate": 4.942163297159263e-05,
"loss": 0.9438,
"num_input_tokens_seen": 344981504,
"step": 329
},
{
"epoch": 0.707111289674568,
"grad_norm": 0.6635883359485919,
"learning_rate": 4.9418015319328204e-05,
"loss": 0.9524,
"num_input_tokens_seen": 346030080,
"step": 330
},
{
"epoch": 0.7092540511584304,
"grad_norm": 0.6682935448210751,
"learning_rate": 4.9414386521445e-05,
"loss": 0.956,
"num_input_tokens_seen": 347078656,
"step": 331
},
{
"epoch": 0.7113968126422927,
"grad_norm": 0.5451374703215448,
"learning_rate": 4.941074657959937e-05,
"loss": 0.9568,
"num_input_tokens_seen": 348127232,
"step": 332
},
{
"epoch": 0.7135395741261551,
"grad_norm": 0.6617882607114642,
"learning_rate": 4.940709549545276e-05,
"loss": 0.9788,
"num_input_tokens_seen": 349175808,
"step": 333
},
{
"epoch": 0.7156823356100174,
"grad_norm": 0.7663700039234058,
"learning_rate": 4.940343327067172e-05,
"loss": 0.9611,
"num_input_tokens_seen": 350224384,
"step": 334
},
{
"epoch": 0.7178250970938798,
"grad_norm": 0.7106293208476724,
"learning_rate": 4.939975990692789e-05,
"loss": 0.9433,
"num_input_tokens_seen": 351272960,
"step": 335
},
{
"epoch": 0.719967858577742,
"grad_norm": 0.8812609616643794,
"learning_rate": 4.939607540589795e-05,
"loss": 0.9522,
"num_input_tokens_seen": 352321536,
"step": 336
},
{
"epoch": 0.7221106200616044,
"grad_norm": 1.180776889607567,
"learning_rate": 4.9392379769263716e-05,
"loss": 0.9644,
"num_input_tokens_seen": 353370112,
"step": 337
},
{
"epoch": 0.7242533815454667,
"grad_norm": 0.9180392680374633,
"learning_rate": 4.9388672998712046e-05,
"loss": 0.9498,
"num_input_tokens_seen": 354418688,
"step": 338
},
{
"epoch": 0.7263961430293291,
"grad_norm": 0.686049375541203,
"learning_rate": 4.938495509593492e-05,
"loss": 0.9603,
"num_input_tokens_seen": 355467264,
"step": 339
},
{
"epoch": 0.7285389045131914,
"grad_norm": 0.6408315854104981,
"learning_rate": 4.938122606262936e-05,
"loss": 0.951,
"num_input_tokens_seen": 356515840,
"step": 340
},
{
"epoch": 0.7306816659970538,
"grad_norm": 0.7454673929992747,
"learning_rate": 4.9377485900497476e-05,
"loss": 0.946,
"num_input_tokens_seen": 357564416,
"step": 341
},
{
"epoch": 0.732824427480916,
"grad_norm": 0.807025821794171,
"learning_rate": 4.937373461124649e-05,
"loss": 0.9694,
"num_input_tokens_seen": 358612992,
"step": 342
},
{
"epoch": 0.7349671889647783,
"grad_norm": 1.0392775338900357,
"learning_rate": 4.9369972196588676e-05,
"loss": 0.9606,
"num_input_tokens_seen": 359661568,
"step": 343
},
{
"epoch": 0.7371099504486407,
"grad_norm": 0.8056437211626823,
"learning_rate": 4.936619865824138e-05,
"loss": 0.9494,
"num_input_tokens_seen": 360710144,
"step": 344
},
{
"epoch": 0.739252711932503,
"grad_norm": 0.6692576799616317,
"learning_rate": 4.936241399792705e-05,
"loss": 0.9471,
"num_input_tokens_seen": 361758720,
"step": 345
},
{
"epoch": 0.7413954734163654,
"grad_norm": 0.6439662667669066,
"learning_rate": 4.935861821737318e-05,
"loss": 0.9518,
"num_input_tokens_seen": 362807296,
"step": 346
},
{
"epoch": 0.7435382349002276,
"grad_norm": 0.6971274641706622,
"learning_rate": 4.9354811318312367e-05,
"loss": 0.958,
"num_input_tokens_seen": 363855872,
"step": 347
},
{
"epoch": 0.74568099638409,
"grad_norm": 0.8246209023489939,
"learning_rate": 4.935099330248227e-05,
"loss": 0.9575,
"num_input_tokens_seen": 364904448,
"step": 348
},
{
"epoch": 0.7478237578679523,
"grad_norm": 0.6454804197784619,
"learning_rate": 4.934716417162563e-05,
"loss": 0.9527,
"num_input_tokens_seen": 365953024,
"step": 349
},
{
"epoch": 0.7499665193518147,
"grad_norm": 0.7907007746191874,
"learning_rate": 4.934332392749025e-05,
"loss": 0.9534,
"num_input_tokens_seen": 367001600,
"step": 350
},
{
"epoch": 0.752109280835677,
"grad_norm": 0.8537019071397235,
"learning_rate": 4.933947257182901e-05,
"loss": 0.9618,
"num_input_tokens_seen": 368050176,
"step": 351
},
{
"epoch": 0.7542520423195394,
"grad_norm": 0.8452849854896484,
"learning_rate": 4.9335610106399864e-05,
"loss": 0.9563,
"num_input_tokens_seen": 369098752,
"step": 352
},
{
"epoch": 0.7563948038034016,
"grad_norm": 0.8365818755601035,
"learning_rate": 4.933173653296585e-05,
"loss": 0.9433,
"num_input_tokens_seen": 370147328,
"step": 353
},
{
"epoch": 0.758537565287264,
"grad_norm": 0.8843871389931269,
"learning_rate": 4.932785185329505e-05,
"loss": 0.9634,
"num_input_tokens_seen": 371195904,
"step": 354
},
{
"epoch": 0.7606803267711263,
"grad_norm": 0.7011261167297714,
"learning_rate": 4.932395606916062e-05,
"loss": 0.9546,
"num_input_tokens_seen": 372244480,
"step": 355
},
{
"epoch": 0.7628230882549886,
"grad_norm": 0.7236664874887737,
"learning_rate": 4.932004918234082e-05,
"loss": 0.9405,
"num_input_tokens_seen": 373293056,
"step": 356
},
{
"epoch": 0.764965849738851,
"grad_norm": 0.6365713819464895,
"learning_rate": 4.931613119461893e-05,
"loss": 0.9456,
"num_input_tokens_seen": 374341632,
"step": 357
},
{
"epoch": 0.7671086112227132,
"grad_norm": 0.7016893089903377,
"learning_rate": 4.931220210778332e-05,
"loss": 0.9578,
"num_input_tokens_seen": 375390208,
"step": 358
},
{
"epoch": 0.7692513727065756,
"grad_norm": 0.604362477542579,
"learning_rate": 4.930826192362744e-05,
"loss": 0.9397,
"num_input_tokens_seen": 376438784,
"step": 359
},
{
"epoch": 0.7713941341904379,
"grad_norm": 0.5848016990696451,
"learning_rate": 4.930431064394977e-05,
"loss": 0.9595,
"num_input_tokens_seen": 377487360,
"step": 360
},
{
"epoch": 0.7735368956743003,
"grad_norm": 0.5429284750696663,
"learning_rate": 4.930034827055388e-05,
"loss": 0.9411,
"num_input_tokens_seen": 378535936,
"step": 361
},
{
"epoch": 0.7756796571581626,
"grad_norm": 0.7012823730956075,
"learning_rate": 4.92963748052484e-05,
"loss": 0.946,
"num_input_tokens_seen": 379584512,
"step": 362
},
{
"epoch": 0.777822418642025,
"grad_norm": 0.8647844578965358,
"learning_rate": 4.929239024984702e-05,
"loss": 0.9537,
"num_input_tokens_seen": 380633088,
"step": 363
},
{
"epoch": 0.7799651801258872,
"grad_norm": 0.9492005933550962,
"learning_rate": 4.9288394606168494e-05,
"loss": 0.9538,
"num_input_tokens_seen": 381681664,
"step": 364
},
{
"epoch": 0.7821079416097496,
"grad_norm": 0.9833441664810633,
"learning_rate": 4.928438787603664e-05,
"loss": 0.9551,
"num_input_tokens_seen": 382730240,
"step": 365
},
{
"epoch": 0.7842507030936119,
"grad_norm": 0.874577604103094,
"learning_rate": 4.928037006128032e-05,
"loss": 0.9536,
"num_input_tokens_seen": 383778816,
"step": 366
},
{
"epoch": 0.7863934645774742,
"grad_norm": 0.6387335904967489,
"learning_rate": 4.927634116373349e-05,
"loss": 0.9408,
"num_input_tokens_seen": 384827392,
"step": 367
},
{
"epoch": 0.7885362260613366,
"grad_norm": 0.6243565041547219,
"learning_rate": 4.9272301185235116e-05,
"loss": 0.9435,
"num_input_tokens_seen": 385875968,
"step": 368
},
{
"epoch": 0.7906789875451988,
"grad_norm": 0.669927371235254,
"learning_rate": 4.9268250127629265e-05,
"loss": 0.95,
"num_input_tokens_seen": 386924544,
"step": 369
},
{
"epoch": 0.7928217490290612,
"grad_norm": 0.6159925821864788,
"learning_rate": 4.926418799276504e-05,
"loss": 0.9403,
"num_input_tokens_seen": 387973120,
"step": 370
},
{
"epoch": 0.7949645105129235,
"grad_norm": 0.5805674830639145,
"learning_rate": 4.926011478249661e-05,
"loss": 0.9489,
"num_input_tokens_seen": 389021696,
"step": 371
},
{
"epoch": 0.7971072719967859,
"grad_norm": 0.5990935108596377,
"learning_rate": 4.925603049868319e-05,
"loss": 0.9333,
"num_input_tokens_seen": 390070272,
"step": 372
},
{
"epoch": 0.7992500334806482,
"grad_norm": 0.56780569716724,
"learning_rate": 4.925193514318906e-05,
"loss": 0.9524,
"num_input_tokens_seen": 391118848,
"step": 373
},
{
"epoch": 0.8013927949645105,
"grad_norm": 0.5555146256374626,
"learning_rate": 4.924782871788354e-05,
"loss": 0.9455,
"num_input_tokens_seen": 392167424,
"step": 374
},
{
"epoch": 0.8035355564483728,
"grad_norm": 0.6025921472581821,
"learning_rate": 4.924371122464101e-05,
"loss": 0.9502,
"num_input_tokens_seen": 393216000,
"step": 375
},
{
"epoch": 0.8056783179322352,
"grad_norm": 0.6412503310742468,
"learning_rate": 4.923958266534091e-05,
"loss": 0.9553,
"num_input_tokens_seen": 394264576,
"step": 376
},
{
"epoch": 0.8078210794160975,
"grad_norm": 0.6035262643666147,
"learning_rate": 4.923544304186771e-05,
"loss": 0.9462,
"num_input_tokens_seen": 395313152,
"step": 377
},
{
"epoch": 0.8099638408999599,
"grad_norm": 0.6773878542789623,
"learning_rate": 4.923129235611096e-05,
"loss": 0.9484,
"num_input_tokens_seen": 396361728,
"step": 378
},
{
"epoch": 0.8121066023838222,
"grad_norm": 0.7181038641332558,
"learning_rate": 4.922713060996524e-05,
"loss": 0.9452,
"num_input_tokens_seen": 397410304,
"step": 379
},
{
"epoch": 0.8142493638676844,
"grad_norm": 0.719797180992769,
"learning_rate": 4.922295780533017e-05,
"loss": 0.9433,
"num_input_tokens_seen": 398458880,
"step": 380
},
{
"epoch": 0.8163921253515468,
"grad_norm": 0.6697588054636289,
"learning_rate": 4.921877394411045e-05,
"loss": 0.9538,
"num_input_tokens_seen": 399507456,
"step": 381
},
{
"epoch": 0.8185348868354091,
"grad_norm": 0.6623009304845271,
"learning_rate": 4.9214579028215776e-05,
"loss": 0.9482,
"num_input_tokens_seen": 400556032,
"step": 382
},
{
"epoch": 0.8206776483192715,
"grad_norm": 0.6039497437674582,
"learning_rate": 4.921037305956095e-05,
"loss": 0.9536,
"num_input_tokens_seen": 401604608,
"step": 383
},
{
"epoch": 0.8228204098031338,
"grad_norm": 0.7653003468169798,
"learning_rate": 4.920615604006578e-05,
"loss": 0.9423,
"num_input_tokens_seen": 402653184,
"step": 384
},
{
"epoch": 0.8249631712869961,
"grad_norm": 0.7904738063345222,
"learning_rate": 4.920192797165511e-05,
"loss": 0.9347,
"num_input_tokens_seen": 403701760,
"step": 385
},
{
"epoch": 0.8271059327708584,
"grad_norm": 0.7924189341019402,
"learning_rate": 4.919768885625887e-05,
"loss": 0.9454,
"num_input_tokens_seen": 404750336,
"step": 386
},
{
"epoch": 0.8292486942547208,
"grad_norm": 0.8312139658612682,
"learning_rate": 4.9193438695811985e-05,
"loss": 0.9386,
"num_input_tokens_seen": 405798912,
"step": 387
},
{
"epoch": 0.8313914557385831,
"grad_norm": 0.8646683775446922,
"learning_rate": 4.9189177492254455e-05,
"loss": 0.9392,
"num_input_tokens_seen": 406847488,
"step": 388
},
{
"epoch": 0.8335342172224455,
"grad_norm": 1.0294254039782134,
"learning_rate": 4.9184905247531316e-05,
"loss": 0.9483,
"num_input_tokens_seen": 407896064,
"step": 389
},
{
"epoch": 0.8356769787063077,
"grad_norm": 0.8744877993208896,
"learning_rate": 4.918062196359263e-05,
"loss": 0.945,
"num_input_tokens_seen": 408944640,
"step": 390
},
{
"epoch": 0.8378197401901701,
"grad_norm": 0.7205016171684795,
"learning_rate": 4.917632764239349e-05,
"loss": 0.9406,
"num_input_tokens_seen": 409993216,
"step": 391
},
{
"epoch": 0.8399625016740324,
"grad_norm": 0.8646230649226543,
"learning_rate": 4.9172022285894074e-05,
"loss": 0.9425,
"num_input_tokens_seen": 411041792,
"step": 392
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.72954927450801,
"learning_rate": 4.9167705896059527e-05,
"loss": 0.9375,
"num_input_tokens_seen": 412090368,
"step": 393
},
{
"epoch": 0.8442480246417571,
"grad_norm": 0.6048658838544064,
"learning_rate": 4.91633784748601e-05,
"loss": 0.9411,
"num_input_tokens_seen": 413138944,
"step": 394
},
{
"epoch": 0.8463907861256194,
"grad_norm": 0.6584812604484274,
"learning_rate": 4.915904002427103e-05,
"loss": 0.9346,
"num_input_tokens_seen": 414187520,
"step": 395
},
{
"epoch": 0.8485335476094817,
"grad_norm": 0.771775334405825,
"learning_rate": 4.9154690546272606e-05,
"loss": 0.9435,
"num_input_tokens_seen": 415236096,
"step": 396
},
{
"epoch": 0.850676309093344,
"grad_norm": 0.7257743982464717,
"learning_rate": 4.9150330042850155e-05,
"loss": 0.9411,
"num_input_tokens_seen": 416284672,
"step": 397
},
{
"epoch": 0.8528190705772064,
"grad_norm": 0.6471296513959546,
"learning_rate": 4.9145958515994025e-05,
"loss": 0.9423,
"num_input_tokens_seen": 417333248,
"step": 398
},
{
"epoch": 0.8549618320610687,
"grad_norm": 0.6905884924419747,
"learning_rate": 4.914157596769962e-05,
"loss": 0.9478,
"num_input_tokens_seen": 418381824,
"step": 399
},
{
"epoch": 0.8571045935449311,
"grad_norm": 0.8067538886359611,
"learning_rate": 4.9137182399967343e-05,
"loss": 0.9418,
"num_input_tokens_seen": 419430400,
"step": 400
},
{
"epoch": 0.8592473550287933,
"grad_norm": 0.6249988394159846,
"learning_rate": 4.9132777814802634e-05,
"loss": 0.9351,
"num_input_tokens_seen": 420478976,
"step": 401
},
{
"epoch": 0.8613901165126557,
"grad_norm": 0.6847816284919354,
"learning_rate": 4.9128362214215986e-05,
"loss": 0.9385,
"num_input_tokens_seen": 421527552,
"step": 402
},
{
"epoch": 0.863532877996518,
"grad_norm": 0.832176639221433,
"learning_rate": 4.912393560022288e-05,
"loss": 0.9468,
"num_input_tokens_seen": 422576128,
"step": 403
},
{
"epoch": 0.8656756394803803,
"grad_norm": 0.6997745179543038,
"learning_rate": 4.911949797484388e-05,
"loss": 0.9327,
"num_input_tokens_seen": 423624704,
"step": 404
},
{
"epoch": 0.8678184009642427,
"grad_norm": 0.6321940710919781,
"learning_rate": 4.9115049340104505e-05,
"loss": 0.9351,
"num_input_tokens_seen": 424673280,
"step": 405
},
{
"epoch": 0.869961162448105,
"grad_norm": 0.7060643613594743,
"learning_rate": 4.911058969803536e-05,
"loss": 0.945,
"num_input_tokens_seen": 425721856,
"step": 406
},
{
"epoch": 0.8721039239319673,
"grad_norm": 0.6065745860189082,
"learning_rate": 4.910611905067205e-05,
"loss": 0.9407,
"num_input_tokens_seen": 426770432,
"step": 407
},
{
"epoch": 0.8742466854158296,
"grad_norm": 0.5015704205138852,
"learning_rate": 4.91016374000552e-05,
"loss": 0.9305,
"num_input_tokens_seen": 427819008,
"step": 408
},
{
"epoch": 0.876389446899692,
"grad_norm": 0.6567251162870986,
"learning_rate": 4.909714474823047e-05,
"loss": 0.951,
"num_input_tokens_seen": 428867584,
"step": 409
},
{
"epoch": 0.8785322083835543,
"grad_norm": 0.5920933662760173,
"learning_rate": 4.909264109724853e-05,
"loss": 0.9315,
"num_input_tokens_seen": 429916160,
"step": 410
},
{
"epoch": 0.8806749698674167,
"grad_norm": 0.660025661812582,
"learning_rate": 4.9088126449165065e-05,
"loss": 0.9308,
"num_input_tokens_seen": 430964736,
"step": 411
},
{
"epoch": 0.882817731351279,
"grad_norm": 0.7662454510330629,
"learning_rate": 4.90836008060408e-05,
"loss": 0.9428,
"num_input_tokens_seen": 432013312,
"step": 412
},
{
"epoch": 0.8849604928351413,
"grad_norm": 0.6707054047645358,
"learning_rate": 4.907906416994146e-05,
"loss": 0.9405,
"num_input_tokens_seen": 433061888,
"step": 413
},
{
"epoch": 0.8871032543190036,
"grad_norm": 0.7475341888198394,
"learning_rate": 4.9074516542937795e-05,
"loss": 0.9479,
"num_input_tokens_seen": 434110464,
"step": 414
},
{
"epoch": 0.889246015802866,
"grad_norm": 0.7247188513031333,
"learning_rate": 4.9069957927105586e-05,
"loss": 0.9348,
"num_input_tokens_seen": 435159040,
"step": 415
},
{
"epoch": 0.8913887772867283,
"grad_norm": 0.5567338151611357,
"learning_rate": 4.906538832452561e-05,
"loss": 0.9292,
"num_input_tokens_seen": 436207616,
"step": 416
},
{
"epoch": 0.8935315387705905,
"grad_norm": 0.6675238484825794,
"learning_rate": 4.9060807737283656e-05,
"loss": 0.9354,
"num_input_tokens_seen": 437256192,
"step": 417
},
{
"epoch": 0.8956743002544529,
"grad_norm": 0.6283836886644354,
"learning_rate": 4.905621616747054e-05,
"loss": 0.9351,
"num_input_tokens_seen": 438304768,
"step": 418
},
{
"epoch": 0.8978170617383152,
"grad_norm": 0.6058437294929521,
"learning_rate": 4.905161361718209e-05,
"loss": 0.9309,
"num_input_tokens_seen": 439353344,
"step": 419
},
{
"epoch": 0.8999598232221776,
"grad_norm": 0.6772602115603277,
"learning_rate": 4.9047000088519144e-05,
"loss": 0.9384,
"num_input_tokens_seen": 440401920,
"step": 420
},
{
"epoch": 0.9021025847060399,
"grad_norm": 0.7034846488508698,
"learning_rate": 4.9042375583587555e-05,
"loss": 0.9519,
"num_input_tokens_seen": 441450496,
"step": 421
},
{
"epoch": 0.9042453461899023,
"grad_norm": 0.6874699025267311,
"learning_rate": 4.9037740104498166e-05,
"loss": 0.9281,
"num_input_tokens_seen": 442499072,
"step": 422
},
{
"epoch": 0.9063881076737645,
"grad_norm": 0.6359007946203304,
"learning_rate": 4.903309365336686e-05,
"loss": 0.939,
"num_input_tokens_seen": 443547648,
"step": 423
},
{
"epoch": 0.9085308691576269,
"grad_norm": 0.7006661263512417,
"learning_rate": 4.90284362323145e-05,
"loss": 0.9345,
"num_input_tokens_seen": 444596224,
"step": 424
},
{
"epoch": 0.9106736306414892,
"grad_norm": 0.5187269941294566,
"learning_rate": 4.902376784346697e-05,
"loss": 0.9414,
"num_input_tokens_seen": 445644800,
"step": 425
},
{
"epoch": 0.9128163921253516,
"grad_norm": 0.6305715854234033,
"learning_rate": 4.901908848895517e-05,
"loss": 0.938,
"num_input_tokens_seen": 446693376,
"step": 426
},
{
"epoch": 0.9149591536092139,
"grad_norm": 0.657768269603032,
"learning_rate": 4.901439817091499e-05,
"loss": 0.9359,
"num_input_tokens_seen": 447741952,
"step": 427
},
{
"epoch": 0.9171019150930761,
"grad_norm": 0.7053589096005869,
"learning_rate": 4.9009696891487325e-05,
"loss": 0.9418,
"num_input_tokens_seen": 448790528,
"step": 428
},
{
"epoch": 0.9192446765769385,
"grad_norm": 0.7187736478194787,
"learning_rate": 4.9004984652818076e-05,
"loss": 0.9338,
"num_input_tokens_seen": 449839104,
"step": 429
},
{
"epoch": 0.9213874380608008,
"grad_norm": 0.5897812937476063,
"learning_rate": 4.900026145705815e-05,
"loss": 0.9425,
"num_input_tokens_seen": 450887680,
"step": 430
},
{
"epoch": 0.9235301995446632,
"grad_norm": 0.5467959396908199,
"learning_rate": 4.899552730636345e-05,
"loss": 0.9276,
"num_input_tokens_seen": 451936256,
"step": 431
},
{
"epoch": 0.9256729610285255,
"grad_norm": 0.6600294586377221,
"learning_rate": 4.899078220289489e-05,
"loss": 0.938,
"num_input_tokens_seen": 452984832,
"step": 432
},
{
"epoch": 0.9278157225123879,
"grad_norm": 0.7533489644860528,
"learning_rate": 4.898602614881836e-05,
"loss": 0.9408,
"num_input_tokens_seen": 454033408,
"step": 433
},
{
"epoch": 0.9299584839962501,
"grad_norm": 0.7688826889229265,
"learning_rate": 4.898125914630479e-05,
"loss": 0.9416,
"num_input_tokens_seen": 455081984,
"step": 434
},
{
"epoch": 0.9321012454801125,
"grad_norm": 0.8178732789471899,
"learning_rate": 4.897648119753006e-05,
"loss": 0.9349,
"num_input_tokens_seen": 456130560,
"step": 435
},
{
"epoch": 0.9342440069639748,
"grad_norm": 0.8805822989196023,
"learning_rate": 4.897169230467506e-05,
"loss": 0.9398,
"num_input_tokens_seen": 457179136,
"step": 436
},
{
"epoch": 0.9363867684478372,
"grad_norm": 0.7148664306953607,
"learning_rate": 4.896689246992572e-05,
"loss": 0.9288,
"num_input_tokens_seen": 458227712,
"step": 437
},
{
"epoch": 0.9385295299316995,
"grad_norm": 0.620777716602536,
"learning_rate": 4.8962081695472886e-05,
"loss": 0.937,
"num_input_tokens_seen": 459276288,
"step": 438
},
{
"epoch": 0.9406722914155619,
"grad_norm": 0.780179843380168,
"learning_rate": 4.895725998351246e-05,
"loss": 0.9282,
"num_input_tokens_seen": 460324864,
"step": 439
},
{
"epoch": 0.9428150528994241,
"grad_norm": 0.7098688144002365,
"learning_rate": 4.8952427336245324e-05,
"loss": 0.9205,
"num_input_tokens_seen": 461373440,
"step": 440
},
{
"epoch": 0.9449578143832864,
"grad_norm": 0.8388395028850127,
"learning_rate": 4.894758375587733e-05,
"loss": 0.9298,
"num_input_tokens_seen": 462422016,
"step": 441
},
{
"epoch": 0.9471005758671488,
"grad_norm": 1.0011263471423495,
"learning_rate": 4.894272924461932e-05,
"loss": 0.9339,
"num_input_tokens_seen": 463470592,
"step": 442
},
{
"epoch": 0.9492433373510111,
"grad_norm": 0.9116591107624027,
"learning_rate": 4.8937863804687165e-05,
"loss": 0.9286,
"num_input_tokens_seen": 464519168,
"step": 443
},
{
"epoch": 0.9513860988348735,
"grad_norm": 0.6734428440804721,
"learning_rate": 4.893298743830168e-05,
"loss": 0.945,
"num_input_tokens_seen": 465567744,
"step": 444
},
{
"epoch": 0.9535288603187357,
"grad_norm": 0.7235503104955481,
"learning_rate": 4.89281001476887e-05,
"loss": 0.9435,
"num_input_tokens_seen": 466616320,
"step": 445
},
{
"epoch": 0.9556716218025981,
"grad_norm": 0.6810287431117885,
"learning_rate": 4.892320193507902e-05,
"loss": 0.9329,
"num_input_tokens_seen": 467664896,
"step": 446
},
{
"epoch": 0.9578143832864604,
"grad_norm": 0.8494818139907088,
"learning_rate": 4.8918292802708445e-05,
"loss": 0.9434,
"num_input_tokens_seen": 468713472,
"step": 447
},
{
"epoch": 0.9599571447703228,
"grad_norm": 0.7794460661595467,
"learning_rate": 4.891337275281774e-05,
"loss": 0.9313,
"num_input_tokens_seen": 469762048,
"step": 448
},
{
"epoch": 0.9620999062541851,
"grad_norm": 0.6458068716390041,
"learning_rate": 4.890844178765267e-05,
"loss": 0.9339,
"num_input_tokens_seen": 470810624,
"step": 449
},
{
"epoch": 0.9642426677380475,
"grad_norm": 0.7530678772967354,
"learning_rate": 4.8903499909463966e-05,
"loss": 0.9381,
"num_input_tokens_seen": 471859200,
"step": 450
},
{
"epoch": 0.9663854292219097,
"grad_norm": 0.8522366384371276,
"learning_rate": 4.889854712050737e-05,
"loss": 0.9326,
"num_input_tokens_seen": 472907776,
"step": 451
},
{
"epoch": 0.9685281907057721,
"grad_norm": 0.6897177044990395,
"learning_rate": 4.8893583423043574e-05,
"loss": 0.939,
"num_input_tokens_seen": 473956352,
"step": 452
},
{
"epoch": 0.9706709521896344,
"grad_norm": 0.5452623417030903,
"learning_rate": 4.888860881933826e-05,
"loss": 0.932,
"num_input_tokens_seen": 475004928,
"step": 453
},
{
"epoch": 0.9728137136734967,
"grad_norm": 0.7408247505392889,
"learning_rate": 4.888362331166211e-05,
"loss": 0.9306,
"num_input_tokens_seen": 476053504,
"step": 454
},
{
"epoch": 0.9749564751573591,
"grad_norm": 0.8159380899572539,
"learning_rate": 4.887862690229073e-05,
"loss": 0.9338,
"num_input_tokens_seen": 477102080,
"step": 455
},
{
"epoch": 0.9770992366412213,
"grad_norm": 0.6775152425008051,
"learning_rate": 4.887361959350475e-05,
"loss": 0.9313,
"num_input_tokens_seen": 478150656,
"step": 456
},
{
"epoch": 0.9792419981250837,
"grad_norm": 0.7643871285334946,
"learning_rate": 4.8868601387589765e-05,
"loss": 0.9292,
"num_input_tokens_seen": 479199232,
"step": 457
},
{
"epoch": 0.981384759608946,
"grad_norm": 0.7359268386748409,
"learning_rate": 4.8863572286836324e-05,
"loss": 0.9371,
"num_input_tokens_seen": 480247808,
"step": 458
},
{
"epoch": 0.9835275210928084,
"grad_norm": 0.7367735699896256,
"learning_rate": 4.885853229353998e-05,
"loss": 0.9165,
"num_input_tokens_seen": 481296384,
"step": 459
},
{
"epoch": 0.9856702825766707,
"grad_norm": 0.7300756128487073,
"learning_rate": 4.885348141000122e-05,
"loss": 0.935,
"num_input_tokens_seen": 482344960,
"step": 460
},
{
"epoch": 0.987813044060533,
"grad_norm": 0.6381716292470745,
"learning_rate": 4.8848419638525545e-05,
"loss": 0.9207,
"num_input_tokens_seen": 483393536,
"step": 461
},
{
"epoch": 0.9899558055443953,
"grad_norm": 0.6121363602754881,
"learning_rate": 4.884334698142339e-05,
"loss": 0.9297,
"num_input_tokens_seen": 484442112,
"step": 462
},
{
"epoch": 0.9920985670282577,
"grad_norm": 0.5846344306782465,
"learning_rate": 4.8838263441010186e-05,
"loss": 0.9317,
"num_input_tokens_seen": 485490688,
"step": 463
},
{
"epoch": 0.99424132851212,
"grad_norm": 0.5734102323548795,
"learning_rate": 4.88331690196063e-05,
"loss": 0.9265,
"num_input_tokens_seen": 486539264,
"step": 464
},
{
"epoch": 0.9963840899959823,
"grad_norm": 0.6569092711755317,
"learning_rate": 4.88280637195371e-05,
"loss": 0.9315,
"num_input_tokens_seen": 487587840,
"step": 465
},
{
"epoch": 0.9985268514798447,
"grad_norm": 0.5544483498666563,
"learning_rate": 4.882294754313289e-05,
"loss": 0.9337,
"num_input_tokens_seen": 488636416,
"step": 466
},
{
"epoch": 1.0,
"grad_norm": 0.6728502195291717,
"learning_rate": 4.881782049272896e-05,
"loss": 0.9227,
"num_input_tokens_seen": 489357312,
"step": 467
},
{
"epoch": 1.0021427614838623,
"grad_norm": 1.1178061816772684,
"learning_rate": 4.8812682570665556e-05,
"loss": 0.7839,
"num_input_tokens_seen": 490405888,
"step": 468
},
{
"epoch": 1.0042855229677246,
"grad_norm": 1.0921873795233088,
"learning_rate": 4.880753377928788e-05,
"loss": 0.7834,
"num_input_tokens_seen": 491454464,
"step": 469
},
{
"epoch": 1.006428284451587,
"grad_norm": 1.1287773013748974,
"learning_rate": 4.880237412094611e-05,
"loss": 0.7761,
"num_input_tokens_seen": 492503040,
"step": 470
},
{
"epoch": 1.0085710459354493,
"grad_norm": 1.0631852319209654,
"learning_rate": 4.879720359799537e-05,
"loss": 0.7684,
"num_input_tokens_seen": 493551616,
"step": 471
},
{
"epoch": 1.0107138074193116,
"grad_norm": 1.0776720184821373,
"learning_rate": 4.879202221279575e-05,
"loss": 0.7701,
"num_input_tokens_seen": 494600192,
"step": 472
},
{
"epoch": 1.0128565689031739,
"grad_norm": 0.9227986460902488,
"learning_rate": 4.878682996771229e-05,
"loss": 0.7791,
"num_input_tokens_seen": 495648768,
"step": 473
},
{
"epoch": 1.0149993303870364,
"grad_norm": 1.075443769168512,
"learning_rate": 4.8781626865115005e-05,
"loss": 0.7674,
"num_input_tokens_seen": 496697344,
"step": 474
},
{
"epoch": 1.0171420918708987,
"grad_norm": 1.2527807821550248,
"learning_rate": 4.877641290737884e-05,
"loss": 0.773,
"num_input_tokens_seen": 497745920,
"step": 475
},
{
"epoch": 1.019284853354761,
"grad_norm": 0.8580407158397938,
"learning_rate": 4.877118809688372e-05,
"loss": 0.771,
"num_input_tokens_seen": 498794496,
"step": 476
},
{
"epoch": 1.0214276148386232,
"grad_norm": 0.6853196367256191,
"learning_rate": 4.8765952436014515e-05,
"loss": 0.7725,
"num_input_tokens_seen": 499843072,
"step": 477
},
{
"epoch": 1.0235703763224857,
"grad_norm": 0.8301110353120499,
"learning_rate": 4.876070592716105e-05,
"loss": 0.7783,
"num_input_tokens_seen": 500891648,
"step": 478
},
{
"epoch": 1.025713137806348,
"grad_norm": 0.8756175808406557,
"learning_rate": 4.875544857271808e-05,
"loss": 0.7487,
"num_input_tokens_seen": 501940224,
"step": 479
},
{
"epoch": 1.0278558992902103,
"grad_norm": 0.9757446146451294,
"learning_rate": 4.8750180375085344e-05,
"loss": 0.766,
"num_input_tokens_seen": 502988800,
"step": 480
},
{
"epoch": 1.0299986607740725,
"grad_norm": 0.8610400304752691,
"learning_rate": 4.874490133666749e-05,
"loss": 0.7663,
"num_input_tokens_seen": 504037376,
"step": 481
},
{
"epoch": 1.0321414222579348,
"grad_norm": 0.8464462121552816,
"learning_rate": 4.873961145987417e-05,
"loss": 0.7689,
"num_input_tokens_seen": 505085952,
"step": 482
},
{
"epoch": 1.0342841837417973,
"grad_norm": 0.7205290839211237,
"learning_rate": 4.8734310747119935e-05,
"loss": 0.7598,
"num_input_tokens_seen": 506134528,
"step": 483
},
{
"epoch": 1.0364269452256596,
"grad_norm": 0.7972996029812055,
"learning_rate": 4.87289992008243e-05,
"loss": 0.7677,
"num_input_tokens_seen": 507183104,
"step": 484
},
{
"epoch": 1.0385697067095219,
"grad_norm": 0.8060439581250051,
"learning_rate": 4.872367682341173e-05,
"loss": 0.7657,
"num_input_tokens_seen": 508231680,
"step": 485
},
{
"epoch": 1.0407124681933841,
"grad_norm": 0.625204599036251,
"learning_rate": 4.871834361731162e-05,
"loss": 0.7537,
"num_input_tokens_seen": 509280256,
"step": 486
},
{
"epoch": 1.0428552296772466,
"grad_norm": 0.7605314454169393,
"learning_rate": 4.8712999584958314e-05,
"loss": 0.7719,
"num_input_tokens_seen": 510328832,
"step": 487
},
{
"epoch": 1.044997991161109,
"grad_norm": 0.8346367886178117,
"learning_rate": 4.87076447287911e-05,
"loss": 0.7662,
"num_input_tokens_seen": 511377408,
"step": 488
},
{
"epoch": 1.0471407526449712,
"grad_norm": 0.6285971829943235,
"learning_rate": 4.870227905125422e-05,
"loss": 0.765,
"num_input_tokens_seen": 512425984,
"step": 489
},
{
"epoch": 1.0492835141288335,
"grad_norm": 0.6964181738913828,
"learning_rate": 4.869690255479682e-05,
"loss": 0.7506,
"num_input_tokens_seen": 513474560,
"step": 490
},
{
"epoch": 1.051426275612696,
"grad_norm": 0.7957806090570286,
"learning_rate": 4.8691515241873023e-05,
"loss": 0.7714,
"num_input_tokens_seen": 514523136,
"step": 491
},
{
"epoch": 1.0535690370965582,
"grad_norm": 0.8340425161918896,
"learning_rate": 4.868611711494186e-05,
"loss": 0.7606,
"num_input_tokens_seen": 515571712,
"step": 492
},
{
"epoch": 1.0557117985804205,
"grad_norm": 0.7847167960403033,
"learning_rate": 4.8680708176467305e-05,
"loss": 0.7512,
"num_input_tokens_seen": 516620288,
"step": 493
},
{
"epoch": 1.0578545600642828,
"grad_norm": 0.825125615426757,
"learning_rate": 4.867528842891828e-05,
"loss": 0.7711,
"num_input_tokens_seen": 517668864,
"step": 494
},
{
"epoch": 1.059997321548145,
"grad_norm": 0.7783907475532756,
"learning_rate": 4.866985787476863e-05,
"loss": 0.7734,
"num_input_tokens_seen": 518717440,
"step": 495
},
{
"epoch": 1.0621400830320076,
"grad_norm": 0.7189975833814624,
"learning_rate": 4.866441651649715e-05,
"loss": 0.7748,
"num_input_tokens_seen": 519766016,
"step": 496
},
{
"epoch": 1.0642828445158699,
"grad_norm": 0.6408978236973415,
"learning_rate": 4.865896435658752e-05,
"loss": 0.7632,
"num_input_tokens_seen": 520814592,
"step": 497
},
{
"epoch": 1.0664256059997321,
"grad_norm": 0.8641304391887605,
"learning_rate": 4.865350139752841e-05,
"loss": 0.7602,
"num_input_tokens_seen": 521863168,
"step": 498
},
{
"epoch": 1.0685683674835944,
"grad_norm": 0.7514805150967576,
"learning_rate": 4.8648027641813384e-05,
"loss": 0.7536,
"num_input_tokens_seen": 522911744,
"step": 499
},
{
"epoch": 1.070711128967457,
"grad_norm": 0.6149193256755058,
"learning_rate": 4.864254309194093e-05,
"loss": 0.764,
"num_input_tokens_seen": 523960320,
"step": 500
}
],
"logging_steps": 1,
"max_steps": 4660,
"num_input_tokens_seen": 523960320,
"num_train_epochs": 10,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 836890483752960.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}