CardProjector-24B-v1b-adapter / trainer_state.json
Pegasus YaY
Upload 12 files
97e6dcf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9976019184652278,
"eval_steps": 500,
"global_step": 312,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0031974420463629096,
"grad_norm": 0.3446813225746155,
"learning_rate": 1.875e-05,
"loss": 2.181,
"step": 1
},
{
"epoch": 0.006394884092725819,
"grad_norm": 0.30540090799331665,
"learning_rate": 3.75e-05,
"loss": 1.9989,
"step": 2
},
{
"epoch": 0.009592326139088728,
"grad_norm": 0.33418309688568115,
"learning_rate": 5.625e-05,
"loss": 2.2171,
"step": 3
},
{
"epoch": 0.012789768185451638,
"grad_norm": 0.3458963632583618,
"learning_rate": 7.5e-05,
"loss": 2.1132,
"step": 4
},
{
"epoch": 0.01598721023181455,
"grad_norm": 0.3518303334712982,
"learning_rate": 9.374999999999999e-05,
"loss": 1.9712,
"step": 5
},
{
"epoch": 0.019184652278177457,
"grad_norm": 0.3810424506664276,
"learning_rate": 0.0001125,
"loss": 1.7775,
"step": 6
},
{
"epoch": 0.02238209432454037,
"grad_norm": 0.4941970109939575,
"learning_rate": 0.00013125,
"loss": 1.9399,
"step": 7
},
{
"epoch": 0.025579536370903277,
"grad_norm": 0.3915010094642639,
"learning_rate": 0.00015,
"loss": 1.8796,
"step": 8
},
{
"epoch": 0.02877697841726619,
"grad_norm": 0.3387204706668854,
"learning_rate": 0.00016874999999999998,
"loss": 1.8703,
"step": 9
},
{
"epoch": 0.0319744204636291,
"grad_norm": 0.3416554927825928,
"learning_rate": 0.00018749999999999998,
"loss": 1.8606,
"step": 10
},
{
"epoch": 0.035171862509992005,
"grad_norm": 0.36551880836486816,
"learning_rate": 0.00020624999999999997,
"loss": 1.5603,
"step": 11
},
{
"epoch": 0.03836930455635491,
"grad_norm": 0.3075932264328003,
"learning_rate": 0.000225,
"loss": 1.5475,
"step": 12
},
{
"epoch": 0.04156674660271783,
"grad_norm": 0.28144699335098267,
"learning_rate": 0.00024375,
"loss": 1.6917,
"step": 13
},
{
"epoch": 0.04476418864908074,
"grad_norm": 0.27931058406829834,
"learning_rate": 0.0002625,
"loss": 1.4981,
"step": 14
},
{
"epoch": 0.047961630695443645,
"grad_norm": 0.24638418853282928,
"learning_rate": 0.00028125,
"loss": 1.599,
"step": 15
},
{
"epoch": 0.051159072741806554,
"grad_norm": 0.49918419122695923,
"learning_rate": 0.0003,
"loss": 1.5411,
"step": 16
},
{
"epoch": 0.05435651478816946,
"grad_norm": 0.227300763130188,
"learning_rate": 0.00029999155161863667,
"loss": 1.4908,
"step": 17
},
{
"epoch": 0.05755395683453238,
"grad_norm": 0.24631308019161224,
"learning_rate": 0.0002999662074262154,
"loss": 1.5127,
"step": 18
},
{
"epoch": 0.060751398880895285,
"grad_norm": 0.20278117060661316,
"learning_rate": 0.00029992397027763483,
"loss": 1.5784,
"step": 19
},
{
"epoch": 0.0639488409272582,
"grad_norm": 0.20311613380908966,
"learning_rate": 0.00029986484493070223,
"loss": 1.5577,
"step": 20
},
{
"epoch": 0.0671462829736211,
"grad_norm": 0.22366106510162354,
"learning_rate": 0.00029978883804559716,
"loss": 1.6616,
"step": 21
},
{
"epoch": 0.07034372501998401,
"grad_norm": 0.22588945925235748,
"learning_rate": 0.00029969595818412183,
"loss": 1.7524,
"step": 22
},
{
"epoch": 0.07354116706634692,
"grad_norm": 0.20929686725139618,
"learning_rate": 0.000299586215808736,
"loss": 1.5186,
"step": 23
},
{
"epoch": 0.07673860911270983,
"grad_norm": 0.2444813847541809,
"learning_rate": 0.00029945962328137895,
"loss": 1.5135,
"step": 24
},
{
"epoch": 0.07993605115907274,
"grad_norm": 0.21571452915668488,
"learning_rate": 0.00029931619486207655,
"loss": 1.4799,
"step": 25
},
{
"epoch": 0.08313349320543566,
"grad_norm": 0.2103520780801773,
"learning_rate": 0.00029915594670733536,
"loss": 1.6818,
"step": 26
},
{
"epoch": 0.08633093525179857,
"grad_norm": 0.24929186701774597,
"learning_rate": 0.00029897889686832227,
"loss": 1.4392,
"step": 27
},
{
"epoch": 0.08952837729816147,
"grad_norm": 0.24320849776268005,
"learning_rate": 0.0002987850652888315,
"loss": 1.5211,
"step": 28
},
{
"epoch": 0.09272581934452438,
"grad_norm": 0.23468714952468872,
"learning_rate": 0.0002985744738030378,
"loss": 1.5468,
"step": 29
},
{
"epoch": 0.09592326139088729,
"grad_norm": 0.2079857587814331,
"learning_rate": 0.0002983471461330368,
"loss": 1.5166,
"step": 30
},
{
"epoch": 0.0991207034372502,
"grad_norm": 0.21627485752105713,
"learning_rate": 0.0002981031078861733,
"loss": 1.5507,
"step": 31
},
{
"epoch": 0.10231814548361311,
"grad_norm": 0.23451927304267883,
"learning_rate": 0.00029784238655215626,
"loss": 1.508,
"step": 32
},
{
"epoch": 0.10551558752997602,
"grad_norm": 0.2220710664987564,
"learning_rate": 0.0002975650114999625,
"loss": 1.5164,
"step": 33
},
{
"epoch": 0.10871302957633892,
"grad_norm": 0.22487205266952515,
"learning_rate": 0.00029727101397452834,
"loss": 1.4938,
"step": 34
},
{
"epoch": 0.11191047162270183,
"grad_norm": 0.2187684029340744,
"learning_rate": 0.00029696042709322995,
"loss": 1.3007,
"step": 35
},
{
"epoch": 0.11510791366906475,
"grad_norm": 0.2184438705444336,
"learning_rate": 0.00029663328584215293,
"loss": 1.5204,
"step": 36
},
{
"epoch": 0.11830535571542766,
"grad_norm": 0.21176907420158386,
"learning_rate": 0.00029628962707215124,
"loss": 1.5017,
"step": 37
},
{
"epoch": 0.12150279776179057,
"grad_norm": 0.2055819034576416,
"learning_rate": 0.00029592948949469614,
"loss": 1.2755,
"step": 38
},
{
"epoch": 0.12470023980815348,
"grad_norm": 0.220439150929451,
"learning_rate": 0.00029555291367751573,
"loss": 1.5057,
"step": 39
},
{
"epoch": 0.1278976818545164,
"grad_norm": 0.2324652075767517,
"learning_rate": 0.00029515994204002484,
"loss": 1.5839,
"step": 40
},
{
"epoch": 0.1310951239008793,
"grad_norm": 0.19285540282726288,
"learning_rate": 0.0002947506188485468,
"loss": 1.4434,
"step": 41
},
{
"epoch": 0.1342925659472422,
"grad_norm": 0.26798316836357117,
"learning_rate": 0.00029432499021132737,
"loss": 1.6137,
"step": 42
},
{
"epoch": 0.1374900079936051,
"grad_norm": 0.18407316505908966,
"learning_rate": 0.0002938831040733405,
"loss": 1.3876,
"step": 43
},
{
"epoch": 0.14068745003996802,
"grad_norm": 0.2084178477525711,
"learning_rate": 0.0002934250102108876,
"loss": 1.5409,
"step": 44
},
{
"epoch": 0.14388489208633093,
"grad_norm": 0.20955117046833038,
"learning_rate": 0.00029295076022599077,
"loss": 1.4635,
"step": 45
},
{
"epoch": 0.14708233413269384,
"grad_norm": 0.2144644558429718,
"learning_rate": 0.00029246040754057976,
"loss": 1.4585,
"step": 46
},
{
"epoch": 0.15027977617905675,
"grad_norm": 0.20352163910865784,
"learning_rate": 0.0002919540073904744,
"loss": 1.5338,
"step": 47
},
{
"epoch": 0.15347721822541965,
"grad_norm": 0.18800681829452515,
"learning_rate": 0.0002914316168191626,
"loss": 1.5031,
"step": 48
},
{
"epoch": 0.15667466027178256,
"grad_norm": 0.19407911598682404,
"learning_rate": 0.00029089329467137456,
"loss": 1.4457,
"step": 49
},
{
"epoch": 0.15987210231814547,
"grad_norm": 0.19459669291973114,
"learning_rate": 0.0002903391015864543,
"loss": 1.3383,
"step": 50
},
{
"epoch": 0.1630695443645084,
"grad_norm": 0.22761711478233337,
"learning_rate": 0.0002897690999915289,
"loss": 1.5057,
"step": 51
},
{
"epoch": 0.16626698641087131,
"grad_norm": 0.22577515244483948,
"learning_rate": 0.0002891833540944764,
"loss": 1.3057,
"step": 52
},
{
"epoch": 0.16946442845723422,
"grad_norm": 0.2257939875125885,
"learning_rate": 0.000288581929876693,
"loss": 1.4777,
"step": 53
},
{
"epoch": 0.17266187050359713,
"grad_norm": 0.20815476775169373,
"learning_rate": 0.0002879648950856608,
"loss": 1.4252,
"step": 54
},
{
"epoch": 0.17585931254996004,
"grad_norm": 0.20832973718643188,
"learning_rate": 0.0002873323192273162,
"loss": 1.5008,
"step": 55
},
{
"epoch": 0.17905675459632295,
"grad_norm": 0.2152206003665924,
"learning_rate": 0.00028668427355822034,
"loss": 1.6078,
"step": 56
},
{
"epoch": 0.18225419664268586,
"grad_norm": 0.18941529095172882,
"learning_rate": 0.0002860208310775327,
"loss": 1.4449,
"step": 57
},
{
"epoch": 0.18545163868904876,
"grad_norm": 0.23700568079948425,
"learning_rate": 0.00028534206651878777,
"loss": 1.5582,
"step": 58
},
{
"epoch": 0.18864908073541167,
"grad_norm": 0.2555181384086609,
"learning_rate": 0.0002846480563414768,
"loss": 1.5682,
"step": 59
},
{
"epoch": 0.19184652278177458,
"grad_norm": 0.18711774051189423,
"learning_rate": 0.0002839388787224353,
"loss": 1.5051,
"step": 60
},
{
"epoch": 0.1950439648281375,
"grad_norm": 0.2022084891796112,
"learning_rate": 0.00028321461354703604,
"loss": 1.4694,
"step": 61
},
{
"epoch": 0.1982414068745004,
"grad_norm": 0.1778743863105774,
"learning_rate": 0.0002824753424001914,
"loss": 1.3847,
"step": 62
},
{
"epoch": 0.2014388489208633,
"grad_norm": 0.1981406807899475,
"learning_rate": 0.0002817211485571623,
"loss": 1.3561,
"step": 63
},
{
"epoch": 0.20463629096722621,
"grad_norm": 0.19981688261032104,
"learning_rate": 0.0002809521169741782,
"loss": 1.4506,
"step": 64
},
{
"epoch": 0.20783373301358912,
"grad_norm": 0.20264656841754913,
"learning_rate": 0.0002801683342788671,
"loss": 1.5316,
"step": 65
},
{
"epoch": 0.21103117505995203,
"grad_norm": 0.18628135323524475,
"learning_rate": 0.000279369888760497,
"loss": 1.4879,
"step": 66
},
{
"epoch": 0.21422861710631494,
"grad_norm": 0.2130441665649414,
"learning_rate": 0.00027855687036003134,
"loss": 1.6192,
"step": 67
},
{
"epoch": 0.21742605915267785,
"grad_norm": 0.19949516654014587,
"learning_rate": 0.00027772937065999667,
"loss": 1.4773,
"step": 68
},
{
"epoch": 0.22062350119904076,
"grad_norm": 0.20962868630886078,
"learning_rate": 0.0002768874828741669,
"loss": 1.4617,
"step": 69
},
{
"epoch": 0.22382094324540366,
"grad_norm": 0.21659812331199646,
"learning_rate": 0.00027603130183706314,
"loss": 1.5065,
"step": 70
},
{
"epoch": 0.2270183852917666,
"grad_norm": 0.19917699694633484,
"learning_rate": 0.00027516092399327094,
"loss": 1.6265,
"step": 71
},
{
"epoch": 0.2302158273381295,
"grad_norm": 0.20580779016017914,
"learning_rate": 0.0002742764473865763,
"loss": 1.4508,
"step": 72
},
{
"epoch": 0.23341326938449242,
"grad_norm": 0.20578929781913757,
"learning_rate": 0.0002733779716489217,
"loss": 1.5362,
"step": 73
},
{
"epoch": 0.23661071143085532,
"grad_norm": 0.21730633080005646,
"learning_rate": 0.0002724655979891828,
"loss": 1.4373,
"step": 74
},
{
"epoch": 0.23980815347721823,
"grad_norm": 0.21635404229164124,
"learning_rate": 0.000271539429181768,
"loss": 1.3639,
"step": 75
},
{
"epoch": 0.24300559552358114,
"grad_norm": 0.24112968146800995,
"learning_rate": 0.0002705995695550411,
"loss": 1.5238,
"step": 76
},
{
"epoch": 0.24620303756994405,
"grad_norm": 0.20409514009952545,
"learning_rate": 0.00026964612497956946,
"loss": 1.4533,
"step": 77
},
{
"epoch": 0.24940047961630696,
"grad_norm": 0.21514864265918732,
"learning_rate": 0.0002686792028561983,
"loss": 1.4657,
"step": 78
},
{
"epoch": 0.25259792166266987,
"grad_norm": 0.20796911418437958,
"learning_rate": 0.00026769891210395207,
"loss": 1.4834,
"step": 79
},
{
"epoch": 0.2557953637090328,
"grad_norm": 0.20425471663475037,
"learning_rate": 0.00026670536314776593,
"loss": 1.4799,
"step": 80
},
{
"epoch": 0.2589928057553957,
"grad_norm": 0.1899542212486267,
"learning_rate": 0.0002656986679060462,
"loss": 1.4862,
"step": 81
},
{
"epoch": 0.2621902478017586,
"grad_norm": 0.20222659409046173,
"learning_rate": 0.00026467893977806387,
"loss": 1.4788,
"step": 82
},
{
"epoch": 0.2653876898481215,
"grad_norm": 0.1941121220588684,
"learning_rate": 0.0002636462936311804,
"loss": 1.4913,
"step": 83
},
{
"epoch": 0.2685851318944844,
"grad_norm": 0.21576811373233795,
"learning_rate": 0.0002626008457879086,
"loss": 1.5327,
"step": 84
},
{
"epoch": 0.2717825739408473,
"grad_norm": 0.1937507688999176,
"learning_rate": 0.00026154271401280957,
"loss": 1.4609,
"step": 85
},
{
"epoch": 0.2749800159872102,
"grad_norm": 0.18996623158454895,
"learning_rate": 0.0002604720174992268,
"loss": 1.4023,
"step": 86
},
{
"epoch": 0.27817745803357313,
"grad_norm": 0.20716165006160736,
"learning_rate": 0.00025938887685585994,
"loss": 1.5351,
"step": 87
},
{
"epoch": 0.28137490007993604,
"grad_norm": 0.20239269733428955,
"learning_rate": 0.0002582934140931786,
"loss": 1.4851,
"step": 88
},
{
"epoch": 0.28457234212629895,
"grad_norm": 0.20915232598781586,
"learning_rate": 0.0002571857526096788,
"loss": 1.3798,
"step": 89
},
{
"epoch": 0.28776978417266186,
"grad_norm": 0.20972570776939392,
"learning_rate": 0.00025606601717798207,
"loss": 1.4097,
"step": 90
},
{
"epoch": 0.29096722621902477,
"grad_norm": 0.20584455132484436,
"learning_rate": 0.0002549343339307813,
"loss": 1.5279,
"step": 91
},
{
"epoch": 0.2941646682653877,
"grad_norm": 0.1897670328617096,
"learning_rate": 0.00025379083034663194,
"loss": 1.603,
"step": 92
},
{
"epoch": 0.2973621103117506,
"grad_norm": 0.19150228798389435,
"learning_rate": 0.000252635635235592,
"loss": 1.3939,
"step": 93
},
{
"epoch": 0.3005595523581135,
"grad_norm": 0.1970176249742508,
"learning_rate": 0.00025146887872471303,
"loss": 1.468,
"step": 94
},
{
"epoch": 0.3037569944044764,
"grad_norm": 0.19097474217414856,
"learning_rate": 0.000250290692243381,
"loss": 1.4303,
"step": 95
},
{
"epoch": 0.3069544364508393,
"grad_norm": 0.21538837254047394,
"learning_rate": 0.00024910120850851216,
"loss": 1.5775,
"step": 96
},
{
"epoch": 0.3101518784972022,
"grad_norm": 0.1855296939611435,
"learning_rate": 0.0002479005615096028,
"loss": 1.413,
"step": 97
},
{
"epoch": 0.3133493205435651,
"grad_norm": 0.23258726298809052,
"learning_rate": 0.00024668888649363583,
"loss": 1.5517,
"step": 98
},
{
"epoch": 0.31654676258992803,
"grad_norm": 0.19402435421943665,
"learning_rate": 0.0002454663199498463,
"loss": 1.3835,
"step": 99
},
{
"epoch": 0.31974420463629094,
"grad_norm": 0.1976032257080078,
"learning_rate": 0.00024423299959434636,
"loss": 1.4637,
"step": 100
},
{
"epoch": 0.3229416466826539,
"grad_norm": 0.19951173663139343,
"learning_rate": 0.0002429890643546119,
"loss": 1.3731,
"step": 101
},
{
"epoch": 0.3261390887290168,
"grad_norm": 0.20681437849998474,
"learning_rate": 0.0002417346543538337,
"loss": 1.4865,
"step": 102
},
{
"epoch": 0.3293365307753797,
"grad_norm": 0.36958593130111694,
"learning_rate": 0.00024046991089513267,
"loss": 1.4612,
"step": 103
},
{
"epoch": 0.33253397282174263,
"grad_norm": 0.20621562004089355,
"learning_rate": 0.00023919497644564298,
"loss": 1.357,
"step": 104
},
{
"epoch": 0.33573141486810554,
"grad_norm": 0.18956023454666138,
"learning_rate": 0.00023790999462046394,
"loss": 1.6554,
"step": 105
},
{
"epoch": 0.33892885691446845,
"grad_norm": 0.2084682583808899,
"learning_rate": 0.0002366151101664822,
"loss": 1.4853,
"step": 106
},
{
"epoch": 0.34212629896083135,
"grad_norm": 0.17509467899799347,
"learning_rate": 0.00023531046894606703,
"loss": 1.4028,
"step": 107
},
{
"epoch": 0.34532374100719426,
"grad_norm": 0.19247236847877502,
"learning_rate": 0.00023399621792063928,
"loss": 1.4353,
"step": 108
},
{
"epoch": 0.34852118305355717,
"grad_norm": 0.19204045832157135,
"learning_rate": 0.00023267250513411733,
"loss": 1.3393,
"step": 109
},
{
"epoch": 0.3517186250999201,
"grad_norm": 0.20329782366752625,
"learning_rate": 0.00023133947969624028,
"loss": 1.6107,
"step": 110
},
{
"epoch": 0.354916067146283,
"grad_norm": 0.2169138640165329,
"learning_rate": 0.00022999729176577163,
"loss": 1.4617,
"step": 111
},
{
"epoch": 0.3581135091926459,
"grad_norm": 0.22543761134147644,
"learning_rate": 0.00022864609253358474,
"loss": 1.4731,
"step": 112
},
{
"epoch": 0.3613109512390088,
"grad_norm": 0.19519487023353577,
"learning_rate": 0.00022728603420563175,
"loss": 1.597,
"step": 113
},
{
"epoch": 0.3645083932853717,
"grad_norm": 0.20843897759914398,
"learning_rate": 0.00022591726998579843,
"loss": 1.4963,
"step": 114
},
{
"epoch": 0.3677058353317346,
"grad_norm": 0.2149285078048706,
"learning_rate": 0.00022453995405864638,
"loss": 1.5095,
"step": 115
},
{
"epoch": 0.37090327737809753,
"grad_norm": 0.19521689414978027,
"learning_rate": 0.00022315424157204518,
"loss": 1.5709,
"step": 116
},
{
"epoch": 0.37410071942446044,
"grad_norm": 0.19614940881729126,
"learning_rate": 0.00022176028861969535,
"loss": 1.4573,
"step": 117
},
{
"epoch": 0.37729816147082335,
"grad_norm": 0.1948356330394745,
"learning_rate": 0.00022035825222354552,
"loss": 1.309,
"step": 118
},
{
"epoch": 0.38049560351718625,
"grad_norm": 0.20020437240600586,
"learning_rate": 0.00021894829031610452,
"loss": 1.5289,
"step": 119
},
{
"epoch": 0.38369304556354916,
"grad_norm": 0.20084881782531738,
"learning_rate": 0.00021753056172265096,
"loss": 1.5456,
"step": 120
},
{
"epoch": 0.38689048760991207,
"grad_norm": 0.17715269327163696,
"learning_rate": 0.00021610522614334265,
"loss": 1.4322,
"step": 121
},
{
"epoch": 0.390087929656275,
"grad_norm": 0.2064034342765808,
"learning_rate": 0.00021467244413522673,
"loss": 1.5772,
"step": 122
},
{
"epoch": 0.3932853717026379,
"grad_norm": 0.19036740064620972,
"learning_rate": 0.00021323237709415413,
"loss": 1.5086,
"step": 123
},
{
"epoch": 0.3964828137490008,
"grad_norm": 0.19214606285095215,
"learning_rate": 0.0002117851872365989,
"loss": 1.5296,
"step": 124
},
{
"epoch": 0.3996802557953637,
"grad_norm": 0.20223727822303772,
"learning_rate": 0.00021033103758138529,
"loss": 1.5354,
"step": 125
},
{
"epoch": 0.4028776978417266,
"grad_norm": 0.18433460593223572,
"learning_rate": 0.00020887009193132456,
"loss": 1.532,
"step": 126
},
{
"epoch": 0.4060751398880895,
"grad_norm": 0.18365609645843506,
"learning_rate": 0.00020740251485476345,
"loss": 1.3326,
"step": 127
},
{
"epoch": 0.40927258193445243,
"grad_norm": 0.19547204673290253,
"learning_rate": 0.0002059284716670463,
"loss": 1.4566,
"step": 128
},
{
"epoch": 0.41247002398081534,
"grad_norm": 0.2268918752670288,
"learning_rate": 0.00020444812841189294,
"loss": 1.6165,
"step": 129
},
{
"epoch": 0.41566746602717825,
"grad_norm": 0.21848422288894653,
"learning_rate": 0.0002029616518426951,
"loss": 1.6039,
"step": 130
},
{
"epoch": 0.41886490807354115,
"grad_norm": 0.19918426871299744,
"learning_rate": 0.00020146920940373195,
"loss": 1.4602,
"step": 131
},
{
"epoch": 0.42206235011990406,
"grad_norm": 0.18590374290943146,
"learning_rate": 0.00019997096921130862,
"loss": 1.2925,
"step": 132
},
{
"epoch": 0.42525979216626697,
"grad_norm": 0.19987183809280396,
"learning_rate": 0.00019846710003481875,
"loss": 1.4157,
"step": 133
},
{
"epoch": 0.4284572342126299,
"grad_norm": 0.20987945795059204,
"learning_rate": 0.00019695777127773332,
"loss": 1.4424,
"step": 134
},
{
"epoch": 0.4316546762589928,
"grad_norm": 0.21076463162899017,
"learning_rate": 0.00019544315295851825,
"loss": 1.4946,
"step": 135
},
{
"epoch": 0.4348521183053557,
"grad_norm": 0.20848603546619415,
"learning_rate": 0.00019392341569148252,
"loss": 1.4393,
"step": 136
},
{
"epoch": 0.4380495603517186,
"grad_norm": 0.21943925321102142,
"learning_rate": 0.00019239873066755964,
"loss": 1.6161,
"step": 137
},
{
"epoch": 0.4412470023980815,
"grad_norm": 0.23087991774082184,
"learning_rate": 0.0001908692696350234,
"loss": 1.3502,
"step": 138
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.20302651822566986,
"learning_rate": 0.00018933520488014166,
"loss": 1.3896,
"step": 139
},
{
"epoch": 0.44764188649080733,
"grad_norm": 0.19597011804580688,
"learning_rate": 0.00018779670920776877,
"loss": 1.4437,
"step": 140
},
{
"epoch": 0.45083932853717024,
"grad_norm": 0.21784569323062897,
"learning_rate": 0.00018625395592188036,
"loss": 1.5956,
"step": 141
},
{
"epoch": 0.4540367705835332,
"grad_norm": 0.20360009372234344,
"learning_rate": 0.00018470711880605122,
"loss": 1.2507,
"step": 142
},
{
"epoch": 0.4572342126298961,
"grad_norm": 0.1850934773683548,
"learning_rate": 0.00018315637210387947,
"loss": 1.477,
"step": 143
},
{
"epoch": 0.460431654676259,
"grad_norm": 0.22538472712039948,
"learning_rate": 0.00018160189049935892,
"loss": 1.3688,
"step": 144
},
{
"epoch": 0.4636290967226219,
"grad_norm": 0.2093997299671173,
"learning_rate": 0.00018004384909720188,
"loss": 1.3953,
"step": 145
},
{
"epoch": 0.46682653876898483,
"grad_norm": 0.19743283092975616,
"learning_rate": 0.00017848242340311424,
"loss": 1.5111,
"step": 146
},
{
"epoch": 0.47002398081534774,
"grad_norm": 0.23592239618301392,
"learning_rate": 0.0001769177893040258,
"loss": 1.4628,
"step": 147
},
{
"epoch": 0.47322142286171065,
"grad_norm": 0.2107086479663849,
"learning_rate": 0.00017535012304827736,
"loss": 1.345,
"step": 148
},
{
"epoch": 0.47641886490807356,
"grad_norm": 0.212343230843544,
"learning_rate": 0.00017377960122576732,
"loss": 1.4294,
"step": 149
},
{
"epoch": 0.47961630695443647,
"grad_norm": 0.280923455953598,
"learning_rate": 0.0001722064007480597,
"loss": 1.6237,
"step": 150
},
{
"epoch": 0.4828137490007994,
"grad_norm": 0.19629351794719696,
"learning_rate": 0.00017063069882845575,
"loss": 1.439,
"step": 151
},
{
"epoch": 0.4860111910471623,
"grad_norm": 0.2047591209411621,
"learning_rate": 0.0001690526729620318,
"loss": 1.3626,
"step": 152
},
{
"epoch": 0.4892086330935252,
"grad_norm": 0.18259218335151672,
"learning_rate": 0.00016747250090564557,
"loss": 1.3234,
"step": 153
},
{
"epoch": 0.4924060751398881,
"grad_norm": 0.20569853484630585,
"learning_rate": 0.00016589036065791242,
"loss": 1.4376,
"step": 154
},
{
"epoch": 0.495603517186251,
"grad_norm": 0.18437625467777252,
"learning_rate": 0.0001643064304391547,
"loss": 1.4705,
"step": 155
},
{
"epoch": 0.4988009592326139,
"grad_norm": 0.22610221803188324,
"learning_rate": 0.00016272088867132637,
"loss": 1.3045,
"step": 156
},
{
"epoch": 0.5019984012789768,
"grad_norm": 0.197098046541214,
"learning_rate": 0.00016113391395791436,
"loss": 1.531,
"step": 157
},
{
"epoch": 0.5051958433253397,
"grad_norm": 0.2230396866798401,
"learning_rate": 0.00015954568506381994,
"loss": 1.5164,
"step": 158
},
{
"epoch": 0.5083932853717026,
"grad_norm": 0.19642704725265503,
"learning_rate": 0.0001579563808952216,
"loss": 1.4442,
"step": 159
},
{
"epoch": 0.5115907274180655,
"grad_norm": 0.21066069602966309,
"learning_rate": 0.00015636618047942222,
"loss": 1.4251,
"step": 160
},
{
"epoch": 0.5147881694644284,
"grad_norm": 0.18799303472042084,
"learning_rate": 0.0001547752629446827,
"loss": 1.3866,
"step": 161
},
{
"epoch": 0.5179856115107914,
"grad_norm": 0.20167718827724457,
"learning_rate": 0.00015318380750004352,
"loss": 1.471,
"step": 162
},
{
"epoch": 0.5211830535571543,
"grad_norm": 0.20787064731121063,
"learning_rate": 0.00015159199341513845,
"loss": 1.5312,
"step": 163
},
{
"epoch": 0.5243804956035172,
"grad_norm": 0.19502943754196167,
"learning_rate": 0.00015,
"loss": 1.5153,
"step": 164
},
{
"epoch": 0.5275779376498801,
"grad_norm": 0.18463830649852753,
"learning_rate": 0.00014840800658486158,
"loss": 1.62,
"step": 165
},
{
"epoch": 0.530775379696243,
"grad_norm": 0.20096978545188904,
"learning_rate": 0.00014681619249995646,
"loss": 1.3816,
"step": 166
},
{
"epoch": 0.533972821742606,
"grad_norm": 0.20995350182056427,
"learning_rate": 0.00014522473705531736,
"loss": 1.4321,
"step": 167
},
{
"epoch": 0.5371702637889688,
"grad_norm": 0.1865735948085785,
"learning_rate": 0.00014363381952057778,
"loss": 1.4262,
"step": 168
},
{
"epoch": 0.5403677058353318,
"grad_norm": 0.1792657971382141,
"learning_rate": 0.00014204361910477844,
"loss": 1.5558,
"step": 169
},
{
"epoch": 0.5435651478816946,
"grad_norm": 0.2027653157711029,
"learning_rate": 0.00014045431493618003,
"loss": 1.3377,
"step": 170
},
{
"epoch": 0.5467625899280576,
"grad_norm": 0.19514119625091553,
"learning_rate": 0.0001388660860420856,
"loss": 1.3874,
"step": 171
},
{
"epoch": 0.5499600319744204,
"grad_norm": 0.17817656695842743,
"learning_rate": 0.00013727911132867365,
"loss": 1.3716,
"step": 172
},
{
"epoch": 0.5531574740207834,
"grad_norm": 0.23043349385261536,
"learning_rate": 0.00013569356956084528,
"loss": 1.464,
"step": 173
},
{
"epoch": 0.5563549160671463,
"grad_norm": 0.19135528802871704,
"learning_rate": 0.00013410963934208759,
"loss": 1.3154,
"step": 174
},
{
"epoch": 0.5595523581135092,
"grad_norm": 0.20745159685611725,
"learning_rate": 0.0001325274990943544,
"loss": 1.4785,
"step": 175
},
{
"epoch": 0.5627498001598721,
"grad_norm": 0.20532263815402985,
"learning_rate": 0.00013094732703796818,
"loss": 1.5137,
"step": 176
},
{
"epoch": 0.565947242206235,
"grad_norm": 0.21446797251701355,
"learning_rate": 0.00012936930117154425,
"loss": 1.3701,
"step": 177
},
{
"epoch": 0.5691446842525979,
"grad_norm": 0.19260822236537933,
"learning_rate": 0.0001277935992519403,
"loss": 1.4443,
"step": 178
},
{
"epoch": 0.5723421262989609,
"grad_norm": 0.19996041059494019,
"learning_rate": 0.00012622039877423265,
"loss": 1.371,
"step": 179
},
{
"epoch": 0.5755395683453237,
"grad_norm": 0.19244007766246796,
"learning_rate": 0.00012464987695172264,
"loss": 1.3142,
"step": 180
},
{
"epoch": 0.5787370103916867,
"grad_norm": 0.19164302945137024,
"learning_rate": 0.00012308221069597418,
"loss": 1.4773,
"step": 181
},
{
"epoch": 0.5819344524380495,
"grad_norm": 0.20002460479736328,
"learning_rate": 0.00012151757659688571,
"loss": 1.4264,
"step": 182
},
{
"epoch": 0.5851318944844125,
"grad_norm": 0.21552026271820068,
"learning_rate": 0.00011995615090279813,
"loss": 1.4049,
"step": 183
},
{
"epoch": 0.5883293365307753,
"grad_norm": 0.19300565123558044,
"learning_rate": 0.00011839810950064109,
"loss": 1.3554,
"step": 184
},
{
"epoch": 0.5915267785771383,
"grad_norm": 0.19941386580467224,
"learning_rate": 0.00011684362789612053,
"loss": 1.5601,
"step": 185
},
{
"epoch": 0.5947242206235012,
"grad_norm": 0.18221646547317505,
"learning_rate": 0.00011529288119394878,
"loss": 1.4828,
"step": 186
},
{
"epoch": 0.5979216626698641,
"grad_norm": 0.1901618093252182,
"learning_rate": 0.00011374604407811962,
"loss": 1.5442,
"step": 187
},
{
"epoch": 0.601119104716227,
"grad_norm": 0.17420263588428497,
"learning_rate": 0.00011220329079223123,
"loss": 1.285,
"step": 188
},
{
"epoch": 0.60431654676259,
"grad_norm": 0.23658356070518494,
"learning_rate": 0.00011066479511985838,
"loss": 1.2485,
"step": 189
},
{
"epoch": 0.6075139888089528,
"grad_norm": 0.20968788862228394,
"learning_rate": 0.00010913073036497658,
"loss": 1.3972,
"step": 190
},
{
"epoch": 0.6107114308553158,
"grad_norm": 0.2030273675918579,
"learning_rate": 0.00010760126933244036,
"loss": 1.6353,
"step": 191
},
{
"epoch": 0.6139088729016786,
"grad_norm": 0.1902075558900833,
"learning_rate": 0.00010607658430851744,
"loss": 1.2809,
"step": 192
},
{
"epoch": 0.6171063149480416,
"grad_norm": 0.20934785902500153,
"learning_rate": 0.00010455684704148173,
"loss": 1.3585,
"step": 193
},
{
"epoch": 0.6203037569944044,
"grad_norm": 0.2173265963792801,
"learning_rate": 0.00010304222872226668,
"loss": 1.2973,
"step": 194
},
{
"epoch": 0.6235011990407674,
"grad_norm": 0.19533811509609222,
"learning_rate": 0.00010153289996518125,
"loss": 1.4299,
"step": 195
},
{
"epoch": 0.6266986410871302,
"grad_norm": 0.2015613615512848,
"learning_rate": 0.00010002903078869135,
"loss": 1.4279,
"step": 196
},
{
"epoch": 0.6298960831334932,
"grad_norm": 0.20218639075756073,
"learning_rate": 9.853079059626805e-05,
"loss": 1.3212,
"step": 197
},
{
"epoch": 0.6330935251798561,
"grad_norm": 0.1902882307767868,
"learning_rate": 9.703834815730487e-05,
"loss": 1.3939,
"step": 198
},
{
"epoch": 0.636290967226219,
"grad_norm": 0.18366214632987976,
"learning_rate": 9.555187158810702e-05,
"loss": 1.4403,
"step": 199
},
{
"epoch": 0.6394884092725819,
"grad_norm": 0.1821315586566925,
"learning_rate": 9.407152833295372e-05,
"loss": 1.372,
"step": 200
},
{
"epoch": 0.6426858513189448,
"grad_norm": 0.20973654091358185,
"learning_rate": 9.259748514523653e-05,
"loss": 1.4149,
"step": 201
},
{
"epoch": 0.6458832933653078,
"grad_norm": 0.18254290521144867,
"learning_rate": 9.112990806867543e-05,
"loss": 1.3052,
"step": 202
},
{
"epoch": 0.6490807354116707,
"grad_norm": 0.18717211484909058,
"learning_rate": 8.966896241861473e-05,
"loss": 1.4061,
"step": 203
},
{
"epoch": 0.6522781774580336,
"grad_norm": 0.17621521651744843,
"learning_rate": 8.821481276340112e-05,
"loss": 1.6093,
"step": 204
},
{
"epoch": 0.6554756195043965,
"grad_norm": 0.1912049949169159,
"learning_rate": 8.676762290584585e-05,
"loss": 1.353,
"step": 205
},
{
"epoch": 0.6586730615507594,
"grad_norm": 0.2157009094953537,
"learning_rate": 8.532755586477324e-05,
"loss": 1.4063,
"step": 206
},
{
"epoch": 0.6618705035971223,
"grad_norm": 0.18072722852230072,
"learning_rate": 8.389477385665732e-05,
"loss": 1.5591,
"step": 207
},
{
"epoch": 0.6650679456434853,
"grad_norm": 0.22034448385238647,
"learning_rate": 8.246943827734897e-05,
"loss": 1.4766,
"step": 208
},
{
"epoch": 0.6682653876898481,
"grad_norm": 0.21938645839691162,
"learning_rate": 8.105170968389552e-05,
"loss": 1.3791,
"step": 209
},
{
"epoch": 0.6714628297362111,
"grad_norm": 0.19702577590942383,
"learning_rate": 7.964174777645448e-05,
"loss": 1.5582,
"step": 210
},
{
"epoch": 0.6746602717825739,
"grad_norm": 0.20586428046226501,
"learning_rate": 7.823971138030466e-05,
"loss": 1.4005,
"step": 211
},
{
"epoch": 0.6778577138289369,
"grad_norm": 0.1924622356891632,
"learning_rate": 7.684575842795485e-05,
"loss": 1.4078,
"step": 212
},
{
"epoch": 0.6810551558752997,
"grad_norm": 0.1937723606824875,
"learning_rate": 7.546004594135356e-05,
"loss": 1.2821,
"step": 213
},
{
"epoch": 0.6842525979216627,
"grad_norm": 0.22969581186771393,
"learning_rate": 7.408273001420153e-05,
"loss": 1.2398,
"step": 214
},
{
"epoch": 0.6874500399680256,
"grad_norm": 0.19231727719306946,
"learning_rate": 7.271396579436825e-05,
"loss": 1.3752,
"step": 215
},
{
"epoch": 0.6906474820143885,
"grad_norm": 0.20469219982624054,
"learning_rate": 7.135390746641526e-05,
"loss": 1.352,
"step": 216
},
{
"epoch": 0.6938449240607514,
"grad_norm": 0.19728676974773407,
"learning_rate": 7.000270823422837e-05,
"loss": 1.5623,
"step": 217
},
{
"epoch": 0.6970423661071143,
"grad_norm": 0.22052626311779022,
"learning_rate": 6.866052030375974e-05,
"loss": 1.4183,
"step": 218
},
{
"epoch": 0.7002398081534772,
"grad_norm": 0.19779476523399353,
"learning_rate": 6.732749486588266e-05,
"loss": 1.4014,
"step": 219
},
{
"epoch": 0.7034372501998402,
"grad_norm": 0.1978594809770584,
"learning_rate": 6.600378207936069e-05,
"loss": 1.4317,
"step": 220
},
{
"epoch": 0.706634692246203,
"grad_norm": 0.2020850032567978,
"learning_rate": 6.468953105393297e-05,
"loss": 1.4208,
"step": 221
},
{
"epoch": 0.709832134292566,
"grad_norm": 0.18292494118213654,
"learning_rate": 6.338488983351777e-05,
"loss": 1.3283,
"step": 222
},
{
"epoch": 0.7130295763389288,
"grad_norm": 0.2223280966281891,
"learning_rate": 6.209000537953605e-05,
"loss": 1.4245,
"step": 223
},
{
"epoch": 0.7162270183852918,
"grad_norm": 0.22692078351974487,
"learning_rate": 6.080502355435701e-05,
"loss": 1.5982,
"step": 224
},
{
"epoch": 0.7194244604316546,
"grad_norm": 0.19702717661857605,
"learning_rate": 5.9530089104867386e-05,
"loss": 1.3909,
"step": 225
},
{
"epoch": 0.7226219024780176,
"grad_norm": 0.22220925986766815,
"learning_rate": 5.826534564616633e-05,
"loss": 1.4322,
"step": 226
},
{
"epoch": 0.7258193445243805,
"grad_norm": 0.20837551355361938,
"learning_rate": 5.701093564538806e-05,
"loss": 1.3919,
"step": 227
},
{
"epoch": 0.7290167865707434,
"grad_norm": 0.1905641108751297,
"learning_rate": 5.5767000405653636e-05,
"loss": 1.446,
"step": 228
},
{
"epoch": 0.7322142286171063,
"grad_norm": 0.20399922132492065,
"learning_rate": 5.453368005015363e-05,
"loss": 1.3922,
"step": 229
},
{
"epoch": 0.7354116706634692,
"grad_norm": 0.19176483154296875,
"learning_rate": 5.3311113506364116e-05,
"loss": 1.3255,
"step": 230
},
{
"epoch": 0.7386091127098321,
"grad_norm": 0.21297192573547363,
"learning_rate": 5.209943849039722e-05,
"loss": 1.3992,
"step": 231
},
{
"epoch": 0.7418065547561951,
"grad_norm": 0.20219087600708008,
"learning_rate": 5.089879149148781e-05,
"loss": 1.5462,
"step": 232
},
{
"epoch": 0.7450039968025579,
"grad_norm": 0.1977456510066986,
"learning_rate": 4.9709307756618985e-05,
"loss": 1.4046,
"step": 233
},
{
"epoch": 0.7482014388489209,
"grad_norm": 0.22329548001289368,
"learning_rate": 4.853112127528698e-05,
"loss": 1.5767,
"step": 234
},
{
"epoch": 0.7513988808952837,
"grad_norm": 0.20563232898712158,
"learning_rate": 4.736436476440791e-05,
"loss": 1.6348,
"step": 235
},
{
"epoch": 0.7545963229416467,
"grad_norm": 0.19388997554779053,
"learning_rate": 4.6209169653368086e-05,
"loss": 1.364,
"step": 236
},
{
"epoch": 0.7577937649880095,
"grad_norm": 0.2103840559720993,
"learning_rate": 4.506566606921864e-05,
"loss": 1.4538,
"step": 237
},
{
"epoch": 0.7609912070343725,
"grad_norm": 0.17306749522686005,
"learning_rate": 4.3933982822017876e-05,
"loss": 1.4435,
"step": 238
},
{
"epoch": 0.7641886490807354,
"grad_norm": 0.20918579399585724,
"learning_rate": 4.2814247390321215e-05,
"loss": 1.2357,
"step": 239
},
{
"epoch": 0.7673860911270983,
"grad_norm": 0.21173876523971558,
"learning_rate": 4.1706585906821334e-05,
"loss": 1.2602,
"step": 240
},
{
"epoch": 0.7705835331734612,
"grad_norm": 0.19886651635169983,
"learning_rate": 4.0611123144140075e-05,
"loss": 1.4166,
"step": 241
},
{
"epoch": 0.7737809752198241,
"grad_norm": 0.19375504553318024,
"learning_rate": 3.952798250077317e-05,
"loss": 1.3777,
"step": 242
},
{
"epoch": 0.7769784172661871,
"grad_norm": 0.20145930349826813,
"learning_rate": 3.84572859871904e-05,
"loss": 1.3258,
"step": 243
},
{
"epoch": 0.78017585931255,
"grad_norm": 0.2076532244682312,
"learning_rate": 3.739915421209133e-05,
"loss": 1.3921,
"step": 244
},
{
"epoch": 0.7833733013589129,
"grad_norm": 0.19265635311603546,
"learning_rate": 3.635370636881958e-05,
"loss": 1.4043,
"step": 245
},
{
"epoch": 0.7865707434052758,
"grad_norm": 0.19883492588996887,
"learning_rate": 3.532106022193615e-05,
"loss": 1.346,
"step": 246
},
{
"epoch": 0.7897681854516387,
"grad_norm": 0.18948738276958466,
"learning_rate": 3.4301332093953807e-05,
"loss": 1.4363,
"step": 247
},
{
"epoch": 0.7929656274980016,
"grad_norm": 0.18976429104804993,
"learning_rate": 3.3294636852234105e-05,
"loss": 1.4316,
"step": 248
},
{
"epoch": 0.7961630695443646,
"grad_norm": 0.202013298869133,
"learning_rate": 3.230108789604792e-05,
"loss": 1.4532,
"step": 249
},
{
"epoch": 0.7993605115907274,
"grad_norm": 0.2116522341966629,
"learning_rate": 3.132079714380171e-05,
"loss": 1.5129,
"step": 250
},
{
"epoch": 0.8025579536370904,
"grad_norm": 0.19418169558048248,
"learning_rate": 3.035387502043052e-05,
"loss": 1.3265,
"step": 251
},
{
"epoch": 0.8057553956834532,
"grad_norm": 0.21084119379520416,
"learning_rate": 2.9400430444958932e-05,
"loss": 1.3929,
"step": 252
},
{
"epoch": 0.8089528377298162,
"grad_norm": 0.23588140308856964,
"learning_rate": 2.846057081823201e-05,
"loss": 1.2077,
"step": 253
},
{
"epoch": 0.812150279776179,
"grad_norm": 0.21185244619846344,
"learning_rate": 2.7534402010817157e-05,
"loss": 1.2874,
"step": 254
},
{
"epoch": 0.815347721822542,
"grad_norm": 0.184846431016922,
"learning_rate": 2.6622028351078277e-05,
"loss": 1.4785,
"step": 255
},
{
"epoch": 0.8185451638689049,
"grad_norm": 0.1995445042848587,
"learning_rate": 2.5723552613423687e-05,
"loss": 1.4153,
"step": 256
},
{
"epoch": 0.8217426059152678,
"grad_norm": 0.20493745803833008,
"learning_rate": 2.4839076006729082e-05,
"loss": 1.448,
"step": 257
},
{
"epoch": 0.8249400479616307,
"grad_norm": 0.1989341676235199,
"learning_rate": 2.3968698162936854e-05,
"loss": 1.4733,
"step": 258
},
{
"epoch": 0.8281374900079936,
"grad_norm": 0.20579148828983307,
"learning_rate": 2.311251712583307e-05,
"loss": 1.4746,
"step": 259
},
{
"epoch": 0.8313349320543565,
"grad_norm": 0.2025279700756073,
"learning_rate": 2.2270629340003303e-05,
"loss": 1.6248,
"step": 260
},
{
"epoch": 0.8345323741007195,
"grad_norm": 0.17980627715587616,
"learning_rate": 2.1443129639968615e-05,
"loss": 1.3753,
"step": 261
},
{
"epoch": 0.8377298161470823,
"grad_norm": 0.21116185188293457,
"learning_rate": 2.063011123950295e-05,
"loss": 1.2975,
"step": 262
},
{
"epoch": 0.8409272581934453,
"grad_norm": 0.20071591436862946,
"learning_rate": 1.9831665721132954e-05,
"loss": 1.444,
"step": 263
},
{
"epoch": 0.8441247002398081,
"grad_norm": 0.19569140672683716,
"learning_rate": 1.9047883025821774e-05,
"loss": 1.5126,
"step": 264
},
{
"epoch": 0.8473221422861711,
"grad_norm": 0.19419822096824646,
"learning_rate": 1.827885144283769e-05,
"loss": 1.3867,
"step": 265
},
{
"epoch": 0.8505195843325339,
"grad_norm": 0.19556277990341187,
"learning_rate": 1.75246575998086e-05,
"loss": 1.3758,
"step": 266
},
{
"epoch": 0.8537170263788969,
"grad_norm": 0.20848549902439117,
"learning_rate": 1.678538645296391e-05,
"loss": 1.4835,
"step": 267
},
{
"epoch": 0.8569144684252598,
"grad_norm": 0.19634144008159637,
"learning_rate": 1.6061121277564743e-05,
"loss": 1.4624,
"step": 268
},
{
"epoch": 0.8601119104716227,
"grad_norm": 0.19600766897201538,
"learning_rate": 1.535194365852315e-05,
"loss": 1.2323,
"step": 269
},
{
"epoch": 0.8633093525179856,
"grad_norm": 0.21323877573013306,
"learning_rate": 1.4657933481212242e-05,
"loss": 1.5224,
"step": 270
},
{
"epoch": 0.8665067945643485,
"grad_norm": 0.18555647134780884,
"learning_rate": 1.3979168922467298e-05,
"loss": 1.3663,
"step": 271
},
{
"epoch": 0.8697042366107114,
"grad_norm": 0.19477520883083344,
"learning_rate": 1.3315726441779629e-05,
"loss": 1.4892,
"step": 272
},
{
"epoch": 0.8729016786570744,
"grad_norm": 0.19639001786708832,
"learning_rate": 1.2667680772683825e-05,
"loss": 1.2377,
"step": 273
},
{
"epoch": 0.8760991207034372,
"grad_norm": 0.21710480749607086,
"learning_rate": 1.2035104914339188e-05,
"loss": 1.3991,
"step": 274
},
{
"epoch": 0.8792965627498002,
"grad_norm": 0.21137666702270508,
"learning_rate": 1.1418070123306989e-05,
"loss": 1.5307,
"step": 275
},
{
"epoch": 0.882494004796163,
"grad_norm": 0.19870568811893463,
"learning_rate": 1.0816645905523597e-05,
"loss": 1.341,
"step": 276
},
{
"epoch": 0.885691446842526,
"grad_norm": 0.2340983897447586,
"learning_rate": 1.0230900008471072e-05,
"loss": 1.3578,
"step": 277
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.18889744579792023,
"learning_rate": 9.660898413545692e-06,
"loss": 1.4085,
"step": 278
},
{
"epoch": 0.8920863309352518,
"grad_norm": 0.213284432888031,
"learning_rate": 9.106705328625408e-06,
"loss": 1.3843,
"step": 279
},
{
"epoch": 0.8952837729816147,
"grad_norm": 0.2060411274433136,
"learning_rate": 8.568383180837368e-06,
"loss": 1.473,
"step": 280
},
{
"epoch": 0.8984812150279776,
"grad_norm": 0.18018406629562378,
"learning_rate": 8.04599260952557e-06,
"loss": 1.3782,
"step": 281
},
{
"epoch": 0.9016786570743405,
"grad_norm": 0.18678754568099976,
"learning_rate": 7.539592459420219e-06,
"loss": 1.4252,
"step": 282
},
{
"epoch": 0.9048760991207034,
"grad_norm": 0.2027515172958374,
"learning_rate": 7.049239774009213e-06,
"loss": 1.3717,
"step": 283
},
{
"epoch": 0.9080735411670664,
"grad_norm": 0.20960167050361633,
"learning_rate": 6.574989789112372e-06,
"loss": 1.2815,
"step": 284
},
{
"epoch": 0.9112709832134293,
"grad_norm": 0.19627049565315247,
"learning_rate": 6.11689592665951e-06,
"loss": 1.4348,
"step": 285
},
{
"epoch": 0.9144684252597922,
"grad_norm": 0.20119017362594604,
"learning_rate": 5.675009788672596e-06,
"loss": 1.3343,
"step": 286
},
{
"epoch": 0.9176658673061551,
"grad_norm": 0.18706481158733368,
"learning_rate": 5.2493811514531635e-06,
"loss": 1.3721,
"step": 287
},
{
"epoch": 0.920863309352518,
"grad_norm": 0.19794286787509918,
"learning_rate": 4.840057959975169e-06,
"loss": 1.3626,
"step": 288
},
{
"epoch": 0.9240607513988809,
"grad_norm": 0.1808895319700241,
"learning_rate": 4.44708632248425e-06,
"loss": 1.5342,
"step": 289
},
{
"epoch": 0.9272581934452439,
"grad_norm": 0.1820111721754074,
"learning_rate": 4.070510505303814e-06,
"loss": 1.4357,
"step": 290
},
{
"epoch": 0.9304556354916067,
"grad_norm": 0.1756613701581955,
"learning_rate": 3.710372927848776e-06,
"loss": 1.328,
"step": 291
},
{
"epoch": 0.9336530775379697,
"grad_norm": 0.19259536266326904,
"learning_rate": 3.366714157847078e-06,
"loss": 1.2882,
"step": 292
},
{
"epoch": 0.9368505195843325,
"grad_norm": 0.20220039784908295,
"learning_rate": 3.0395729067700324e-06,
"loss": 1.3903,
"step": 293
},
{
"epoch": 0.9400479616306955,
"grad_norm": 0.1991778463125229,
"learning_rate": 2.728986025471641e-06,
"loss": 1.3649,
"step": 294
},
{
"epoch": 0.9432454036770583,
"grad_norm": 0.20098921656608582,
"learning_rate": 2.4349885000374657e-06,
"loss": 1.4128,
"step": 295
},
{
"epoch": 0.9464428457234213,
"grad_norm": 0.18276216089725494,
"learning_rate": 2.1576134478437313e-06,
"loss": 1.3548,
"step": 296
},
{
"epoch": 0.9496402877697842,
"grad_norm": 0.21758389472961426,
"learning_rate": 1.8968921138267091e-06,
"loss": 1.4765,
"step": 297
},
{
"epoch": 0.9528377298161471,
"grad_norm": 0.18690507113933563,
"learning_rate": 1.6528538669631997e-06,
"loss": 1.5375,
"step": 298
},
{
"epoch": 0.95603517186251,
"grad_norm": 0.1706872582435608,
"learning_rate": 1.4255261969622456e-06,
"loss": 1.2775,
"step": 299
},
{
"epoch": 0.9592326139088729,
"grad_norm": 0.2452152669429779,
"learning_rate": 1.2149347111684749e-06,
"loss": 1.2828,
"step": 300
},
{
"epoch": 0.9624300559552358,
"grad_norm": 0.17894317209720612,
"learning_rate": 1.0211031316776919e-06,
"loss": 1.4131,
"step": 301
},
{
"epoch": 0.9656274980015987,
"grad_norm": 0.21982480585575104,
"learning_rate": 8.440532926646315e-07,
"loss": 1.3501,
"step": 302
},
{
"epoch": 0.9688249400479616,
"grad_norm": 0.19508808851242065,
"learning_rate": 6.838051379234099e-07,
"loss": 1.3474,
"step": 303
},
{
"epoch": 0.9720223820943246,
"grad_norm": 0.1852046549320221,
"learning_rate": 5.403767186210218e-07,
"loss": 1.3791,
"step": 304
},
{
"epoch": 0.9752198241406874,
"grad_norm": 0.18738119304180145,
"learning_rate": 4.137841912639328e-07,
"loss": 1.4893,
"step": 305
},
{
"epoch": 0.9784172661870504,
"grad_norm": 0.20034608244895935,
"learning_rate": 3.0404181587811994e-07,
"loss": 1.4388,
"step": 306
},
{
"epoch": 0.9816147082334132,
"grad_norm": 0.20295751094818115,
"learning_rate": 2.1116195440278872e-07,
"loss": 1.4804,
"step": 307
},
{
"epoch": 0.9848121502797762,
"grad_norm": 0.207365021109581,
"learning_rate": 1.3515506929778762e-07,
"loss": 1.4719,
"step": 308
},
{
"epoch": 0.988009592326139,
"grad_norm": 0.2223723828792572,
"learning_rate": 7.602972236513405e-08,
"loss": 1.3123,
"step": 309
},
{
"epoch": 0.991207034372502,
"grad_norm": 0.2046136111021042,
"learning_rate": 3.3792573784585665e-08,
"loss": 1.4272,
"step": 310
},
{
"epoch": 0.9944044764188649,
"grad_norm": 0.21449051797389984,
"learning_rate": 8.448381363307388e-09,
"loss": 1.3367,
"step": 311
},
{
"epoch": 0.9976019184652278,
"grad_norm": 0.21067871153354645,
"learning_rate": 0.0,
"loss": 1.4037,
"step": 312
}
],
"logging_steps": 1,
"max_steps": 312,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.138882997433958e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}