mnl-336 / trainer_state.json
iamPi's picture
Add files using upload-large-folder tool
75cbf5d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 336,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.06201171875,
"epoch": 0.005979073243647235,
"grad_norm": 9.5625,
"learning_rate": 0.0,
"loss": 1.38720703125,
"mean_token_accuracy": 0.6870120912790298,
"num_tokens": 589646.0,
"step": 1
},
{
"entropy": 1.0556640625,
"epoch": 0.01195814648729447,
"grad_norm": 8.6875,
"learning_rate": 1.818181818181818e-07,
"loss": 1.356689453125,
"mean_token_accuracy": 0.6927258595824242,
"num_tokens": 1179185.0,
"step": 2
},
{
"entropy": 1.0947265625,
"epoch": 0.017937219730941704,
"grad_norm": 8.875,
"learning_rate": 3.636363636363636e-07,
"loss": 1.38818359375,
"mean_token_accuracy": 0.6830958425998688,
"num_tokens": 1768801.0,
"step": 3
},
{
"entropy": 1.0830078125,
"epoch": 0.02391629297458894,
"grad_norm": 9.8125,
"learning_rate": 5.454545454545454e-07,
"loss": 1.408203125,
"mean_token_accuracy": 0.6821927055716515,
"num_tokens": 2355606.0,
"step": 4
},
{
"entropy": 1.0908203125,
"epoch": 0.029895366218236172,
"grad_norm": 9.375,
"learning_rate": 7.272727272727272e-07,
"loss": 1.382568359375,
"mean_token_accuracy": 0.6858013942837715,
"num_tokens": 2941851.0,
"step": 5
},
{
"entropy": 1.060546875,
"epoch": 0.03587443946188341,
"grad_norm": 8.25,
"learning_rate": 9.09090909090909e-07,
"loss": 1.3363037109375,
"mean_token_accuracy": 0.6952119246125221,
"num_tokens": 3531477.0,
"step": 6
},
{
"entropy": 1.060546875,
"epoch": 0.04185351270553064,
"grad_norm": 7.84375,
"learning_rate": 1.0909090909090908e-06,
"loss": 1.338134765625,
"mean_token_accuracy": 0.6950986832380295,
"num_tokens": 4121161.0,
"step": 7
},
{
"entropy": 1.06494140625,
"epoch": 0.04783258594917788,
"grad_norm": 8.9375,
"learning_rate": 1.2727272727272726e-06,
"loss": 1.378662109375,
"mean_token_accuracy": 0.6874435991048813,
"num_tokens": 4700699.0,
"step": 8
},
{
"entropy": 1.05224609375,
"epoch": 0.053811659192825115,
"grad_norm": 7.75,
"learning_rate": 1.4545454545454544e-06,
"loss": 1.3175048828125,
"mean_token_accuracy": 0.6969988569617271,
"num_tokens": 5284275.0,
"step": 9
},
{
"entropy": 1.07763671875,
"epoch": 0.059790732436472344,
"grad_norm": 7.0625,
"learning_rate": 1.6363636363636365e-06,
"loss": 1.3353271484375,
"mean_token_accuracy": 0.6939220502972603,
"num_tokens": 5873877.0,
"step": 10
},
{
"entropy": 1.087890625,
"epoch": 0.06576980568011959,
"grad_norm": 7.34375,
"learning_rate": 1.818181818181818e-06,
"loss": 1.3642578125,
"mean_token_accuracy": 0.6876638159155846,
"num_tokens": 6463496.0,
"step": 11
},
{
"entropy": 1.0673828125,
"epoch": 0.07174887892376682,
"grad_norm": 7.125,
"learning_rate": 2e-06,
"loss": 1.332275390625,
"mean_token_accuracy": 0.6941065043210983,
"num_tokens": 7053054.0,
"step": 12
},
{
"entropy": 1.04150390625,
"epoch": 0.07772795216741404,
"grad_norm": 8.6875,
"learning_rate": 1.999953280342959e-06,
"loss": 1.354248046875,
"mean_token_accuracy": 0.6947119757533073,
"num_tokens": 7633205.0,
"step": 13
},
{
"entropy": 1.0634765625,
"epoch": 0.08370702541106129,
"grad_norm": 6.75,
"learning_rate": 1.9998131257372875e-06,
"loss": 1.314208984375,
"mean_token_accuracy": 0.6971366181969643,
"num_tokens": 8219275.0,
"step": 14
},
{
"entropy": 1.1015625,
"epoch": 0.08968609865470852,
"grad_norm": 6.375,
"learning_rate": 1.9995795492789365e-06,
"loss": 1.34912109375,
"mean_token_accuracy": 0.6870761960744858,
"num_tokens": 8808882.0,
"step": 15
},
{
"entropy": 1.0693359375,
"epoch": 0.09566517189835576,
"grad_norm": 6.5625,
"learning_rate": 1.99925257279313e-06,
"loss": 1.318115234375,
"mean_token_accuracy": 0.6940489485859871,
"num_tokens": 9398391.0,
"step": 16
},
{
"entropy": 1.07421875,
"epoch": 0.10164424514200299,
"grad_norm": 6.5,
"learning_rate": 1.9988322268323264e-06,
"loss": 1.32470703125,
"mean_token_accuracy": 0.6955610886216164,
"num_tokens": 9988054.0,
"step": 17
},
{
"entropy": 1.0986328125,
"epoch": 0.10762331838565023,
"grad_norm": 6.1875,
"learning_rate": 1.998318550673364e-06,
"loss": 1.3236083984375,
"mean_token_accuracy": 0.691640131175518,
"num_tokens": 10577624.0,
"step": 18
},
{
"entropy": 1.06787109375,
"epoch": 0.11360239162929746,
"grad_norm": 6.34375,
"learning_rate": 1.997711592313791e-06,
"loss": 1.3114013671875,
"mean_token_accuracy": 0.6971054673194885,
"num_tokens": 11167248.0,
"step": 19
},
{
"entropy": 1.0439453125,
"epoch": 0.11958146487294469,
"grad_norm": 6.84375,
"learning_rate": 1.9970114084673796e-06,
"loss": 1.291748046875,
"mean_token_accuracy": 0.7017510756850243,
"num_tokens": 11748340.0,
"step": 20
},
{
"entropy": 1.087890625,
"epoch": 0.12556053811659193,
"grad_norm": 6.78125,
"learning_rate": 1.9962180645588286e-06,
"loss": 1.3157958984375,
"mean_token_accuracy": 0.692795068025589,
"num_tokens": 12331037.0,
"step": 21
},
{
"entropy": 1.08203125,
"epoch": 0.13153961136023917,
"grad_norm": 9.1875,
"learning_rate": 1.9953316347176486e-06,
"loss": 1.302001953125,
"mean_token_accuracy": 0.6975891441106796,
"num_tokens": 12918010.0,
"step": 22
},
{
"entropy": 1.068359375,
"epoch": 0.1375186846038864,
"grad_norm": 11.625,
"learning_rate": 1.994352201771236e-06,
"loss": 1.3125,
"mean_token_accuracy": 0.695681132376194,
"num_tokens": 13507561.0,
"step": 23
},
{
"entropy": 1.06640625,
"epoch": 0.14349775784753363,
"grad_norm": 12.8125,
"learning_rate": 1.993279857237133e-06,
"loss": 1.2779541015625,
"mean_token_accuracy": 0.6991168782114983,
"num_tokens": 14090266.0,
"step": 24
},
{
"entropy": 1.06640625,
"epoch": 0.14947683109118087,
"grad_norm": 13.9375,
"learning_rate": 1.9921147013144777e-06,
"loss": 1.283447265625,
"mean_token_accuracy": 0.6982817649841309,
"num_tokens": 14679851.0,
"step": 25
},
{
"entropy": 1.0849609375,
"epoch": 0.1554559043348281,
"grad_norm": 14.125,
"learning_rate": 1.9908568428746405e-06,
"loss": 1.28564453125,
"mean_token_accuracy": 0.6965354830026627,
"num_tokens": 15269433.0,
"step": 26
},
{
"entropy": 1.087890625,
"epoch": 0.16143497757847533,
"grad_norm": 16.375,
"learning_rate": 1.989506399451051e-06,
"loss": 1.3095703125,
"mean_token_accuracy": 0.6939198896288872,
"num_tokens": 15858975.0,
"step": 27
},
{
"entropy": 1.0615234375,
"epoch": 0.16741405082212257,
"grad_norm": 12.1875,
"learning_rate": 1.9880634972282166e-06,
"loss": 1.273681640625,
"mean_token_accuracy": 0.7005915492773056,
"num_tokens": 16448581.0,
"step": 28
},
{
"entropy": 1.1220703125,
"epoch": 0.17339312406576982,
"grad_norm": 17.5,
"learning_rate": 1.986528271029931e-06,
"loss": 1.302001953125,
"mean_token_accuracy": 0.6892580538988113,
"num_tokens": 17038162.0,
"step": 29
},
{
"entropy": 1.060546875,
"epoch": 0.17937219730941703,
"grad_norm": 12.5625,
"learning_rate": 1.984900864306677e-06,
"loss": 1.2484130859375,
"mean_token_accuracy": 0.7043808251619339,
"num_tokens": 17622841.0,
"step": 30
},
{
"entropy": 1.083984375,
"epoch": 0.18535127055306427,
"grad_norm": 16.125,
"learning_rate": 1.9831814291222233e-06,
"loss": 1.276611328125,
"mean_token_accuracy": 0.6971414536237717,
"num_tokens": 18212334.0,
"step": 31
},
{
"entropy": 1.0859375,
"epoch": 0.19133034379671152,
"grad_norm": 19.125,
"learning_rate": 1.981370126139413e-06,
"loss": 1.282470703125,
"mean_token_accuracy": 0.6962975636124611,
"num_tokens": 18794983.0,
"step": 32
},
{
"entropy": 1.072265625,
"epoch": 0.19730941704035873,
"grad_norm": 22.875,
"learning_rate": 1.979467124605156e-06,
"loss": 1.2430419921875,
"mean_token_accuracy": 0.7024854198098183,
"num_tokens": 19384646.0,
"step": 33
},
{
"entropy": 1.103515625,
"epoch": 0.20328849028400597,
"grad_norm": 29.375,
"learning_rate": 1.977472602334609e-06,
"loss": 1.293212890625,
"mean_token_accuracy": 0.691613681614399,
"num_tokens": 19971390.0,
"step": 34
},
{
"entropy": 1.0517578125,
"epoch": 0.20926756352765322,
"grad_norm": 34.75,
"learning_rate": 1.975386745694565e-06,
"loss": 1.2325439453125,
"mean_token_accuracy": 0.7074485868215561,
"num_tokens": 20552857.0,
"step": 35
},
{
"entropy": 1.0888671875,
"epoch": 0.21524663677130046,
"grad_norm": 46.25,
"learning_rate": 1.9732097495860385e-06,
"loss": 1.27880859375,
"mean_token_accuracy": 0.6955694854259491,
"num_tokens": 21142419.0,
"step": 36
},
{
"entropy": 1.0869140625,
"epoch": 0.22122571001494767,
"grad_norm": 52.0,
"learning_rate": 1.970941817426052e-06,
"loss": 1.2547607421875,
"mean_token_accuracy": 0.6988364160060883,
"num_tokens": 21725119.0,
"step": 37
},
{
"entropy": 1.0888671875,
"epoch": 0.22720478325859492,
"grad_norm": 50.25,
"learning_rate": 1.968583161128631e-06,
"loss": 1.2620849609375,
"mean_token_accuracy": 0.6952823475003242,
"num_tokens": 22314606.0,
"step": 38
},
{
"entropy": 1.111328125,
"epoch": 0.23318385650224216,
"grad_norm": 28.5,
"learning_rate": 1.9661340010850024e-06,
"loss": 1.2611083984375,
"mean_token_accuracy": 0.6952020153403282,
"num_tokens": 22897136.0,
"step": 39
},
{
"entropy": 1.0546875,
"epoch": 0.23916292974588937,
"grad_norm": 13.375,
"learning_rate": 1.9635945661430005e-06,
"loss": 1.2120361328125,
"mean_token_accuracy": 0.7072760388255119,
"num_tokens": 23470326.0,
"step": 40
},
{
"entropy": 1.05322265625,
"epoch": 0.24514200298953662,
"grad_norm": 11.0625,
"learning_rate": 1.960965093585684e-06,
"loss": 1.1966552734375,
"mean_token_accuracy": 0.710840716958046,
"num_tokens": 24059902.0,
"step": 41
},
{
"entropy": 1.0986328125,
"epoch": 0.25112107623318386,
"grad_norm": 10.1875,
"learning_rate": 1.9582458291091663e-06,
"loss": 1.2474365234375,
"mean_token_accuracy": 0.6974528953433037,
"num_tokens": 24641292.0,
"step": 42
},
{
"entropy": 1.08203125,
"epoch": 0.2571001494768311,
"grad_norm": 9.125,
"learning_rate": 1.9554370267996535e-06,
"loss": 1.2308349609375,
"mean_token_accuracy": 0.7031876817345619,
"num_tokens": 25230783.0,
"step": 43
},
{
"entropy": 1.05810546875,
"epoch": 0.26307922272047835,
"grad_norm": 9.125,
"learning_rate": 1.952538949109708e-06,
"loss": 1.195556640625,
"mean_token_accuracy": 0.7081611901521683,
"num_tokens": 25820348.0,
"step": 44
},
{
"entropy": 1.0888671875,
"epoch": 0.26905829596412556,
"grad_norm": 9.5625,
"learning_rate": 1.94955186683372e-06,
"loss": 1.242919921875,
"mean_token_accuracy": 0.6991148665547371,
"num_tokens": 26409842.0,
"step": 45
},
{
"entropy": 1.080078125,
"epoch": 0.2750373692077728,
"grad_norm": 9.4375,
"learning_rate": 1.94647605908261e-06,
"loss": 1.2176513671875,
"mean_token_accuracy": 0.7035784423351288,
"num_tokens": 26993040.0,
"step": 46
},
{
"entropy": 1.10009765625,
"epoch": 0.28101644245142005,
"grad_norm": 8.125,
"learning_rate": 1.943311813257743e-06,
"loss": 1.252685546875,
"mean_token_accuracy": 0.6970779970288277,
"num_tokens": 27582641.0,
"step": 47
},
{
"entropy": 1.11328125,
"epoch": 0.28699551569506726,
"grad_norm": 8.5,
"learning_rate": 1.9400594250240794e-06,
"loss": 1.260009765625,
"mean_token_accuracy": 0.6932175979018211,
"num_tokens": 28172234.0,
"step": 48
},
{
"entropy": 1.0869140625,
"epoch": 0.2929745889387145,
"grad_norm": 8.625,
"learning_rate": 1.9367191982825448e-06,
"loss": 1.208740234375,
"mean_token_accuracy": 0.7022047564387321,
"num_tokens": 28761815.0,
"step": 49
},
{
"entropy": 1.06982421875,
"epoch": 0.29895366218236175,
"grad_norm": 7.46875,
"learning_rate": 1.9332914451416345e-06,
"loss": 1.214599609375,
"mean_token_accuracy": 0.7050945162773132,
"num_tokens": 29351427.0,
"step": 50
},
{
"entropy": 1.078125,
"epoch": 0.30493273542600896,
"grad_norm": 6.78125,
"learning_rate": 1.929776485888251e-06,
"loss": 1.23046875,
"mean_token_accuracy": 0.7009832188487053,
"num_tokens": 29941039.0,
"step": 51
},
{
"entropy": 1.0703125,
"epoch": 0.3109118086696562,
"grad_norm": 11.4375,
"learning_rate": 1.9261746489577764e-06,
"loss": 1.2705078125,
"mean_token_accuracy": 0.696273148059845,
"num_tokens": 30517107.0,
"step": 52
},
{
"entropy": 1.091796875,
"epoch": 0.31689088191330345,
"grad_norm": 7.1875,
"learning_rate": 1.9224862709033824e-06,
"loss": 1.2236328125,
"mean_token_accuracy": 0.6999221071600914,
"num_tokens": 31106632.0,
"step": 53
},
{
"entropy": 1.03955078125,
"epoch": 0.32286995515695066,
"grad_norm": 6.25,
"learning_rate": 1.918711696364584e-06,
"loss": 1.180908203125,
"mean_token_accuracy": 0.710278332233429,
"num_tokens": 31688974.0,
"step": 54
},
{
"entropy": 1.04345703125,
"epoch": 0.32884902840059793,
"grad_norm": 5.5625,
"learning_rate": 1.914851278035038e-06,
"loss": 1.1917724609375,
"mean_token_accuracy": 0.7096548527479172,
"num_tokens": 32278575.0,
"step": 55
},
{
"entropy": 1.05859375,
"epoch": 0.33482810164424515,
"grad_norm": 5.9375,
"learning_rate": 1.910905376629585e-06,
"loss": 1.2235107421875,
"mean_token_accuracy": 0.7041697576642036,
"num_tokens": 32868159.0,
"step": 56
},
{
"entropy": 1.03369140625,
"epoch": 0.34080717488789236,
"grad_norm": 4.84375,
"learning_rate": 1.9068743608505452e-06,
"loss": 1.1871337890625,
"mean_token_accuracy": 0.7095241695642471,
"num_tokens": 33457746.0,
"step": 57
},
{
"entropy": 1.0615234375,
"epoch": 0.34678624813153963,
"grad_norm": 6.125,
"learning_rate": 1.902758607353269e-06,
"loss": 1.2313232421875,
"mean_token_accuracy": 0.7004028484225273,
"num_tokens": 34047328.0,
"step": 58
},
{
"entropy": 1.06982421875,
"epoch": 0.35276532137518685,
"grad_norm": 5.78125,
"learning_rate": 1.8985585007109388e-06,
"loss": 1.23828125,
"mean_token_accuracy": 0.7001003175973892,
"num_tokens": 34636812.0,
"step": 59
},
{
"entropy": 1.05078125,
"epoch": 0.35874439461883406,
"grad_norm": 5.03125,
"learning_rate": 1.8942744333786395e-06,
"loss": 1.184326171875,
"mean_token_accuracy": 0.7088666930794716,
"num_tokens": 35226406.0,
"step": 60
},
{
"entropy": 1.0556640625,
"epoch": 0.36472346786248133,
"grad_norm": 7.53125,
"learning_rate": 1.8899068056566838e-06,
"loss": 1.2060546875,
"mean_token_accuracy": 0.7032047733664513,
"num_tokens": 35809841.0,
"step": 61
},
{
"entropy": 1.0791015625,
"epoch": 0.37070254110612855,
"grad_norm": 10.375,
"learning_rate": 1.8854560256532098e-06,
"loss": 1.2000732421875,
"mean_token_accuracy": 0.7027083933353424,
"num_tokens": 36399387.0,
"step": 62
},
{
"entropy": 1.04931640625,
"epoch": 0.37668161434977576,
"grad_norm": 11.625,
"learning_rate": 1.8809225092460485e-06,
"loss": 1.2080078125,
"mean_token_accuracy": 0.7051479294896126,
"num_tokens": 36988937.0,
"step": 63
},
{
"entropy": 1.02294921875,
"epoch": 0.38266068759342303,
"grad_norm": 7.25,
"learning_rate": 1.8763066800438634e-06,
"loss": 1.1639404296875,
"mean_token_accuracy": 0.7147629410028458,
"num_tokens": 37569757.0,
"step": 64
},
{
"entropy": 1.02880859375,
"epoch": 0.38863976083707025,
"grad_norm": 6.75,
"learning_rate": 1.8716089693465693e-06,
"loss": 1.1640625,
"mean_token_accuracy": 0.7142753675580025,
"num_tokens": 38159331.0,
"step": 65
},
{
"entropy": 1.03564453125,
"epoch": 0.39461883408071746,
"grad_norm": 9.5,
"learning_rate": 1.8668298161050306e-06,
"loss": 1.199951171875,
"mean_token_accuracy": 0.7058519497513771,
"num_tokens": 38747516.0,
"step": 66
},
{
"entropy": 1.05126953125,
"epoch": 0.40059790732436473,
"grad_norm": 6.09375,
"learning_rate": 1.861969666880049e-06,
"loss": 1.179443359375,
"mean_token_accuracy": 0.7096298113465309,
"num_tokens": 39337113.0,
"step": 67
},
{
"entropy": 1.05126953125,
"epoch": 0.40657698056801195,
"grad_norm": 5.46875,
"learning_rate": 1.8570289758006343e-06,
"loss": 1.1827392578125,
"mean_token_accuracy": 0.7079932987689972,
"num_tokens": 39926721.0,
"step": 68
},
{
"entropy": 1.0615234375,
"epoch": 0.4125560538116592,
"grad_norm": 4.84375,
"learning_rate": 1.8520082045215717e-06,
"loss": 1.189453125,
"mean_token_accuracy": 0.7062863036990166,
"num_tokens": 40516290.0,
"step": 69
},
{
"entropy": 1.0439453125,
"epoch": 0.41853512705530643,
"grad_norm": 4.84375,
"learning_rate": 1.846907822180286e-06,
"loss": 1.16650390625,
"mean_token_accuracy": 0.711765356361866,
"num_tokens": 41105790.0,
"step": 70
},
{
"entropy": 1.0625,
"epoch": 0.42451420029895365,
"grad_norm": 6.0,
"learning_rate": 1.8417283053530043e-06,
"loss": 1.18603515625,
"mean_token_accuracy": 0.7049060165882111,
"num_tokens": 41695388.0,
"step": 71
},
{
"entropy": 1.0478515625,
"epoch": 0.4304932735426009,
"grad_norm": 6.875,
"learning_rate": 1.8364701380102264e-06,
"loss": 1.1793212890625,
"mean_token_accuracy": 0.7081038281321526,
"num_tokens": 42271847.0,
"step": 72
},
{
"entropy": 1.04833984375,
"epoch": 0.43647234678624813,
"grad_norm": 10.5625,
"learning_rate": 1.8311338114715027e-06,
"loss": 1.185791015625,
"mean_token_accuracy": 0.7098284065723419,
"num_tokens": 42855765.0,
"step": 73
},
{
"entropy": 1.05517578125,
"epoch": 0.44245142002989535,
"grad_norm": 6.84375,
"learning_rate": 1.825719824359524e-06,
"loss": 1.177734375,
"mean_token_accuracy": 0.7076143845915794,
"num_tokens": 43445385.0,
"step": 74
},
{
"entropy": 1.0751953125,
"epoch": 0.4484304932735426,
"grad_norm": 6.28125,
"learning_rate": 1.8202286825535329e-06,
"loss": 1.208251953125,
"mean_token_accuracy": 0.7024360001087189,
"num_tokens": 44033137.0,
"step": 75
},
{
"entropy": 1.0888671875,
"epoch": 0.45440956651718983,
"grad_norm": 5.375,
"learning_rate": 1.814660899142053e-06,
"loss": 1.202392578125,
"mean_token_accuracy": 0.7014130279421806,
"num_tokens": 44622745.0,
"step": 76
},
{
"entropy": 1.04248046875,
"epoch": 0.46038863976083705,
"grad_norm": 6.5,
"learning_rate": 1.8090169943749474e-06,
"loss": 1.18212890625,
"mean_token_accuracy": 0.7097717002034187,
"num_tokens": 45212356.0,
"step": 77
},
{
"entropy": 1.0576171875,
"epoch": 0.4663677130044843,
"grad_norm": 9.3125,
"learning_rate": 1.8032974956148062e-06,
"loss": 1.179443359375,
"mean_token_accuracy": 0.7062700912356377,
"num_tokens": 45798390.0,
"step": 78
},
{
"entropy": 1.03857421875,
"epoch": 0.47234678624813153,
"grad_norm": 9.125,
"learning_rate": 1.7975029372876705e-06,
"loss": 1.1568603515625,
"mean_token_accuracy": 0.7130975499749184,
"num_tokens": 46388008.0,
"step": 79
},
{
"entropy": 1.05322265625,
"epoch": 0.47832585949177875,
"grad_norm": 7.34375,
"learning_rate": 1.7916338608330956e-06,
"loss": 1.182861328125,
"mean_token_accuracy": 0.7081197574734688,
"num_tokens": 46974446.0,
"step": 80
},
{
"entropy": 1.0458984375,
"epoch": 0.484304932735426,
"grad_norm": 4.5625,
"learning_rate": 1.78569081465356e-06,
"loss": 1.1536865234375,
"mean_token_accuracy": 0.7095040455460548,
"num_tokens": 47564004.0,
"step": 81
},
{
"entropy": 1.03759765625,
"epoch": 0.49028400597907323,
"grad_norm": 3.875,
"learning_rate": 1.7796743540632221e-06,
"loss": 1.1531982421875,
"mean_token_accuracy": 0.7134627625346184,
"num_tokens": 48153580.0,
"step": 82
},
{
"entropy": 1.0625,
"epoch": 0.4962630792227205,
"grad_norm": 6.5,
"learning_rate": 1.7735850412360328e-06,
"loss": 1.177490234375,
"mean_token_accuracy": 0.7081628367304802,
"num_tokens": 48740404.0,
"step": 83
},
{
"entropy": 1.048828125,
"epoch": 0.5022421524663677,
"grad_norm": 8.8125,
"learning_rate": 1.7674234451532063e-06,
"loss": 1.1700439453125,
"mean_token_accuracy": 0.7087839841842651,
"num_tokens": 49329930.0,
"step": 84
},
{
"entropy": 1.0458984375,
"epoch": 0.5082212257100149,
"grad_norm": 7.25,
"learning_rate": 1.7611901415500533e-06,
"loss": 1.16259765625,
"mean_token_accuracy": 0.7115066945552826,
"num_tokens": 49914835.0,
"step": 85
},
{
"entropy": 1.0615234375,
"epoch": 0.5142002989536621,
"grad_norm": 8.0625,
"learning_rate": 1.7548857128621874e-06,
"loss": 1.18359375,
"mean_token_accuracy": 0.706175908446312,
"num_tokens": 50504393.0,
"step": 86
},
{
"entropy": 1.05322265625,
"epoch": 0.5201793721973094,
"grad_norm": 4.875,
"learning_rate": 1.748510748171101e-06,
"loss": 1.1778564453125,
"mean_token_accuracy": 0.7066154479980469,
"num_tokens": 51093995.0,
"step": 87
},
{
"entropy": 1.052734375,
"epoch": 0.5261584454409567,
"grad_norm": 4.65625,
"learning_rate": 1.7420658431491222e-06,
"loss": 1.167236328125,
"mean_token_accuracy": 0.7099850177764893,
"num_tokens": 51683545.0,
"step": 88
},
{
"entropy": 1.04638671875,
"epoch": 0.5321375186846039,
"grad_norm": 9.125,
"learning_rate": 1.735551600003755e-06,
"loss": 1.157470703125,
"mean_token_accuracy": 0.7102955356240273,
"num_tokens": 52272548.0,
"step": 89
},
{
"entropy": 1.0302734375,
"epoch": 0.5381165919282511,
"grad_norm": 10.25,
"learning_rate": 1.7289686274214115e-06,
"loss": 1.1446533203125,
"mean_token_accuracy": 0.7140024453401566,
"num_tokens": 52862195.0,
"step": 90
},
{
"entropy": 1.03369140625,
"epoch": 0.5440956651718983,
"grad_norm": 11.75,
"learning_rate": 1.722317540510534e-06,
"loss": 1.1495361328125,
"mean_token_accuracy": 0.7119031846523285,
"num_tokens": 53450055.0,
"step": 91
},
{
"entropy": 1.03564453125,
"epoch": 0.5500747384155455,
"grad_norm": 9.8125,
"learning_rate": 1.715598960744121e-06,
"loss": 1.149658203125,
"mean_token_accuracy": 0.7118451669812202,
"num_tokens": 54023463.0,
"step": 92
},
{
"entropy": 1.0634765625,
"epoch": 0.5560538116591929,
"grad_norm": 6.40625,
"learning_rate": 1.7088135159016582e-06,
"loss": 1.1729736328125,
"mean_token_accuracy": 0.7083615809679031,
"num_tokens": 54613056.0,
"step": 93
},
{
"entropy": 1.056640625,
"epoch": 0.5620328849028401,
"grad_norm": 4.1875,
"learning_rate": 1.7019618400104569e-06,
"loss": 1.158447265625,
"mean_token_accuracy": 0.7101547122001648,
"num_tokens": 55194480.0,
"step": 94
},
{
"entropy": 1.046875,
"epoch": 0.5680119581464873,
"grad_norm": 9.25,
"learning_rate": 1.6950445732864126e-06,
"loss": 1.162109375,
"mean_token_accuracy": 0.7099513560533524,
"num_tokens": 55784110.0,
"step": 95
},
{
"entropy": 1.0625,
"epoch": 0.5739910313901345,
"grad_norm": 6.65625,
"learning_rate": 1.688062362074184e-06,
"loss": 1.1649169921875,
"mean_token_accuracy": 0.7057990580797195,
"num_tokens": 56373674.0,
"step": 96
},
{
"entropy": 1.05078125,
"epoch": 0.5799701046337817,
"grad_norm": 15.125,
"learning_rate": 1.681015858786797e-06,
"loss": 1.166259765625,
"mean_token_accuracy": 0.7095335945487022,
"num_tokens": 56952628.0,
"step": 97
},
{
"entropy": 1.03369140625,
"epoch": 0.585949177877429,
"grad_norm": 8.3125,
"learning_rate": 1.6739057218446857e-06,
"loss": 1.156005859375,
"mean_token_accuracy": 0.7151156216859818,
"num_tokens": 57542149.0,
"step": 98
},
{
"entropy": 1.0380859375,
"epoch": 0.5919282511210763,
"grad_norm": 4.15625,
"learning_rate": 1.666732615614169e-06,
"loss": 1.143798828125,
"mean_token_accuracy": 0.7129637002944946,
"num_tokens": 58131730.0,
"step": 99
},
{
"entropy": 1.02392578125,
"epoch": 0.5979073243647235,
"grad_norm": 9.3125,
"learning_rate": 1.6594972103453724e-06,
"loss": 1.1343994140625,
"mean_token_accuracy": 0.7165936082601547,
"num_tokens": 58721339.0,
"step": 100
},
{
"entropy": 1.029296875,
"epoch": 0.6038863976083707,
"grad_norm": 7.5,
"learning_rate": 1.6522001821096019e-06,
"loss": 1.1375732421875,
"mean_token_accuracy": 0.7172424420714378,
"num_tokens": 59310867.0,
"step": 101
},
{
"entropy": 1.02978515625,
"epoch": 0.6098654708520179,
"grad_norm": 11.3125,
"learning_rate": 1.6448422127361705e-06,
"loss": 1.117919921875,
"mean_token_accuracy": 0.7172070667147636,
"num_tokens": 59894680.0,
"step": 102
},
{
"entropy": 1.0205078125,
"epoch": 0.6158445440956651,
"grad_norm": 11.5,
"learning_rate": 1.6374239897486897e-06,
"loss": 1.1181640625,
"mean_token_accuracy": 0.7184400483965874,
"num_tokens": 60484296.0,
"step": 103
},
{
"entropy": 1.05078125,
"epoch": 0.6218236173393124,
"grad_norm": 8.0,
"learning_rate": 1.6299462063008269e-06,
"loss": 1.143798828125,
"mean_token_accuracy": 0.7096443995833397,
"num_tokens": 61073911.0,
"step": 104
},
{
"entropy": 1.064453125,
"epoch": 0.6278026905829597,
"grad_norm": 7.75,
"learning_rate": 1.6224095611115383e-06,
"loss": 1.1650390625,
"mean_token_accuracy": 0.7072784155607224,
"num_tokens": 61663572.0,
"step": 105
},
{
"entropy": 1.0341796875,
"epoch": 0.6337817638266069,
"grad_norm": 6.9375,
"learning_rate": 1.614814758399781e-06,
"loss": 1.128662109375,
"mean_token_accuracy": 0.71539356559515,
"num_tokens": 62252067.0,
"step": 106
},
{
"entropy": 1.03662109375,
"epoch": 0.6397608370702541,
"grad_norm": 14.125,
"learning_rate": 1.6071625078187112e-06,
"loss": 1.146240234375,
"mean_token_accuracy": 0.7122742831707001,
"num_tokens": 62841656.0,
"step": 107
},
{
"entropy": 1.0400390625,
"epoch": 0.6457399103139013,
"grad_norm": 14.3125,
"learning_rate": 1.599453524389374e-06,
"loss": 1.146728515625,
"mean_token_accuracy": 0.7149165868759155,
"num_tokens": 63431221.0,
"step": 108
},
{
"entropy": 1.0556640625,
"epoch": 0.6517189835575485,
"grad_norm": 15.375,
"learning_rate": 1.5916885284338935e-06,
"loss": 1.155029296875,
"mean_token_accuracy": 0.7111315131187439,
"num_tokens": 64020069.0,
"step": 109
},
{
"entropy": 1.0380859375,
"epoch": 0.6576980568011959,
"grad_norm": 12.3125,
"learning_rate": 1.5838682455081657e-06,
"loss": 1.13671875,
"mean_token_accuracy": 0.7157010585069656,
"num_tokens": 64609550.0,
"step": 110
},
{
"entropy": 1.03076171875,
"epoch": 0.6636771300448431,
"grad_norm": 7.90625,
"learning_rate": 1.5759934063340624e-06,
"loss": 1.1343994140625,
"mean_token_accuracy": 0.7165603339672089,
"num_tokens": 65199109.0,
"step": 111
},
{
"entropy": 1.05615234375,
"epoch": 0.6696562032884903,
"grad_norm": 12.5,
"learning_rate": 1.5680647467311555e-06,
"loss": 1.1683349609375,
"mean_token_accuracy": 0.7091170027852058,
"num_tokens": 65788728.0,
"step": 112
},
{
"entropy": 1.03759765625,
"epoch": 0.6756352765321375,
"grad_norm": 8.9375,
"learning_rate": 1.56008300754796e-06,
"loss": 1.1407470703125,
"mean_token_accuracy": 0.7141014188528061,
"num_tokens": 66370343.0,
"step": 113
},
{
"entropy": 1.0478515625,
"epoch": 0.6816143497757847,
"grad_norm": 12.5,
"learning_rate": 1.5520489345927094e-06,
"loss": 1.1500244140625,
"mean_token_accuracy": 0.7122905552387238,
"num_tokens": 66955876.0,
"step": 114
},
{
"entropy": 1.02392578125,
"epoch": 0.6875934230194319,
"grad_norm": 5.65625,
"learning_rate": 1.5439632785636705e-06,
"loss": 1.135498046875,
"mean_token_accuracy": 0.7167380154132843,
"num_tokens": 67545426.0,
"step": 115
},
{
"entropy": 1.052734375,
"epoch": 0.6935724962630793,
"grad_norm": 5.0625,
"learning_rate": 1.5358267949789964e-06,
"loss": 1.1448974609375,
"mean_token_accuracy": 0.7125077843666077,
"num_tokens": 68134997.0,
"step": 116
},
{
"entropy": 1.041015625,
"epoch": 0.6995515695067265,
"grad_norm": 5.75,
"learning_rate": 1.5276402441061327e-06,
"loss": 1.125732421875,
"mean_token_accuracy": 0.7163447961211205,
"num_tokens": 68724591.0,
"step": 117
},
{
"entropy": 1.02587890625,
"epoch": 0.7055306427503737,
"grad_norm": 9.625,
"learning_rate": 1.5194043908907772e-06,
"loss": 1.131103515625,
"mean_token_accuracy": 0.716623105108738,
"num_tokens": 69314113.0,
"step": 118
},
{
"entropy": 1.005859375,
"epoch": 0.7115097159940209,
"grad_norm": 5.09375,
"learning_rate": 1.5111200048854054e-06,
"loss": 1.1011962890625,
"mean_token_accuracy": 0.7245713621377945,
"num_tokens": 69903640.0,
"step": 119
},
{
"entropy": 1.044921875,
"epoch": 0.7174887892376681,
"grad_norm": 6.59375,
"learning_rate": 1.5027878601773632e-06,
"loss": 1.1431884765625,
"mean_token_accuracy": 0.7117293328046799,
"num_tokens": 70493259.0,
"step": 120
},
{
"entropy": 1.0615234375,
"epoch": 0.7234678624813154,
"grad_norm": 8.1875,
"learning_rate": 1.494408735316537e-06,
"loss": 1.15380859375,
"mean_token_accuracy": 0.7094393074512482,
"num_tokens": 71082785.0,
"step": 121
},
{
"entropy": 1.07861328125,
"epoch": 0.7294469357249627,
"grad_norm": 16.625,
"learning_rate": 1.4859834132426058e-06,
"loss": 1.1781005859375,
"mean_token_accuracy": 0.7051993981003761,
"num_tokens": 71666506.0,
"step": 122
},
{
"entropy": 1.0166015625,
"epoch": 0.7354260089686099,
"grad_norm": 16.875,
"learning_rate": 1.4775126812118863e-06,
"loss": 1.1251220703125,
"mean_token_accuracy": 0.7166302278637886,
"num_tokens": 72250385.0,
"step": 123
},
{
"entropy": 1.05615234375,
"epoch": 0.7414050822122571,
"grad_norm": 7.4375,
"learning_rate": 1.4689973307237686e-06,
"loss": 1.15478515625,
"mean_token_accuracy": 0.709770917892456,
"num_tokens": 72829598.0,
"step": 124
},
{
"entropy": 1.029296875,
"epoch": 0.7473841554559043,
"grad_norm": 9.9375,
"learning_rate": 1.4604381574467614e-06,
"loss": 1.13037109375,
"mean_token_accuracy": 0.7166016399860382,
"num_tokens": 73419110.0,
"step": 125
},
{
"entropy": 1.013671875,
"epoch": 0.7533632286995515,
"grad_norm": 16.625,
"learning_rate": 1.451835961144145e-06,
"loss": 1.1103515625,
"mean_token_accuracy": 0.7211827859282494,
"num_tokens": 74008729.0,
"step": 126
},
{
"entropy": 1.0126953125,
"epoch": 0.7593423019431988,
"grad_norm": 13.0625,
"learning_rate": 1.4431915455992414e-06,
"loss": 1.1024169921875,
"mean_token_accuracy": 0.7223981395363808,
"num_tokens": 74598306.0,
"step": 127
},
{
"entropy": 1.056640625,
"epoch": 0.7653213751868461,
"grad_norm": 7.75,
"learning_rate": 1.4345057185403098e-06,
"loss": 1.15869140625,
"mean_token_accuracy": 0.7109938785433769,
"num_tokens": 75187853.0,
"step": 128
},
{
"entropy": 1.037109375,
"epoch": 0.7713004484304933,
"grad_norm": 12.875,
"learning_rate": 1.4257792915650725e-06,
"loss": 1.13232421875,
"mean_token_accuracy": 0.7139059007167816,
"num_tokens": 75777399.0,
"step": 129
},
{
"entropy": 1.0390625,
"epoch": 0.7772795216741405,
"grad_norm": 11.375,
"learning_rate": 1.4170130800648812e-06,
"loss": 1.1455078125,
"mean_token_accuracy": 0.7116378918290138,
"num_tokens": 76367001.0,
"step": 130
},
{
"entropy": 1.0419921875,
"epoch": 0.7832585949177877,
"grad_norm": 10.875,
"learning_rate": 1.408207903148525e-06,
"loss": 1.1370849609375,
"mean_token_accuracy": 0.7141571119427681,
"num_tokens": 76956562.0,
"step": 131
},
{
"entropy": 1.033203125,
"epoch": 0.7892376681614349,
"grad_norm": 17.625,
"learning_rate": 1.3993645835656952e-06,
"loss": 1.147705078125,
"mean_token_accuracy": 0.7140598297119141,
"num_tokens": 77544057.0,
"step": 132
},
{
"entropy": 0.99755859375,
"epoch": 0.7952167414050823,
"grad_norm": 18.875,
"learning_rate": 1.3904839476301088e-06,
"loss": 1.085693359375,
"mean_token_accuracy": 0.7245375439524651,
"num_tokens": 78133581.0,
"step": 133
},
{
"entropy": 1.0478515625,
"epoch": 0.8011958146487295,
"grad_norm": 19.125,
"learning_rate": 1.3815668251422953e-06,
"loss": 1.14013671875,
"mean_token_accuracy": 0.7118667960166931,
"num_tokens": 78723253.0,
"step": 134
},
{
"entropy": 1.03271484375,
"epoch": 0.8071748878923767,
"grad_norm": 24.125,
"learning_rate": 1.3726140493120637e-06,
"loss": 1.1357421875,
"mean_token_accuracy": 0.7158161103725433,
"num_tokens": 79306761.0,
"step": 135
},
{
"entropy": 1.02685546875,
"epoch": 0.8131539611360239,
"grad_norm": 28.0,
"learning_rate": 1.363626456680647e-06,
"loss": 1.125244140625,
"mean_token_accuracy": 0.7170991152524948,
"num_tokens": 79893309.0,
"step": 136
},
{
"entropy": 1.05224609375,
"epoch": 0.8191330343796711,
"grad_norm": 24.125,
"learning_rate": 1.3546048870425354e-06,
"loss": 1.148681640625,
"mean_token_accuracy": 0.7112774699926376,
"num_tokens": 80482935.0,
"step": 137
},
{
"entropy": 1.05615234375,
"epoch": 0.8251121076233184,
"grad_norm": 12.5625,
"learning_rate": 1.3455501833670087e-06,
"loss": 1.134033203125,
"mean_token_accuracy": 0.7125924825668335,
"num_tokens": 81072531.0,
"step": 138
},
{
"entropy": 1.03271484375,
"epoch": 0.8310911808669657,
"grad_norm": 19.375,
"learning_rate": 1.336463191719367e-06,
"loss": 1.12335205078125,
"mean_token_accuracy": 0.7159583121538162,
"num_tokens": 81654332.0,
"step": 139
},
{
"entropy": 1.052734375,
"epoch": 0.8370702541106129,
"grad_norm": 20.125,
"learning_rate": 1.3273447611818766e-06,
"loss": 1.1549072265625,
"mean_token_accuracy": 0.7113095596432686,
"num_tokens": 82243896.0,
"step": 140
},
{
"entropy": 1.0322265625,
"epoch": 0.8430493273542601,
"grad_norm": 14.6875,
"learning_rate": 1.3181957437744332e-06,
"loss": 1.128662109375,
"mean_token_accuracy": 0.7145743370056152,
"num_tokens": 82826175.0,
"step": 141
},
{
"entropy": 1.0458984375,
"epoch": 0.8490284005979073,
"grad_norm": 7.71875,
"learning_rate": 1.3090169943749473e-06,
"loss": 1.12841796875,
"mean_token_accuracy": 0.7128350734710693,
"num_tokens": 83415822.0,
"step": 142
},
{
"entropy": 1.03759765625,
"epoch": 0.8550074738415545,
"grad_norm": 22.875,
"learning_rate": 1.2998093706394675e-06,
"loss": 1.14453125,
"mean_token_accuracy": 0.7128356993198395,
"num_tokens": 83997557.0,
"step": 143
},
{
"entropy": 1.05078125,
"epoch": 0.8609865470852018,
"grad_norm": 13.875,
"learning_rate": 1.2905737329220392e-06,
"loss": 1.136474609375,
"mean_token_accuracy": 0.7111846879124641,
"num_tokens": 84587168.0,
"step": 144
},
{
"entropy": 1.04638671875,
"epoch": 0.866965620328849,
"grad_norm": 15.25,
"learning_rate": 1.2813109441943164e-06,
"loss": 1.138671875,
"mean_token_accuracy": 0.7117553874850273,
"num_tokens": 85176064.0,
"step": 145
},
{
"entropy": 1.0205078125,
"epoch": 0.8729446935724963,
"grad_norm": 13.625,
"learning_rate": 1.2720218699649241e-06,
"loss": 1.111572265625,
"mean_token_accuracy": 0.7199290543794632,
"num_tokens": 85765635.0,
"step": 146
},
{
"entropy": 1.02197265625,
"epoch": 0.8789237668161435,
"grad_norm": 10.5625,
"learning_rate": 1.262707378198587e-06,
"loss": 1.1162109375,
"mean_token_accuracy": 0.7189603447914124,
"num_tokens": 86355190.0,
"step": 147
},
{
"entropy": 1.04736328125,
"epoch": 0.8849028400597907,
"grad_norm": 11.75,
"learning_rate": 1.2533683392350262e-06,
"loss": 1.138427734375,
"mean_token_accuracy": 0.7134011015295982,
"num_tokens": 86938046.0,
"step": 148
},
{
"entropy": 1.02490234375,
"epoch": 0.890881913303438,
"grad_norm": 19.5,
"learning_rate": 1.2440056257076374e-06,
"loss": 1.113037109375,
"mean_token_accuracy": 0.7172698378562927,
"num_tokens": 87527598.0,
"step": 149
},
{
"entropy": 1.0146484375,
"epoch": 0.8968609865470852,
"grad_norm": 14.0,
"learning_rate": 1.23462011246195e-06,
"loss": 1.11181640625,
"mean_token_accuracy": 0.7199216857552528,
"num_tokens": 88110264.0,
"step": 150
},
{
"entropy": 1.03271484375,
"epoch": 0.9028400597907325,
"grad_norm": 8.9375,
"learning_rate": 1.2252126764738844e-06,
"loss": 1.12353515625,
"mean_token_accuracy": 0.716008372604847,
"num_tokens": 88699832.0,
"step": 151
},
{
"entropy": 1.046875,
"epoch": 0.9088191330343797,
"grad_norm": 8.125,
"learning_rate": 1.2157841967678063e-06,
"loss": 1.130126953125,
"mean_token_accuracy": 0.714422382414341,
"num_tokens": 89289495.0,
"step": 152
},
{
"entropy": 1.00439453125,
"epoch": 0.9147982062780269,
"grad_norm": 12.0,
"learning_rate": 1.2063355543343923e-06,
"loss": 1.08837890625,
"mean_token_accuracy": 0.7251131683588028,
"num_tokens": 89879089.0,
"step": 153
},
{
"entropy": 1.0244140625,
"epoch": 0.9207772795216741,
"grad_norm": 14.3125,
"learning_rate": 1.1968676320483101e-06,
"loss": 1.1243896484375,
"mean_token_accuracy": 0.7171234339475632,
"num_tokens": 90451022.0,
"step": 154
},
{
"entropy": 1.03466796875,
"epoch": 0.9267563527653214,
"grad_norm": 28.75,
"learning_rate": 1.1873813145857248e-06,
"loss": 1.1207275390625,
"mean_token_accuracy": 0.7154998481273651,
"num_tokens": 91036040.0,
"step": 155
},
{
"entropy": 1.03125,
"epoch": 0.9327354260089686,
"grad_norm": 33.25,
"learning_rate": 1.1778774883416322e-06,
"loss": 1.119873046875,
"mean_token_accuracy": 0.715819425880909,
"num_tokens": 91625659.0,
"step": 156
},
{
"entropy": 1.0400390625,
"epoch": 0.9387144992526159,
"grad_norm": 13.625,
"learning_rate": 1.1683570413470383e-06,
"loss": 1.1197509765625,
"mean_token_accuracy": 0.7150726914405823,
"num_tokens": 92215320.0,
"step": 157
},
{
"entropy": 1.01953125,
"epoch": 0.9446935724962631,
"grad_norm": 21.125,
"learning_rate": 1.1588208631859807e-06,
"loss": 1.1259765625,
"mean_token_accuracy": 0.7184253633022308,
"num_tokens": 92804840.0,
"step": 158
},
{
"entropy": 1.03076171875,
"epoch": 0.9506726457399103,
"grad_norm": 14.0,
"learning_rate": 1.149269844912404e-06,
"loss": 1.115234375,
"mean_token_accuracy": 0.7175646647810936,
"num_tokens": 93394439.0,
"step": 159
},
{
"entropy": 1.0546875,
"epoch": 0.9566517189835575,
"grad_norm": 17.5,
"learning_rate": 1.1397048789669059e-06,
"loss": 1.13916015625,
"mean_token_accuracy": 0.7107137218117714,
"num_tokens": 93979057.0,
"step": 160
},
{
"entropy": 1.05859375,
"epoch": 0.9626307922272048,
"grad_norm": 15.25,
"learning_rate": 1.1301268590933434e-06,
"loss": 1.14404296875,
"mean_token_accuracy": 0.71033675968647,
"num_tokens": 94568560.0,
"step": 161
},
{
"entropy": 1.0244140625,
"epoch": 0.968609865470852,
"grad_norm": 20.125,
"learning_rate": 1.1205366802553228e-06,
"loss": 1.1131591796875,
"mean_token_accuracy": 0.7181500568985939,
"num_tokens": 95158163.0,
"step": 162
},
{
"entropy": 1.048828125,
"epoch": 0.9745889387144993,
"grad_norm": 21.25,
"learning_rate": 1.110935238552578e-06,
"loss": 1.1319580078125,
"mean_token_accuracy": 0.7127460688352585,
"num_tokens": 95747756.0,
"step": 163
},
{
"entropy": 1.017578125,
"epoch": 0.9805680119581465,
"grad_norm": 25.125,
"learning_rate": 1.1013234311372353e-06,
"loss": 1.1143798828125,
"mean_token_accuracy": 0.7193858399987221,
"num_tokens": 96337278.0,
"step": 164
},
{
"entropy": 1.02783203125,
"epoch": 0.9865470852017937,
"grad_norm": 18.375,
"learning_rate": 1.0917021561299862e-06,
"loss": 1.1024169921875,
"mean_token_accuracy": 0.7175654470920563,
"num_tokens": 96926854.0,
"step": 165
},
{
"entropy": 1.060546875,
"epoch": 0.992526158445441,
"grad_norm": 11.375,
"learning_rate": 1.0820723125361684e-06,
"loss": 1.13623046875,
"mean_token_accuracy": 0.710972748696804,
"num_tokens": 97507731.0,
"step": 166
},
{
"entropy": 1.0107421875,
"epoch": 0.9985052316890882,
"grad_norm": 24.875,
"learning_rate": 1.0724348001617625e-06,
"loss": 1.1070556640625,
"mean_token_accuracy": 0.7217210680246353,
"num_tokens": 98097346.0,
"step": 167
},
{
"entropy": 0.9921875,
"epoch": 1.0,
"grad_norm": 23.5,
"learning_rate": 1.0627905195293135e-06,
"loss": 1.072265625,
"mean_token_accuracy": 0.726732075214386,
"num_tokens": 98244774.0,
"step": 168
},
{
"entropy": 1.037109375,
"epoch": 1.0059790732436473,
"grad_norm": 21.875,
"learning_rate": 1.0531403717937886e-06,
"loss": 1.1287841796875,
"mean_token_accuracy": 0.715124748647213,
"num_tokens": 98834357.0,
"step": 169
},
{
"entropy": 0.98046875,
"epoch": 1.0119581464872944,
"grad_norm": 20.875,
"learning_rate": 1.0434852586583737e-06,
"loss": 1.0762939453125,
"mean_token_accuracy": 0.7286977842450142,
"num_tokens": 99424044.0,
"step": 170
},
{
"entropy": 1.0498046875,
"epoch": 1.0179372197309418,
"grad_norm": 21.875,
"learning_rate": 1.0338260822902165e-06,
"loss": 1.1475830078125,
"mean_token_accuracy": 0.7115440741181374,
"num_tokens": 100013632.0,
"step": 171
},
{
"entropy": 1.03759765625,
"epoch": 1.0239162929745889,
"grad_norm": 17.25,
"learning_rate": 1.0241637452361322e-06,
"loss": 1.133056640625,
"mean_token_accuracy": 0.713756151497364,
"num_tokens": 100603269.0,
"step": 172
},
{
"entropy": 1.02197265625,
"epoch": 1.0298953662182362,
"grad_norm": 15.8125,
"learning_rate": 1.0144991503382673e-06,
"loss": 1.1068115234375,
"mean_token_accuracy": 0.7196066528558731,
"num_tokens": 101191071.0,
"step": 173
},
{
"entropy": 1.0302734375,
"epoch": 1.0358744394618835,
"grad_norm": 15.5,
"learning_rate": 1.0048332006497404e-06,
"loss": 1.111572265625,
"mean_token_accuracy": 0.7173566892743111,
"num_tokens": 101780702.0,
"step": 174
},
{
"entropy": 1.0576171875,
"epoch": 1.0418535127055306,
"grad_norm": 9.25,
"learning_rate": 9.951667993502597e-07,
"loss": 1.1553955078125,
"mean_token_accuracy": 0.7085662558674812,
"num_tokens": 102367028.0,
"step": 175
},
{
"entropy": 1.05322265625,
"epoch": 1.047832585949178,
"grad_norm": 9.0625,
"learning_rate": 9.855008496617326e-07,
"loss": 1.1552734375,
"mean_token_accuracy": 0.7092385366559029,
"num_tokens": 102956643.0,
"step": 176
},
{
"entropy": 1.029296875,
"epoch": 1.053811659192825,
"grad_norm": 29.375,
"learning_rate": 9.75836254763868e-07,
"loss": 1.136474609375,
"mean_token_accuracy": 0.7162440121173859,
"num_tokens": 103535589.0,
"step": 177
},
{
"entropy": 1.0029296875,
"epoch": 1.0597907324364724,
"grad_norm": 11.9375,
"learning_rate": 9.661739177097834e-07,
"loss": 1.0927734375,
"mean_token_accuracy": 0.7226409837603569,
"num_tokens": 104124613.0,
"step": 178
},
{
"entropy": 1.04150390625,
"epoch": 1.0657698056801195,
"grad_norm": 12.0,
"learning_rate": 9.565147413416265e-07,
"loss": 1.1234130859375,
"mean_token_accuracy": 0.7134781181812286,
"num_tokens": 104714199.0,
"step": 179
},
{
"entropy": 1.01318359375,
"epoch": 1.0717488789237668,
"grad_norm": 13.8125,
"learning_rate": 9.468596282062113e-07,
"loss": 1.1014404296875,
"mean_token_accuracy": 0.7213335856795311,
"num_tokens": 105294866.0,
"step": 180
},
{
"entropy": 1.05322265625,
"epoch": 1.0777279521674141,
"grad_norm": 14.625,
"learning_rate": 9.372094804706866e-07,
"loss": 1.152587890625,
"mean_token_accuracy": 0.7105412855744362,
"num_tokens": 105884489.0,
"step": 181
},
{
"entropy": 1.01025390625,
"epoch": 1.0837070254110612,
"grad_norm": 11.0,
"learning_rate": 9.275651998382377e-07,
"loss": 1.101318359375,
"mean_token_accuracy": 0.7201149016618729,
"num_tokens": 106474079.0,
"step": 182
},
{
"entropy": 1.02001953125,
"epoch": 1.0896860986547086,
"grad_norm": 8.3125,
"learning_rate": 9.179276874638314e-07,
"loss": 1.107666015625,
"mean_token_accuracy": 0.7216706648468971,
"num_tokens": 107063687.0,
"step": 183
},
{
"entropy": 1.02685546875,
"epoch": 1.0956651718983557,
"grad_norm": 11.625,
"learning_rate": 9.082978438700138e-07,
"loss": 1.125732421875,
"mean_token_accuracy": 0.7165105268359184,
"num_tokens": 107649683.0,
"step": 184
},
{
"entropy": 1.0322265625,
"epoch": 1.101644245142003,
"grad_norm": 10.625,
"learning_rate": 8.986765688627651e-07,
"loss": 1.10595703125,
"mean_token_accuracy": 0.7177045792341232,
"num_tokens": 108239185.0,
"step": 185
},
{
"entropy": 1.0166015625,
"epoch": 1.1076233183856503,
"grad_norm": 11.5625,
"learning_rate": 8.890647614474222e-07,
"loss": 1.1109619140625,
"mean_token_accuracy": 0.7202980294823647,
"num_tokens": 108828659.0,
"step": 186
},
{
"entropy": 1.04833984375,
"epoch": 1.1136023916292974,
"grad_norm": 23.25,
"learning_rate": 8.79463319744677e-07,
"loss": 1.14404296875,
"mean_token_accuracy": 0.7117270454764366,
"num_tokens": 109415343.0,
"step": 187
},
{
"entropy": 1.01904296875,
"epoch": 1.1195814648729447,
"grad_norm": 20.5,
"learning_rate": 8.698731409066568e-07,
"loss": 1.1033935546875,
"mean_token_accuracy": 0.7186397314071655,
"num_tokens": 110000047.0,
"step": 188
},
{
"entropy": 1.0185546875,
"epoch": 1.1255605381165918,
"grad_norm": 17.0,
"learning_rate": 8.602951210330941e-07,
"loss": 1.1114501953125,
"mean_token_accuracy": 0.7207945957779884,
"num_tokens": 110589519.0,
"step": 189
},
{
"entropy": 1.01171875,
"epoch": 1.1315396113602392,
"grad_norm": 19.625,
"learning_rate": 8.507301550875959e-07,
"loss": 1.1103515625,
"mean_token_accuracy": 0.7186660766601562,
"num_tokens": 111179017.0,
"step": 190
},
{
"entropy": 1.05859375,
"epoch": 1.1375186846038865,
"grad_norm": 19.5,
"learning_rate": 8.411791368140195e-07,
"loss": 1.1348876953125,
"mean_token_accuracy": 0.7089879661798477,
"num_tokens": 111761675.0,
"step": 191
},
{
"entropy": 1.033203125,
"epoch": 1.1434977578475336,
"grad_norm": 15.125,
"learning_rate": 8.316429586529614e-07,
"loss": 1.1116943359375,
"mean_token_accuracy": 0.7168847694993019,
"num_tokens": 112351282.0,
"step": 192
},
{
"entropy": 1.015625,
"epoch": 1.149476831091181,
"grad_norm": 10.0625,
"learning_rate": 8.221225116583676e-07,
"loss": 1.0850830078125,
"mean_token_accuracy": 0.7229639515280724,
"num_tokens": 112940935.0,
"step": 193
},
{
"entropy": 1.04150390625,
"epoch": 1.155455904334828,
"grad_norm": 9.75,
"learning_rate": 8.126186854142751e-07,
"loss": 1.1219482421875,
"mean_token_accuracy": 0.7170581594109535,
"num_tokens": 113530462.0,
"step": 194
},
{
"entropy": 1.02880859375,
"epoch": 1.1614349775784754,
"grad_norm": 17.625,
"learning_rate": 8.031323679516899e-07,
"loss": 1.130859375,
"mean_token_accuracy": 0.715842954814434,
"num_tokens": 114115879.0,
"step": 195
},
{
"entropy": 1.0419921875,
"epoch": 1.1674140508221225,
"grad_norm": 10.1875,
"learning_rate": 7.936644456656081e-07,
"loss": 1.1396484375,
"mean_token_accuracy": 0.713227279484272,
"num_tokens": 114705390.0,
"step": 196
},
{
"entropy": 1.03662109375,
"epoch": 1.1733931240657698,
"grad_norm": 23.375,
"learning_rate": 7.84215803232194e-07,
"loss": 1.1226806640625,
"mean_token_accuracy": 0.7155178636312485,
"num_tokens": 115286747.0,
"step": 197
},
{
"entropy": 1.025390625,
"epoch": 1.1793721973094171,
"grad_norm": 16.5,
"learning_rate": 7.747873235261156e-07,
"loss": 1.1126708984375,
"mean_token_accuracy": 0.7184700071811676,
"num_tokens": 115876348.0,
"step": 198
},
{
"entropy": 1.02880859375,
"epoch": 1.1853512705530642,
"grad_norm": 27.125,
"learning_rate": 7.653798875380499e-07,
"loss": 1.1217041015625,
"mean_token_accuracy": 0.7165053337812424,
"num_tokens": 116458527.0,
"step": 199
},
{
"entropy": 1.02392578125,
"epoch": 1.1913303437967115,
"grad_norm": 17.5,
"learning_rate": 7.559943742923625e-07,
"loss": 1.10888671875,
"mean_token_accuracy": 0.7185152769088745,
"num_tokens": 117048168.0,
"step": 200
},
{
"entropy": 1.0390625,
"epoch": 1.1973094170403586,
"grad_norm": 22.625,
"learning_rate": 7.466316607649736e-07,
"loss": 1.130126953125,
"mean_token_accuracy": 0.7132042795419693,
"num_tokens": 117636670.0,
"step": 201
},
{
"entropy": 1.04833984375,
"epoch": 1.203288490284006,
"grad_norm": 24.75,
"learning_rate": 7.372926218014131e-07,
"loss": 1.132568359375,
"mean_token_accuracy": 0.7136494368314743,
"num_tokens": 118226230.0,
"step": 202
},
{
"entropy": 1.0576171875,
"epoch": 1.2092675635276533,
"grad_norm": 17.875,
"learning_rate": 7.279781300350757e-07,
"loss": 1.1424560546875,
"mean_token_accuracy": 0.711665190756321,
"num_tokens": 118815835.0,
"step": 203
},
{
"entropy": 1.01953125,
"epoch": 1.2152466367713004,
"grad_norm": 19.125,
"learning_rate": 7.186890558056836e-07,
"loss": 1.1112060546875,
"mean_token_accuracy": 0.7197611033916473,
"num_tokens": 119402683.0,
"step": 204
},
{
"entropy": 1.0546875,
"epoch": 1.2212257100149477,
"grad_norm": 52.25,
"learning_rate": 7.09426267077961e-07,
"loss": 1.150146484375,
"mean_token_accuracy": 0.7076791599392891,
"num_tokens": 119987245.0,
"step": 205
},
{
"entropy": 1.03076171875,
"epoch": 1.2272047832585948,
"grad_norm": 56.5,
"learning_rate": 7.001906293605329e-07,
"loss": 1.130615234375,
"mean_token_accuracy": 0.7152413129806519,
"num_tokens": 120576831.0,
"step": 206
},
{
"entropy": 1.0244140625,
"epoch": 1.2331838565022422,
"grad_norm": 15.9375,
"learning_rate": 6.909830056250526e-07,
"loss": 1.1173095703125,
"mean_token_accuracy": 0.7177402079105377,
"num_tokens": 121166431.0,
"step": 207
},
{
"entropy": 1.03369140625,
"epoch": 1.2391629297458895,
"grad_norm": 54.25,
"learning_rate": 6.81804256225567e-07,
"loss": 1.1336669921875,
"mean_token_accuracy": 0.7143785133957863,
"num_tokens": 121756094.0,
"step": 208
},
{
"entropy": 1.0380859375,
"epoch": 1.2451420029895366,
"grad_norm": 16.25,
"learning_rate": 6.726552388181233e-07,
"loss": 1.1319580078125,
"mean_token_accuracy": 0.714967779815197,
"num_tokens": 122337877.0,
"step": 209
},
{
"entropy": 1.0263671875,
"epoch": 1.251121076233184,
"grad_norm": 14.25,
"learning_rate": 6.63536808280633e-07,
"loss": 1.109130859375,
"mean_token_accuracy": 0.7193189635872841,
"num_tokens": 122927447.0,
"step": 210
},
{
"entropy": 1.02587890625,
"epoch": 1.257100149476831,
"grad_norm": 13.3125,
"learning_rate": 6.544498166329912e-07,
"loss": 1.113525390625,
"mean_token_accuracy": 0.7177118062973022,
"num_tokens": 123509882.0,
"step": 211
},
{
"entropy": 1.0078125,
"epoch": 1.2630792227204783,
"grad_norm": 15.875,
"learning_rate": 6.453951129574643e-07,
"loss": 1.0953369140625,
"mean_token_accuracy": 0.722569465637207,
"num_tokens": 124099443.0,
"step": 212
},
{
"entropy": 1.048828125,
"epoch": 1.2690582959641254,
"grad_norm": 17.25,
"learning_rate": 6.363735433193529e-07,
"loss": 1.1336669921875,
"mean_token_accuracy": 0.7113273218274117,
"num_tokens": 124682200.0,
"step": 213
},
{
"entropy": 1.01318359375,
"epoch": 1.2750373692077728,
"grad_norm": 16.625,
"learning_rate": 6.273859506879364e-07,
"loss": 1.10498046875,
"mean_token_accuracy": 0.7205186262726784,
"num_tokens": 125265808.0,
"step": 214
},
{
"entropy": 1.013671875,
"epoch": 1.28101644245142,
"grad_norm": 10.75,
"learning_rate": 6.18433174857705e-07,
"loss": 1.112060546875,
"mean_token_accuracy": 0.7207604125142097,
"num_tokens": 125855421.0,
"step": 215
},
{
"entropy": 1.0322265625,
"epoch": 1.2869955156950672,
"grad_norm": 8.75,
"learning_rate": 6.095160523698912e-07,
"loss": 1.118408203125,
"mean_token_accuracy": 0.7177054435014725,
"num_tokens": 126445003.0,
"step": 216
},
{
"entropy": 1.021484375,
"epoch": 1.2929745889387145,
"grad_norm": 9.0625,
"learning_rate": 6.006354164343046e-07,
"loss": 1.110595703125,
"mean_token_accuracy": 0.7162402048707008,
"num_tokens": 127027787.0,
"step": 217
},
{
"entropy": 1.05029296875,
"epoch": 1.2989536621823619,
"grad_norm": 11.5625,
"learning_rate": 5.917920968514751e-07,
"loss": 1.1461181640625,
"mean_token_accuracy": 0.7097650542855263,
"num_tokens": 127617320.0,
"step": 218
},
{
"entropy": 1.03662109375,
"epoch": 1.304932735426009,
"grad_norm": 12.4375,
"learning_rate": 5.829869199351187e-07,
"loss": 1.1298828125,
"mean_token_accuracy": 0.7168288081884384,
"num_tokens": 128206868.0,
"step": 219
},
{
"entropy": 1.0361328125,
"epoch": 1.310911808669656,
"grad_norm": 10.5,
"learning_rate": 5.742207084349273e-07,
"loss": 1.1165771484375,
"mean_token_accuracy": 0.7156732380390167,
"num_tokens": 128796426.0,
"step": 220
},
{
"entropy": 1.01904296875,
"epoch": 1.3168908819133034,
"grad_norm": 13.25,
"learning_rate": 5.654942814596901e-07,
"loss": 1.10205078125,
"mean_token_accuracy": 0.7199403569102287,
"num_tokens": 129385997.0,
"step": 221
},
{
"entropy": 1.02392578125,
"epoch": 1.3228699551569507,
"grad_norm": 15.0625,
"learning_rate": 5.568084544007588e-07,
"loss": 1.11083984375,
"mean_token_accuracy": 0.7177979946136475,
"num_tokens": 129961180.0,
"step": 222
},
{
"entropy": 1.00927734375,
"epoch": 1.3288490284005978,
"grad_norm": 20.375,
"learning_rate": 5.48164038855855e-07,
"loss": 1.094482421875,
"mean_token_accuracy": 0.7219114229083061,
"num_tokens": 130549338.0,
"step": 223
},
{
"entropy": 1.01708984375,
"epoch": 1.3348281016442451,
"grad_norm": 12.25,
"learning_rate": 5.395618425532389e-07,
"loss": 1.1097412109375,
"mean_token_accuracy": 0.7211140915751457,
"num_tokens": 131134800.0,
"step": 224
},
{
"entropy": 0.99365234375,
"epoch": 1.3408071748878925,
"grad_norm": 26.25,
"learning_rate": 5.310026692762314e-07,
"loss": 1.0784912109375,
"mean_token_accuracy": 0.727905310690403,
"num_tokens": 131724429.0,
"step": 225
},
{
"entropy": 1.029296875,
"epoch": 1.3467862481315396,
"grad_norm": 14.75,
"learning_rate": 5.224873187881136e-07,
"loss": 1.1151123046875,
"mean_token_accuracy": 0.7176884040236473,
"num_tokens": 132314019.0,
"step": 226
},
{
"entropy": 1.03125,
"epoch": 1.352765321375187,
"grad_norm": 15.3125,
"learning_rate": 5.140165867573939e-07,
"loss": 1.12353515625,
"mean_token_accuracy": 0.7174642384052277,
"num_tokens": 132903580.0,
"step": 227
},
{
"entropy": 1.0390625,
"epoch": 1.358744394618834,
"grad_norm": 18.125,
"learning_rate": 5.055912646834635e-07,
"loss": 1.127197265625,
"mean_token_accuracy": 0.7134326621890068,
"num_tokens": 133493126.0,
"step": 228
},
{
"entropy": 1.01513671875,
"epoch": 1.3647234678624813,
"grad_norm": 10.75,
"learning_rate": 4.972121398226371e-07,
"loss": 1.101318359375,
"mean_token_accuracy": 0.7197434529662132,
"num_tokens": 134079329.0,
"step": 229
},
{
"entropy": 1.041015625,
"epoch": 1.3707025411061284,
"grad_norm": 13.625,
"learning_rate": 4.888799951145947e-07,
"loss": 1.1278076171875,
"mean_token_accuracy": 0.7140819206833839,
"num_tokens": 134654986.0,
"step": 230
},
{
"entropy": 1.0380859375,
"epoch": 1.3766816143497758,
"grad_norm": 10.4375,
"learning_rate": 4.805956091092227e-07,
"loss": 1.123779296875,
"mean_token_accuracy": 0.7162793427705765,
"num_tokens": 135244586.0,
"step": 231
},
{
"entropy": 1.05810546875,
"epoch": 1.382660687593423,
"grad_norm": 16.375,
"learning_rate": 4.7235975589386713e-07,
"loss": 1.1463623046875,
"mean_token_accuracy": 0.7098471596837044,
"num_tokens": 135834199.0,
"step": 232
},
{
"entropy": 1.04443359375,
"epoch": 1.3886397608370702,
"grad_norm": 17.125,
"learning_rate": 4.641732050210031e-07,
"loss": 1.1280517578125,
"mean_token_accuracy": 0.7144335135817528,
"num_tokens": 136423743.0,
"step": 233
},
{
"entropy": 1.06640625,
"epoch": 1.3946188340807175,
"grad_norm": 12.3125,
"learning_rate": 4.5603672143632945e-07,
"loss": 1.1444091796875,
"mean_token_accuracy": 0.7087363749742508,
"num_tokens": 137013243.0,
"step": 234
},
{
"entropy": 1.0302734375,
"epoch": 1.4005979073243648,
"grad_norm": 12.4375,
"learning_rate": 4.479510654072909e-07,
"loss": 1.1185302734375,
"mean_token_accuracy": 0.7174856439232826,
"num_tokens": 137599228.0,
"step": 235
},
{
"entropy": 1.02783203125,
"epoch": 1.406576980568012,
"grad_norm": 25.625,
"learning_rate": 4.399169924520403e-07,
"loss": 1.1148681640625,
"mean_token_accuracy": 0.7194091156125069,
"num_tokens": 138181557.0,
"step": 236
},
{
"entropy": 1.0234375,
"epoch": 1.4125560538116593,
"grad_norm": 16.625,
"learning_rate": 4.3193525326884426e-07,
"loss": 1.1102294921875,
"mean_token_accuracy": 0.7175892367959023,
"num_tokens": 138771048.0,
"step": 237
},
{
"entropy": 1.0361328125,
"epoch": 1.4185351270553064,
"grad_norm": 14.6875,
"learning_rate": 4.240065936659374e-07,
"loss": 1.12451171875,
"mean_token_accuracy": 0.71492750197649,
"num_tokens": 139360593.0,
"step": 238
},
{
"entropy": 1.05322265625,
"epoch": 1.4245142002989537,
"grad_norm": 25.125,
"learning_rate": 4.1613175449183446e-07,
"loss": 1.13232421875,
"mean_token_accuracy": 0.7100114226341248,
"num_tokens": 139946622.0,
"step": 239
},
{
"entropy": 0.99560546875,
"epoch": 1.4304932735426008,
"grad_norm": 36.75,
"learning_rate": 4.0831147156610676e-07,
"loss": 1.0897216796875,
"mean_token_accuracy": 0.7266808152198792,
"num_tokens": 140531396.0,
"step": 240
},
{
"entropy": 1.0009765625,
"epoch": 1.4364723467862481,
"grad_norm": 24.75,
"learning_rate": 4.0054647561062615e-07,
"loss": 1.0850830078125,
"mean_token_accuracy": 0.7258649617433548,
"num_tokens": 141120987.0,
"step": 241
},
{
"entropy": 1.0244140625,
"epoch": 1.4424514200298955,
"grad_norm": 29.75,
"learning_rate": 3.928374921812888e-07,
"loss": 1.1165771484375,
"mean_token_accuracy": 0.7189558371901512,
"num_tokens": 141703670.0,
"step": 242
},
{
"entropy": 1.046875,
"epoch": 1.4484304932735426,
"grad_norm": 10.375,
"learning_rate": 3.851852416002187e-07,
"loss": 1.1234130859375,
"mean_token_accuracy": 0.7134259343147278,
"num_tokens": 142283264.0,
"step": 243
},
{
"entropy": 1.03466796875,
"epoch": 1.45440956651719,
"grad_norm": 35.75,
"learning_rate": 3.7759043888846173e-07,
"loss": 1.12158203125,
"mean_token_accuracy": 0.715514525771141,
"num_tokens": 142870702.0,
"step": 244
},
{
"entropy": 1.0615234375,
"epoch": 1.460388639760837,
"grad_norm": 8.5625,
"learning_rate": 3.7005379369917324e-07,
"loss": 1.1358642578125,
"mean_token_accuracy": 0.7089022919535637,
"num_tokens": 143460302.0,
"step": 245
},
{
"entropy": 1.0126953125,
"epoch": 1.4663677130044843,
"grad_norm": 14.6875,
"learning_rate": 3.625760102513102e-07,
"loss": 1.1024169921875,
"mean_token_accuracy": 0.7216273471713066,
"num_tokens": 144047084.0,
"step": 246
},
{
"entropy": 1.037109375,
"epoch": 1.4723467862481314,
"grad_norm": 12.5,
"learning_rate": 3.551577872638296e-07,
"loss": 1.1268310546875,
"mean_token_accuracy": 0.7154128924012184,
"num_tokens": 144629747.0,
"step": 247
},
{
"entropy": 1.029296875,
"epoch": 1.4783258594917787,
"grad_norm": 15.875,
"learning_rate": 3.477998178903981e-07,
"loss": 1.133056640625,
"mean_token_accuracy": 0.7142782434821129,
"num_tokens": 145219266.0,
"step": 248
},
{
"entropy": 1.0146484375,
"epoch": 1.484304932735426,
"grad_norm": 10.5,
"learning_rate": 3.4050278965462763e-07,
"loss": 1.0947265625,
"mean_token_accuracy": 0.720554769039154,
"num_tokens": 145808833.0,
"step": 249
},
{
"entropy": 1.017578125,
"epoch": 1.4902840059790732,
"grad_norm": 12.0,
"learning_rate": 3.3326738438583114e-07,
"loss": 1.1031494140625,
"mean_token_accuracy": 0.7191615030169487,
"num_tokens": 146398531.0,
"step": 250
},
{
"entropy": 1.013671875,
"epoch": 1.4962630792227205,
"grad_norm": 13.0625,
"learning_rate": 3.260942781553142e-07,
"loss": 1.1036376953125,
"mean_token_accuracy": 0.7216013073921204,
"num_tokens": 146988047.0,
"step": 251
},
{
"entropy": 1.021484375,
"epoch": 1.5022421524663678,
"grad_norm": 8.1875,
"learning_rate": 3.189841412132027e-07,
"loss": 1.10498046875,
"mean_token_accuracy": 0.7199263349175453,
"num_tokens": 147577682.0,
"step": 252
},
{
"entropy": 1.05859375,
"epoch": 1.508221225710015,
"grad_norm": 9.75,
"learning_rate": 3.1193763792581594e-07,
"loss": 1.134765625,
"mean_token_accuracy": 0.7107567712664604,
"num_tokens": 148166138.0,
"step": 253
},
{
"entropy": 1.0166015625,
"epoch": 1.514200298953662,
"grad_norm": 7.59375,
"learning_rate": 3.0495542671358744e-07,
"loss": 1.1031494140625,
"mean_token_accuracy": 0.7203914448618889,
"num_tokens": 148755748.0,
"step": 254
},
{
"entropy": 1.03076171875,
"epoch": 1.5201793721973094,
"grad_norm": 19.25,
"learning_rate": 2.980381599895433e-07,
"loss": 1.1265869140625,
"mean_token_accuracy": 0.7148845717310905,
"num_tokens": 149345252.0,
"step": 255
},
{
"entropy": 1.087890625,
"epoch": 1.5261584454409567,
"grad_norm": 10.5,
"learning_rate": 2.91186484098342e-07,
"loss": 1.1712646484375,
"mean_token_accuracy": 0.7025258839130402,
"num_tokens": 149934781.0,
"step": 256
},
{
"entropy": 1.02880859375,
"epoch": 1.5321375186846038,
"grad_norm": 15.0,
"learning_rate": 2.84401039255879e-07,
"loss": 1.1123046875,
"mean_token_accuracy": 0.7171602919697762,
"num_tokens": 150524424.0,
"step": 257
},
{
"entropy": 1.04345703125,
"epoch": 1.5381165919282511,
"grad_norm": 16.5,
"learning_rate": 2.776824594894661e-07,
"loss": 1.1370849609375,
"mean_token_accuracy": 0.7134297341108322,
"num_tokens": 151113962.0,
"step": 258
},
{
"entropy": 1.02685546875,
"epoch": 1.5440956651718984,
"grad_norm": 13.875,
"learning_rate": 2.7103137257858863e-07,
"loss": 1.1080322265625,
"mean_token_accuracy": 0.7190811783075333,
"num_tokens": 151703586.0,
"step": 259
},
{
"entropy": 1.048828125,
"epoch": 1.5500747384155455,
"grad_norm": 9.25,
"learning_rate": 2.644483999962449e-07,
"loss": 1.1405029296875,
"mean_token_accuracy": 0.712283693253994,
"num_tokens": 152292444.0,
"step": 260
},
{
"entropy": 1.01025390625,
"epoch": 1.5560538116591929,
"grad_norm": 9.625,
"learning_rate": 2.579341568508779e-07,
"loss": 1.09228515625,
"mean_token_accuracy": 0.721127025783062,
"num_tokens": 152882090.0,
"step": 261
},
{
"entropy": 1.03466796875,
"epoch": 1.5620328849028402,
"grad_norm": 6.875,
"learning_rate": 2.514892518288988e-07,
"loss": 1.1090087890625,
"mean_token_accuracy": 0.7168014496564865,
"num_tokens": 153471720.0,
"step": 262
},
{
"entropy": 1.056640625,
"epoch": 1.5680119581464873,
"grad_norm": 6.25,
"learning_rate": 2.4511428713781236e-07,
"loss": 1.1324462890625,
"mean_token_accuracy": 0.7105724215507507,
"num_tokens": 154061310.0,
"step": 263
},
{
"entropy": 1.0419921875,
"epoch": 1.5739910313901344,
"grad_norm": 16.75,
"learning_rate": 2.3880985844994673e-07,
"loss": 1.1239013671875,
"mean_token_accuracy": 0.713848665356636,
"num_tokens": 154650888.0,
"step": 264
},
{
"entropy": 1.037109375,
"epoch": 1.5799701046337817,
"grad_norm": 8.9375,
"learning_rate": 2.3257655484679372e-07,
"loss": 1.12451171875,
"mean_token_accuracy": 0.7131286934018135,
"num_tokens": 155239209.0,
"step": 265
},
{
"entropy": 1.04638671875,
"epoch": 1.585949177877429,
"grad_norm": 8.5625,
"learning_rate": 2.264149587639671e-07,
"loss": 1.13037109375,
"mean_token_accuracy": 0.7152972370386124,
"num_tokens": 155828817.0,
"step": 266
},
{
"entropy": 1.005859375,
"epoch": 1.5919282511210762,
"grad_norm": 13.9375,
"learning_rate": 2.2032564593677772e-07,
"loss": 1.0977783203125,
"mean_token_accuracy": 0.7207474857568741,
"num_tokens": 156418416.0,
"step": 267
},
{
"entropy": 1.0166015625,
"epoch": 1.5979073243647235,
"grad_norm": 15.25,
"learning_rate": 2.1430918534643994e-07,
"loss": 1.1092529296875,
"mean_token_accuracy": 0.7178195714950562,
"num_tokens": 156996671.0,
"step": 268
},
{
"entropy": 1.0126953125,
"epoch": 1.6038863976083708,
"grad_norm": 10.1875,
"learning_rate": 2.0836613916690427e-07,
"loss": 1.097900390625,
"mean_token_accuracy": 0.7219494804739952,
"num_tokens": 157586304.0,
"step": 269
},
{
"entropy": 1.041015625,
"epoch": 1.609865470852018,
"grad_norm": 7.375,
"learning_rate": 2.0249706271232946e-07,
"loss": 1.13525390625,
"mean_token_accuracy": 0.714814230799675,
"num_tokens": 158175939.0,
"step": 270
},
{
"entropy": 1.01953125,
"epoch": 1.615844544095665,
"grad_norm": 14.4375,
"learning_rate": 1.9670250438519386e-07,
"loss": 1.1107177734375,
"mean_token_accuracy": 0.719508022069931,
"num_tokens": 158765530.0,
"step": 271
},
{
"entropy": 1.03076171875,
"epoch": 1.6218236173393124,
"grad_norm": 6.09375,
"learning_rate": 1.9098300562505264e-07,
"loss": 1.112060546875,
"mean_token_accuracy": 0.7187488600611687,
"num_tokens": 159355121.0,
"step": 272
},
{
"entropy": 1.05078125,
"epoch": 1.6278026905829597,
"grad_norm": 9.875,
"learning_rate": 1.8533910085794713e-07,
"loss": 1.1397705078125,
"mean_token_accuracy": 0.7100469321012497,
"num_tokens": 159937687.0,
"step": 273
},
{
"entropy": 1.01904296875,
"epoch": 1.6337817638266068,
"grad_norm": 10.8125,
"learning_rate": 1.7977131744646724e-07,
"loss": 1.1077880859375,
"mean_token_accuracy": 0.720647431910038,
"num_tokens": 160518664.0,
"step": 274
},
{
"entropy": 0.9990234375,
"epoch": 1.639760837070254,
"grad_norm": 20.25,
"learning_rate": 1.742801756404759e-07,
"loss": 1.09033203125,
"mean_token_accuracy": 0.7235589995980263,
"num_tokens": 161102238.0,
"step": 275
},
{
"entropy": 1.033203125,
"epoch": 1.6457399103139014,
"grad_norm": 12.5625,
"learning_rate": 1.688661885284972e-07,
"loss": 1.125,
"mean_token_accuracy": 0.7176312282681465,
"num_tokens": 161676535.0,
"step": 276
},
{
"entropy": 1.02783203125,
"epoch": 1.6517189835575485,
"grad_norm": 18.75,
"learning_rate": 1.6352986198977325e-07,
"loss": 1.10791015625,
"mean_token_accuracy": 0.718546986579895,
"num_tokens": 162266110.0,
"step": 277
},
{
"entropy": 1.0009765625,
"epoch": 1.6576980568011959,
"grad_norm": 10.5,
"learning_rate": 1.5827169464699575e-07,
"loss": 1.0906982421875,
"mean_token_accuracy": 0.7236178815364838,
"num_tokens": 162855683.0,
"step": 278
},
{
"entropy": 1.033203125,
"epoch": 1.6636771300448432,
"grad_norm": 8.375,
"learning_rate": 1.5309217781971416e-07,
"loss": 1.1171875,
"mean_token_accuracy": 0.7165531665086746,
"num_tokens": 163428337.0,
"step": 279
},
{
"entropy": 1.0244140625,
"epoch": 1.6696562032884903,
"grad_norm": 15.5625,
"learning_rate": 1.479917954784282e-07,
"loss": 1.10693359375,
"mean_token_accuracy": 0.7174379974603653,
"num_tokens": 164017962.0,
"step": 280
},
{
"entropy": 1.03466796875,
"epoch": 1.6756352765321374,
"grad_norm": 22.125,
"learning_rate": 1.429710241993656e-07,
"loss": 1.1173095703125,
"mean_token_accuracy": 0.7154664248228073,
"num_tokens": 164605762.0,
"step": 281
},
{
"entropy": 1.04541015625,
"epoch": 1.6816143497757847,
"grad_norm": 14.875,
"learning_rate": 1.380303331199507e-07,
"loss": 1.1348876953125,
"mean_token_accuracy": 0.7117466628551483,
"num_tokens": 165195338.0,
"step": 282
},
{
"entropy": 1.009765625,
"epoch": 1.687593423019432,
"grad_norm": 21.0,
"learning_rate": 1.3317018389496926e-07,
"loss": 1.111083984375,
"mean_token_accuracy": 0.7207304239273071,
"num_tokens": 165784893.0,
"step": 283
},
{
"entropy": 1.02734375,
"epoch": 1.6935724962630792,
"grad_norm": 8.6875,
"learning_rate": 1.283910306534308e-07,
"loss": 1.1119384765625,
"mean_token_accuracy": 0.717054933309555,
"num_tokens": 166374502.0,
"step": 284
},
{
"entropy": 1.02783203125,
"epoch": 1.6995515695067265,
"grad_norm": 12.75,
"learning_rate": 1.2369331995613663e-07,
"loss": 1.1182861328125,
"mean_token_accuracy": 0.7191892936825752,
"num_tokens": 166964071.0,
"step": 285
},
{
"entropy": 1.01904296875,
"epoch": 1.7055306427503738,
"grad_norm": 19.625,
"learning_rate": 1.1907749075395146e-07,
"loss": 1.1087646484375,
"mean_token_accuracy": 0.7190410420298576,
"num_tokens": 167553522.0,
"step": 286
},
{
"entropy": 1.03466796875,
"epoch": 1.711509715994021,
"grad_norm": 14.3125,
"learning_rate": 1.145439743467902e-07,
"loss": 1.11865234375,
"mean_token_accuracy": 0.71589395403862,
"num_tokens": 168143165.0,
"step": 287
},
{
"entropy": 1.037109375,
"epoch": 1.717488789237668,
"grad_norm": 9.0,
"learning_rate": 1.1009319434331621e-07,
"loss": 1.1199951171875,
"mean_token_accuracy": 0.7174848467111588,
"num_tokens": 168727049.0,
"step": 288
},
{
"entropy": 1.052734375,
"epoch": 1.7234678624813156,
"grad_norm": 13.3125,
"learning_rate": 1.0572556662136035e-07,
"loss": 1.1346435546875,
"mean_token_accuracy": 0.7104056030511856,
"num_tokens": 169308189.0,
"step": 289
},
{
"entropy": 1.0556640625,
"epoch": 1.7294469357249627,
"grad_norm": 10.3125,
"learning_rate": 1.014414992890611e-07,
"loss": 1.1441650390625,
"mean_token_accuracy": 0.7134399339556694,
"num_tokens": 169897805.0,
"step": 290
},
{
"entropy": 1.03564453125,
"epoch": 1.7354260089686098,
"grad_norm": 12.4375,
"learning_rate": 9.724139264673114e-08,
"loss": 1.1241455078125,
"mean_token_accuracy": 0.7146468609571457,
"num_tokens": 170487344.0,
"step": 291
},
{
"entropy": 1.0625,
"epoch": 1.741405082212257,
"grad_norm": 9.625,
"learning_rate": 9.312563914945459e-08,
"loss": 1.14111328125,
"mean_token_accuracy": 0.7087547183036804,
"num_tokens": 171076956.0,
"step": 292
},
{
"entropy": 1.0302734375,
"epoch": 1.7473841554559044,
"grad_norm": 12.3125,
"learning_rate": 8.909462337041507e-08,
"loss": 1.119384765625,
"mean_token_accuracy": 0.7157952710986137,
"num_tokens": 171666573.0,
"step": 293
},
{
"entropy": 1.0283203125,
"epoch": 1.7533632286995515,
"grad_norm": 17.5,
"learning_rate": 8.514872196496181e-08,
"loss": 1.116943359375,
"mean_token_accuracy": 0.7182503044605255,
"num_tokens": 172247089.0,
"step": 294
},
{
"entropy": 1.0390625,
"epoch": 1.7593423019431988,
"grad_norm": 6.96875,
"learning_rate": 8.128830363541572e-08,
"loss": 1.132568359375,
"mean_token_accuracy": 0.7144065871834755,
"num_tokens": 172836721.0,
"step": 295
},
{
"entropy": 1.0361328125,
"epoch": 1.7653213751868462,
"grad_norm": 14.75,
"learning_rate": 7.751372909661768e-08,
"loss": 1.1168212890625,
"mean_token_accuracy": 0.7155275791883469,
"num_tokens": 173426281.0,
"step": 296
},
{
"entropy": 1.00146484375,
"epoch": 1.7713004484304933,
"grad_norm": 16.375,
"learning_rate": 7.382535104222364e-08,
"loss": 1.0948486328125,
"mean_token_accuracy": 0.7220958769321442,
"num_tokens": 174015810.0,
"step": 297
},
{
"entropy": 1.01513671875,
"epoch": 1.7772795216741404,
"grad_norm": 12.8125,
"learning_rate": 7.022351411174865e-08,
"loss": 1.097900390625,
"mean_token_accuracy": 0.7205198705196381,
"num_tokens": 174602847.0,
"step": 298
},
{
"entropy": 1.029296875,
"epoch": 1.7832585949177877,
"grad_norm": 11.8125,
"learning_rate": 6.670855485836524e-08,
"loss": 1.1104736328125,
"mean_token_accuracy": 0.718941256403923,
"num_tokens": 175192486.0,
"step": 299
},
{
"entropy": 1.0439453125,
"epoch": 1.789237668161435,
"grad_norm": 11.75,
"learning_rate": 6.328080171745509e-08,
"loss": 1.125,
"mean_token_accuracy": 0.7150193601846695,
"num_tokens": 175782052.0,
"step": 300
},
{
"entropy": 1.03955078125,
"epoch": 1.7952167414050821,
"grad_norm": 13.6875,
"learning_rate": 5.994057497592031e-08,
"loss": 1.13037109375,
"mean_token_accuracy": 0.7155382409691811,
"num_tokens": 176366656.0,
"step": 301
},
{
"entropy": 1.03759765625,
"epoch": 1.8011958146487295,
"grad_norm": 10.3125,
"learning_rate": 5.6688186742256835e-08,
"loss": 1.11767578125,
"mean_token_accuracy": 0.7158585712313652,
"num_tokens": 176956185.0,
"step": 302
},
{
"entropy": 1.04052734375,
"epoch": 1.8071748878923768,
"grad_norm": 20.625,
"learning_rate": 5.352394091739021e-08,
"loss": 1.1318359375,
"mean_token_accuracy": 0.7144715860486031,
"num_tokens": 177545828.0,
"step": 303
},
{
"entropy": 1.01611328125,
"epoch": 1.813153961136024,
"grad_norm": 11.75,
"learning_rate": 5.0448133166279935e-08,
"loss": 1.1007080078125,
"mean_token_accuracy": 0.7212875410914421,
"num_tokens": 178126394.0,
"step": 304
},
{
"entropy": 1.04443359375,
"epoch": 1.819133034379671,
"grad_norm": 11.4375,
"learning_rate": 4.746105089029229e-08,
"loss": 1.1265869140625,
"mean_token_accuracy": 0.714270606637001,
"num_tokens": 178715841.0,
"step": 305
},
{
"entropy": 1.01171875,
"epoch": 1.8251121076233185,
"grad_norm": 12.625,
"learning_rate": 4.456297320034641e-08,
"loss": 1.0985107421875,
"mean_token_accuracy": 0.7234744802117348,
"num_tokens": 179304834.0,
"step": 306
},
{
"entropy": 1.025390625,
"epoch": 1.8310911808669657,
"grad_norm": 23.375,
"learning_rate": 4.1754170890833774e-08,
"loss": 1.1092529296875,
"mean_token_accuracy": 0.7182625830173492,
"num_tokens": 179894402.0,
"step": 307
},
{
"entropy": 1.0283203125,
"epoch": 1.8370702541106128,
"grad_norm": 13.875,
"learning_rate": 3.9034906414315725e-08,
"loss": 1.126953125,
"mean_token_accuracy": 0.7168915420770645,
"num_tokens": 180483905.0,
"step": 308
},
{
"entropy": 1.0234375,
"epoch": 1.84304932735426,
"grad_norm": 14.9375,
"learning_rate": 3.6405433856999676e-08,
"loss": 1.10693359375,
"mean_token_accuracy": 0.7204272672533989,
"num_tokens": 181073455.0,
"step": 309
},
{
"entropy": 1.01025390625,
"epoch": 1.8490284005979074,
"grad_norm": 15.5625,
"learning_rate": 3.386599891499764e-08,
"loss": 1.0946044921875,
"mean_token_accuracy": 0.7214725464582443,
"num_tokens": 181663017.0,
"step": 310
},
{
"entropy": 1.04638671875,
"epoch": 1.8550074738415545,
"grad_norm": 12.625,
"learning_rate": 3.141683887136892e-08,
"loss": 1.13232421875,
"mean_token_accuracy": 0.7134620323777199,
"num_tokens": 182245322.0,
"step": 311
},
{
"entropy": 1.01513671875,
"epoch": 1.8609865470852018,
"grad_norm": 11.0,
"learning_rate": 2.9058182573947986e-08,
"loss": 1.0958251953125,
"mean_token_accuracy": 0.7222162559628487,
"num_tokens": 182826090.0,
"step": 312
},
{
"entropy": 1.00341796875,
"epoch": 1.8669656203288492,
"grad_norm": 8.1875,
"learning_rate": 2.6790250413961546e-08,
"loss": 1.0860595703125,
"mean_token_accuracy": 0.7237614244222641,
"num_tokens": 183415723.0,
"step": 313
},
{
"entropy": 1.0322265625,
"epoch": 1.8729446935724963,
"grad_norm": 11.375,
"learning_rate": 2.4613254305434815e-08,
"loss": 1.10894775390625,
"mean_token_accuracy": 0.7184558361768723,
"num_tokens": 183996444.0,
"step": 314
},
{
"entropy": 1.03515625,
"epoch": 1.8789237668161434,
"grad_norm": 6.46875,
"learning_rate": 2.2527397665391024e-08,
"loss": 1.1177978515625,
"mean_token_accuracy": 0.716648705303669,
"num_tokens": 184586073.0,
"step": 315
},
{
"entropy": 1.05078125,
"epoch": 1.8849028400597907,
"grad_norm": 12.1875,
"learning_rate": 2.053287539484405e-08,
"loss": 1.1304931640625,
"mean_token_accuracy": 0.7115833833813667,
"num_tokens": 185175613.0,
"step": 316
},
{
"entropy": 1.0009765625,
"epoch": 1.890881913303438,
"grad_norm": 11.8125,
"learning_rate": 1.8629873860586564e-08,
"loss": 1.0841064453125,
"mean_token_accuracy": 0.7247348576784134,
"num_tokens": 185765103.0,
"step": 317
},
{
"entropy": 0.99609375,
"epoch": 1.8968609865470851,
"grad_norm": 10.75,
"learning_rate": 1.6818570877776718e-08,
"loss": 1.0699462890625,
"mean_token_accuracy": 0.7280925586819649,
"num_tokens": 186347971.0,
"step": 318
},
{
"entropy": 1.03369140625,
"epoch": 1.9028400597907325,
"grad_norm": 8.0,
"learning_rate": 1.5099135693322773e-08,
"loss": 1.1158447265625,
"mean_token_accuracy": 0.716788075864315,
"num_tokens": 186937575.0,
"step": 319
},
{
"entropy": 1.0078125,
"epoch": 1.9088191330343798,
"grad_norm": 9.4375,
"learning_rate": 1.3471728970068985e-08,
"loss": 1.0909423828125,
"mean_token_accuracy": 0.7214584723114967,
"num_tokens": 187525095.0,
"step": 320
},
{
"entropy": 1.041015625,
"epoch": 1.9147982062780269,
"grad_norm": 7.65625,
"learning_rate": 1.1936502771783486e-08,
"loss": 1.1307373046875,
"mean_token_accuracy": 0.7133340612053871,
"num_tokens": 188114646.0,
"step": 321
},
{
"entropy": 1.0556640625,
"epoch": 1.920777279521674,
"grad_norm": 12.0,
"learning_rate": 1.0493600548948877e-08,
"loss": 1.140625,
"mean_token_accuracy": 0.7104567736387253,
"num_tokens": 188700786.0,
"step": 322
},
{
"entropy": 1.04296875,
"epoch": 1.9267563527653215,
"grad_norm": 14.4375,
"learning_rate": 9.143157125359513e-09,
"loss": 1.134033203125,
"mean_token_accuracy": 0.7124024033546448,
"num_tokens": 189285842.0,
"step": 323
},
{
"entropy": 1.02294921875,
"epoch": 1.9327354260089686,
"grad_norm": 15.375,
"learning_rate": 7.885298685522235e-09,
"loss": 1.117431640625,
"mean_token_accuracy": 0.7192790359258652,
"num_tokens": 189868226.0,
"step": 324
},
{
"entropy": 1.017578125,
"epoch": 1.9387144992526157,
"grad_norm": 26.25,
"learning_rate": 6.720142762867032e-09,
"loss": 1.107666015625,
"mean_token_accuracy": 0.718171015381813,
"num_tokens": 190450814.0,
"step": 325
},
{
"entropy": 0.98291015625,
"epoch": 1.944693572496263,
"grad_norm": 29.375,
"learning_rate": 5.647798228764156e-09,
"loss": 1.0780029296875,
"mean_token_accuracy": 0.7297961264848709,
"num_tokens": 191040409.0,
"step": 326
},
{
"entropy": 1.03125,
"epoch": 1.9506726457399104,
"grad_norm": 8.9375,
"learning_rate": 4.668365282351372e-09,
"loss": 1.1124267578125,
"mean_token_accuracy": 0.7161725759506226,
"num_tokens": 191630067.0,
"step": 327
},
{
"entropy": 1.02685546875,
"epoch": 1.9566517189835575,
"grad_norm": 10.5625,
"learning_rate": 3.7819354411713355e-09,
"loss": 1.11083984375,
"mean_token_accuracy": 0.7190196141600609,
"num_tokens": 192219651.0,
"step": 328
},
{
"entropy": 1.03076171875,
"epoch": 1.9626307922272048,
"grad_norm": 12.25,
"learning_rate": 2.9885915326203216e-09,
"loss": 1.1121826171875,
"mean_token_accuracy": 0.7161883562803268,
"num_tokens": 192809216.0,
"step": 329
},
{
"entropy": 1.0224609375,
"epoch": 1.9686098654708521,
"grad_norm": 13.5625,
"learning_rate": 2.2884076862089707e-09,
"loss": 1.108642578125,
"mean_token_accuracy": 0.7192875891923904,
"num_tokens": 193394355.0,
"step": 330
},
{
"entropy": 1.041015625,
"epoch": 1.9745889387144993,
"grad_norm": 13.125,
"learning_rate": 1.6814493266357199e-09,
"loss": 1.129638671875,
"mean_token_accuracy": 0.7148761376738548,
"num_tokens": 193983861.0,
"step": 331
},
{
"entropy": 0.98583984375,
"epoch": 1.9805680119581464,
"grad_norm": 11.0625,
"learning_rate": 1.1677731676733581e-09,
"loss": 1.0601806640625,
"mean_token_accuracy": 0.7276952490210533,
"num_tokens": 194573512.0,
"step": 332
},
{
"entropy": 1.0263671875,
"epoch": 1.9865470852017937,
"grad_norm": 12.9375,
"learning_rate": 7.474272068698217e-10,
"loss": 1.1114501953125,
"mean_token_accuracy": 0.7177807167172432,
"num_tokens": 195163062.0,
"step": 333
},
{
"entropy": 1.0537109375,
"epoch": 1.992526158445441,
"grad_norm": 10.9375,
"learning_rate": 4.204507210633368e-10,
"loss": 1.135498046875,
"mean_token_accuracy": 0.7125765532255173,
"num_tokens": 195752597.0,
"step": 334
},
{
"entropy": 1.03369140625,
"epoch": 1.9985052316890881,
"grad_norm": 10.375,
"learning_rate": 1.8687426271246642e-10,
"loss": 1.1175537109375,
"mean_token_accuracy": 0.7172495499253273,
"num_tokens": 196342135.0,
"step": 335
},
{
"entropy": 0.9921875,
"epoch": 2.0,
"grad_norm": 10.0,
"learning_rate": 4.6719657041283115e-11,
"loss": 1.0634765625,
"mean_token_accuracy": 0.7287841141223907,
"num_tokens": 196489548.0,
"step": 336
}
],
"logging_steps": 1,
"max_steps": 336,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.8420381399090463e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}