{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 336, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.06201171875, "epoch": 0.005979073243647235, "grad_norm": 9.5625, "learning_rate": 0.0, "loss": 1.38720703125, "mean_token_accuracy": 0.6870120912790298, "num_tokens": 589646.0, "step": 1 }, { "entropy": 1.0556640625, "epoch": 0.01195814648729447, "grad_norm": 8.6875, "learning_rate": 1.818181818181818e-07, "loss": 1.356689453125, "mean_token_accuracy": 0.6927258595824242, "num_tokens": 1179185.0, "step": 2 }, { "entropy": 1.0947265625, "epoch": 0.017937219730941704, "grad_norm": 8.875, "learning_rate": 3.636363636363636e-07, "loss": 1.38818359375, "mean_token_accuracy": 0.6830958425998688, "num_tokens": 1768801.0, "step": 3 }, { "entropy": 1.0830078125, "epoch": 0.02391629297458894, "grad_norm": 9.8125, "learning_rate": 5.454545454545454e-07, "loss": 1.408203125, "mean_token_accuracy": 0.6821927055716515, "num_tokens": 2355606.0, "step": 4 }, { "entropy": 1.0908203125, "epoch": 0.029895366218236172, "grad_norm": 9.375, "learning_rate": 7.272727272727272e-07, "loss": 1.382568359375, "mean_token_accuracy": 0.6858013942837715, "num_tokens": 2941851.0, "step": 5 }, { "entropy": 1.060546875, "epoch": 0.03587443946188341, "grad_norm": 8.25, "learning_rate": 9.09090909090909e-07, "loss": 1.3363037109375, "mean_token_accuracy": 0.6952119246125221, "num_tokens": 3531477.0, "step": 6 }, { "entropy": 1.060546875, "epoch": 0.04185351270553064, "grad_norm": 7.84375, "learning_rate": 1.0909090909090908e-06, "loss": 1.338134765625, "mean_token_accuracy": 0.6950986832380295, "num_tokens": 4121161.0, "step": 7 }, { "entropy": 1.06494140625, "epoch": 0.04783258594917788, "grad_norm": 8.9375, "learning_rate": 1.2727272727272726e-06, "loss": 1.378662109375, "mean_token_accuracy": 0.6874435991048813, "num_tokens": 4700699.0, "step": 8 }, { "entropy": 1.05224609375, "epoch": 0.053811659192825115, "grad_norm": 7.75, "learning_rate": 1.4545454545454544e-06, "loss": 1.3175048828125, "mean_token_accuracy": 0.6969988569617271, "num_tokens": 5284275.0, "step": 9 }, { "entropy": 1.07763671875, "epoch": 0.059790732436472344, "grad_norm": 7.0625, "learning_rate": 1.6363636363636365e-06, "loss": 1.3353271484375, "mean_token_accuracy": 0.6939220502972603, "num_tokens": 5873877.0, "step": 10 }, { "entropy": 1.087890625, "epoch": 0.06576980568011959, "grad_norm": 7.34375, "learning_rate": 1.818181818181818e-06, "loss": 1.3642578125, "mean_token_accuracy": 0.6876638159155846, "num_tokens": 6463496.0, "step": 11 }, { "entropy": 1.0673828125, "epoch": 0.07174887892376682, "grad_norm": 7.125, "learning_rate": 2e-06, "loss": 1.332275390625, "mean_token_accuracy": 0.6941065043210983, "num_tokens": 7053054.0, "step": 12 }, { "entropy": 1.04150390625, "epoch": 0.07772795216741404, "grad_norm": 8.6875, "learning_rate": 1.999953280342959e-06, "loss": 1.354248046875, "mean_token_accuracy": 0.6947119757533073, "num_tokens": 7633205.0, "step": 13 }, { "entropy": 1.0634765625, "epoch": 0.08370702541106129, "grad_norm": 6.75, "learning_rate": 1.9998131257372875e-06, "loss": 1.314208984375, "mean_token_accuracy": 0.6971366181969643, "num_tokens": 8219275.0, "step": 14 }, { "entropy": 1.1015625, "epoch": 0.08968609865470852, "grad_norm": 6.375, "learning_rate": 1.9995795492789365e-06, "loss": 1.34912109375, "mean_token_accuracy": 0.6870761960744858, "num_tokens": 8808882.0, "step": 15 }, { "entropy": 1.0693359375, "epoch": 0.09566517189835576, "grad_norm": 6.5625, "learning_rate": 1.99925257279313e-06, "loss": 1.318115234375, "mean_token_accuracy": 0.6940489485859871, "num_tokens": 9398391.0, "step": 16 }, { "entropy": 1.07421875, "epoch": 0.10164424514200299, "grad_norm": 6.5, "learning_rate": 1.9988322268323264e-06, "loss": 1.32470703125, "mean_token_accuracy": 0.6955610886216164, "num_tokens": 9988054.0, "step": 17 }, { "entropy": 1.0986328125, "epoch": 0.10762331838565023, "grad_norm": 6.1875, "learning_rate": 1.998318550673364e-06, "loss": 1.3236083984375, "mean_token_accuracy": 0.691640131175518, "num_tokens": 10577624.0, "step": 18 }, { "entropy": 1.06787109375, "epoch": 0.11360239162929746, "grad_norm": 6.34375, "learning_rate": 1.997711592313791e-06, "loss": 1.3114013671875, "mean_token_accuracy": 0.6971054673194885, "num_tokens": 11167248.0, "step": 19 }, { "entropy": 1.0439453125, "epoch": 0.11958146487294469, "grad_norm": 6.84375, "learning_rate": 1.9970114084673796e-06, "loss": 1.291748046875, "mean_token_accuracy": 0.7017510756850243, "num_tokens": 11748340.0, "step": 20 }, { "entropy": 1.087890625, "epoch": 0.12556053811659193, "grad_norm": 6.78125, "learning_rate": 1.9962180645588286e-06, "loss": 1.3157958984375, "mean_token_accuracy": 0.692795068025589, "num_tokens": 12331037.0, "step": 21 }, { "entropy": 1.08203125, "epoch": 0.13153961136023917, "grad_norm": 9.1875, "learning_rate": 1.9953316347176486e-06, "loss": 1.302001953125, "mean_token_accuracy": 0.6975891441106796, "num_tokens": 12918010.0, "step": 22 }, { "entropy": 1.068359375, "epoch": 0.1375186846038864, "grad_norm": 11.625, "learning_rate": 1.994352201771236e-06, "loss": 1.3125, "mean_token_accuracy": 0.695681132376194, "num_tokens": 13507561.0, "step": 23 }, { "entropy": 1.06640625, "epoch": 0.14349775784753363, "grad_norm": 12.8125, "learning_rate": 1.993279857237133e-06, "loss": 1.2779541015625, "mean_token_accuracy": 0.6991168782114983, "num_tokens": 14090266.0, "step": 24 }, { "entropy": 1.06640625, "epoch": 0.14947683109118087, "grad_norm": 13.9375, "learning_rate": 1.9921147013144777e-06, "loss": 1.283447265625, "mean_token_accuracy": 0.6982817649841309, "num_tokens": 14679851.0, "step": 25 }, { "entropy": 1.0849609375, "epoch": 0.1554559043348281, "grad_norm": 14.125, "learning_rate": 1.9908568428746405e-06, "loss": 1.28564453125, "mean_token_accuracy": 0.6965354830026627, "num_tokens": 15269433.0, "step": 26 }, { "entropy": 1.087890625, "epoch": 0.16143497757847533, "grad_norm": 16.375, "learning_rate": 1.989506399451051e-06, "loss": 1.3095703125, "mean_token_accuracy": 0.6939198896288872, "num_tokens": 15858975.0, "step": 27 }, { "entropy": 1.0615234375, "epoch": 0.16741405082212257, "grad_norm": 12.1875, "learning_rate": 1.9880634972282166e-06, "loss": 1.273681640625, "mean_token_accuracy": 0.7005915492773056, "num_tokens": 16448581.0, "step": 28 }, { "entropy": 1.1220703125, "epoch": 0.17339312406576982, "grad_norm": 17.5, "learning_rate": 1.986528271029931e-06, "loss": 1.302001953125, "mean_token_accuracy": 0.6892580538988113, "num_tokens": 17038162.0, "step": 29 }, { "entropy": 1.060546875, "epoch": 0.17937219730941703, "grad_norm": 12.5625, "learning_rate": 1.984900864306677e-06, "loss": 1.2484130859375, "mean_token_accuracy": 0.7043808251619339, "num_tokens": 17622841.0, "step": 30 }, { "entropy": 1.083984375, "epoch": 0.18535127055306427, "grad_norm": 16.125, "learning_rate": 1.9831814291222233e-06, "loss": 1.276611328125, "mean_token_accuracy": 0.6971414536237717, "num_tokens": 18212334.0, "step": 31 }, { "entropy": 1.0859375, "epoch": 0.19133034379671152, "grad_norm": 19.125, "learning_rate": 1.981370126139413e-06, "loss": 1.282470703125, "mean_token_accuracy": 0.6962975636124611, "num_tokens": 18794983.0, "step": 32 }, { "entropy": 1.072265625, "epoch": 0.19730941704035873, "grad_norm": 22.875, "learning_rate": 1.979467124605156e-06, "loss": 1.2430419921875, "mean_token_accuracy": 0.7024854198098183, "num_tokens": 19384646.0, "step": 33 }, { "entropy": 1.103515625, "epoch": 0.20328849028400597, "grad_norm": 29.375, "learning_rate": 1.977472602334609e-06, "loss": 1.293212890625, "mean_token_accuracy": 0.691613681614399, "num_tokens": 19971390.0, "step": 34 }, { "entropy": 1.0517578125, "epoch": 0.20926756352765322, "grad_norm": 34.75, "learning_rate": 1.975386745694565e-06, "loss": 1.2325439453125, "mean_token_accuracy": 0.7074485868215561, "num_tokens": 20552857.0, "step": 35 }, { "entropy": 1.0888671875, "epoch": 0.21524663677130046, "grad_norm": 46.25, "learning_rate": 1.9732097495860385e-06, "loss": 1.27880859375, "mean_token_accuracy": 0.6955694854259491, "num_tokens": 21142419.0, "step": 36 }, { "entropy": 1.0869140625, "epoch": 0.22122571001494767, "grad_norm": 52.0, "learning_rate": 1.970941817426052e-06, "loss": 1.2547607421875, "mean_token_accuracy": 0.6988364160060883, "num_tokens": 21725119.0, "step": 37 }, { "entropy": 1.0888671875, "epoch": 0.22720478325859492, "grad_norm": 50.25, "learning_rate": 1.968583161128631e-06, "loss": 1.2620849609375, "mean_token_accuracy": 0.6952823475003242, "num_tokens": 22314606.0, "step": 38 }, { "entropy": 1.111328125, "epoch": 0.23318385650224216, "grad_norm": 28.5, "learning_rate": 1.9661340010850024e-06, "loss": 1.2611083984375, "mean_token_accuracy": 0.6952020153403282, "num_tokens": 22897136.0, "step": 39 }, { "entropy": 1.0546875, "epoch": 0.23916292974588937, "grad_norm": 13.375, "learning_rate": 1.9635945661430005e-06, "loss": 1.2120361328125, "mean_token_accuracy": 0.7072760388255119, "num_tokens": 23470326.0, "step": 40 }, { "entropy": 1.05322265625, "epoch": 0.24514200298953662, "grad_norm": 11.0625, "learning_rate": 1.960965093585684e-06, "loss": 1.1966552734375, "mean_token_accuracy": 0.710840716958046, "num_tokens": 24059902.0, "step": 41 }, { "entropy": 1.0986328125, "epoch": 0.25112107623318386, "grad_norm": 10.1875, "learning_rate": 1.9582458291091663e-06, "loss": 1.2474365234375, "mean_token_accuracy": 0.6974528953433037, "num_tokens": 24641292.0, "step": 42 }, { "entropy": 1.08203125, "epoch": 0.2571001494768311, "grad_norm": 9.125, "learning_rate": 1.9554370267996535e-06, "loss": 1.2308349609375, "mean_token_accuracy": 0.7031876817345619, "num_tokens": 25230783.0, "step": 43 }, { "entropy": 1.05810546875, "epoch": 0.26307922272047835, "grad_norm": 9.125, "learning_rate": 1.952538949109708e-06, "loss": 1.195556640625, "mean_token_accuracy": 0.7081611901521683, "num_tokens": 25820348.0, "step": 44 }, { "entropy": 1.0888671875, "epoch": 0.26905829596412556, "grad_norm": 9.5625, "learning_rate": 1.94955186683372e-06, "loss": 1.242919921875, "mean_token_accuracy": 0.6991148665547371, "num_tokens": 26409842.0, "step": 45 }, { "entropy": 1.080078125, "epoch": 0.2750373692077728, "grad_norm": 9.4375, "learning_rate": 1.94647605908261e-06, "loss": 1.2176513671875, "mean_token_accuracy": 0.7035784423351288, "num_tokens": 26993040.0, "step": 46 }, { "entropy": 1.10009765625, "epoch": 0.28101644245142005, "grad_norm": 8.125, "learning_rate": 1.943311813257743e-06, "loss": 1.252685546875, "mean_token_accuracy": 0.6970779970288277, "num_tokens": 27582641.0, "step": 47 }, { "entropy": 1.11328125, "epoch": 0.28699551569506726, "grad_norm": 8.5, "learning_rate": 1.9400594250240794e-06, "loss": 1.260009765625, "mean_token_accuracy": 0.6932175979018211, "num_tokens": 28172234.0, "step": 48 }, { "entropy": 1.0869140625, "epoch": 0.2929745889387145, "grad_norm": 8.625, "learning_rate": 1.9367191982825448e-06, "loss": 1.208740234375, "mean_token_accuracy": 0.7022047564387321, "num_tokens": 28761815.0, "step": 49 }, { "entropy": 1.06982421875, "epoch": 0.29895366218236175, "grad_norm": 7.46875, "learning_rate": 1.9332914451416345e-06, "loss": 1.214599609375, "mean_token_accuracy": 0.7050945162773132, "num_tokens": 29351427.0, "step": 50 }, { "entropy": 1.078125, "epoch": 0.30493273542600896, "grad_norm": 6.78125, "learning_rate": 1.929776485888251e-06, "loss": 1.23046875, "mean_token_accuracy": 0.7009832188487053, "num_tokens": 29941039.0, "step": 51 }, { "entropy": 1.0703125, "epoch": 0.3109118086696562, "grad_norm": 11.4375, "learning_rate": 1.9261746489577764e-06, "loss": 1.2705078125, "mean_token_accuracy": 0.696273148059845, "num_tokens": 30517107.0, "step": 52 }, { "entropy": 1.091796875, "epoch": 0.31689088191330345, "grad_norm": 7.1875, "learning_rate": 1.9224862709033824e-06, "loss": 1.2236328125, "mean_token_accuracy": 0.6999221071600914, "num_tokens": 31106632.0, "step": 53 }, { "entropy": 1.03955078125, "epoch": 0.32286995515695066, "grad_norm": 6.25, "learning_rate": 1.918711696364584e-06, "loss": 1.180908203125, "mean_token_accuracy": 0.710278332233429, "num_tokens": 31688974.0, "step": 54 }, { "entropy": 1.04345703125, "epoch": 0.32884902840059793, "grad_norm": 5.5625, "learning_rate": 1.914851278035038e-06, "loss": 1.1917724609375, "mean_token_accuracy": 0.7096548527479172, "num_tokens": 32278575.0, "step": 55 }, { "entropy": 1.05859375, "epoch": 0.33482810164424515, "grad_norm": 5.9375, "learning_rate": 1.910905376629585e-06, "loss": 1.2235107421875, "mean_token_accuracy": 0.7041697576642036, "num_tokens": 32868159.0, "step": 56 }, { "entropy": 1.03369140625, "epoch": 0.34080717488789236, "grad_norm": 4.84375, "learning_rate": 1.9068743608505452e-06, "loss": 1.1871337890625, "mean_token_accuracy": 0.7095241695642471, "num_tokens": 33457746.0, "step": 57 }, { "entropy": 1.0615234375, "epoch": 0.34678624813153963, "grad_norm": 6.125, "learning_rate": 1.902758607353269e-06, "loss": 1.2313232421875, "mean_token_accuracy": 0.7004028484225273, "num_tokens": 34047328.0, "step": 58 }, { "entropy": 1.06982421875, "epoch": 0.35276532137518685, "grad_norm": 5.78125, "learning_rate": 1.8985585007109388e-06, "loss": 1.23828125, "mean_token_accuracy": 0.7001003175973892, "num_tokens": 34636812.0, "step": 59 }, { "entropy": 1.05078125, "epoch": 0.35874439461883406, "grad_norm": 5.03125, "learning_rate": 1.8942744333786395e-06, "loss": 1.184326171875, "mean_token_accuracy": 0.7088666930794716, "num_tokens": 35226406.0, "step": 60 }, { "entropy": 1.0556640625, "epoch": 0.36472346786248133, "grad_norm": 7.53125, "learning_rate": 1.8899068056566838e-06, "loss": 1.2060546875, "mean_token_accuracy": 0.7032047733664513, "num_tokens": 35809841.0, "step": 61 }, { "entropy": 1.0791015625, "epoch": 0.37070254110612855, "grad_norm": 10.375, "learning_rate": 1.8854560256532098e-06, "loss": 1.2000732421875, "mean_token_accuracy": 0.7027083933353424, "num_tokens": 36399387.0, "step": 62 }, { "entropy": 1.04931640625, "epoch": 0.37668161434977576, "grad_norm": 11.625, "learning_rate": 1.8809225092460485e-06, "loss": 1.2080078125, "mean_token_accuracy": 0.7051479294896126, "num_tokens": 36988937.0, "step": 63 }, { "entropy": 1.02294921875, "epoch": 0.38266068759342303, "grad_norm": 7.25, "learning_rate": 1.8763066800438634e-06, "loss": 1.1639404296875, "mean_token_accuracy": 0.7147629410028458, "num_tokens": 37569757.0, "step": 64 }, { "entropy": 1.02880859375, "epoch": 0.38863976083707025, "grad_norm": 6.75, "learning_rate": 1.8716089693465693e-06, "loss": 1.1640625, "mean_token_accuracy": 0.7142753675580025, "num_tokens": 38159331.0, "step": 65 }, { "entropy": 1.03564453125, "epoch": 0.39461883408071746, "grad_norm": 9.5, "learning_rate": 1.8668298161050306e-06, "loss": 1.199951171875, "mean_token_accuracy": 0.7058519497513771, "num_tokens": 38747516.0, "step": 66 }, { "entropy": 1.05126953125, "epoch": 0.40059790732436473, "grad_norm": 6.09375, "learning_rate": 1.861969666880049e-06, "loss": 1.179443359375, "mean_token_accuracy": 0.7096298113465309, "num_tokens": 39337113.0, "step": 67 }, { "entropy": 1.05126953125, "epoch": 0.40657698056801195, "grad_norm": 5.46875, "learning_rate": 1.8570289758006343e-06, "loss": 1.1827392578125, "mean_token_accuracy": 0.7079932987689972, "num_tokens": 39926721.0, "step": 68 }, { "entropy": 1.0615234375, "epoch": 0.4125560538116592, "grad_norm": 4.84375, "learning_rate": 1.8520082045215717e-06, "loss": 1.189453125, "mean_token_accuracy": 0.7062863036990166, "num_tokens": 40516290.0, "step": 69 }, { "entropy": 1.0439453125, "epoch": 0.41853512705530643, "grad_norm": 4.84375, "learning_rate": 1.846907822180286e-06, "loss": 1.16650390625, "mean_token_accuracy": 0.711765356361866, "num_tokens": 41105790.0, "step": 70 }, { "entropy": 1.0625, "epoch": 0.42451420029895365, "grad_norm": 6.0, "learning_rate": 1.8417283053530043e-06, "loss": 1.18603515625, "mean_token_accuracy": 0.7049060165882111, "num_tokens": 41695388.0, "step": 71 }, { "entropy": 1.0478515625, "epoch": 0.4304932735426009, "grad_norm": 6.875, "learning_rate": 1.8364701380102264e-06, "loss": 1.1793212890625, "mean_token_accuracy": 0.7081038281321526, "num_tokens": 42271847.0, "step": 72 }, { "entropy": 1.04833984375, "epoch": 0.43647234678624813, "grad_norm": 10.5625, "learning_rate": 1.8311338114715027e-06, "loss": 1.185791015625, "mean_token_accuracy": 0.7098284065723419, "num_tokens": 42855765.0, "step": 73 }, { "entropy": 1.05517578125, "epoch": 0.44245142002989535, "grad_norm": 6.84375, "learning_rate": 1.825719824359524e-06, "loss": 1.177734375, "mean_token_accuracy": 0.7076143845915794, "num_tokens": 43445385.0, "step": 74 }, { "entropy": 1.0751953125, "epoch": 0.4484304932735426, "grad_norm": 6.28125, "learning_rate": 1.8202286825535329e-06, "loss": 1.208251953125, "mean_token_accuracy": 0.7024360001087189, "num_tokens": 44033137.0, "step": 75 }, { "entropy": 1.0888671875, "epoch": 0.45440956651718983, "grad_norm": 5.375, "learning_rate": 1.814660899142053e-06, "loss": 1.202392578125, "mean_token_accuracy": 0.7014130279421806, "num_tokens": 44622745.0, "step": 76 }, { "entropy": 1.04248046875, "epoch": 0.46038863976083705, "grad_norm": 6.5, "learning_rate": 1.8090169943749474e-06, "loss": 1.18212890625, "mean_token_accuracy": 0.7097717002034187, "num_tokens": 45212356.0, "step": 77 }, { "entropy": 1.0576171875, "epoch": 0.4663677130044843, "grad_norm": 9.3125, "learning_rate": 1.8032974956148062e-06, "loss": 1.179443359375, "mean_token_accuracy": 0.7062700912356377, "num_tokens": 45798390.0, "step": 78 }, { "entropy": 1.03857421875, "epoch": 0.47234678624813153, "grad_norm": 9.125, "learning_rate": 1.7975029372876705e-06, "loss": 1.1568603515625, "mean_token_accuracy": 0.7130975499749184, "num_tokens": 46388008.0, "step": 79 }, { "entropy": 1.05322265625, "epoch": 0.47832585949177875, "grad_norm": 7.34375, "learning_rate": 1.7916338608330956e-06, "loss": 1.182861328125, "mean_token_accuracy": 0.7081197574734688, "num_tokens": 46974446.0, "step": 80 }, { "entropy": 1.0458984375, "epoch": 0.484304932735426, "grad_norm": 4.5625, "learning_rate": 1.78569081465356e-06, "loss": 1.1536865234375, "mean_token_accuracy": 0.7095040455460548, "num_tokens": 47564004.0, "step": 81 }, { "entropy": 1.03759765625, "epoch": 0.49028400597907323, "grad_norm": 3.875, "learning_rate": 1.7796743540632221e-06, "loss": 1.1531982421875, "mean_token_accuracy": 0.7134627625346184, "num_tokens": 48153580.0, "step": 82 }, { "entropy": 1.0625, "epoch": 0.4962630792227205, "grad_norm": 6.5, "learning_rate": 1.7735850412360328e-06, "loss": 1.177490234375, "mean_token_accuracy": 0.7081628367304802, "num_tokens": 48740404.0, "step": 83 }, { "entropy": 1.048828125, "epoch": 0.5022421524663677, "grad_norm": 8.8125, "learning_rate": 1.7674234451532063e-06, "loss": 1.1700439453125, "mean_token_accuracy": 0.7087839841842651, "num_tokens": 49329930.0, "step": 84 }, { "entropy": 1.0458984375, "epoch": 0.5082212257100149, "grad_norm": 7.25, "learning_rate": 1.7611901415500533e-06, "loss": 1.16259765625, "mean_token_accuracy": 0.7115066945552826, "num_tokens": 49914835.0, "step": 85 }, { "entropy": 1.0615234375, "epoch": 0.5142002989536621, "grad_norm": 8.0625, "learning_rate": 1.7548857128621874e-06, "loss": 1.18359375, "mean_token_accuracy": 0.706175908446312, "num_tokens": 50504393.0, "step": 86 }, { "entropy": 1.05322265625, "epoch": 0.5201793721973094, "grad_norm": 4.875, "learning_rate": 1.748510748171101e-06, "loss": 1.1778564453125, "mean_token_accuracy": 0.7066154479980469, "num_tokens": 51093995.0, "step": 87 }, { "entropy": 1.052734375, "epoch": 0.5261584454409567, "grad_norm": 4.65625, "learning_rate": 1.7420658431491222e-06, "loss": 1.167236328125, "mean_token_accuracy": 0.7099850177764893, "num_tokens": 51683545.0, "step": 88 }, { "entropy": 1.04638671875, "epoch": 0.5321375186846039, "grad_norm": 9.125, "learning_rate": 1.735551600003755e-06, "loss": 1.157470703125, "mean_token_accuracy": 0.7102955356240273, "num_tokens": 52272548.0, "step": 89 }, { "entropy": 1.0302734375, "epoch": 0.5381165919282511, "grad_norm": 10.25, "learning_rate": 1.7289686274214115e-06, "loss": 1.1446533203125, "mean_token_accuracy": 0.7140024453401566, "num_tokens": 52862195.0, "step": 90 }, { "entropy": 1.03369140625, "epoch": 0.5440956651718983, "grad_norm": 11.75, "learning_rate": 1.722317540510534e-06, "loss": 1.1495361328125, "mean_token_accuracy": 0.7119031846523285, "num_tokens": 53450055.0, "step": 91 }, { "entropy": 1.03564453125, "epoch": 0.5500747384155455, "grad_norm": 9.8125, "learning_rate": 1.715598960744121e-06, "loss": 1.149658203125, "mean_token_accuracy": 0.7118451669812202, "num_tokens": 54023463.0, "step": 92 }, { "entropy": 1.0634765625, "epoch": 0.5560538116591929, "grad_norm": 6.40625, "learning_rate": 1.7088135159016582e-06, "loss": 1.1729736328125, "mean_token_accuracy": 0.7083615809679031, "num_tokens": 54613056.0, "step": 93 }, { "entropy": 1.056640625, "epoch": 0.5620328849028401, "grad_norm": 4.1875, "learning_rate": 1.7019618400104569e-06, "loss": 1.158447265625, "mean_token_accuracy": 0.7101547122001648, "num_tokens": 55194480.0, "step": 94 }, { "entropy": 1.046875, "epoch": 0.5680119581464873, "grad_norm": 9.25, "learning_rate": 1.6950445732864126e-06, "loss": 1.162109375, "mean_token_accuracy": 0.7099513560533524, "num_tokens": 55784110.0, "step": 95 }, { "entropy": 1.0625, "epoch": 0.5739910313901345, "grad_norm": 6.65625, "learning_rate": 1.688062362074184e-06, "loss": 1.1649169921875, "mean_token_accuracy": 0.7057990580797195, "num_tokens": 56373674.0, "step": 96 }, { "entropy": 1.05078125, "epoch": 0.5799701046337817, "grad_norm": 15.125, "learning_rate": 1.681015858786797e-06, "loss": 1.166259765625, "mean_token_accuracy": 0.7095335945487022, "num_tokens": 56952628.0, "step": 97 }, { "entropy": 1.03369140625, "epoch": 0.585949177877429, "grad_norm": 8.3125, "learning_rate": 1.6739057218446857e-06, "loss": 1.156005859375, "mean_token_accuracy": 0.7151156216859818, "num_tokens": 57542149.0, "step": 98 }, { "entropy": 1.0380859375, "epoch": 0.5919282511210763, "grad_norm": 4.15625, "learning_rate": 1.666732615614169e-06, "loss": 1.143798828125, "mean_token_accuracy": 0.7129637002944946, "num_tokens": 58131730.0, "step": 99 }, { "entropy": 1.02392578125, "epoch": 0.5979073243647235, "grad_norm": 9.3125, "learning_rate": 1.6594972103453724e-06, "loss": 1.1343994140625, "mean_token_accuracy": 0.7165936082601547, "num_tokens": 58721339.0, "step": 100 }, { "entropy": 1.029296875, "epoch": 0.6038863976083707, "grad_norm": 7.5, "learning_rate": 1.6522001821096019e-06, "loss": 1.1375732421875, "mean_token_accuracy": 0.7172424420714378, "num_tokens": 59310867.0, "step": 101 }, { "entropy": 1.02978515625, "epoch": 0.6098654708520179, "grad_norm": 11.3125, "learning_rate": 1.6448422127361705e-06, "loss": 1.117919921875, "mean_token_accuracy": 0.7172070667147636, "num_tokens": 59894680.0, "step": 102 }, { "entropy": 1.0205078125, "epoch": 0.6158445440956651, "grad_norm": 11.5, "learning_rate": 1.6374239897486897e-06, "loss": 1.1181640625, "mean_token_accuracy": 0.7184400483965874, "num_tokens": 60484296.0, "step": 103 }, { "entropy": 1.05078125, "epoch": 0.6218236173393124, "grad_norm": 8.0, "learning_rate": 1.6299462063008269e-06, "loss": 1.143798828125, "mean_token_accuracy": 0.7096443995833397, "num_tokens": 61073911.0, "step": 104 }, { "entropy": 1.064453125, "epoch": 0.6278026905829597, "grad_norm": 7.75, "learning_rate": 1.6224095611115383e-06, "loss": 1.1650390625, "mean_token_accuracy": 0.7072784155607224, "num_tokens": 61663572.0, "step": 105 }, { "entropy": 1.0341796875, "epoch": 0.6337817638266069, "grad_norm": 6.9375, "learning_rate": 1.614814758399781e-06, "loss": 1.128662109375, "mean_token_accuracy": 0.71539356559515, "num_tokens": 62252067.0, "step": 106 }, { "entropy": 1.03662109375, "epoch": 0.6397608370702541, "grad_norm": 14.125, "learning_rate": 1.6071625078187112e-06, "loss": 1.146240234375, "mean_token_accuracy": 0.7122742831707001, "num_tokens": 62841656.0, "step": 107 }, { "entropy": 1.0400390625, "epoch": 0.6457399103139013, "grad_norm": 14.3125, "learning_rate": 1.599453524389374e-06, "loss": 1.146728515625, "mean_token_accuracy": 0.7149165868759155, "num_tokens": 63431221.0, "step": 108 }, { "entropy": 1.0556640625, "epoch": 0.6517189835575485, "grad_norm": 15.375, "learning_rate": 1.5916885284338935e-06, "loss": 1.155029296875, "mean_token_accuracy": 0.7111315131187439, "num_tokens": 64020069.0, "step": 109 }, { "entropy": 1.0380859375, "epoch": 0.6576980568011959, "grad_norm": 12.3125, "learning_rate": 1.5838682455081657e-06, "loss": 1.13671875, "mean_token_accuracy": 0.7157010585069656, "num_tokens": 64609550.0, "step": 110 }, { "entropy": 1.03076171875, "epoch": 0.6636771300448431, "grad_norm": 7.90625, "learning_rate": 1.5759934063340624e-06, "loss": 1.1343994140625, "mean_token_accuracy": 0.7165603339672089, "num_tokens": 65199109.0, "step": 111 }, { "entropy": 1.05615234375, "epoch": 0.6696562032884903, "grad_norm": 12.5, "learning_rate": 1.5680647467311555e-06, "loss": 1.1683349609375, "mean_token_accuracy": 0.7091170027852058, "num_tokens": 65788728.0, "step": 112 }, { "entropy": 1.03759765625, "epoch": 0.6756352765321375, "grad_norm": 8.9375, "learning_rate": 1.56008300754796e-06, "loss": 1.1407470703125, "mean_token_accuracy": 0.7141014188528061, "num_tokens": 66370343.0, "step": 113 }, { "entropy": 1.0478515625, "epoch": 0.6816143497757847, "grad_norm": 12.5, "learning_rate": 1.5520489345927094e-06, "loss": 1.1500244140625, "mean_token_accuracy": 0.7122905552387238, "num_tokens": 66955876.0, "step": 114 }, { "entropy": 1.02392578125, "epoch": 0.6875934230194319, "grad_norm": 5.65625, "learning_rate": 1.5439632785636705e-06, "loss": 1.135498046875, "mean_token_accuracy": 0.7167380154132843, "num_tokens": 67545426.0, "step": 115 }, { "entropy": 1.052734375, "epoch": 0.6935724962630793, "grad_norm": 5.0625, "learning_rate": 1.5358267949789964e-06, "loss": 1.1448974609375, "mean_token_accuracy": 0.7125077843666077, "num_tokens": 68134997.0, "step": 116 }, { "entropy": 1.041015625, "epoch": 0.6995515695067265, "grad_norm": 5.75, "learning_rate": 1.5276402441061327e-06, "loss": 1.125732421875, "mean_token_accuracy": 0.7163447961211205, "num_tokens": 68724591.0, "step": 117 }, { "entropy": 1.02587890625, "epoch": 0.7055306427503737, "grad_norm": 9.625, "learning_rate": 1.5194043908907772e-06, "loss": 1.131103515625, "mean_token_accuracy": 0.716623105108738, "num_tokens": 69314113.0, "step": 118 }, { "entropy": 1.005859375, "epoch": 0.7115097159940209, "grad_norm": 5.09375, "learning_rate": 1.5111200048854054e-06, "loss": 1.1011962890625, "mean_token_accuracy": 0.7245713621377945, "num_tokens": 69903640.0, "step": 119 }, { "entropy": 1.044921875, "epoch": 0.7174887892376681, "grad_norm": 6.59375, "learning_rate": 1.5027878601773632e-06, "loss": 1.1431884765625, "mean_token_accuracy": 0.7117293328046799, "num_tokens": 70493259.0, "step": 120 }, { "entropy": 1.0615234375, "epoch": 0.7234678624813154, "grad_norm": 8.1875, "learning_rate": 1.494408735316537e-06, "loss": 1.15380859375, "mean_token_accuracy": 0.7094393074512482, "num_tokens": 71082785.0, "step": 121 }, { "entropy": 1.07861328125, "epoch": 0.7294469357249627, "grad_norm": 16.625, "learning_rate": 1.4859834132426058e-06, "loss": 1.1781005859375, "mean_token_accuracy": 0.7051993981003761, "num_tokens": 71666506.0, "step": 122 }, { "entropy": 1.0166015625, "epoch": 0.7354260089686099, "grad_norm": 16.875, "learning_rate": 1.4775126812118863e-06, "loss": 1.1251220703125, "mean_token_accuracy": 0.7166302278637886, "num_tokens": 72250385.0, "step": 123 }, { "entropy": 1.05615234375, "epoch": 0.7414050822122571, "grad_norm": 7.4375, "learning_rate": 1.4689973307237686e-06, "loss": 1.15478515625, "mean_token_accuracy": 0.709770917892456, "num_tokens": 72829598.0, "step": 124 }, { "entropy": 1.029296875, "epoch": 0.7473841554559043, "grad_norm": 9.9375, "learning_rate": 1.4604381574467614e-06, "loss": 1.13037109375, "mean_token_accuracy": 0.7166016399860382, "num_tokens": 73419110.0, "step": 125 }, { "entropy": 1.013671875, "epoch": 0.7533632286995515, "grad_norm": 16.625, "learning_rate": 1.451835961144145e-06, "loss": 1.1103515625, "mean_token_accuracy": 0.7211827859282494, "num_tokens": 74008729.0, "step": 126 }, { "entropy": 1.0126953125, "epoch": 0.7593423019431988, "grad_norm": 13.0625, "learning_rate": 1.4431915455992414e-06, "loss": 1.1024169921875, "mean_token_accuracy": 0.7223981395363808, "num_tokens": 74598306.0, "step": 127 }, { "entropy": 1.056640625, "epoch": 0.7653213751868461, "grad_norm": 7.75, "learning_rate": 1.4345057185403098e-06, "loss": 1.15869140625, "mean_token_accuracy": 0.7109938785433769, "num_tokens": 75187853.0, "step": 128 }, { "entropy": 1.037109375, "epoch": 0.7713004484304933, "grad_norm": 12.875, "learning_rate": 1.4257792915650725e-06, "loss": 1.13232421875, "mean_token_accuracy": 0.7139059007167816, "num_tokens": 75777399.0, "step": 129 }, { "entropy": 1.0390625, "epoch": 0.7772795216741405, "grad_norm": 11.375, "learning_rate": 1.4170130800648812e-06, "loss": 1.1455078125, "mean_token_accuracy": 0.7116378918290138, "num_tokens": 76367001.0, "step": 130 }, { "entropy": 1.0419921875, "epoch": 0.7832585949177877, "grad_norm": 10.875, "learning_rate": 1.408207903148525e-06, "loss": 1.1370849609375, "mean_token_accuracy": 0.7141571119427681, "num_tokens": 76956562.0, "step": 131 }, { "entropy": 1.033203125, "epoch": 0.7892376681614349, "grad_norm": 17.625, "learning_rate": 1.3993645835656952e-06, "loss": 1.147705078125, "mean_token_accuracy": 0.7140598297119141, "num_tokens": 77544057.0, "step": 132 }, { "entropy": 0.99755859375, "epoch": 0.7952167414050823, "grad_norm": 18.875, "learning_rate": 1.3904839476301088e-06, "loss": 1.085693359375, "mean_token_accuracy": 0.7245375439524651, "num_tokens": 78133581.0, "step": 133 }, { "entropy": 1.0478515625, "epoch": 0.8011958146487295, "grad_norm": 19.125, "learning_rate": 1.3815668251422953e-06, "loss": 1.14013671875, "mean_token_accuracy": 0.7118667960166931, "num_tokens": 78723253.0, "step": 134 }, { "entropy": 1.03271484375, "epoch": 0.8071748878923767, "grad_norm": 24.125, "learning_rate": 1.3726140493120637e-06, "loss": 1.1357421875, "mean_token_accuracy": 0.7158161103725433, "num_tokens": 79306761.0, "step": 135 }, { "entropy": 1.02685546875, "epoch": 0.8131539611360239, "grad_norm": 28.0, "learning_rate": 1.363626456680647e-06, "loss": 1.125244140625, "mean_token_accuracy": 0.7170991152524948, "num_tokens": 79893309.0, "step": 136 }, { "entropy": 1.05224609375, "epoch": 0.8191330343796711, "grad_norm": 24.125, "learning_rate": 1.3546048870425354e-06, "loss": 1.148681640625, "mean_token_accuracy": 0.7112774699926376, "num_tokens": 80482935.0, "step": 137 }, { "entropy": 1.05615234375, "epoch": 0.8251121076233184, "grad_norm": 12.5625, "learning_rate": 1.3455501833670087e-06, "loss": 1.134033203125, "mean_token_accuracy": 0.7125924825668335, "num_tokens": 81072531.0, "step": 138 }, { "entropy": 1.03271484375, "epoch": 0.8310911808669657, "grad_norm": 19.375, "learning_rate": 1.336463191719367e-06, "loss": 1.12335205078125, "mean_token_accuracy": 0.7159583121538162, "num_tokens": 81654332.0, "step": 139 }, { "entropy": 1.052734375, "epoch": 0.8370702541106129, "grad_norm": 20.125, "learning_rate": 1.3273447611818766e-06, "loss": 1.1549072265625, "mean_token_accuracy": 0.7113095596432686, "num_tokens": 82243896.0, "step": 140 }, { "entropy": 1.0322265625, "epoch": 0.8430493273542601, "grad_norm": 14.6875, "learning_rate": 1.3181957437744332e-06, "loss": 1.128662109375, "mean_token_accuracy": 0.7145743370056152, "num_tokens": 82826175.0, "step": 141 }, { "entropy": 1.0458984375, "epoch": 0.8490284005979073, "grad_norm": 7.71875, "learning_rate": 1.3090169943749473e-06, "loss": 1.12841796875, "mean_token_accuracy": 0.7128350734710693, "num_tokens": 83415822.0, "step": 142 }, { "entropy": 1.03759765625, "epoch": 0.8550074738415545, "grad_norm": 22.875, "learning_rate": 1.2998093706394675e-06, "loss": 1.14453125, "mean_token_accuracy": 0.7128356993198395, "num_tokens": 83997557.0, "step": 143 }, { "entropy": 1.05078125, "epoch": 0.8609865470852018, "grad_norm": 13.875, "learning_rate": 1.2905737329220392e-06, "loss": 1.136474609375, "mean_token_accuracy": 0.7111846879124641, "num_tokens": 84587168.0, "step": 144 }, { "entropy": 1.04638671875, "epoch": 0.866965620328849, "grad_norm": 15.25, "learning_rate": 1.2813109441943164e-06, "loss": 1.138671875, "mean_token_accuracy": 0.7117553874850273, "num_tokens": 85176064.0, "step": 145 }, { "entropy": 1.0205078125, "epoch": 0.8729446935724963, "grad_norm": 13.625, "learning_rate": 1.2720218699649241e-06, "loss": 1.111572265625, "mean_token_accuracy": 0.7199290543794632, "num_tokens": 85765635.0, "step": 146 }, { "entropy": 1.02197265625, "epoch": 0.8789237668161435, "grad_norm": 10.5625, "learning_rate": 1.262707378198587e-06, "loss": 1.1162109375, "mean_token_accuracy": 0.7189603447914124, "num_tokens": 86355190.0, "step": 147 }, { "entropy": 1.04736328125, "epoch": 0.8849028400597907, "grad_norm": 11.75, "learning_rate": 1.2533683392350262e-06, "loss": 1.138427734375, "mean_token_accuracy": 0.7134011015295982, "num_tokens": 86938046.0, "step": 148 }, { "entropy": 1.02490234375, "epoch": 0.890881913303438, "grad_norm": 19.5, "learning_rate": 1.2440056257076374e-06, "loss": 1.113037109375, "mean_token_accuracy": 0.7172698378562927, "num_tokens": 87527598.0, "step": 149 }, { "entropy": 1.0146484375, "epoch": 0.8968609865470852, "grad_norm": 14.0, "learning_rate": 1.23462011246195e-06, "loss": 1.11181640625, "mean_token_accuracy": 0.7199216857552528, "num_tokens": 88110264.0, "step": 150 }, { "entropy": 1.03271484375, "epoch": 0.9028400597907325, "grad_norm": 8.9375, "learning_rate": 1.2252126764738844e-06, "loss": 1.12353515625, "mean_token_accuracy": 0.716008372604847, "num_tokens": 88699832.0, "step": 151 }, { "entropy": 1.046875, "epoch": 0.9088191330343797, "grad_norm": 8.125, "learning_rate": 1.2157841967678063e-06, "loss": 1.130126953125, "mean_token_accuracy": 0.714422382414341, "num_tokens": 89289495.0, "step": 152 }, { "entropy": 1.00439453125, "epoch": 0.9147982062780269, "grad_norm": 12.0, "learning_rate": 1.2063355543343923e-06, "loss": 1.08837890625, "mean_token_accuracy": 0.7251131683588028, "num_tokens": 89879089.0, "step": 153 }, { "entropy": 1.0244140625, "epoch": 0.9207772795216741, "grad_norm": 14.3125, "learning_rate": 1.1968676320483101e-06, "loss": 1.1243896484375, "mean_token_accuracy": 0.7171234339475632, "num_tokens": 90451022.0, "step": 154 }, { "entropy": 1.03466796875, "epoch": 0.9267563527653214, "grad_norm": 28.75, "learning_rate": 1.1873813145857248e-06, "loss": 1.1207275390625, "mean_token_accuracy": 0.7154998481273651, "num_tokens": 91036040.0, "step": 155 }, { "entropy": 1.03125, "epoch": 0.9327354260089686, "grad_norm": 33.25, "learning_rate": 1.1778774883416322e-06, "loss": 1.119873046875, "mean_token_accuracy": 0.715819425880909, "num_tokens": 91625659.0, "step": 156 }, { "entropy": 1.0400390625, "epoch": 0.9387144992526159, "grad_norm": 13.625, "learning_rate": 1.1683570413470383e-06, "loss": 1.1197509765625, "mean_token_accuracy": 0.7150726914405823, "num_tokens": 92215320.0, "step": 157 }, { "entropy": 1.01953125, "epoch": 0.9446935724962631, "grad_norm": 21.125, "learning_rate": 1.1588208631859807e-06, "loss": 1.1259765625, "mean_token_accuracy": 0.7184253633022308, "num_tokens": 92804840.0, "step": 158 }, { "entropy": 1.03076171875, "epoch": 0.9506726457399103, "grad_norm": 14.0, "learning_rate": 1.149269844912404e-06, "loss": 1.115234375, "mean_token_accuracy": 0.7175646647810936, "num_tokens": 93394439.0, "step": 159 }, { "entropy": 1.0546875, "epoch": 0.9566517189835575, "grad_norm": 17.5, "learning_rate": 1.1397048789669059e-06, "loss": 1.13916015625, "mean_token_accuracy": 0.7107137218117714, "num_tokens": 93979057.0, "step": 160 }, { "entropy": 1.05859375, "epoch": 0.9626307922272048, "grad_norm": 15.25, "learning_rate": 1.1301268590933434e-06, "loss": 1.14404296875, "mean_token_accuracy": 0.71033675968647, "num_tokens": 94568560.0, "step": 161 }, { "entropy": 1.0244140625, "epoch": 0.968609865470852, "grad_norm": 20.125, "learning_rate": 1.1205366802553228e-06, "loss": 1.1131591796875, "mean_token_accuracy": 0.7181500568985939, "num_tokens": 95158163.0, "step": 162 }, { "entropy": 1.048828125, "epoch": 0.9745889387144993, "grad_norm": 21.25, "learning_rate": 1.110935238552578e-06, "loss": 1.1319580078125, "mean_token_accuracy": 0.7127460688352585, "num_tokens": 95747756.0, "step": 163 }, { "entropy": 1.017578125, "epoch": 0.9805680119581465, "grad_norm": 25.125, "learning_rate": 1.1013234311372353e-06, "loss": 1.1143798828125, "mean_token_accuracy": 0.7193858399987221, "num_tokens": 96337278.0, "step": 164 }, { "entropy": 1.02783203125, "epoch": 0.9865470852017937, "grad_norm": 18.375, "learning_rate": 1.0917021561299862e-06, "loss": 1.1024169921875, "mean_token_accuracy": 0.7175654470920563, "num_tokens": 96926854.0, "step": 165 }, { "entropy": 1.060546875, "epoch": 0.992526158445441, "grad_norm": 11.375, "learning_rate": 1.0820723125361684e-06, "loss": 1.13623046875, "mean_token_accuracy": 0.710972748696804, "num_tokens": 97507731.0, "step": 166 }, { "entropy": 1.0107421875, "epoch": 0.9985052316890882, "grad_norm": 24.875, "learning_rate": 1.0724348001617625e-06, "loss": 1.1070556640625, "mean_token_accuracy": 0.7217210680246353, "num_tokens": 98097346.0, "step": 167 }, { "entropy": 0.9921875, "epoch": 1.0, "grad_norm": 23.5, "learning_rate": 1.0627905195293135e-06, "loss": 1.072265625, "mean_token_accuracy": 0.726732075214386, "num_tokens": 98244774.0, "step": 168 }, { "entropy": 1.037109375, "epoch": 1.0059790732436473, "grad_norm": 21.875, "learning_rate": 1.0531403717937886e-06, "loss": 1.1287841796875, "mean_token_accuracy": 0.715124748647213, "num_tokens": 98834357.0, "step": 169 }, { "entropy": 0.98046875, "epoch": 1.0119581464872944, "grad_norm": 20.875, "learning_rate": 1.0434852586583737e-06, "loss": 1.0762939453125, "mean_token_accuracy": 0.7286977842450142, "num_tokens": 99424044.0, "step": 170 }, { "entropy": 1.0498046875, "epoch": 1.0179372197309418, "grad_norm": 21.875, "learning_rate": 1.0338260822902165e-06, "loss": 1.1475830078125, "mean_token_accuracy": 0.7115440741181374, "num_tokens": 100013632.0, "step": 171 }, { "entropy": 1.03759765625, "epoch": 1.0239162929745889, "grad_norm": 17.25, "learning_rate": 1.0241637452361322e-06, "loss": 1.133056640625, "mean_token_accuracy": 0.713756151497364, "num_tokens": 100603269.0, "step": 172 }, { "entropy": 1.02197265625, "epoch": 1.0298953662182362, "grad_norm": 15.8125, "learning_rate": 1.0144991503382673e-06, "loss": 1.1068115234375, "mean_token_accuracy": 0.7196066528558731, "num_tokens": 101191071.0, "step": 173 }, { "entropy": 1.0302734375, "epoch": 1.0358744394618835, "grad_norm": 15.5, "learning_rate": 1.0048332006497404e-06, "loss": 1.111572265625, "mean_token_accuracy": 0.7173566892743111, "num_tokens": 101780702.0, "step": 174 }, { "entropy": 1.0576171875, "epoch": 1.0418535127055306, "grad_norm": 9.25, "learning_rate": 9.951667993502597e-07, "loss": 1.1553955078125, "mean_token_accuracy": 0.7085662558674812, "num_tokens": 102367028.0, "step": 175 }, { "entropy": 1.05322265625, "epoch": 1.047832585949178, "grad_norm": 9.0625, "learning_rate": 9.855008496617326e-07, "loss": 1.1552734375, "mean_token_accuracy": 0.7092385366559029, "num_tokens": 102956643.0, "step": 176 }, { "entropy": 1.029296875, "epoch": 1.053811659192825, "grad_norm": 29.375, "learning_rate": 9.75836254763868e-07, "loss": 1.136474609375, "mean_token_accuracy": 0.7162440121173859, "num_tokens": 103535589.0, "step": 177 }, { "entropy": 1.0029296875, "epoch": 1.0597907324364724, "grad_norm": 11.9375, "learning_rate": 9.661739177097834e-07, "loss": 1.0927734375, "mean_token_accuracy": 0.7226409837603569, "num_tokens": 104124613.0, "step": 178 }, { "entropy": 1.04150390625, "epoch": 1.0657698056801195, "grad_norm": 12.0, "learning_rate": 9.565147413416265e-07, "loss": 1.1234130859375, "mean_token_accuracy": 0.7134781181812286, "num_tokens": 104714199.0, "step": 179 }, { "entropy": 1.01318359375, "epoch": 1.0717488789237668, "grad_norm": 13.8125, "learning_rate": 9.468596282062113e-07, "loss": 1.1014404296875, "mean_token_accuracy": 0.7213335856795311, "num_tokens": 105294866.0, "step": 180 }, { "entropy": 1.05322265625, "epoch": 1.0777279521674141, "grad_norm": 14.625, "learning_rate": 9.372094804706866e-07, "loss": 1.152587890625, "mean_token_accuracy": 0.7105412855744362, "num_tokens": 105884489.0, "step": 181 }, { "entropy": 1.01025390625, "epoch": 1.0837070254110612, "grad_norm": 11.0, "learning_rate": 9.275651998382377e-07, "loss": 1.101318359375, "mean_token_accuracy": 0.7201149016618729, "num_tokens": 106474079.0, "step": 182 }, { "entropy": 1.02001953125, "epoch": 1.0896860986547086, "grad_norm": 8.3125, "learning_rate": 9.179276874638314e-07, "loss": 1.107666015625, "mean_token_accuracy": 0.7216706648468971, "num_tokens": 107063687.0, "step": 183 }, { "entropy": 1.02685546875, "epoch": 1.0956651718983557, "grad_norm": 11.625, "learning_rate": 9.082978438700138e-07, "loss": 1.125732421875, "mean_token_accuracy": 0.7165105268359184, "num_tokens": 107649683.0, "step": 184 }, { "entropy": 1.0322265625, "epoch": 1.101644245142003, "grad_norm": 10.625, "learning_rate": 8.986765688627651e-07, "loss": 1.10595703125, "mean_token_accuracy": 0.7177045792341232, "num_tokens": 108239185.0, "step": 185 }, { "entropy": 1.0166015625, "epoch": 1.1076233183856503, "grad_norm": 11.5625, "learning_rate": 8.890647614474222e-07, "loss": 1.1109619140625, "mean_token_accuracy": 0.7202980294823647, "num_tokens": 108828659.0, "step": 186 }, { "entropy": 1.04833984375, "epoch": 1.1136023916292974, "grad_norm": 23.25, "learning_rate": 8.79463319744677e-07, "loss": 1.14404296875, "mean_token_accuracy": 0.7117270454764366, "num_tokens": 109415343.0, "step": 187 }, { "entropy": 1.01904296875, "epoch": 1.1195814648729447, "grad_norm": 20.5, "learning_rate": 8.698731409066568e-07, "loss": 1.1033935546875, "mean_token_accuracy": 0.7186397314071655, "num_tokens": 110000047.0, "step": 188 }, { "entropy": 1.0185546875, "epoch": 1.1255605381165918, "grad_norm": 17.0, "learning_rate": 8.602951210330941e-07, "loss": 1.1114501953125, "mean_token_accuracy": 0.7207945957779884, "num_tokens": 110589519.0, "step": 189 }, { "entropy": 1.01171875, "epoch": 1.1315396113602392, "grad_norm": 19.625, "learning_rate": 8.507301550875959e-07, "loss": 1.1103515625, "mean_token_accuracy": 0.7186660766601562, "num_tokens": 111179017.0, "step": 190 }, { "entropy": 1.05859375, "epoch": 1.1375186846038865, "grad_norm": 19.5, "learning_rate": 8.411791368140195e-07, "loss": 1.1348876953125, "mean_token_accuracy": 0.7089879661798477, "num_tokens": 111761675.0, "step": 191 }, { "entropy": 1.033203125, "epoch": 1.1434977578475336, "grad_norm": 15.125, "learning_rate": 8.316429586529614e-07, "loss": 1.1116943359375, "mean_token_accuracy": 0.7168847694993019, "num_tokens": 112351282.0, "step": 192 }, { "entropy": 1.015625, "epoch": 1.149476831091181, "grad_norm": 10.0625, "learning_rate": 8.221225116583676e-07, "loss": 1.0850830078125, "mean_token_accuracy": 0.7229639515280724, "num_tokens": 112940935.0, "step": 193 }, { "entropy": 1.04150390625, "epoch": 1.155455904334828, "grad_norm": 9.75, "learning_rate": 8.126186854142751e-07, "loss": 1.1219482421875, "mean_token_accuracy": 0.7170581594109535, "num_tokens": 113530462.0, "step": 194 }, { "entropy": 1.02880859375, "epoch": 1.1614349775784754, "grad_norm": 17.625, "learning_rate": 8.031323679516899e-07, "loss": 1.130859375, "mean_token_accuracy": 0.715842954814434, "num_tokens": 114115879.0, "step": 195 }, { "entropy": 1.0419921875, "epoch": 1.1674140508221225, "grad_norm": 10.1875, "learning_rate": 7.936644456656081e-07, "loss": 1.1396484375, "mean_token_accuracy": 0.713227279484272, "num_tokens": 114705390.0, "step": 196 }, { "entropy": 1.03662109375, "epoch": 1.1733931240657698, "grad_norm": 23.375, "learning_rate": 7.84215803232194e-07, "loss": 1.1226806640625, "mean_token_accuracy": 0.7155178636312485, "num_tokens": 115286747.0, "step": 197 }, { "entropy": 1.025390625, "epoch": 1.1793721973094171, "grad_norm": 16.5, "learning_rate": 7.747873235261156e-07, "loss": 1.1126708984375, "mean_token_accuracy": 0.7184700071811676, "num_tokens": 115876348.0, "step": 198 }, { "entropy": 1.02880859375, "epoch": 1.1853512705530642, "grad_norm": 27.125, "learning_rate": 7.653798875380499e-07, "loss": 1.1217041015625, "mean_token_accuracy": 0.7165053337812424, "num_tokens": 116458527.0, "step": 199 }, { "entropy": 1.02392578125, "epoch": 1.1913303437967115, "grad_norm": 17.5, "learning_rate": 7.559943742923625e-07, "loss": 1.10888671875, "mean_token_accuracy": 0.7185152769088745, "num_tokens": 117048168.0, "step": 200 }, { "entropy": 1.0390625, "epoch": 1.1973094170403586, "grad_norm": 22.625, "learning_rate": 7.466316607649736e-07, "loss": 1.130126953125, "mean_token_accuracy": 0.7132042795419693, "num_tokens": 117636670.0, "step": 201 }, { "entropy": 1.04833984375, "epoch": 1.203288490284006, "grad_norm": 24.75, "learning_rate": 7.372926218014131e-07, "loss": 1.132568359375, "mean_token_accuracy": 0.7136494368314743, "num_tokens": 118226230.0, "step": 202 }, { "entropy": 1.0576171875, "epoch": 1.2092675635276533, "grad_norm": 17.875, "learning_rate": 7.279781300350757e-07, "loss": 1.1424560546875, "mean_token_accuracy": 0.711665190756321, "num_tokens": 118815835.0, "step": 203 }, { "entropy": 1.01953125, "epoch": 1.2152466367713004, "grad_norm": 19.125, "learning_rate": 7.186890558056836e-07, "loss": 1.1112060546875, "mean_token_accuracy": 0.7197611033916473, "num_tokens": 119402683.0, "step": 204 }, { "entropy": 1.0546875, "epoch": 1.2212257100149477, "grad_norm": 52.25, "learning_rate": 7.09426267077961e-07, "loss": 1.150146484375, "mean_token_accuracy": 0.7076791599392891, "num_tokens": 119987245.0, "step": 205 }, { "entropy": 1.03076171875, "epoch": 1.2272047832585948, "grad_norm": 56.5, "learning_rate": 7.001906293605329e-07, "loss": 1.130615234375, "mean_token_accuracy": 0.7152413129806519, "num_tokens": 120576831.0, "step": 206 }, { "entropy": 1.0244140625, "epoch": 1.2331838565022422, "grad_norm": 15.9375, "learning_rate": 6.909830056250526e-07, "loss": 1.1173095703125, "mean_token_accuracy": 0.7177402079105377, "num_tokens": 121166431.0, "step": 207 }, { "entropy": 1.03369140625, "epoch": 1.2391629297458895, "grad_norm": 54.25, "learning_rate": 6.81804256225567e-07, "loss": 1.1336669921875, "mean_token_accuracy": 0.7143785133957863, "num_tokens": 121756094.0, "step": 208 }, { "entropy": 1.0380859375, "epoch": 1.2451420029895366, "grad_norm": 16.25, "learning_rate": 6.726552388181233e-07, "loss": 1.1319580078125, "mean_token_accuracy": 0.714967779815197, "num_tokens": 122337877.0, "step": 209 }, { "entropy": 1.0263671875, "epoch": 1.251121076233184, "grad_norm": 14.25, "learning_rate": 6.63536808280633e-07, "loss": 1.109130859375, "mean_token_accuracy": 0.7193189635872841, "num_tokens": 122927447.0, "step": 210 }, { "entropy": 1.02587890625, "epoch": 1.257100149476831, "grad_norm": 13.3125, "learning_rate": 6.544498166329912e-07, "loss": 1.113525390625, "mean_token_accuracy": 0.7177118062973022, "num_tokens": 123509882.0, "step": 211 }, { "entropy": 1.0078125, "epoch": 1.2630792227204783, "grad_norm": 15.875, "learning_rate": 6.453951129574643e-07, "loss": 1.0953369140625, "mean_token_accuracy": 0.722569465637207, "num_tokens": 124099443.0, "step": 212 }, { "entropy": 1.048828125, "epoch": 1.2690582959641254, "grad_norm": 17.25, "learning_rate": 6.363735433193529e-07, "loss": 1.1336669921875, "mean_token_accuracy": 0.7113273218274117, "num_tokens": 124682200.0, "step": 213 }, { "entropy": 1.01318359375, "epoch": 1.2750373692077728, "grad_norm": 16.625, "learning_rate": 6.273859506879364e-07, "loss": 1.10498046875, "mean_token_accuracy": 0.7205186262726784, "num_tokens": 125265808.0, "step": 214 }, { "entropy": 1.013671875, "epoch": 1.28101644245142, "grad_norm": 10.75, "learning_rate": 6.18433174857705e-07, "loss": 1.112060546875, "mean_token_accuracy": 0.7207604125142097, "num_tokens": 125855421.0, "step": 215 }, { "entropy": 1.0322265625, "epoch": 1.2869955156950672, "grad_norm": 8.75, "learning_rate": 6.095160523698912e-07, "loss": 1.118408203125, "mean_token_accuracy": 0.7177054435014725, "num_tokens": 126445003.0, "step": 216 }, { "entropy": 1.021484375, "epoch": 1.2929745889387145, "grad_norm": 9.0625, "learning_rate": 6.006354164343046e-07, "loss": 1.110595703125, "mean_token_accuracy": 0.7162402048707008, "num_tokens": 127027787.0, "step": 217 }, { "entropy": 1.05029296875, "epoch": 1.2989536621823619, "grad_norm": 11.5625, "learning_rate": 5.917920968514751e-07, "loss": 1.1461181640625, "mean_token_accuracy": 0.7097650542855263, "num_tokens": 127617320.0, "step": 218 }, { "entropy": 1.03662109375, "epoch": 1.304932735426009, "grad_norm": 12.4375, "learning_rate": 5.829869199351187e-07, "loss": 1.1298828125, "mean_token_accuracy": 0.7168288081884384, "num_tokens": 128206868.0, "step": 219 }, { "entropy": 1.0361328125, "epoch": 1.310911808669656, "grad_norm": 10.5, "learning_rate": 5.742207084349273e-07, "loss": 1.1165771484375, "mean_token_accuracy": 0.7156732380390167, "num_tokens": 128796426.0, "step": 220 }, { "entropy": 1.01904296875, "epoch": 1.3168908819133034, "grad_norm": 13.25, "learning_rate": 5.654942814596901e-07, "loss": 1.10205078125, "mean_token_accuracy": 0.7199403569102287, "num_tokens": 129385997.0, "step": 221 }, { "entropy": 1.02392578125, "epoch": 1.3228699551569507, "grad_norm": 15.0625, "learning_rate": 5.568084544007588e-07, "loss": 1.11083984375, "mean_token_accuracy": 0.7177979946136475, "num_tokens": 129961180.0, "step": 222 }, { "entropy": 1.00927734375, "epoch": 1.3288490284005978, "grad_norm": 20.375, "learning_rate": 5.48164038855855e-07, "loss": 1.094482421875, "mean_token_accuracy": 0.7219114229083061, "num_tokens": 130549338.0, "step": 223 }, { "entropy": 1.01708984375, "epoch": 1.3348281016442451, "grad_norm": 12.25, "learning_rate": 5.395618425532389e-07, "loss": 1.1097412109375, "mean_token_accuracy": 0.7211140915751457, "num_tokens": 131134800.0, "step": 224 }, { "entropy": 0.99365234375, "epoch": 1.3408071748878925, "grad_norm": 26.25, "learning_rate": 5.310026692762314e-07, "loss": 1.0784912109375, "mean_token_accuracy": 0.727905310690403, "num_tokens": 131724429.0, "step": 225 }, { "entropy": 1.029296875, "epoch": 1.3467862481315396, "grad_norm": 14.75, "learning_rate": 5.224873187881136e-07, "loss": 1.1151123046875, "mean_token_accuracy": 0.7176884040236473, "num_tokens": 132314019.0, "step": 226 }, { "entropy": 1.03125, "epoch": 1.352765321375187, "grad_norm": 15.3125, "learning_rate": 5.140165867573939e-07, "loss": 1.12353515625, "mean_token_accuracy": 0.7174642384052277, "num_tokens": 132903580.0, "step": 227 }, { "entropy": 1.0390625, "epoch": 1.358744394618834, "grad_norm": 18.125, "learning_rate": 5.055912646834635e-07, "loss": 1.127197265625, "mean_token_accuracy": 0.7134326621890068, "num_tokens": 133493126.0, "step": 228 }, { "entropy": 1.01513671875, "epoch": 1.3647234678624813, "grad_norm": 10.75, "learning_rate": 4.972121398226371e-07, "loss": 1.101318359375, "mean_token_accuracy": 0.7197434529662132, "num_tokens": 134079329.0, "step": 229 }, { "entropy": 1.041015625, "epoch": 1.3707025411061284, "grad_norm": 13.625, "learning_rate": 4.888799951145947e-07, "loss": 1.1278076171875, "mean_token_accuracy": 0.7140819206833839, "num_tokens": 134654986.0, "step": 230 }, { "entropy": 1.0380859375, "epoch": 1.3766816143497758, "grad_norm": 10.4375, "learning_rate": 4.805956091092227e-07, "loss": 1.123779296875, "mean_token_accuracy": 0.7162793427705765, "num_tokens": 135244586.0, "step": 231 }, { "entropy": 1.05810546875, "epoch": 1.382660687593423, "grad_norm": 16.375, "learning_rate": 4.7235975589386713e-07, "loss": 1.1463623046875, "mean_token_accuracy": 0.7098471596837044, "num_tokens": 135834199.0, "step": 232 }, { "entropy": 1.04443359375, "epoch": 1.3886397608370702, "grad_norm": 17.125, "learning_rate": 4.641732050210031e-07, "loss": 1.1280517578125, "mean_token_accuracy": 0.7144335135817528, "num_tokens": 136423743.0, "step": 233 }, { "entropy": 1.06640625, "epoch": 1.3946188340807175, "grad_norm": 12.3125, "learning_rate": 4.5603672143632945e-07, "loss": 1.1444091796875, "mean_token_accuracy": 0.7087363749742508, "num_tokens": 137013243.0, "step": 234 }, { "entropy": 1.0302734375, "epoch": 1.4005979073243648, "grad_norm": 12.4375, "learning_rate": 4.479510654072909e-07, "loss": 1.1185302734375, "mean_token_accuracy": 0.7174856439232826, "num_tokens": 137599228.0, "step": 235 }, { "entropy": 1.02783203125, "epoch": 1.406576980568012, "grad_norm": 25.625, "learning_rate": 4.399169924520403e-07, "loss": 1.1148681640625, "mean_token_accuracy": 0.7194091156125069, "num_tokens": 138181557.0, "step": 236 }, { "entropy": 1.0234375, "epoch": 1.4125560538116593, "grad_norm": 16.625, "learning_rate": 4.3193525326884426e-07, "loss": 1.1102294921875, "mean_token_accuracy": 0.7175892367959023, "num_tokens": 138771048.0, "step": 237 }, { "entropy": 1.0361328125, "epoch": 1.4185351270553064, "grad_norm": 14.6875, "learning_rate": 4.240065936659374e-07, "loss": 1.12451171875, "mean_token_accuracy": 0.71492750197649, "num_tokens": 139360593.0, "step": 238 }, { "entropy": 1.05322265625, "epoch": 1.4245142002989537, "grad_norm": 25.125, "learning_rate": 4.1613175449183446e-07, "loss": 1.13232421875, "mean_token_accuracy": 0.7100114226341248, "num_tokens": 139946622.0, "step": 239 }, { "entropy": 0.99560546875, "epoch": 1.4304932735426008, "grad_norm": 36.75, "learning_rate": 4.0831147156610676e-07, "loss": 1.0897216796875, "mean_token_accuracy": 0.7266808152198792, "num_tokens": 140531396.0, "step": 240 }, { "entropy": 1.0009765625, "epoch": 1.4364723467862481, "grad_norm": 24.75, "learning_rate": 4.0054647561062615e-07, "loss": 1.0850830078125, "mean_token_accuracy": 0.7258649617433548, "num_tokens": 141120987.0, "step": 241 }, { "entropy": 1.0244140625, "epoch": 1.4424514200298955, "grad_norm": 29.75, "learning_rate": 3.928374921812888e-07, "loss": 1.1165771484375, "mean_token_accuracy": 0.7189558371901512, "num_tokens": 141703670.0, "step": 242 }, { "entropy": 1.046875, "epoch": 1.4484304932735426, "grad_norm": 10.375, "learning_rate": 3.851852416002187e-07, "loss": 1.1234130859375, "mean_token_accuracy": 0.7134259343147278, "num_tokens": 142283264.0, "step": 243 }, { "entropy": 1.03466796875, "epoch": 1.45440956651719, "grad_norm": 35.75, "learning_rate": 3.7759043888846173e-07, "loss": 1.12158203125, "mean_token_accuracy": 0.715514525771141, "num_tokens": 142870702.0, "step": 244 }, { "entropy": 1.0615234375, "epoch": 1.460388639760837, "grad_norm": 8.5625, "learning_rate": 3.7005379369917324e-07, "loss": 1.1358642578125, "mean_token_accuracy": 0.7089022919535637, "num_tokens": 143460302.0, "step": 245 }, { "entropy": 1.0126953125, "epoch": 1.4663677130044843, "grad_norm": 14.6875, "learning_rate": 3.625760102513102e-07, "loss": 1.1024169921875, "mean_token_accuracy": 0.7216273471713066, "num_tokens": 144047084.0, "step": 246 }, { "entropy": 1.037109375, "epoch": 1.4723467862481314, "grad_norm": 12.5, "learning_rate": 3.551577872638296e-07, "loss": 1.1268310546875, "mean_token_accuracy": 0.7154128924012184, "num_tokens": 144629747.0, "step": 247 }, { "entropy": 1.029296875, "epoch": 1.4783258594917787, "grad_norm": 15.875, "learning_rate": 3.477998178903981e-07, "loss": 1.133056640625, "mean_token_accuracy": 0.7142782434821129, "num_tokens": 145219266.0, "step": 248 }, { "entropy": 1.0146484375, "epoch": 1.484304932735426, "grad_norm": 10.5, "learning_rate": 3.4050278965462763e-07, "loss": 1.0947265625, "mean_token_accuracy": 0.720554769039154, "num_tokens": 145808833.0, "step": 249 }, { "entropy": 1.017578125, "epoch": 1.4902840059790732, "grad_norm": 12.0, "learning_rate": 3.3326738438583114e-07, "loss": 1.1031494140625, "mean_token_accuracy": 0.7191615030169487, "num_tokens": 146398531.0, "step": 250 }, { "entropy": 1.013671875, "epoch": 1.4962630792227205, "grad_norm": 13.0625, "learning_rate": 3.260942781553142e-07, "loss": 1.1036376953125, "mean_token_accuracy": 0.7216013073921204, "num_tokens": 146988047.0, "step": 251 }, { "entropy": 1.021484375, "epoch": 1.5022421524663678, "grad_norm": 8.1875, "learning_rate": 3.189841412132027e-07, "loss": 1.10498046875, "mean_token_accuracy": 0.7199263349175453, "num_tokens": 147577682.0, "step": 252 }, { "entropy": 1.05859375, "epoch": 1.508221225710015, "grad_norm": 9.75, "learning_rate": 3.1193763792581594e-07, "loss": 1.134765625, "mean_token_accuracy": 0.7107567712664604, "num_tokens": 148166138.0, "step": 253 }, { "entropy": 1.0166015625, "epoch": 1.514200298953662, "grad_norm": 7.59375, "learning_rate": 3.0495542671358744e-07, "loss": 1.1031494140625, "mean_token_accuracy": 0.7203914448618889, "num_tokens": 148755748.0, "step": 254 }, { "entropy": 1.03076171875, "epoch": 1.5201793721973094, "grad_norm": 19.25, "learning_rate": 2.980381599895433e-07, "loss": 1.1265869140625, "mean_token_accuracy": 0.7148845717310905, "num_tokens": 149345252.0, "step": 255 }, { "entropy": 1.087890625, "epoch": 1.5261584454409567, "grad_norm": 10.5, "learning_rate": 2.91186484098342e-07, "loss": 1.1712646484375, "mean_token_accuracy": 0.7025258839130402, "num_tokens": 149934781.0, "step": 256 }, { "entropy": 1.02880859375, "epoch": 1.5321375186846038, "grad_norm": 15.0, "learning_rate": 2.84401039255879e-07, "loss": 1.1123046875, "mean_token_accuracy": 0.7171602919697762, "num_tokens": 150524424.0, "step": 257 }, { "entropy": 1.04345703125, "epoch": 1.5381165919282511, "grad_norm": 16.5, "learning_rate": 2.776824594894661e-07, "loss": 1.1370849609375, "mean_token_accuracy": 0.7134297341108322, "num_tokens": 151113962.0, "step": 258 }, { "entropy": 1.02685546875, "epoch": 1.5440956651718984, "grad_norm": 13.875, "learning_rate": 2.7103137257858863e-07, "loss": 1.1080322265625, "mean_token_accuracy": 0.7190811783075333, "num_tokens": 151703586.0, "step": 259 }, { "entropy": 1.048828125, "epoch": 1.5500747384155455, "grad_norm": 9.25, "learning_rate": 2.644483999962449e-07, "loss": 1.1405029296875, "mean_token_accuracy": 0.712283693253994, "num_tokens": 152292444.0, "step": 260 }, { "entropy": 1.01025390625, "epoch": 1.5560538116591929, "grad_norm": 9.625, "learning_rate": 2.579341568508779e-07, "loss": 1.09228515625, "mean_token_accuracy": 0.721127025783062, "num_tokens": 152882090.0, "step": 261 }, { "entropy": 1.03466796875, "epoch": 1.5620328849028402, "grad_norm": 6.875, "learning_rate": 2.514892518288988e-07, "loss": 1.1090087890625, "mean_token_accuracy": 0.7168014496564865, "num_tokens": 153471720.0, "step": 262 }, { "entropy": 1.056640625, "epoch": 1.5680119581464873, "grad_norm": 6.25, "learning_rate": 2.4511428713781236e-07, "loss": 1.1324462890625, "mean_token_accuracy": 0.7105724215507507, "num_tokens": 154061310.0, "step": 263 }, { "entropy": 1.0419921875, "epoch": 1.5739910313901344, "grad_norm": 16.75, "learning_rate": 2.3880985844994673e-07, "loss": 1.1239013671875, "mean_token_accuracy": 0.713848665356636, "num_tokens": 154650888.0, "step": 264 }, { "entropy": 1.037109375, "epoch": 1.5799701046337817, "grad_norm": 8.9375, "learning_rate": 2.3257655484679372e-07, "loss": 1.12451171875, "mean_token_accuracy": 0.7131286934018135, "num_tokens": 155239209.0, "step": 265 }, { "entropy": 1.04638671875, "epoch": 1.585949177877429, "grad_norm": 8.5625, "learning_rate": 2.264149587639671e-07, "loss": 1.13037109375, "mean_token_accuracy": 0.7152972370386124, "num_tokens": 155828817.0, "step": 266 }, { "entropy": 1.005859375, "epoch": 1.5919282511210762, "grad_norm": 13.9375, "learning_rate": 2.2032564593677772e-07, "loss": 1.0977783203125, "mean_token_accuracy": 0.7207474857568741, "num_tokens": 156418416.0, "step": 267 }, { "entropy": 1.0166015625, "epoch": 1.5979073243647235, "grad_norm": 15.25, "learning_rate": 2.1430918534643994e-07, "loss": 1.1092529296875, "mean_token_accuracy": 0.7178195714950562, "num_tokens": 156996671.0, "step": 268 }, { "entropy": 1.0126953125, "epoch": 1.6038863976083708, "grad_norm": 10.1875, "learning_rate": 2.0836613916690427e-07, "loss": 1.097900390625, "mean_token_accuracy": 0.7219494804739952, "num_tokens": 157586304.0, "step": 269 }, { "entropy": 1.041015625, "epoch": 1.609865470852018, "grad_norm": 7.375, "learning_rate": 2.0249706271232946e-07, "loss": 1.13525390625, "mean_token_accuracy": 0.714814230799675, "num_tokens": 158175939.0, "step": 270 }, { "entropy": 1.01953125, "epoch": 1.615844544095665, "grad_norm": 14.4375, "learning_rate": 1.9670250438519386e-07, "loss": 1.1107177734375, "mean_token_accuracy": 0.719508022069931, "num_tokens": 158765530.0, "step": 271 }, { "entropy": 1.03076171875, "epoch": 1.6218236173393124, "grad_norm": 6.09375, "learning_rate": 1.9098300562505264e-07, "loss": 1.112060546875, "mean_token_accuracy": 0.7187488600611687, "num_tokens": 159355121.0, "step": 272 }, { "entropy": 1.05078125, "epoch": 1.6278026905829597, "grad_norm": 9.875, "learning_rate": 1.8533910085794713e-07, "loss": 1.1397705078125, "mean_token_accuracy": 0.7100469321012497, "num_tokens": 159937687.0, "step": 273 }, { "entropy": 1.01904296875, "epoch": 1.6337817638266068, "grad_norm": 10.8125, "learning_rate": 1.7977131744646724e-07, "loss": 1.1077880859375, "mean_token_accuracy": 0.720647431910038, "num_tokens": 160518664.0, "step": 274 }, { "entropy": 0.9990234375, "epoch": 1.639760837070254, "grad_norm": 20.25, "learning_rate": 1.742801756404759e-07, "loss": 1.09033203125, "mean_token_accuracy": 0.7235589995980263, "num_tokens": 161102238.0, "step": 275 }, { "entropy": 1.033203125, "epoch": 1.6457399103139014, "grad_norm": 12.5625, "learning_rate": 1.688661885284972e-07, "loss": 1.125, "mean_token_accuracy": 0.7176312282681465, "num_tokens": 161676535.0, "step": 276 }, { "entropy": 1.02783203125, "epoch": 1.6517189835575485, "grad_norm": 18.75, "learning_rate": 1.6352986198977325e-07, "loss": 1.10791015625, "mean_token_accuracy": 0.718546986579895, "num_tokens": 162266110.0, "step": 277 }, { "entropy": 1.0009765625, "epoch": 1.6576980568011959, "grad_norm": 10.5, "learning_rate": 1.5827169464699575e-07, "loss": 1.0906982421875, "mean_token_accuracy": 0.7236178815364838, "num_tokens": 162855683.0, "step": 278 }, { "entropy": 1.033203125, "epoch": 1.6636771300448432, "grad_norm": 8.375, "learning_rate": 1.5309217781971416e-07, "loss": 1.1171875, "mean_token_accuracy": 0.7165531665086746, "num_tokens": 163428337.0, "step": 279 }, { "entropy": 1.0244140625, "epoch": 1.6696562032884903, "grad_norm": 15.5625, "learning_rate": 1.479917954784282e-07, "loss": 1.10693359375, "mean_token_accuracy": 0.7174379974603653, "num_tokens": 164017962.0, "step": 280 }, { "entropy": 1.03466796875, "epoch": 1.6756352765321374, "grad_norm": 22.125, "learning_rate": 1.429710241993656e-07, "loss": 1.1173095703125, "mean_token_accuracy": 0.7154664248228073, "num_tokens": 164605762.0, "step": 281 }, { "entropy": 1.04541015625, "epoch": 1.6816143497757847, "grad_norm": 14.875, "learning_rate": 1.380303331199507e-07, "loss": 1.1348876953125, "mean_token_accuracy": 0.7117466628551483, "num_tokens": 165195338.0, "step": 282 }, { "entropy": 1.009765625, "epoch": 1.687593423019432, "grad_norm": 21.0, "learning_rate": 1.3317018389496926e-07, "loss": 1.111083984375, "mean_token_accuracy": 0.7207304239273071, "num_tokens": 165784893.0, "step": 283 }, { "entropy": 1.02734375, "epoch": 1.6935724962630792, "grad_norm": 8.6875, "learning_rate": 1.283910306534308e-07, "loss": 1.1119384765625, "mean_token_accuracy": 0.717054933309555, "num_tokens": 166374502.0, "step": 284 }, { "entropy": 1.02783203125, "epoch": 1.6995515695067265, "grad_norm": 12.75, "learning_rate": 1.2369331995613663e-07, "loss": 1.1182861328125, "mean_token_accuracy": 0.7191892936825752, "num_tokens": 166964071.0, "step": 285 }, { "entropy": 1.01904296875, "epoch": 1.7055306427503738, "grad_norm": 19.625, "learning_rate": 1.1907749075395146e-07, "loss": 1.1087646484375, "mean_token_accuracy": 0.7190410420298576, "num_tokens": 167553522.0, "step": 286 }, { "entropy": 1.03466796875, "epoch": 1.711509715994021, "grad_norm": 14.3125, "learning_rate": 1.145439743467902e-07, "loss": 1.11865234375, "mean_token_accuracy": 0.71589395403862, "num_tokens": 168143165.0, "step": 287 }, { "entropy": 1.037109375, "epoch": 1.717488789237668, "grad_norm": 9.0, "learning_rate": 1.1009319434331621e-07, "loss": 1.1199951171875, "mean_token_accuracy": 0.7174848467111588, "num_tokens": 168727049.0, "step": 288 }, { "entropy": 1.052734375, "epoch": 1.7234678624813156, "grad_norm": 13.3125, "learning_rate": 1.0572556662136035e-07, "loss": 1.1346435546875, "mean_token_accuracy": 0.7104056030511856, "num_tokens": 169308189.0, "step": 289 }, { "entropy": 1.0556640625, "epoch": 1.7294469357249627, "grad_norm": 10.3125, "learning_rate": 1.014414992890611e-07, "loss": 1.1441650390625, "mean_token_accuracy": 0.7134399339556694, "num_tokens": 169897805.0, "step": 290 }, { "entropy": 1.03564453125, "epoch": 1.7354260089686098, "grad_norm": 12.4375, "learning_rate": 9.724139264673114e-08, "loss": 1.1241455078125, "mean_token_accuracy": 0.7146468609571457, "num_tokens": 170487344.0, "step": 291 }, { "entropy": 1.0625, "epoch": 1.741405082212257, "grad_norm": 9.625, "learning_rate": 9.312563914945459e-08, "loss": 1.14111328125, "mean_token_accuracy": 0.7087547183036804, "num_tokens": 171076956.0, "step": 292 }, { "entropy": 1.0302734375, "epoch": 1.7473841554559044, "grad_norm": 12.3125, "learning_rate": 8.909462337041507e-08, "loss": 1.119384765625, "mean_token_accuracy": 0.7157952710986137, "num_tokens": 171666573.0, "step": 293 }, { "entropy": 1.0283203125, "epoch": 1.7533632286995515, "grad_norm": 17.5, "learning_rate": 8.514872196496181e-08, "loss": 1.116943359375, "mean_token_accuracy": 0.7182503044605255, "num_tokens": 172247089.0, "step": 294 }, { "entropy": 1.0390625, "epoch": 1.7593423019431988, "grad_norm": 6.96875, "learning_rate": 8.128830363541572e-08, "loss": 1.132568359375, "mean_token_accuracy": 0.7144065871834755, "num_tokens": 172836721.0, "step": 295 }, { "entropy": 1.0361328125, "epoch": 1.7653213751868462, "grad_norm": 14.75, "learning_rate": 7.751372909661768e-08, "loss": 1.1168212890625, "mean_token_accuracy": 0.7155275791883469, "num_tokens": 173426281.0, "step": 296 }, { "entropy": 1.00146484375, "epoch": 1.7713004484304933, "grad_norm": 16.375, "learning_rate": 7.382535104222364e-08, "loss": 1.0948486328125, "mean_token_accuracy": 0.7220958769321442, "num_tokens": 174015810.0, "step": 297 }, { "entropy": 1.01513671875, "epoch": 1.7772795216741404, "grad_norm": 12.8125, "learning_rate": 7.022351411174865e-08, "loss": 1.097900390625, "mean_token_accuracy": 0.7205198705196381, "num_tokens": 174602847.0, "step": 298 }, { "entropy": 1.029296875, "epoch": 1.7832585949177877, "grad_norm": 11.8125, "learning_rate": 6.670855485836524e-08, "loss": 1.1104736328125, "mean_token_accuracy": 0.718941256403923, "num_tokens": 175192486.0, "step": 299 }, { "entropy": 1.0439453125, "epoch": 1.789237668161435, "grad_norm": 11.75, "learning_rate": 6.328080171745509e-08, "loss": 1.125, "mean_token_accuracy": 0.7150193601846695, "num_tokens": 175782052.0, "step": 300 }, { "entropy": 1.03955078125, "epoch": 1.7952167414050821, "grad_norm": 13.6875, "learning_rate": 5.994057497592031e-08, "loss": 1.13037109375, "mean_token_accuracy": 0.7155382409691811, "num_tokens": 176366656.0, "step": 301 }, { "entropy": 1.03759765625, "epoch": 1.8011958146487295, "grad_norm": 10.3125, "learning_rate": 5.6688186742256835e-08, "loss": 1.11767578125, "mean_token_accuracy": 0.7158585712313652, "num_tokens": 176956185.0, "step": 302 }, { "entropy": 1.04052734375, "epoch": 1.8071748878923768, "grad_norm": 20.625, "learning_rate": 5.352394091739021e-08, "loss": 1.1318359375, "mean_token_accuracy": 0.7144715860486031, "num_tokens": 177545828.0, "step": 303 }, { "entropy": 1.01611328125, "epoch": 1.813153961136024, "grad_norm": 11.75, "learning_rate": 5.0448133166279935e-08, "loss": 1.1007080078125, "mean_token_accuracy": 0.7212875410914421, "num_tokens": 178126394.0, "step": 304 }, { "entropy": 1.04443359375, "epoch": 1.819133034379671, "grad_norm": 11.4375, "learning_rate": 4.746105089029229e-08, "loss": 1.1265869140625, "mean_token_accuracy": 0.714270606637001, "num_tokens": 178715841.0, "step": 305 }, { "entropy": 1.01171875, "epoch": 1.8251121076233185, "grad_norm": 12.625, "learning_rate": 4.456297320034641e-08, "loss": 1.0985107421875, "mean_token_accuracy": 0.7234744802117348, "num_tokens": 179304834.0, "step": 306 }, { "entropy": 1.025390625, "epoch": 1.8310911808669657, "grad_norm": 23.375, "learning_rate": 4.1754170890833774e-08, "loss": 1.1092529296875, "mean_token_accuracy": 0.7182625830173492, "num_tokens": 179894402.0, "step": 307 }, { "entropy": 1.0283203125, "epoch": 1.8370702541106128, "grad_norm": 13.875, "learning_rate": 3.9034906414315725e-08, "loss": 1.126953125, "mean_token_accuracy": 0.7168915420770645, "num_tokens": 180483905.0, "step": 308 }, { "entropy": 1.0234375, "epoch": 1.84304932735426, "grad_norm": 14.9375, "learning_rate": 3.6405433856999676e-08, "loss": 1.10693359375, "mean_token_accuracy": 0.7204272672533989, "num_tokens": 181073455.0, "step": 309 }, { "entropy": 1.01025390625, "epoch": 1.8490284005979074, "grad_norm": 15.5625, "learning_rate": 3.386599891499764e-08, "loss": 1.0946044921875, "mean_token_accuracy": 0.7214725464582443, "num_tokens": 181663017.0, "step": 310 }, { "entropy": 1.04638671875, "epoch": 1.8550074738415545, "grad_norm": 12.625, "learning_rate": 3.141683887136892e-08, "loss": 1.13232421875, "mean_token_accuracy": 0.7134620323777199, "num_tokens": 182245322.0, "step": 311 }, { "entropy": 1.01513671875, "epoch": 1.8609865470852018, "grad_norm": 11.0, "learning_rate": 2.9058182573947986e-08, "loss": 1.0958251953125, "mean_token_accuracy": 0.7222162559628487, "num_tokens": 182826090.0, "step": 312 }, { "entropy": 1.00341796875, "epoch": 1.8669656203288492, "grad_norm": 8.1875, "learning_rate": 2.6790250413961546e-08, "loss": 1.0860595703125, "mean_token_accuracy": 0.7237614244222641, "num_tokens": 183415723.0, "step": 313 }, { "entropy": 1.0322265625, "epoch": 1.8729446935724963, "grad_norm": 11.375, "learning_rate": 2.4613254305434815e-08, "loss": 1.10894775390625, "mean_token_accuracy": 0.7184558361768723, "num_tokens": 183996444.0, "step": 314 }, { "entropy": 1.03515625, "epoch": 1.8789237668161434, "grad_norm": 6.46875, "learning_rate": 2.2527397665391024e-08, "loss": 1.1177978515625, "mean_token_accuracy": 0.716648705303669, "num_tokens": 184586073.0, "step": 315 }, { "entropy": 1.05078125, "epoch": 1.8849028400597907, "grad_norm": 12.1875, "learning_rate": 2.053287539484405e-08, "loss": 1.1304931640625, "mean_token_accuracy": 0.7115833833813667, "num_tokens": 185175613.0, "step": 316 }, { "entropy": 1.0009765625, "epoch": 1.890881913303438, "grad_norm": 11.8125, "learning_rate": 1.8629873860586564e-08, "loss": 1.0841064453125, "mean_token_accuracy": 0.7247348576784134, "num_tokens": 185765103.0, "step": 317 }, { "entropy": 0.99609375, "epoch": 1.8968609865470851, "grad_norm": 10.75, "learning_rate": 1.6818570877776718e-08, "loss": 1.0699462890625, "mean_token_accuracy": 0.7280925586819649, "num_tokens": 186347971.0, "step": 318 }, { "entropy": 1.03369140625, "epoch": 1.9028400597907325, "grad_norm": 8.0, "learning_rate": 1.5099135693322773e-08, "loss": 1.1158447265625, "mean_token_accuracy": 0.716788075864315, "num_tokens": 186937575.0, "step": 319 }, { "entropy": 1.0078125, "epoch": 1.9088191330343798, "grad_norm": 9.4375, "learning_rate": 1.3471728970068985e-08, "loss": 1.0909423828125, "mean_token_accuracy": 0.7214584723114967, "num_tokens": 187525095.0, "step": 320 }, { "entropy": 1.041015625, "epoch": 1.9147982062780269, "grad_norm": 7.65625, "learning_rate": 1.1936502771783486e-08, "loss": 1.1307373046875, "mean_token_accuracy": 0.7133340612053871, "num_tokens": 188114646.0, "step": 321 }, { "entropy": 1.0556640625, "epoch": 1.920777279521674, "grad_norm": 12.0, "learning_rate": 1.0493600548948877e-08, "loss": 1.140625, "mean_token_accuracy": 0.7104567736387253, "num_tokens": 188700786.0, "step": 322 }, { "entropy": 1.04296875, "epoch": 1.9267563527653215, "grad_norm": 14.4375, "learning_rate": 9.143157125359513e-09, "loss": 1.134033203125, "mean_token_accuracy": 0.7124024033546448, "num_tokens": 189285842.0, "step": 323 }, { "entropy": 1.02294921875, "epoch": 1.9327354260089686, "grad_norm": 15.375, "learning_rate": 7.885298685522235e-09, "loss": 1.117431640625, "mean_token_accuracy": 0.7192790359258652, "num_tokens": 189868226.0, "step": 324 }, { "entropy": 1.017578125, "epoch": 1.9387144992526157, "grad_norm": 26.25, "learning_rate": 6.720142762867032e-09, "loss": 1.107666015625, "mean_token_accuracy": 0.718171015381813, "num_tokens": 190450814.0, "step": 325 }, { "entropy": 0.98291015625, "epoch": 1.944693572496263, "grad_norm": 29.375, "learning_rate": 5.647798228764156e-09, "loss": 1.0780029296875, "mean_token_accuracy": 0.7297961264848709, "num_tokens": 191040409.0, "step": 326 }, { "entropy": 1.03125, "epoch": 1.9506726457399104, "grad_norm": 8.9375, "learning_rate": 4.668365282351372e-09, "loss": 1.1124267578125, "mean_token_accuracy": 0.7161725759506226, "num_tokens": 191630067.0, "step": 327 }, { "entropy": 1.02685546875, "epoch": 1.9566517189835575, "grad_norm": 10.5625, "learning_rate": 3.7819354411713355e-09, "loss": 1.11083984375, "mean_token_accuracy": 0.7190196141600609, "num_tokens": 192219651.0, "step": 328 }, { "entropy": 1.03076171875, "epoch": 1.9626307922272048, "grad_norm": 12.25, "learning_rate": 2.9885915326203216e-09, "loss": 1.1121826171875, "mean_token_accuracy": 0.7161883562803268, "num_tokens": 192809216.0, "step": 329 }, { "entropy": 1.0224609375, "epoch": 1.9686098654708521, "grad_norm": 13.5625, "learning_rate": 2.2884076862089707e-09, "loss": 1.108642578125, "mean_token_accuracy": 0.7192875891923904, "num_tokens": 193394355.0, "step": 330 }, { "entropy": 1.041015625, "epoch": 1.9745889387144993, "grad_norm": 13.125, "learning_rate": 1.6814493266357199e-09, "loss": 1.129638671875, "mean_token_accuracy": 0.7148761376738548, "num_tokens": 193983861.0, "step": 331 }, { "entropy": 0.98583984375, "epoch": 1.9805680119581464, "grad_norm": 11.0625, "learning_rate": 1.1677731676733581e-09, "loss": 1.0601806640625, "mean_token_accuracy": 0.7276952490210533, "num_tokens": 194573512.0, "step": 332 }, { "entropy": 1.0263671875, "epoch": 1.9865470852017937, "grad_norm": 12.9375, "learning_rate": 7.474272068698217e-10, "loss": 1.1114501953125, "mean_token_accuracy": 0.7177807167172432, "num_tokens": 195163062.0, "step": 333 }, { "entropy": 1.0537109375, "epoch": 1.992526158445441, "grad_norm": 10.9375, "learning_rate": 4.204507210633368e-10, "loss": 1.135498046875, "mean_token_accuracy": 0.7125765532255173, "num_tokens": 195752597.0, "step": 334 }, { "entropy": 1.03369140625, "epoch": 1.9985052316890881, "grad_norm": 10.375, "learning_rate": 1.8687426271246642e-10, "loss": 1.1175537109375, "mean_token_accuracy": 0.7172495499253273, "num_tokens": 196342135.0, "step": 335 }, { "entropy": 0.9921875, "epoch": 2.0, "grad_norm": 10.0, "learning_rate": 4.6719657041283115e-11, "loss": 1.0634765625, "mean_token_accuracy": 0.7287841141223907, "num_tokens": 196489548.0, "step": 336 } ], "logging_steps": 1, "max_steps": 336, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8420381399090463e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }