| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 336, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.06201171875, |
| "epoch": 0.005979073243647235, |
| "grad_norm": 9.5625, |
| "learning_rate": 0.0, |
| "loss": 1.38720703125, |
| "mean_token_accuracy": 0.6870120912790298, |
| "num_tokens": 589646.0, |
| "step": 1 |
| }, |
| { |
| "entropy": 1.0556640625, |
| "epoch": 0.01195814648729447, |
| "grad_norm": 8.6875, |
| "learning_rate": 1.818181818181818e-07, |
| "loss": 1.356689453125, |
| "mean_token_accuracy": 0.6927258595824242, |
| "num_tokens": 1179185.0, |
| "step": 2 |
| }, |
| { |
| "entropy": 1.0947265625, |
| "epoch": 0.017937219730941704, |
| "grad_norm": 8.875, |
| "learning_rate": 3.636363636363636e-07, |
| "loss": 1.38818359375, |
| "mean_token_accuracy": 0.6830958425998688, |
| "num_tokens": 1768801.0, |
| "step": 3 |
| }, |
| { |
| "entropy": 1.0830078125, |
| "epoch": 0.02391629297458894, |
| "grad_norm": 9.8125, |
| "learning_rate": 5.454545454545454e-07, |
| "loss": 1.408203125, |
| "mean_token_accuracy": 0.6821927055716515, |
| "num_tokens": 2355606.0, |
| "step": 4 |
| }, |
| { |
| "entropy": 1.0908203125, |
| "epoch": 0.029895366218236172, |
| "grad_norm": 9.375, |
| "learning_rate": 7.272727272727272e-07, |
| "loss": 1.382568359375, |
| "mean_token_accuracy": 0.6858013942837715, |
| "num_tokens": 2941851.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 1.060546875, |
| "epoch": 0.03587443946188341, |
| "grad_norm": 8.25, |
| "learning_rate": 9.09090909090909e-07, |
| "loss": 1.3363037109375, |
| "mean_token_accuracy": 0.6952119246125221, |
| "num_tokens": 3531477.0, |
| "step": 6 |
| }, |
| { |
| "entropy": 1.060546875, |
| "epoch": 0.04185351270553064, |
| "grad_norm": 7.84375, |
| "learning_rate": 1.0909090909090908e-06, |
| "loss": 1.338134765625, |
| "mean_token_accuracy": 0.6950986832380295, |
| "num_tokens": 4121161.0, |
| "step": 7 |
| }, |
| { |
| "entropy": 1.06494140625, |
| "epoch": 0.04783258594917788, |
| "grad_norm": 8.9375, |
| "learning_rate": 1.2727272727272726e-06, |
| "loss": 1.378662109375, |
| "mean_token_accuracy": 0.6874435991048813, |
| "num_tokens": 4700699.0, |
| "step": 8 |
| }, |
| { |
| "entropy": 1.05224609375, |
| "epoch": 0.053811659192825115, |
| "grad_norm": 7.75, |
| "learning_rate": 1.4545454545454544e-06, |
| "loss": 1.3175048828125, |
| "mean_token_accuracy": 0.6969988569617271, |
| "num_tokens": 5284275.0, |
| "step": 9 |
| }, |
| { |
| "entropy": 1.07763671875, |
| "epoch": 0.059790732436472344, |
| "grad_norm": 7.0625, |
| "learning_rate": 1.6363636363636365e-06, |
| "loss": 1.3353271484375, |
| "mean_token_accuracy": 0.6939220502972603, |
| "num_tokens": 5873877.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.087890625, |
| "epoch": 0.06576980568011959, |
| "grad_norm": 7.34375, |
| "learning_rate": 1.818181818181818e-06, |
| "loss": 1.3642578125, |
| "mean_token_accuracy": 0.6876638159155846, |
| "num_tokens": 6463496.0, |
| "step": 11 |
| }, |
| { |
| "entropy": 1.0673828125, |
| "epoch": 0.07174887892376682, |
| "grad_norm": 7.125, |
| "learning_rate": 2e-06, |
| "loss": 1.332275390625, |
| "mean_token_accuracy": 0.6941065043210983, |
| "num_tokens": 7053054.0, |
| "step": 12 |
| }, |
| { |
| "entropy": 1.04150390625, |
| "epoch": 0.07772795216741404, |
| "grad_norm": 8.6875, |
| "learning_rate": 1.999953280342959e-06, |
| "loss": 1.354248046875, |
| "mean_token_accuracy": 0.6947119757533073, |
| "num_tokens": 7633205.0, |
| "step": 13 |
| }, |
| { |
| "entropy": 1.0634765625, |
| "epoch": 0.08370702541106129, |
| "grad_norm": 6.75, |
| "learning_rate": 1.9998131257372875e-06, |
| "loss": 1.314208984375, |
| "mean_token_accuracy": 0.6971366181969643, |
| "num_tokens": 8219275.0, |
| "step": 14 |
| }, |
| { |
| "entropy": 1.1015625, |
| "epoch": 0.08968609865470852, |
| "grad_norm": 6.375, |
| "learning_rate": 1.9995795492789365e-06, |
| "loss": 1.34912109375, |
| "mean_token_accuracy": 0.6870761960744858, |
| "num_tokens": 8808882.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 1.0693359375, |
| "epoch": 0.09566517189835576, |
| "grad_norm": 6.5625, |
| "learning_rate": 1.99925257279313e-06, |
| "loss": 1.318115234375, |
| "mean_token_accuracy": 0.6940489485859871, |
| "num_tokens": 9398391.0, |
| "step": 16 |
| }, |
| { |
| "entropy": 1.07421875, |
| "epoch": 0.10164424514200299, |
| "grad_norm": 6.5, |
| "learning_rate": 1.9988322268323264e-06, |
| "loss": 1.32470703125, |
| "mean_token_accuracy": 0.6955610886216164, |
| "num_tokens": 9988054.0, |
| "step": 17 |
| }, |
| { |
| "entropy": 1.0986328125, |
| "epoch": 0.10762331838565023, |
| "grad_norm": 6.1875, |
| "learning_rate": 1.998318550673364e-06, |
| "loss": 1.3236083984375, |
| "mean_token_accuracy": 0.691640131175518, |
| "num_tokens": 10577624.0, |
| "step": 18 |
| }, |
| { |
| "entropy": 1.06787109375, |
| "epoch": 0.11360239162929746, |
| "grad_norm": 6.34375, |
| "learning_rate": 1.997711592313791e-06, |
| "loss": 1.3114013671875, |
| "mean_token_accuracy": 0.6971054673194885, |
| "num_tokens": 11167248.0, |
| "step": 19 |
| }, |
| { |
| "entropy": 1.0439453125, |
| "epoch": 0.11958146487294469, |
| "grad_norm": 6.84375, |
| "learning_rate": 1.9970114084673796e-06, |
| "loss": 1.291748046875, |
| "mean_token_accuracy": 0.7017510756850243, |
| "num_tokens": 11748340.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.087890625, |
| "epoch": 0.12556053811659193, |
| "grad_norm": 6.78125, |
| "learning_rate": 1.9962180645588286e-06, |
| "loss": 1.3157958984375, |
| "mean_token_accuracy": 0.692795068025589, |
| "num_tokens": 12331037.0, |
| "step": 21 |
| }, |
| { |
| "entropy": 1.08203125, |
| "epoch": 0.13153961136023917, |
| "grad_norm": 9.1875, |
| "learning_rate": 1.9953316347176486e-06, |
| "loss": 1.302001953125, |
| "mean_token_accuracy": 0.6975891441106796, |
| "num_tokens": 12918010.0, |
| "step": 22 |
| }, |
| { |
| "entropy": 1.068359375, |
| "epoch": 0.1375186846038864, |
| "grad_norm": 11.625, |
| "learning_rate": 1.994352201771236e-06, |
| "loss": 1.3125, |
| "mean_token_accuracy": 0.695681132376194, |
| "num_tokens": 13507561.0, |
| "step": 23 |
| }, |
| { |
| "entropy": 1.06640625, |
| "epoch": 0.14349775784753363, |
| "grad_norm": 12.8125, |
| "learning_rate": 1.993279857237133e-06, |
| "loss": 1.2779541015625, |
| "mean_token_accuracy": 0.6991168782114983, |
| "num_tokens": 14090266.0, |
| "step": 24 |
| }, |
| { |
| "entropy": 1.06640625, |
| "epoch": 0.14947683109118087, |
| "grad_norm": 13.9375, |
| "learning_rate": 1.9921147013144777e-06, |
| "loss": 1.283447265625, |
| "mean_token_accuracy": 0.6982817649841309, |
| "num_tokens": 14679851.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 1.0849609375, |
| "epoch": 0.1554559043348281, |
| "grad_norm": 14.125, |
| "learning_rate": 1.9908568428746405e-06, |
| "loss": 1.28564453125, |
| "mean_token_accuracy": 0.6965354830026627, |
| "num_tokens": 15269433.0, |
| "step": 26 |
| }, |
| { |
| "entropy": 1.087890625, |
| "epoch": 0.16143497757847533, |
| "grad_norm": 16.375, |
| "learning_rate": 1.989506399451051e-06, |
| "loss": 1.3095703125, |
| "mean_token_accuracy": 0.6939198896288872, |
| "num_tokens": 15858975.0, |
| "step": 27 |
| }, |
| { |
| "entropy": 1.0615234375, |
| "epoch": 0.16741405082212257, |
| "grad_norm": 12.1875, |
| "learning_rate": 1.9880634972282166e-06, |
| "loss": 1.273681640625, |
| "mean_token_accuracy": 0.7005915492773056, |
| "num_tokens": 16448581.0, |
| "step": 28 |
| }, |
| { |
| "entropy": 1.1220703125, |
| "epoch": 0.17339312406576982, |
| "grad_norm": 17.5, |
| "learning_rate": 1.986528271029931e-06, |
| "loss": 1.302001953125, |
| "mean_token_accuracy": 0.6892580538988113, |
| "num_tokens": 17038162.0, |
| "step": 29 |
| }, |
| { |
| "entropy": 1.060546875, |
| "epoch": 0.17937219730941703, |
| "grad_norm": 12.5625, |
| "learning_rate": 1.984900864306677e-06, |
| "loss": 1.2484130859375, |
| "mean_token_accuracy": 0.7043808251619339, |
| "num_tokens": 17622841.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.083984375, |
| "epoch": 0.18535127055306427, |
| "grad_norm": 16.125, |
| "learning_rate": 1.9831814291222233e-06, |
| "loss": 1.276611328125, |
| "mean_token_accuracy": 0.6971414536237717, |
| "num_tokens": 18212334.0, |
| "step": 31 |
| }, |
| { |
| "entropy": 1.0859375, |
| "epoch": 0.19133034379671152, |
| "grad_norm": 19.125, |
| "learning_rate": 1.981370126139413e-06, |
| "loss": 1.282470703125, |
| "mean_token_accuracy": 0.6962975636124611, |
| "num_tokens": 18794983.0, |
| "step": 32 |
| }, |
| { |
| "entropy": 1.072265625, |
| "epoch": 0.19730941704035873, |
| "grad_norm": 22.875, |
| "learning_rate": 1.979467124605156e-06, |
| "loss": 1.2430419921875, |
| "mean_token_accuracy": 0.7024854198098183, |
| "num_tokens": 19384646.0, |
| "step": 33 |
| }, |
| { |
| "entropy": 1.103515625, |
| "epoch": 0.20328849028400597, |
| "grad_norm": 29.375, |
| "learning_rate": 1.977472602334609e-06, |
| "loss": 1.293212890625, |
| "mean_token_accuracy": 0.691613681614399, |
| "num_tokens": 19971390.0, |
| "step": 34 |
| }, |
| { |
| "entropy": 1.0517578125, |
| "epoch": 0.20926756352765322, |
| "grad_norm": 34.75, |
| "learning_rate": 1.975386745694565e-06, |
| "loss": 1.2325439453125, |
| "mean_token_accuracy": 0.7074485868215561, |
| "num_tokens": 20552857.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 1.0888671875, |
| "epoch": 0.21524663677130046, |
| "grad_norm": 46.25, |
| "learning_rate": 1.9732097495860385e-06, |
| "loss": 1.27880859375, |
| "mean_token_accuracy": 0.6955694854259491, |
| "num_tokens": 21142419.0, |
| "step": 36 |
| }, |
| { |
| "entropy": 1.0869140625, |
| "epoch": 0.22122571001494767, |
| "grad_norm": 52.0, |
| "learning_rate": 1.970941817426052e-06, |
| "loss": 1.2547607421875, |
| "mean_token_accuracy": 0.6988364160060883, |
| "num_tokens": 21725119.0, |
| "step": 37 |
| }, |
| { |
| "entropy": 1.0888671875, |
| "epoch": 0.22720478325859492, |
| "grad_norm": 50.25, |
| "learning_rate": 1.968583161128631e-06, |
| "loss": 1.2620849609375, |
| "mean_token_accuracy": 0.6952823475003242, |
| "num_tokens": 22314606.0, |
| "step": 38 |
| }, |
| { |
| "entropy": 1.111328125, |
| "epoch": 0.23318385650224216, |
| "grad_norm": 28.5, |
| "learning_rate": 1.9661340010850024e-06, |
| "loss": 1.2611083984375, |
| "mean_token_accuracy": 0.6952020153403282, |
| "num_tokens": 22897136.0, |
| "step": 39 |
| }, |
| { |
| "entropy": 1.0546875, |
| "epoch": 0.23916292974588937, |
| "grad_norm": 13.375, |
| "learning_rate": 1.9635945661430005e-06, |
| "loss": 1.2120361328125, |
| "mean_token_accuracy": 0.7072760388255119, |
| "num_tokens": 23470326.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 1.05322265625, |
| "epoch": 0.24514200298953662, |
| "grad_norm": 11.0625, |
| "learning_rate": 1.960965093585684e-06, |
| "loss": 1.1966552734375, |
| "mean_token_accuracy": 0.710840716958046, |
| "num_tokens": 24059902.0, |
| "step": 41 |
| }, |
| { |
| "entropy": 1.0986328125, |
| "epoch": 0.25112107623318386, |
| "grad_norm": 10.1875, |
| "learning_rate": 1.9582458291091663e-06, |
| "loss": 1.2474365234375, |
| "mean_token_accuracy": 0.6974528953433037, |
| "num_tokens": 24641292.0, |
| "step": 42 |
| }, |
| { |
| "entropy": 1.08203125, |
| "epoch": 0.2571001494768311, |
| "grad_norm": 9.125, |
| "learning_rate": 1.9554370267996535e-06, |
| "loss": 1.2308349609375, |
| "mean_token_accuracy": 0.7031876817345619, |
| "num_tokens": 25230783.0, |
| "step": 43 |
| }, |
| { |
| "entropy": 1.05810546875, |
| "epoch": 0.26307922272047835, |
| "grad_norm": 9.125, |
| "learning_rate": 1.952538949109708e-06, |
| "loss": 1.195556640625, |
| "mean_token_accuracy": 0.7081611901521683, |
| "num_tokens": 25820348.0, |
| "step": 44 |
| }, |
| { |
| "entropy": 1.0888671875, |
| "epoch": 0.26905829596412556, |
| "grad_norm": 9.5625, |
| "learning_rate": 1.94955186683372e-06, |
| "loss": 1.242919921875, |
| "mean_token_accuracy": 0.6991148665547371, |
| "num_tokens": 26409842.0, |
| "step": 45 |
| }, |
| { |
| "entropy": 1.080078125, |
| "epoch": 0.2750373692077728, |
| "grad_norm": 9.4375, |
| "learning_rate": 1.94647605908261e-06, |
| "loss": 1.2176513671875, |
| "mean_token_accuracy": 0.7035784423351288, |
| "num_tokens": 26993040.0, |
| "step": 46 |
| }, |
| { |
| "entropy": 1.10009765625, |
| "epoch": 0.28101644245142005, |
| "grad_norm": 8.125, |
| "learning_rate": 1.943311813257743e-06, |
| "loss": 1.252685546875, |
| "mean_token_accuracy": 0.6970779970288277, |
| "num_tokens": 27582641.0, |
| "step": 47 |
| }, |
| { |
| "entropy": 1.11328125, |
| "epoch": 0.28699551569506726, |
| "grad_norm": 8.5, |
| "learning_rate": 1.9400594250240794e-06, |
| "loss": 1.260009765625, |
| "mean_token_accuracy": 0.6932175979018211, |
| "num_tokens": 28172234.0, |
| "step": 48 |
| }, |
| { |
| "entropy": 1.0869140625, |
| "epoch": 0.2929745889387145, |
| "grad_norm": 8.625, |
| "learning_rate": 1.9367191982825448e-06, |
| "loss": 1.208740234375, |
| "mean_token_accuracy": 0.7022047564387321, |
| "num_tokens": 28761815.0, |
| "step": 49 |
| }, |
| { |
| "entropy": 1.06982421875, |
| "epoch": 0.29895366218236175, |
| "grad_norm": 7.46875, |
| "learning_rate": 1.9332914451416345e-06, |
| "loss": 1.214599609375, |
| "mean_token_accuracy": 0.7050945162773132, |
| "num_tokens": 29351427.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.078125, |
| "epoch": 0.30493273542600896, |
| "grad_norm": 6.78125, |
| "learning_rate": 1.929776485888251e-06, |
| "loss": 1.23046875, |
| "mean_token_accuracy": 0.7009832188487053, |
| "num_tokens": 29941039.0, |
| "step": 51 |
| }, |
| { |
| "entropy": 1.0703125, |
| "epoch": 0.3109118086696562, |
| "grad_norm": 11.4375, |
| "learning_rate": 1.9261746489577764e-06, |
| "loss": 1.2705078125, |
| "mean_token_accuracy": 0.696273148059845, |
| "num_tokens": 30517107.0, |
| "step": 52 |
| }, |
| { |
| "entropy": 1.091796875, |
| "epoch": 0.31689088191330345, |
| "grad_norm": 7.1875, |
| "learning_rate": 1.9224862709033824e-06, |
| "loss": 1.2236328125, |
| "mean_token_accuracy": 0.6999221071600914, |
| "num_tokens": 31106632.0, |
| "step": 53 |
| }, |
| { |
| "entropy": 1.03955078125, |
| "epoch": 0.32286995515695066, |
| "grad_norm": 6.25, |
| "learning_rate": 1.918711696364584e-06, |
| "loss": 1.180908203125, |
| "mean_token_accuracy": 0.710278332233429, |
| "num_tokens": 31688974.0, |
| "step": 54 |
| }, |
| { |
| "entropy": 1.04345703125, |
| "epoch": 0.32884902840059793, |
| "grad_norm": 5.5625, |
| "learning_rate": 1.914851278035038e-06, |
| "loss": 1.1917724609375, |
| "mean_token_accuracy": 0.7096548527479172, |
| "num_tokens": 32278575.0, |
| "step": 55 |
| }, |
| { |
| "entropy": 1.05859375, |
| "epoch": 0.33482810164424515, |
| "grad_norm": 5.9375, |
| "learning_rate": 1.910905376629585e-06, |
| "loss": 1.2235107421875, |
| "mean_token_accuracy": 0.7041697576642036, |
| "num_tokens": 32868159.0, |
| "step": 56 |
| }, |
| { |
| "entropy": 1.03369140625, |
| "epoch": 0.34080717488789236, |
| "grad_norm": 4.84375, |
| "learning_rate": 1.9068743608505452e-06, |
| "loss": 1.1871337890625, |
| "mean_token_accuracy": 0.7095241695642471, |
| "num_tokens": 33457746.0, |
| "step": 57 |
| }, |
| { |
| "entropy": 1.0615234375, |
| "epoch": 0.34678624813153963, |
| "grad_norm": 6.125, |
| "learning_rate": 1.902758607353269e-06, |
| "loss": 1.2313232421875, |
| "mean_token_accuracy": 0.7004028484225273, |
| "num_tokens": 34047328.0, |
| "step": 58 |
| }, |
| { |
| "entropy": 1.06982421875, |
| "epoch": 0.35276532137518685, |
| "grad_norm": 5.78125, |
| "learning_rate": 1.8985585007109388e-06, |
| "loss": 1.23828125, |
| "mean_token_accuracy": 0.7001003175973892, |
| "num_tokens": 34636812.0, |
| "step": 59 |
| }, |
| { |
| "entropy": 1.05078125, |
| "epoch": 0.35874439461883406, |
| "grad_norm": 5.03125, |
| "learning_rate": 1.8942744333786395e-06, |
| "loss": 1.184326171875, |
| "mean_token_accuracy": 0.7088666930794716, |
| "num_tokens": 35226406.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 1.0556640625, |
| "epoch": 0.36472346786248133, |
| "grad_norm": 7.53125, |
| "learning_rate": 1.8899068056566838e-06, |
| "loss": 1.2060546875, |
| "mean_token_accuracy": 0.7032047733664513, |
| "num_tokens": 35809841.0, |
| "step": 61 |
| }, |
| { |
| "entropy": 1.0791015625, |
| "epoch": 0.37070254110612855, |
| "grad_norm": 10.375, |
| "learning_rate": 1.8854560256532098e-06, |
| "loss": 1.2000732421875, |
| "mean_token_accuracy": 0.7027083933353424, |
| "num_tokens": 36399387.0, |
| "step": 62 |
| }, |
| { |
| "entropy": 1.04931640625, |
| "epoch": 0.37668161434977576, |
| "grad_norm": 11.625, |
| "learning_rate": 1.8809225092460485e-06, |
| "loss": 1.2080078125, |
| "mean_token_accuracy": 0.7051479294896126, |
| "num_tokens": 36988937.0, |
| "step": 63 |
| }, |
| { |
| "entropy": 1.02294921875, |
| "epoch": 0.38266068759342303, |
| "grad_norm": 7.25, |
| "learning_rate": 1.8763066800438634e-06, |
| "loss": 1.1639404296875, |
| "mean_token_accuracy": 0.7147629410028458, |
| "num_tokens": 37569757.0, |
| "step": 64 |
| }, |
| { |
| "entropy": 1.02880859375, |
| "epoch": 0.38863976083707025, |
| "grad_norm": 6.75, |
| "learning_rate": 1.8716089693465693e-06, |
| "loss": 1.1640625, |
| "mean_token_accuracy": 0.7142753675580025, |
| "num_tokens": 38159331.0, |
| "step": 65 |
| }, |
| { |
| "entropy": 1.03564453125, |
| "epoch": 0.39461883408071746, |
| "grad_norm": 9.5, |
| "learning_rate": 1.8668298161050306e-06, |
| "loss": 1.199951171875, |
| "mean_token_accuracy": 0.7058519497513771, |
| "num_tokens": 38747516.0, |
| "step": 66 |
| }, |
| { |
| "entropy": 1.05126953125, |
| "epoch": 0.40059790732436473, |
| "grad_norm": 6.09375, |
| "learning_rate": 1.861969666880049e-06, |
| "loss": 1.179443359375, |
| "mean_token_accuracy": 0.7096298113465309, |
| "num_tokens": 39337113.0, |
| "step": 67 |
| }, |
| { |
| "entropy": 1.05126953125, |
| "epoch": 0.40657698056801195, |
| "grad_norm": 5.46875, |
| "learning_rate": 1.8570289758006343e-06, |
| "loss": 1.1827392578125, |
| "mean_token_accuracy": 0.7079932987689972, |
| "num_tokens": 39926721.0, |
| "step": 68 |
| }, |
| { |
| "entropy": 1.0615234375, |
| "epoch": 0.4125560538116592, |
| "grad_norm": 4.84375, |
| "learning_rate": 1.8520082045215717e-06, |
| "loss": 1.189453125, |
| "mean_token_accuracy": 0.7062863036990166, |
| "num_tokens": 40516290.0, |
| "step": 69 |
| }, |
| { |
| "entropy": 1.0439453125, |
| "epoch": 0.41853512705530643, |
| "grad_norm": 4.84375, |
| "learning_rate": 1.846907822180286e-06, |
| "loss": 1.16650390625, |
| "mean_token_accuracy": 0.711765356361866, |
| "num_tokens": 41105790.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 1.0625, |
| "epoch": 0.42451420029895365, |
| "grad_norm": 6.0, |
| "learning_rate": 1.8417283053530043e-06, |
| "loss": 1.18603515625, |
| "mean_token_accuracy": 0.7049060165882111, |
| "num_tokens": 41695388.0, |
| "step": 71 |
| }, |
| { |
| "entropy": 1.0478515625, |
| "epoch": 0.4304932735426009, |
| "grad_norm": 6.875, |
| "learning_rate": 1.8364701380102264e-06, |
| "loss": 1.1793212890625, |
| "mean_token_accuracy": 0.7081038281321526, |
| "num_tokens": 42271847.0, |
| "step": 72 |
| }, |
| { |
| "entropy": 1.04833984375, |
| "epoch": 0.43647234678624813, |
| "grad_norm": 10.5625, |
| "learning_rate": 1.8311338114715027e-06, |
| "loss": 1.185791015625, |
| "mean_token_accuracy": 0.7098284065723419, |
| "num_tokens": 42855765.0, |
| "step": 73 |
| }, |
| { |
| "entropy": 1.05517578125, |
| "epoch": 0.44245142002989535, |
| "grad_norm": 6.84375, |
| "learning_rate": 1.825719824359524e-06, |
| "loss": 1.177734375, |
| "mean_token_accuracy": 0.7076143845915794, |
| "num_tokens": 43445385.0, |
| "step": 74 |
| }, |
| { |
| "entropy": 1.0751953125, |
| "epoch": 0.4484304932735426, |
| "grad_norm": 6.28125, |
| "learning_rate": 1.8202286825535329e-06, |
| "loss": 1.208251953125, |
| "mean_token_accuracy": 0.7024360001087189, |
| "num_tokens": 44033137.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 1.0888671875, |
| "epoch": 0.45440956651718983, |
| "grad_norm": 5.375, |
| "learning_rate": 1.814660899142053e-06, |
| "loss": 1.202392578125, |
| "mean_token_accuracy": 0.7014130279421806, |
| "num_tokens": 44622745.0, |
| "step": 76 |
| }, |
| { |
| "entropy": 1.04248046875, |
| "epoch": 0.46038863976083705, |
| "grad_norm": 6.5, |
| "learning_rate": 1.8090169943749474e-06, |
| "loss": 1.18212890625, |
| "mean_token_accuracy": 0.7097717002034187, |
| "num_tokens": 45212356.0, |
| "step": 77 |
| }, |
| { |
| "entropy": 1.0576171875, |
| "epoch": 0.4663677130044843, |
| "grad_norm": 9.3125, |
| "learning_rate": 1.8032974956148062e-06, |
| "loss": 1.179443359375, |
| "mean_token_accuracy": 0.7062700912356377, |
| "num_tokens": 45798390.0, |
| "step": 78 |
| }, |
| { |
| "entropy": 1.03857421875, |
| "epoch": 0.47234678624813153, |
| "grad_norm": 9.125, |
| "learning_rate": 1.7975029372876705e-06, |
| "loss": 1.1568603515625, |
| "mean_token_accuracy": 0.7130975499749184, |
| "num_tokens": 46388008.0, |
| "step": 79 |
| }, |
| { |
| "entropy": 1.05322265625, |
| "epoch": 0.47832585949177875, |
| "grad_norm": 7.34375, |
| "learning_rate": 1.7916338608330956e-06, |
| "loss": 1.182861328125, |
| "mean_token_accuracy": 0.7081197574734688, |
| "num_tokens": 46974446.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 1.0458984375, |
| "epoch": 0.484304932735426, |
| "grad_norm": 4.5625, |
| "learning_rate": 1.78569081465356e-06, |
| "loss": 1.1536865234375, |
| "mean_token_accuracy": 0.7095040455460548, |
| "num_tokens": 47564004.0, |
| "step": 81 |
| }, |
| { |
| "entropy": 1.03759765625, |
| "epoch": 0.49028400597907323, |
| "grad_norm": 3.875, |
| "learning_rate": 1.7796743540632221e-06, |
| "loss": 1.1531982421875, |
| "mean_token_accuracy": 0.7134627625346184, |
| "num_tokens": 48153580.0, |
| "step": 82 |
| }, |
| { |
| "entropy": 1.0625, |
| "epoch": 0.4962630792227205, |
| "grad_norm": 6.5, |
| "learning_rate": 1.7735850412360328e-06, |
| "loss": 1.177490234375, |
| "mean_token_accuracy": 0.7081628367304802, |
| "num_tokens": 48740404.0, |
| "step": 83 |
| }, |
| { |
| "entropy": 1.048828125, |
| "epoch": 0.5022421524663677, |
| "grad_norm": 8.8125, |
| "learning_rate": 1.7674234451532063e-06, |
| "loss": 1.1700439453125, |
| "mean_token_accuracy": 0.7087839841842651, |
| "num_tokens": 49329930.0, |
| "step": 84 |
| }, |
| { |
| "entropy": 1.0458984375, |
| "epoch": 0.5082212257100149, |
| "grad_norm": 7.25, |
| "learning_rate": 1.7611901415500533e-06, |
| "loss": 1.16259765625, |
| "mean_token_accuracy": 0.7115066945552826, |
| "num_tokens": 49914835.0, |
| "step": 85 |
| }, |
| { |
| "entropy": 1.0615234375, |
| "epoch": 0.5142002989536621, |
| "grad_norm": 8.0625, |
| "learning_rate": 1.7548857128621874e-06, |
| "loss": 1.18359375, |
| "mean_token_accuracy": 0.706175908446312, |
| "num_tokens": 50504393.0, |
| "step": 86 |
| }, |
| { |
| "entropy": 1.05322265625, |
| "epoch": 0.5201793721973094, |
| "grad_norm": 4.875, |
| "learning_rate": 1.748510748171101e-06, |
| "loss": 1.1778564453125, |
| "mean_token_accuracy": 0.7066154479980469, |
| "num_tokens": 51093995.0, |
| "step": 87 |
| }, |
| { |
| "entropy": 1.052734375, |
| "epoch": 0.5261584454409567, |
| "grad_norm": 4.65625, |
| "learning_rate": 1.7420658431491222e-06, |
| "loss": 1.167236328125, |
| "mean_token_accuracy": 0.7099850177764893, |
| "num_tokens": 51683545.0, |
| "step": 88 |
| }, |
| { |
| "entropy": 1.04638671875, |
| "epoch": 0.5321375186846039, |
| "grad_norm": 9.125, |
| "learning_rate": 1.735551600003755e-06, |
| "loss": 1.157470703125, |
| "mean_token_accuracy": 0.7102955356240273, |
| "num_tokens": 52272548.0, |
| "step": 89 |
| }, |
| { |
| "entropy": 1.0302734375, |
| "epoch": 0.5381165919282511, |
| "grad_norm": 10.25, |
| "learning_rate": 1.7289686274214115e-06, |
| "loss": 1.1446533203125, |
| "mean_token_accuracy": 0.7140024453401566, |
| "num_tokens": 52862195.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 1.03369140625, |
| "epoch": 0.5440956651718983, |
| "grad_norm": 11.75, |
| "learning_rate": 1.722317540510534e-06, |
| "loss": 1.1495361328125, |
| "mean_token_accuracy": 0.7119031846523285, |
| "num_tokens": 53450055.0, |
| "step": 91 |
| }, |
| { |
| "entropy": 1.03564453125, |
| "epoch": 0.5500747384155455, |
| "grad_norm": 9.8125, |
| "learning_rate": 1.715598960744121e-06, |
| "loss": 1.149658203125, |
| "mean_token_accuracy": 0.7118451669812202, |
| "num_tokens": 54023463.0, |
| "step": 92 |
| }, |
| { |
| "entropy": 1.0634765625, |
| "epoch": 0.5560538116591929, |
| "grad_norm": 6.40625, |
| "learning_rate": 1.7088135159016582e-06, |
| "loss": 1.1729736328125, |
| "mean_token_accuracy": 0.7083615809679031, |
| "num_tokens": 54613056.0, |
| "step": 93 |
| }, |
| { |
| "entropy": 1.056640625, |
| "epoch": 0.5620328849028401, |
| "grad_norm": 4.1875, |
| "learning_rate": 1.7019618400104569e-06, |
| "loss": 1.158447265625, |
| "mean_token_accuracy": 0.7101547122001648, |
| "num_tokens": 55194480.0, |
| "step": 94 |
| }, |
| { |
| "entropy": 1.046875, |
| "epoch": 0.5680119581464873, |
| "grad_norm": 9.25, |
| "learning_rate": 1.6950445732864126e-06, |
| "loss": 1.162109375, |
| "mean_token_accuracy": 0.7099513560533524, |
| "num_tokens": 55784110.0, |
| "step": 95 |
| }, |
| { |
| "entropy": 1.0625, |
| "epoch": 0.5739910313901345, |
| "grad_norm": 6.65625, |
| "learning_rate": 1.688062362074184e-06, |
| "loss": 1.1649169921875, |
| "mean_token_accuracy": 0.7057990580797195, |
| "num_tokens": 56373674.0, |
| "step": 96 |
| }, |
| { |
| "entropy": 1.05078125, |
| "epoch": 0.5799701046337817, |
| "grad_norm": 15.125, |
| "learning_rate": 1.681015858786797e-06, |
| "loss": 1.166259765625, |
| "mean_token_accuracy": 0.7095335945487022, |
| "num_tokens": 56952628.0, |
| "step": 97 |
| }, |
| { |
| "entropy": 1.03369140625, |
| "epoch": 0.585949177877429, |
| "grad_norm": 8.3125, |
| "learning_rate": 1.6739057218446857e-06, |
| "loss": 1.156005859375, |
| "mean_token_accuracy": 0.7151156216859818, |
| "num_tokens": 57542149.0, |
| "step": 98 |
| }, |
| { |
| "entropy": 1.0380859375, |
| "epoch": 0.5919282511210763, |
| "grad_norm": 4.15625, |
| "learning_rate": 1.666732615614169e-06, |
| "loss": 1.143798828125, |
| "mean_token_accuracy": 0.7129637002944946, |
| "num_tokens": 58131730.0, |
| "step": 99 |
| }, |
| { |
| "entropy": 1.02392578125, |
| "epoch": 0.5979073243647235, |
| "grad_norm": 9.3125, |
| "learning_rate": 1.6594972103453724e-06, |
| "loss": 1.1343994140625, |
| "mean_token_accuracy": 0.7165936082601547, |
| "num_tokens": 58721339.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.029296875, |
| "epoch": 0.6038863976083707, |
| "grad_norm": 7.5, |
| "learning_rate": 1.6522001821096019e-06, |
| "loss": 1.1375732421875, |
| "mean_token_accuracy": 0.7172424420714378, |
| "num_tokens": 59310867.0, |
| "step": 101 |
| }, |
| { |
| "entropy": 1.02978515625, |
| "epoch": 0.6098654708520179, |
| "grad_norm": 11.3125, |
| "learning_rate": 1.6448422127361705e-06, |
| "loss": 1.117919921875, |
| "mean_token_accuracy": 0.7172070667147636, |
| "num_tokens": 59894680.0, |
| "step": 102 |
| }, |
| { |
| "entropy": 1.0205078125, |
| "epoch": 0.6158445440956651, |
| "grad_norm": 11.5, |
| "learning_rate": 1.6374239897486897e-06, |
| "loss": 1.1181640625, |
| "mean_token_accuracy": 0.7184400483965874, |
| "num_tokens": 60484296.0, |
| "step": 103 |
| }, |
| { |
| "entropy": 1.05078125, |
| "epoch": 0.6218236173393124, |
| "grad_norm": 8.0, |
| "learning_rate": 1.6299462063008269e-06, |
| "loss": 1.143798828125, |
| "mean_token_accuracy": 0.7096443995833397, |
| "num_tokens": 61073911.0, |
| "step": 104 |
| }, |
| { |
| "entropy": 1.064453125, |
| "epoch": 0.6278026905829597, |
| "grad_norm": 7.75, |
| "learning_rate": 1.6224095611115383e-06, |
| "loss": 1.1650390625, |
| "mean_token_accuracy": 0.7072784155607224, |
| "num_tokens": 61663572.0, |
| "step": 105 |
| }, |
| { |
| "entropy": 1.0341796875, |
| "epoch": 0.6337817638266069, |
| "grad_norm": 6.9375, |
| "learning_rate": 1.614814758399781e-06, |
| "loss": 1.128662109375, |
| "mean_token_accuracy": 0.71539356559515, |
| "num_tokens": 62252067.0, |
| "step": 106 |
| }, |
| { |
| "entropy": 1.03662109375, |
| "epoch": 0.6397608370702541, |
| "grad_norm": 14.125, |
| "learning_rate": 1.6071625078187112e-06, |
| "loss": 1.146240234375, |
| "mean_token_accuracy": 0.7122742831707001, |
| "num_tokens": 62841656.0, |
| "step": 107 |
| }, |
| { |
| "entropy": 1.0400390625, |
| "epoch": 0.6457399103139013, |
| "grad_norm": 14.3125, |
| "learning_rate": 1.599453524389374e-06, |
| "loss": 1.146728515625, |
| "mean_token_accuracy": 0.7149165868759155, |
| "num_tokens": 63431221.0, |
| "step": 108 |
| }, |
| { |
| "entropy": 1.0556640625, |
| "epoch": 0.6517189835575485, |
| "grad_norm": 15.375, |
| "learning_rate": 1.5916885284338935e-06, |
| "loss": 1.155029296875, |
| "mean_token_accuracy": 0.7111315131187439, |
| "num_tokens": 64020069.0, |
| "step": 109 |
| }, |
| { |
| "entropy": 1.0380859375, |
| "epoch": 0.6576980568011959, |
| "grad_norm": 12.3125, |
| "learning_rate": 1.5838682455081657e-06, |
| "loss": 1.13671875, |
| "mean_token_accuracy": 0.7157010585069656, |
| "num_tokens": 64609550.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 1.03076171875, |
| "epoch": 0.6636771300448431, |
| "grad_norm": 7.90625, |
| "learning_rate": 1.5759934063340624e-06, |
| "loss": 1.1343994140625, |
| "mean_token_accuracy": 0.7165603339672089, |
| "num_tokens": 65199109.0, |
| "step": 111 |
| }, |
| { |
| "entropy": 1.05615234375, |
| "epoch": 0.6696562032884903, |
| "grad_norm": 12.5, |
| "learning_rate": 1.5680647467311555e-06, |
| "loss": 1.1683349609375, |
| "mean_token_accuracy": 0.7091170027852058, |
| "num_tokens": 65788728.0, |
| "step": 112 |
| }, |
| { |
| "entropy": 1.03759765625, |
| "epoch": 0.6756352765321375, |
| "grad_norm": 8.9375, |
| "learning_rate": 1.56008300754796e-06, |
| "loss": 1.1407470703125, |
| "mean_token_accuracy": 0.7141014188528061, |
| "num_tokens": 66370343.0, |
| "step": 113 |
| }, |
| { |
| "entropy": 1.0478515625, |
| "epoch": 0.6816143497757847, |
| "grad_norm": 12.5, |
| "learning_rate": 1.5520489345927094e-06, |
| "loss": 1.1500244140625, |
| "mean_token_accuracy": 0.7122905552387238, |
| "num_tokens": 66955876.0, |
| "step": 114 |
| }, |
| { |
| "entropy": 1.02392578125, |
| "epoch": 0.6875934230194319, |
| "grad_norm": 5.65625, |
| "learning_rate": 1.5439632785636705e-06, |
| "loss": 1.135498046875, |
| "mean_token_accuracy": 0.7167380154132843, |
| "num_tokens": 67545426.0, |
| "step": 115 |
| }, |
| { |
| "entropy": 1.052734375, |
| "epoch": 0.6935724962630793, |
| "grad_norm": 5.0625, |
| "learning_rate": 1.5358267949789964e-06, |
| "loss": 1.1448974609375, |
| "mean_token_accuracy": 0.7125077843666077, |
| "num_tokens": 68134997.0, |
| "step": 116 |
| }, |
| { |
| "entropy": 1.041015625, |
| "epoch": 0.6995515695067265, |
| "grad_norm": 5.75, |
| "learning_rate": 1.5276402441061327e-06, |
| "loss": 1.125732421875, |
| "mean_token_accuracy": 0.7163447961211205, |
| "num_tokens": 68724591.0, |
| "step": 117 |
| }, |
| { |
| "entropy": 1.02587890625, |
| "epoch": 0.7055306427503737, |
| "grad_norm": 9.625, |
| "learning_rate": 1.5194043908907772e-06, |
| "loss": 1.131103515625, |
| "mean_token_accuracy": 0.716623105108738, |
| "num_tokens": 69314113.0, |
| "step": 118 |
| }, |
| { |
| "entropy": 1.005859375, |
| "epoch": 0.7115097159940209, |
| "grad_norm": 5.09375, |
| "learning_rate": 1.5111200048854054e-06, |
| "loss": 1.1011962890625, |
| "mean_token_accuracy": 0.7245713621377945, |
| "num_tokens": 69903640.0, |
| "step": 119 |
| }, |
| { |
| "entropy": 1.044921875, |
| "epoch": 0.7174887892376681, |
| "grad_norm": 6.59375, |
| "learning_rate": 1.5027878601773632e-06, |
| "loss": 1.1431884765625, |
| "mean_token_accuracy": 0.7117293328046799, |
| "num_tokens": 70493259.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 1.0615234375, |
| "epoch": 0.7234678624813154, |
| "grad_norm": 8.1875, |
| "learning_rate": 1.494408735316537e-06, |
| "loss": 1.15380859375, |
| "mean_token_accuracy": 0.7094393074512482, |
| "num_tokens": 71082785.0, |
| "step": 121 |
| }, |
| { |
| "entropy": 1.07861328125, |
| "epoch": 0.7294469357249627, |
| "grad_norm": 16.625, |
| "learning_rate": 1.4859834132426058e-06, |
| "loss": 1.1781005859375, |
| "mean_token_accuracy": 0.7051993981003761, |
| "num_tokens": 71666506.0, |
| "step": 122 |
| }, |
| { |
| "entropy": 1.0166015625, |
| "epoch": 0.7354260089686099, |
| "grad_norm": 16.875, |
| "learning_rate": 1.4775126812118863e-06, |
| "loss": 1.1251220703125, |
| "mean_token_accuracy": 0.7166302278637886, |
| "num_tokens": 72250385.0, |
| "step": 123 |
| }, |
| { |
| "entropy": 1.05615234375, |
| "epoch": 0.7414050822122571, |
| "grad_norm": 7.4375, |
| "learning_rate": 1.4689973307237686e-06, |
| "loss": 1.15478515625, |
| "mean_token_accuracy": 0.709770917892456, |
| "num_tokens": 72829598.0, |
| "step": 124 |
| }, |
| { |
| "entropy": 1.029296875, |
| "epoch": 0.7473841554559043, |
| "grad_norm": 9.9375, |
| "learning_rate": 1.4604381574467614e-06, |
| "loss": 1.13037109375, |
| "mean_token_accuracy": 0.7166016399860382, |
| "num_tokens": 73419110.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 1.013671875, |
| "epoch": 0.7533632286995515, |
| "grad_norm": 16.625, |
| "learning_rate": 1.451835961144145e-06, |
| "loss": 1.1103515625, |
| "mean_token_accuracy": 0.7211827859282494, |
| "num_tokens": 74008729.0, |
| "step": 126 |
| }, |
| { |
| "entropy": 1.0126953125, |
| "epoch": 0.7593423019431988, |
| "grad_norm": 13.0625, |
| "learning_rate": 1.4431915455992414e-06, |
| "loss": 1.1024169921875, |
| "mean_token_accuracy": 0.7223981395363808, |
| "num_tokens": 74598306.0, |
| "step": 127 |
| }, |
| { |
| "entropy": 1.056640625, |
| "epoch": 0.7653213751868461, |
| "grad_norm": 7.75, |
| "learning_rate": 1.4345057185403098e-06, |
| "loss": 1.15869140625, |
| "mean_token_accuracy": 0.7109938785433769, |
| "num_tokens": 75187853.0, |
| "step": 128 |
| }, |
| { |
| "entropy": 1.037109375, |
| "epoch": 0.7713004484304933, |
| "grad_norm": 12.875, |
| "learning_rate": 1.4257792915650725e-06, |
| "loss": 1.13232421875, |
| "mean_token_accuracy": 0.7139059007167816, |
| "num_tokens": 75777399.0, |
| "step": 129 |
| }, |
| { |
| "entropy": 1.0390625, |
| "epoch": 0.7772795216741405, |
| "grad_norm": 11.375, |
| "learning_rate": 1.4170130800648812e-06, |
| "loss": 1.1455078125, |
| "mean_token_accuracy": 0.7116378918290138, |
| "num_tokens": 76367001.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 1.0419921875, |
| "epoch": 0.7832585949177877, |
| "grad_norm": 10.875, |
| "learning_rate": 1.408207903148525e-06, |
| "loss": 1.1370849609375, |
| "mean_token_accuracy": 0.7141571119427681, |
| "num_tokens": 76956562.0, |
| "step": 131 |
| }, |
| { |
| "entropy": 1.033203125, |
| "epoch": 0.7892376681614349, |
| "grad_norm": 17.625, |
| "learning_rate": 1.3993645835656952e-06, |
| "loss": 1.147705078125, |
| "mean_token_accuracy": 0.7140598297119141, |
| "num_tokens": 77544057.0, |
| "step": 132 |
| }, |
| { |
| "entropy": 0.99755859375, |
| "epoch": 0.7952167414050823, |
| "grad_norm": 18.875, |
| "learning_rate": 1.3904839476301088e-06, |
| "loss": 1.085693359375, |
| "mean_token_accuracy": 0.7245375439524651, |
| "num_tokens": 78133581.0, |
| "step": 133 |
| }, |
| { |
| "entropy": 1.0478515625, |
| "epoch": 0.8011958146487295, |
| "grad_norm": 19.125, |
| "learning_rate": 1.3815668251422953e-06, |
| "loss": 1.14013671875, |
| "mean_token_accuracy": 0.7118667960166931, |
| "num_tokens": 78723253.0, |
| "step": 134 |
| }, |
| { |
| "entropy": 1.03271484375, |
| "epoch": 0.8071748878923767, |
| "grad_norm": 24.125, |
| "learning_rate": 1.3726140493120637e-06, |
| "loss": 1.1357421875, |
| "mean_token_accuracy": 0.7158161103725433, |
| "num_tokens": 79306761.0, |
| "step": 135 |
| }, |
| { |
| "entropy": 1.02685546875, |
| "epoch": 0.8131539611360239, |
| "grad_norm": 28.0, |
| "learning_rate": 1.363626456680647e-06, |
| "loss": 1.125244140625, |
| "mean_token_accuracy": 0.7170991152524948, |
| "num_tokens": 79893309.0, |
| "step": 136 |
| }, |
| { |
| "entropy": 1.05224609375, |
| "epoch": 0.8191330343796711, |
| "grad_norm": 24.125, |
| "learning_rate": 1.3546048870425354e-06, |
| "loss": 1.148681640625, |
| "mean_token_accuracy": 0.7112774699926376, |
| "num_tokens": 80482935.0, |
| "step": 137 |
| }, |
| { |
| "entropy": 1.05615234375, |
| "epoch": 0.8251121076233184, |
| "grad_norm": 12.5625, |
| "learning_rate": 1.3455501833670087e-06, |
| "loss": 1.134033203125, |
| "mean_token_accuracy": 0.7125924825668335, |
| "num_tokens": 81072531.0, |
| "step": 138 |
| }, |
| { |
| "entropy": 1.03271484375, |
| "epoch": 0.8310911808669657, |
| "grad_norm": 19.375, |
| "learning_rate": 1.336463191719367e-06, |
| "loss": 1.12335205078125, |
| "mean_token_accuracy": 0.7159583121538162, |
| "num_tokens": 81654332.0, |
| "step": 139 |
| }, |
| { |
| "entropy": 1.052734375, |
| "epoch": 0.8370702541106129, |
| "grad_norm": 20.125, |
| "learning_rate": 1.3273447611818766e-06, |
| "loss": 1.1549072265625, |
| "mean_token_accuracy": 0.7113095596432686, |
| "num_tokens": 82243896.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 1.0322265625, |
| "epoch": 0.8430493273542601, |
| "grad_norm": 14.6875, |
| "learning_rate": 1.3181957437744332e-06, |
| "loss": 1.128662109375, |
| "mean_token_accuracy": 0.7145743370056152, |
| "num_tokens": 82826175.0, |
| "step": 141 |
| }, |
| { |
| "entropy": 1.0458984375, |
| "epoch": 0.8490284005979073, |
| "grad_norm": 7.71875, |
| "learning_rate": 1.3090169943749473e-06, |
| "loss": 1.12841796875, |
| "mean_token_accuracy": 0.7128350734710693, |
| "num_tokens": 83415822.0, |
| "step": 142 |
| }, |
| { |
| "entropy": 1.03759765625, |
| "epoch": 0.8550074738415545, |
| "grad_norm": 22.875, |
| "learning_rate": 1.2998093706394675e-06, |
| "loss": 1.14453125, |
| "mean_token_accuracy": 0.7128356993198395, |
| "num_tokens": 83997557.0, |
| "step": 143 |
| }, |
| { |
| "entropy": 1.05078125, |
| "epoch": 0.8609865470852018, |
| "grad_norm": 13.875, |
| "learning_rate": 1.2905737329220392e-06, |
| "loss": 1.136474609375, |
| "mean_token_accuracy": 0.7111846879124641, |
| "num_tokens": 84587168.0, |
| "step": 144 |
| }, |
| { |
| "entropy": 1.04638671875, |
| "epoch": 0.866965620328849, |
| "grad_norm": 15.25, |
| "learning_rate": 1.2813109441943164e-06, |
| "loss": 1.138671875, |
| "mean_token_accuracy": 0.7117553874850273, |
| "num_tokens": 85176064.0, |
| "step": 145 |
| }, |
| { |
| "entropy": 1.0205078125, |
| "epoch": 0.8729446935724963, |
| "grad_norm": 13.625, |
| "learning_rate": 1.2720218699649241e-06, |
| "loss": 1.111572265625, |
| "mean_token_accuracy": 0.7199290543794632, |
| "num_tokens": 85765635.0, |
| "step": 146 |
| }, |
| { |
| "entropy": 1.02197265625, |
| "epoch": 0.8789237668161435, |
| "grad_norm": 10.5625, |
| "learning_rate": 1.262707378198587e-06, |
| "loss": 1.1162109375, |
| "mean_token_accuracy": 0.7189603447914124, |
| "num_tokens": 86355190.0, |
| "step": 147 |
| }, |
| { |
| "entropy": 1.04736328125, |
| "epoch": 0.8849028400597907, |
| "grad_norm": 11.75, |
| "learning_rate": 1.2533683392350262e-06, |
| "loss": 1.138427734375, |
| "mean_token_accuracy": 0.7134011015295982, |
| "num_tokens": 86938046.0, |
| "step": 148 |
| }, |
| { |
| "entropy": 1.02490234375, |
| "epoch": 0.890881913303438, |
| "grad_norm": 19.5, |
| "learning_rate": 1.2440056257076374e-06, |
| "loss": 1.113037109375, |
| "mean_token_accuracy": 0.7172698378562927, |
| "num_tokens": 87527598.0, |
| "step": 149 |
| }, |
| { |
| "entropy": 1.0146484375, |
| "epoch": 0.8968609865470852, |
| "grad_norm": 14.0, |
| "learning_rate": 1.23462011246195e-06, |
| "loss": 1.11181640625, |
| "mean_token_accuracy": 0.7199216857552528, |
| "num_tokens": 88110264.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 1.03271484375, |
| "epoch": 0.9028400597907325, |
| "grad_norm": 8.9375, |
| "learning_rate": 1.2252126764738844e-06, |
| "loss": 1.12353515625, |
| "mean_token_accuracy": 0.716008372604847, |
| "num_tokens": 88699832.0, |
| "step": 151 |
| }, |
| { |
| "entropy": 1.046875, |
| "epoch": 0.9088191330343797, |
| "grad_norm": 8.125, |
| "learning_rate": 1.2157841967678063e-06, |
| "loss": 1.130126953125, |
| "mean_token_accuracy": 0.714422382414341, |
| "num_tokens": 89289495.0, |
| "step": 152 |
| }, |
| { |
| "entropy": 1.00439453125, |
| "epoch": 0.9147982062780269, |
| "grad_norm": 12.0, |
| "learning_rate": 1.2063355543343923e-06, |
| "loss": 1.08837890625, |
| "mean_token_accuracy": 0.7251131683588028, |
| "num_tokens": 89879089.0, |
| "step": 153 |
| }, |
| { |
| "entropy": 1.0244140625, |
| "epoch": 0.9207772795216741, |
| "grad_norm": 14.3125, |
| "learning_rate": 1.1968676320483101e-06, |
| "loss": 1.1243896484375, |
| "mean_token_accuracy": 0.7171234339475632, |
| "num_tokens": 90451022.0, |
| "step": 154 |
| }, |
| { |
| "entropy": 1.03466796875, |
| "epoch": 0.9267563527653214, |
| "grad_norm": 28.75, |
| "learning_rate": 1.1873813145857248e-06, |
| "loss": 1.1207275390625, |
| "mean_token_accuracy": 0.7154998481273651, |
| "num_tokens": 91036040.0, |
| "step": 155 |
| }, |
| { |
| "entropy": 1.03125, |
| "epoch": 0.9327354260089686, |
| "grad_norm": 33.25, |
| "learning_rate": 1.1778774883416322e-06, |
| "loss": 1.119873046875, |
| "mean_token_accuracy": 0.715819425880909, |
| "num_tokens": 91625659.0, |
| "step": 156 |
| }, |
| { |
| "entropy": 1.0400390625, |
| "epoch": 0.9387144992526159, |
| "grad_norm": 13.625, |
| "learning_rate": 1.1683570413470383e-06, |
| "loss": 1.1197509765625, |
| "mean_token_accuracy": 0.7150726914405823, |
| "num_tokens": 92215320.0, |
| "step": 157 |
| }, |
| { |
| "entropy": 1.01953125, |
| "epoch": 0.9446935724962631, |
| "grad_norm": 21.125, |
| "learning_rate": 1.1588208631859807e-06, |
| "loss": 1.1259765625, |
| "mean_token_accuracy": 0.7184253633022308, |
| "num_tokens": 92804840.0, |
| "step": 158 |
| }, |
| { |
| "entropy": 1.03076171875, |
| "epoch": 0.9506726457399103, |
| "grad_norm": 14.0, |
| "learning_rate": 1.149269844912404e-06, |
| "loss": 1.115234375, |
| "mean_token_accuracy": 0.7175646647810936, |
| "num_tokens": 93394439.0, |
| "step": 159 |
| }, |
| { |
| "entropy": 1.0546875, |
| "epoch": 0.9566517189835575, |
| "grad_norm": 17.5, |
| "learning_rate": 1.1397048789669059e-06, |
| "loss": 1.13916015625, |
| "mean_token_accuracy": 0.7107137218117714, |
| "num_tokens": 93979057.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 1.05859375, |
| "epoch": 0.9626307922272048, |
| "grad_norm": 15.25, |
| "learning_rate": 1.1301268590933434e-06, |
| "loss": 1.14404296875, |
| "mean_token_accuracy": 0.71033675968647, |
| "num_tokens": 94568560.0, |
| "step": 161 |
| }, |
| { |
| "entropy": 1.0244140625, |
| "epoch": 0.968609865470852, |
| "grad_norm": 20.125, |
| "learning_rate": 1.1205366802553228e-06, |
| "loss": 1.1131591796875, |
| "mean_token_accuracy": 0.7181500568985939, |
| "num_tokens": 95158163.0, |
| "step": 162 |
| }, |
| { |
| "entropy": 1.048828125, |
| "epoch": 0.9745889387144993, |
| "grad_norm": 21.25, |
| "learning_rate": 1.110935238552578e-06, |
| "loss": 1.1319580078125, |
| "mean_token_accuracy": 0.7127460688352585, |
| "num_tokens": 95747756.0, |
| "step": 163 |
| }, |
| { |
| "entropy": 1.017578125, |
| "epoch": 0.9805680119581465, |
| "grad_norm": 25.125, |
| "learning_rate": 1.1013234311372353e-06, |
| "loss": 1.1143798828125, |
| "mean_token_accuracy": 0.7193858399987221, |
| "num_tokens": 96337278.0, |
| "step": 164 |
| }, |
| { |
| "entropy": 1.02783203125, |
| "epoch": 0.9865470852017937, |
| "grad_norm": 18.375, |
| "learning_rate": 1.0917021561299862e-06, |
| "loss": 1.1024169921875, |
| "mean_token_accuracy": 0.7175654470920563, |
| "num_tokens": 96926854.0, |
| "step": 165 |
| }, |
| { |
| "entropy": 1.060546875, |
| "epoch": 0.992526158445441, |
| "grad_norm": 11.375, |
| "learning_rate": 1.0820723125361684e-06, |
| "loss": 1.13623046875, |
| "mean_token_accuracy": 0.710972748696804, |
| "num_tokens": 97507731.0, |
| "step": 166 |
| }, |
| { |
| "entropy": 1.0107421875, |
| "epoch": 0.9985052316890882, |
| "grad_norm": 24.875, |
| "learning_rate": 1.0724348001617625e-06, |
| "loss": 1.1070556640625, |
| "mean_token_accuracy": 0.7217210680246353, |
| "num_tokens": 98097346.0, |
| "step": 167 |
| }, |
| { |
| "entropy": 0.9921875, |
| "epoch": 1.0, |
| "grad_norm": 23.5, |
| "learning_rate": 1.0627905195293135e-06, |
| "loss": 1.072265625, |
| "mean_token_accuracy": 0.726732075214386, |
| "num_tokens": 98244774.0, |
| "step": 168 |
| }, |
| { |
| "entropy": 1.037109375, |
| "epoch": 1.0059790732436473, |
| "grad_norm": 21.875, |
| "learning_rate": 1.0531403717937886e-06, |
| "loss": 1.1287841796875, |
| "mean_token_accuracy": 0.715124748647213, |
| "num_tokens": 98834357.0, |
| "step": 169 |
| }, |
| { |
| "entropy": 0.98046875, |
| "epoch": 1.0119581464872944, |
| "grad_norm": 20.875, |
| "learning_rate": 1.0434852586583737e-06, |
| "loss": 1.0762939453125, |
| "mean_token_accuracy": 0.7286977842450142, |
| "num_tokens": 99424044.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 1.0498046875, |
| "epoch": 1.0179372197309418, |
| "grad_norm": 21.875, |
| "learning_rate": 1.0338260822902165e-06, |
| "loss": 1.1475830078125, |
| "mean_token_accuracy": 0.7115440741181374, |
| "num_tokens": 100013632.0, |
| "step": 171 |
| }, |
| { |
| "entropy": 1.03759765625, |
| "epoch": 1.0239162929745889, |
| "grad_norm": 17.25, |
| "learning_rate": 1.0241637452361322e-06, |
| "loss": 1.133056640625, |
| "mean_token_accuracy": 0.713756151497364, |
| "num_tokens": 100603269.0, |
| "step": 172 |
| }, |
| { |
| "entropy": 1.02197265625, |
| "epoch": 1.0298953662182362, |
| "grad_norm": 15.8125, |
| "learning_rate": 1.0144991503382673e-06, |
| "loss": 1.1068115234375, |
| "mean_token_accuracy": 0.7196066528558731, |
| "num_tokens": 101191071.0, |
| "step": 173 |
| }, |
| { |
| "entropy": 1.0302734375, |
| "epoch": 1.0358744394618835, |
| "grad_norm": 15.5, |
| "learning_rate": 1.0048332006497404e-06, |
| "loss": 1.111572265625, |
| "mean_token_accuracy": 0.7173566892743111, |
| "num_tokens": 101780702.0, |
| "step": 174 |
| }, |
| { |
| "entropy": 1.0576171875, |
| "epoch": 1.0418535127055306, |
| "grad_norm": 9.25, |
| "learning_rate": 9.951667993502597e-07, |
| "loss": 1.1553955078125, |
| "mean_token_accuracy": 0.7085662558674812, |
| "num_tokens": 102367028.0, |
| "step": 175 |
| }, |
| { |
| "entropy": 1.05322265625, |
| "epoch": 1.047832585949178, |
| "grad_norm": 9.0625, |
| "learning_rate": 9.855008496617326e-07, |
| "loss": 1.1552734375, |
| "mean_token_accuracy": 0.7092385366559029, |
| "num_tokens": 102956643.0, |
| "step": 176 |
| }, |
| { |
| "entropy": 1.029296875, |
| "epoch": 1.053811659192825, |
| "grad_norm": 29.375, |
| "learning_rate": 9.75836254763868e-07, |
| "loss": 1.136474609375, |
| "mean_token_accuracy": 0.7162440121173859, |
| "num_tokens": 103535589.0, |
| "step": 177 |
| }, |
| { |
| "entropy": 1.0029296875, |
| "epoch": 1.0597907324364724, |
| "grad_norm": 11.9375, |
| "learning_rate": 9.661739177097834e-07, |
| "loss": 1.0927734375, |
| "mean_token_accuracy": 0.7226409837603569, |
| "num_tokens": 104124613.0, |
| "step": 178 |
| }, |
| { |
| "entropy": 1.04150390625, |
| "epoch": 1.0657698056801195, |
| "grad_norm": 12.0, |
| "learning_rate": 9.565147413416265e-07, |
| "loss": 1.1234130859375, |
| "mean_token_accuracy": 0.7134781181812286, |
| "num_tokens": 104714199.0, |
| "step": 179 |
| }, |
| { |
| "entropy": 1.01318359375, |
| "epoch": 1.0717488789237668, |
| "grad_norm": 13.8125, |
| "learning_rate": 9.468596282062113e-07, |
| "loss": 1.1014404296875, |
| "mean_token_accuracy": 0.7213335856795311, |
| "num_tokens": 105294866.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 1.05322265625, |
| "epoch": 1.0777279521674141, |
| "grad_norm": 14.625, |
| "learning_rate": 9.372094804706866e-07, |
| "loss": 1.152587890625, |
| "mean_token_accuracy": 0.7105412855744362, |
| "num_tokens": 105884489.0, |
| "step": 181 |
| }, |
| { |
| "entropy": 1.01025390625, |
| "epoch": 1.0837070254110612, |
| "grad_norm": 11.0, |
| "learning_rate": 9.275651998382377e-07, |
| "loss": 1.101318359375, |
| "mean_token_accuracy": 0.7201149016618729, |
| "num_tokens": 106474079.0, |
| "step": 182 |
| }, |
| { |
| "entropy": 1.02001953125, |
| "epoch": 1.0896860986547086, |
| "grad_norm": 8.3125, |
| "learning_rate": 9.179276874638314e-07, |
| "loss": 1.107666015625, |
| "mean_token_accuracy": 0.7216706648468971, |
| "num_tokens": 107063687.0, |
| "step": 183 |
| }, |
| { |
| "entropy": 1.02685546875, |
| "epoch": 1.0956651718983557, |
| "grad_norm": 11.625, |
| "learning_rate": 9.082978438700138e-07, |
| "loss": 1.125732421875, |
| "mean_token_accuracy": 0.7165105268359184, |
| "num_tokens": 107649683.0, |
| "step": 184 |
| }, |
| { |
| "entropy": 1.0322265625, |
| "epoch": 1.101644245142003, |
| "grad_norm": 10.625, |
| "learning_rate": 8.986765688627651e-07, |
| "loss": 1.10595703125, |
| "mean_token_accuracy": 0.7177045792341232, |
| "num_tokens": 108239185.0, |
| "step": 185 |
| }, |
| { |
| "entropy": 1.0166015625, |
| "epoch": 1.1076233183856503, |
| "grad_norm": 11.5625, |
| "learning_rate": 8.890647614474222e-07, |
| "loss": 1.1109619140625, |
| "mean_token_accuracy": 0.7202980294823647, |
| "num_tokens": 108828659.0, |
| "step": 186 |
| }, |
| { |
| "entropy": 1.04833984375, |
| "epoch": 1.1136023916292974, |
| "grad_norm": 23.25, |
| "learning_rate": 8.79463319744677e-07, |
| "loss": 1.14404296875, |
| "mean_token_accuracy": 0.7117270454764366, |
| "num_tokens": 109415343.0, |
| "step": 187 |
| }, |
| { |
| "entropy": 1.01904296875, |
| "epoch": 1.1195814648729447, |
| "grad_norm": 20.5, |
| "learning_rate": 8.698731409066568e-07, |
| "loss": 1.1033935546875, |
| "mean_token_accuracy": 0.7186397314071655, |
| "num_tokens": 110000047.0, |
| "step": 188 |
| }, |
| { |
| "entropy": 1.0185546875, |
| "epoch": 1.1255605381165918, |
| "grad_norm": 17.0, |
| "learning_rate": 8.602951210330941e-07, |
| "loss": 1.1114501953125, |
| "mean_token_accuracy": 0.7207945957779884, |
| "num_tokens": 110589519.0, |
| "step": 189 |
| }, |
| { |
| "entropy": 1.01171875, |
| "epoch": 1.1315396113602392, |
| "grad_norm": 19.625, |
| "learning_rate": 8.507301550875959e-07, |
| "loss": 1.1103515625, |
| "mean_token_accuracy": 0.7186660766601562, |
| "num_tokens": 111179017.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 1.05859375, |
| "epoch": 1.1375186846038865, |
| "grad_norm": 19.5, |
| "learning_rate": 8.411791368140195e-07, |
| "loss": 1.1348876953125, |
| "mean_token_accuracy": 0.7089879661798477, |
| "num_tokens": 111761675.0, |
| "step": 191 |
| }, |
| { |
| "entropy": 1.033203125, |
| "epoch": 1.1434977578475336, |
| "grad_norm": 15.125, |
| "learning_rate": 8.316429586529614e-07, |
| "loss": 1.1116943359375, |
| "mean_token_accuracy": 0.7168847694993019, |
| "num_tokens": 112351282.0, |
| "step": 192 |
| }, |
| { |
| "entropy": 1.015625, |
| "epoch": 1.149476831091181, |
| "grad_norm": 10.0625, |
| "learning_rate": 8.221225116583676e-07, |
| "loss": 1.0850830078125, |
| "mean_token_accuracy": 0.7229639515280724, |
| "num_tokens": 112940935.0, |
| "step": 193 |
| }, |
| { |
| "entropy": 1.04150390625, |
| "epoch": 1.155455904334828, |
| "grad_norm": 9.75, |
| "learning_rate": 8.126186854142751e-07, |
| "loss": 1.1219482421875, |
| "mean_token_accuracy": 0.7170581594109535, |
| "num_tokens": 113530462.0, |
| "step": 194 |
| }, |
| { |
| "entropy": 1.02880859375, |
| "epoch": 1.1614349775784754, |
| "grad_norm": 17.625, |
| "learning_rate": 8.031323679516899e-07, |
| "loss": 1.130859375, |
| "mean_token_accuracy": 0.715842954814434, |
| "num_tokens": 114115879.0, |
| "step": 195 |
| }, |
| { |
| "entropy": 1.0419921875, |
| "epoch": 1.1674140508221225, |
| "grad_norm": 10.1875, |
| "learning_rate": 7.936644456656081e-07, |
| "loss": 1.1396484375, |
| "mean_token_accuracy": 0.713227279484272, |
| "num_tokens": 114705390.0, |
| "step": 196 |
| }, |
| { |
| "entropy": 1.03662109375, |
| "epoch": 1.1733931240657698, |
| "grad_norm": 23.375, |
| "learning_rate": 7.84215803232194e-07, |
| "loss": 1.1226806640625, |
| "mean_token_accuracy": 0.7155178636312485, |
| "num_tokens": 115286747.0, |
| "step": 197 |
| }, |
| { |
| "entropy": 1.025390625, |
| "epoch": 1.1793721973094171, |
| "grad_norm": 16.5, |
| "learning_rate": 7.747873235261156e-07, |
| "loss": 1.1126708984375, |
| "mean_token_accuracy": 0.7184700071811676, |
| "num_tokens": 115876348.0, |
| "step": 198 |
| }, |
| { |
| "entropy": 1.02880859375, |
| "epoch": 1.1853512705530642, |
| "grad_norm": 27.125, |
| "learning_rate": 7.653798875380499e-07, |
| "loss": 1.1217041015625, |
| "mean_token_accuracy": 0.7165053337812424, |
| "num_tokens": 116458527.0, |
| "step": 199 |
| }, |
| { |
| "entropy": 1.02392578125, |
| "epoch": 1.1913303437967115, |
| "grad_norm": 17.5, |
| "learning_rate": 7.559943742923625e-07, |
| "loss": 1.10888671875, |
| "mean_token_accuracy": 0.7185152769088745, |
| "num_tokens": 117048168.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 1.0390625, |
| "epoch": 1.1973094170403586, |
| "grad_norm": 22.625, |
| "learning_rate": 7.466316607649736e-07, |
| "loss": 1.130126953125, |
| "mean_token_accuracy": 0.7132042795419693, |
| "num_tokens": 117636670.0, |
| "step": 201 |
| }, |
| { |
| "entropy": 1.04833984375, |
| "epoch": 1.203288490284006, |
| "grad_norm": 24.75, |
| "learning_rate": 7.372926218014131e-07, |
| "loss": 1.132568359375, |
| "mean_token_accuracy": 0.7136494368314743, |
| "num_tokens": 118226230.0, |
| "step": 202 |
| }, |
| { |
| "entropy": 1.0576171875, |
| "epoch": 1.2092675635276533, |
| "grad_norm": 17.875, |
| "learning_rate": 7.279781300350757e-07, |
| "loss": 1.1424560546875, |
| "mean_token_accuracy": 0.711665190756321, |
| "num_tokens": 118815835.0, |
| "step": 203 |
| }, |
| { |
| "entropy": 1.01953125, |
| "epoch": 1.2152466367713004, |
| "grad_norm": 19.125, |
| "learning_rate": 7.186890558056836e-07, |
| "loss": 1.1112060546875, |
| "mean_token_accuracy": 0.7197611033916473, |
| "num_tokens": 119402683.0, |
| "step": 204 |
| }, |
| { |
| "entropy": 1.0546875, |
| "epoch": 1.2212257100149477, |
| "grad_norm": 52.25, |
| "learning_rate": 7.09426267077961e-07, |
| "loss": 1.150146484375, |
| "mean_token_accuracy": 0.7076791599392891, |
| "num_tokens": 119987245.0, |
| "step": 205 |
| }, |
| { |
| "entropy": 1.03076171875, |
| "epoch": 1.2272047832585948, |
| "grad_norm": 56.5, |
| "learning_rate": 7.001906293605329e-07, |
| "loss": 1.130615234375, |
| "mean_token_accuracy": 0.7152413129806519, |
| "num_tokens": 120576831.0, |
| "step": 206 |
| }, |
| { |
| "entropy": 1.0244140625, |
| "epoch": 1.2331838565022422, |
| "grad_norm": 15.9375, |
| "learning_rate": 6.909830056250526e-07, |
| "loss": 1.1173095703125, |
| "mean_token_accuracy": 0.7177402079105377, |
| "num_tokens": 121166431.0, |
| "step": 207 |
| }, |
| { |
| "entropy": 1.03369140625, |
| "epoch": 1.2391629297458895, |
| "grad_norm": 54.25, |
| "learning_rate": 6.81804256225567e-07, |
| "loss": 1.1336669921875, |
| "mean_token_accuracy": 0.7143785133957863, |
| "num_tokens": 121756094.0, |
| "step": 208 |
| }, |
| { |
| "entropy": 1.0380859375, |
| "epoch": 1.2451420029895366, |
| "grad_norm": 16.25, |
| "learning_rate": 6.726552388181233e-07, |
| "loss": 1.1319580078125, |
| "mean_token_accuracy": 0.714967779815197, |
| "num_tokens": 122337877.0, |
| "step": 209 |
| }, |
| { |
| "entropy": 1.0263671875, |
| "epoch": 1.251121076233184, |
| "grad_norm": 14.25, |
| "learning_rate": 6.63536808280633e-07, |
| "loss": 1.109130859375, |
| "mean_token_accuracy": 0.7193189635872841, |
| "num_tokens": 122927447.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 1.02587890625, |
| "epoch": 1.257100149476831, |
| "grad_norm": 13.3125, |
| "learning_rate": 6.544498166329912e-07, |
| "loss": 1.113525390625, |
| "mean_token_accuracy": 0.7177118062973022, |
| "num_tokens": 123509882.0, |
| "step": 211 |
| }, |
| { |
| "entropy": 1.0078125, |
| "epoch": 1.2630792227204783, |
| "grad_norm": 15.875, |
| "learning_rate": 6.453951129574643e-07, |
| "loss": 1.0953369140625, |
| "mean_token_accuracy": 0.722569465637207, |
| "num_tokens": 124099443.0, |
| "step": 212 |
| }, |
| { |
| "entropy": 1.048828125, |
| "epoch": 1.2690582959641254, |
| "grad_norm": 17.25, |
| "learning_rate": 6.363735433193529e-07, |
| "loss": 1.1336669921875, |
| "mean_token_accuracy": 0.7113273218274117, |
| "num_tokens": 124682200.0, |
| "step": 213 |
| }, |
| { |
| "entropy": 1.01318359375, |
| "epoch": 1.2750373692077728, |
| "grad_norm": 16.625, |
| "learning_rate": 6.273859506879364e-07, |
| "loss": 1.10498046875, |
| "mean_token_accuracy": 0.7205186262726784, |
| "num_tokens": 125265808.0, |
| "step": 214 |
| }, |
| { |
| "entropy": 1.013671875, |
| "epoch": 1.28101644245142, |
| "grad_norm": 10.75, |
| "learning_rate": 6.18433174857705e-07, |
| "loss": 1.112060546875, |
| "mean_token_accuracy": 0.7207604125142097, |
| "num_tokens": 125855421.0, |
| "step": 215 |
| }, |
| { |
| "entropy": 1.0322265625, |
| "epoch": 1.2869955156950672, |
| "grad_norm": 8.75, |
| "learning_rate": 6.095160523698912e-07, |
| "loss": 1.118408203125, |
| "mean_token_accuracy": 0.7177054435014725, |
| "num_tokens": 126445003.0, |
| "step": 216 |
| }, |
| { |
| "entropy": 1.021484375, |
| "epoch": 1.2929745889387145, |
| "grad_norm": 9.0625, |
| "learning_rate": 6.006354164343046e-07, |
| "loss": 1.110595703125, |
| "mean_token_accuracy": 0.7162402048707008, |
| "num_tokens": 127027787.0, |
| "step": 217 |
| }, |
| { |
| "entropy": 1.05029296875, |
| "epoch": 1.2989536621823619, |
| "grad_norm": 11.5625, |
| "learning_rate": 5.917920968514751e-07, |
| "loss": 1.1461181640625, |
| "mean_token_accuracy": 0.7097650542855263, |
| "num_tokens": 127617320.0, |
| "step": 218 |
| }, |
| { |
| "entropy": 1.03662109375, |
| "epoch": 1.304932735426009, |
| "grad_norm": 12.4375, |
| "learning_rate": 5.829869199351187e-07, |
| "loss": 1.1298828125, |
| "mean_token_accuracy": 0.7168288081884384, |
| "num_tokens": 128206868.0, |
| "step": 219 |
| }, |
| { |
| "entropy": 1.0361328125, |
| "epoch": 1.310911808669656, |
| "grad_norm": 10.5, |
| "learning_rate": 5.742207084349273e-07, |
| "loss": 1.1165771484375, |
| "mean_token_accuracy": 0.7156732380390167, |
| "num_tokens": 128796426.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 1.01904296875, |
| "epoch": 1.3168908819133034, |
| "grad_norm": 13.25, |
| "learning_rate": 5.654942814596901e-07, |
| "loss": 1.10205078125, |
| "mean_token_accuracy": 0.7199403569102287, |
| "num_tokens": 129385997.0, |
| "step": 221 |
| }, |
| { |
| "entropy": 1.02392578125, |
| "epoch": 1.3228699551569507, |
| "grad_norm": 15.0625, |
| "learning_rate": 5.568084544007588e-07, |
| "loss": 1.11083984375, |
| "mean_token_accuracy": 0.7177979946136475, |
| "num_tokens": 129961180.0, |
| "step": 222 |
| }, |
| { |
| "entropy": 1.00927734375, |
| "epoch": 1.3288490284005978, |
| "grad_norm": 20.375, |
| "learning_rate": 5.48164038855855e-07, |
| "loss": 1.094482421875, |
| "mean_token_accuracy": 0.7219114229083061, |
| "num_tokens": 130549338.0, |
| "step": 223 |
| }, |
| { |
| "entropy": 1.01708984375, |
| "epoch": 1.3348281016442451, |
| "grad_norm": 12.25, |
| "learning_rate": 5.395618425532389e-07, |
| "loss": 1.1097412109375, |
| "mean_token_accuracy": 0.7211140915751457, |
| "num_tokens": 131134800.0, |
| "step": 224 |
| }, |
| { |
| "entropy": 0.99365234375, |
| "epoch": 1.3408071748878925, |
| "grad_norm": 26.25, |
| "learning_rate": 5.310026692762314e-07, |
| "loss": 1.0784912109375, |
| "mean_token_accuracy": 0.727905310690403, |
| "num_tokens": 131724429.0, |
| "step": 225 |
| }, |
| { |
| "entropy": 1.029296875, |
| "epoch": 1.3467862481315396, |
| "grad_norm": 14.75, |
| "learning_rate": 5.224873187881136e-07, |
| "loss": 1.1151123046875, |
| "mean_token_accuracy": 0.7176884040236473, |
| "num_tokens": 132314019.0, |
| "step": 226 |
| }, |
| { |
| "entropy": 1.03125, |
| "epoch": 1.352765321375187, |
| "grad_norm": 15.3125, |
| "learning_rate": 5.140165867573939e-07, |
| "loss": 1.12353515625, |
| "mean_token_accuracy": 0.7174642384052277, |
| "num_tokens": 132903580.0, |
| "step": 227 |
| }, |
| { |
| "entropy": 1.0390625, |
| "epoch": 1.358744394618834, |
| "grad_norm": 18.125, |
| "learning_rate": 5.055912646834635e-07, |
| "loss": 1.127197265625, |
| "mean_token_accuracy": 0.7134326621890068, |
| "num_tokens": 133493126.0, |
| "step": 228 |
| }, |
| { |
| "entropy": 1.01513671875, |
| "epoch": 1.3647234678624813, |
| "grad_norm": 10.75, |
| "learning_rate": 4.972121398226371e-07, |
| "loss": 1.101318359375, |
| "mean_token_accuracy": 0.7197434529662132, |
| "num_tokens": 134079329.0, |
| "step": 229 |
| }, |
| { |
| "entropy": 1.041015625, |
| "epoch": 1.3707025411061284, |
| "grad_norm": 13.625, |
| "learning_rate": 4.888799951145947e-07, |
| "loss": 1.1278076171875, |
| "mean_token_accuracy": 0.7140819206833839, |
| "num_tokens": 134654986.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 1.0380859375, |
| "epoch": 1.3766816143497758, |
| "grad_norm": 10.4375, |
| "learning_rate": 4.805956091092227e-07, |
| "loss": 1.123779296875, |
| "mean_token_accuracy": 0.7162793427705765, |
| "num_tokens": 135244586.0, |
| "step": 231 |
| }, |
| { |
| "entropy": 1.05810546875, |
| "epoch": 1.382660687593423, |
| "grad_norm": 16.375, |
| "learning_rate": 4.7235975589386713e-07, |
| "loss": 1.1463623046875, |
| "mean_token_accuracy": 0.7098471596837044, |
| "num_tokens": 135834199.0, |
| "step": 232 |
| }, |
| { |
| "entropy": 1.04443359375, |
| "epoch": 1.3886397608370702, |
| "grad_norm": 17.125, |
| "learning_rate": 4.641732050210031e-07, |
| "loss": 1.1280517578125, |
| "mean_token_accuracy": 0.7144335135817528, |
| "num_tokens": 136423743.0, |
| "step": 233 |
| }, |
| { |
| "entropy": 1.06640625, |
| "epoch": 1.3946188340807175, |
| "grad_norm": 12.3125, |
| "learning_rate": 4.5603672143632945e-07, |
| "loss": 1.1444091796875, |
| "mean_token_accuracy": 0.7087363749742508, |
| "num_tokens": 137013243.0, |
| "step": 234 |
| }, |
| { |
| "entropy": 1.0302734375, |
| "epoch": 1.4005979073243648, |
| "grad_norm": 12.4375, |
| "learning_rate": 4.479510654072909e-07, |
| "loss": 1.1185302734375, |
| "mean_token_accuracy": 0.7174856439232826, |
| "num_tokens": 137599228.0, |
| "step": 235 |
| }, |
| { |
| "entropy": 1.02783203125, |
| "epoch": 1.406576980568012, |
| "grad_norm": 25.625, |
| "learning_rate": 4.399169924520403e-07, |
| "loss": 1.1148681640625, |
| "mean_token_accuracy": 0.7194091156125069, |
| "num_tokens": 138181557.0, |
| "step": 236 |
| }, |
| { |
| "entropy": 1.0234375, |
| "epoch": 1.4125560538116593, |
| "grad_norm": 16.625, |
| "learning_rate": 4.3193525326884426e-07, |
| "loss": 1.1102294921875, |
| "mean_token_accuracy": 0.7175892367959023, |
| "num_tokens": 138771048.0, |
| "step": 237 |
| }, |
| { |
| "entropy": 1.0361328125, |
| "epoch": 1.4185351270553064, |
| "grad_norm": 14.6875, |
| "learning_rate": 4.240065936659374e-07, |
| "loss": 1.12451171875, |
| "mean_token_accuracy": 0.71492750197649, |
| "num_tokens": 139360593.0, |
| "step": 238 |
| }, |
| { |
| "entropy": 1.05322265625, |
| "epoch": 1.4245142002989537, |
| "grad_norm": 25.125, |
| "learning_rate": 4.1613175449183446e-07, |
| "loss": 1.13232421875, |
| "mean_token_accuracy": 0.7100114226341248, |
| "num_tokens": 139946622.0, |
| "step": 239 |
| }, |
| { |
| "entropy": 0.99560546875, |
| "epoch": 1.4304932735426008, |
| "grad_norm": 36.75, |
| "learning_rate": 4.0831147156610676e-07, |
| "loss": 1.0897216796875, |
| "mean_token_accuracy": 0.7266808152198792, |
| "num_tokens": 140531396.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 1.0009765625, |
| "epoch": 1.4364723467862481, |
| "grad_norm": 24.75, |
| "learning_rate": 4.0054647561062615e-07, |
| "loss": 1.0850830078125, |
| "mean_token_accuracy": 0.7258649617433548, |
| "num_tokens": 141120987.0, |
| "step": 241 |
| }, |
| { |
| "entropy": 1.0244140625, |
| "epoch": 1.4424514200298955, |
| "grad_norm": 29.75, |
| "learning_rate": 3.928374921812888e-07, |
| "loss": 1.1165771484375, |
| "mean_token_accuracy": 0.7189558371901512, |
| "num_tokens": 141703670.0, |
| "step": 242 |
| }, |
| { |
| "entropy": 1.046875, |
| "epoch": 1.4484304932735426, |
| "grad_norm": 10.375, |
| "learning_rate": 3.851852416002187e-07, |
| "loss": 1.1234130859375, |
| "mean_token_accuracy": 0.7134259343147278, |
| "num_tokens": 142283264.0, |
| "step": 243 |
| }, |
| { |
| "entropy": 1.03466796875, |
| "epoch": 1.45440956651719, |
| "grad_norm": 35.75, |
| "learning_rate": 3.7759043888846173e-07, |
| "loss": 1.12158203125, |
| "mean_token_accuracy": 0.715514525771141, |
| "num_tokens": 142870702.0, |
| "step": 244 |
| }, |
| { |
| "entropy": 1.0615234375, |
| "epoch": 1.460388639760837, |
| "grad_norm": 8.5625, |
| "learning_rate": 3.7005379369917324e-07, |
| "loss": 1.1358642578125, |
| "mean_token_accuracy": 0.7089022919535637, |
| "num_tokens": 143460302.0, |
| "step": 245 |
| }, |
| { |
| "entropy": 1.0126953125, |
| "epoch": 1.4663677130044843, |
| "grad_norm": 14.6875, |
| "learning_rate": 3.625760102513102e-07, |
| "loss": 1.1024169921875, |
| "mean_token_accuracy": 0.7216273471713066, |
| "num_tokens": 144047084.0, |
| "step": 246 |
| }, |
| { |
| "entropy": 1.037109375, |
| "epoch": 1.4723467862481314, |
| "grad_norm": 12.5, |
| "learning_rate": 3.551577872638296e-07, |
| "loss": 1.1268310546875, |
| "mean_token_accuracy": 0.7154128924012184, |
| "num_tokens": 144629747.0, |
| "step": 247 |
| }, |
| { |
| "entropy": 1.029296875, |
| "epoch": 1.4783258594917787, |
| "grad_norm": 15.875, |
| "learning_rate": 3.477998178903981e-07, |
| "loss": 1.133056640625, |
| "mean_token_accuracy": 0.7142782434821129, |
| "num_tokens": 145219266.0, |
| "step": 248 |
| }, |
| { |
| "entropy": 1.0146484375, |
| "epoch": 1.484304932735426, |
| "grad_norm": 10.5, |
| "learning_rate": 3.4050278965462763e-07, |
| "loss": 1.0947265625, |
| "mean_token_accuracy": 0.720554769039154, |
| "num_tokens": 145808833.0, |
| "step": 249 |
| }, |
| { |
| "entropy": 1.017578125, |
| "epoch": 1.4902840059790732, |
| "grad_norm": 12.0, |
| "learning_rate": 3.3326738438583114e-07, |
| "loss": 1.1031494140625, |
| "mean_token_accuracy": 0.7191615030169487, |
| "num_tokens": 146398531.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 1.013671875, |
| "epoch": 1.4962630792227205, |
| "grad_norm": 13.0625, |
| "learning_rate": 3.260942781553142e-07, |
| "loss": 1.1036376953125, |
| "mean_token_accuracy": 0.7216013073921204, |
| "num_tokens": 146988047.0, |
| "step": 251 |
| }, |
| { |
| "entropy": 1.021484375, |
| "epoch": 1.5022421524663678, |
| "grad_norm": 8.1875, |
| "learning_rate": 3.189841412132027e-07, |
| "loss": 1.10498046875, |
| "mean_token_accuracy": 0.7199263349175453, |
| "num_tokens": 147577682.0, |
| "step": 252 |
| }, |
| { |
| "entropy": 1.05859375, |
| "epoch": 1.508221225710015, |
| "grad_norm": 9.75, |
| "learning_rate": 3.1193763792581594e-07, |
| "loss": 1.134765625, |
| "mean_token_accuracy": 0.7107567712664604, |
| "num_tokens": 148166138.0, |
| "step": 253 |
| }, |
| { |
| "entropy": 1.0166015625, |
| "epoch": 1.514200298953662, |
| "grad_norm": 7.59375, |
| "learning_rate": 3.0495542671358744e-07, |
| "loss": 1.1031494140625, |
| "mean_token_accuracy": 0.7203914448618889, |
| "num_tokens": 148755748.0, |
| "step": 254 |
| }, |
| { |
| "entropy": 1.03076171875, |
| "epoch": 1.5201793721973094, |
| "grad_norm": 19.25, |
| "learning_rate": 2.980381599895433e-07, |
| "loss": 1.1265869140625, |
| "mean_token_accuracy": 0.7148845717310905, |
| "num_tokens": 149345252.0, |
| "step": 255 |
| }, |
| { |
| "entropy": 1.087890625, |
| "epoch": 1.5261584454409567, |
| "grad_norm": 10.5, |
| "learning_rate": 2.91186484098342e-07, |
| "loss": 1.1712646484375, |
| "mean_token_accuracy": 0.7025258839130402, |
| "num_tokens": 149934781.0, |
| "step": 256 |
| }, |
| { |
| "entropy": 1.02880859375, |
| "epoch": 1.5321375186846038, |
| "grad_norm": 15.0, |
| "learning_rate": 2.84401039255879e-07, |
| "loss": 1.1123046875, |
| "mean_token_accuracy": 0.7171602919697762, |
| "num_tokens": 150524424.0, |
| "step": 257 |
| }, |
| { |
| "entropy": 1.04345703125, |
| "epoch": 1.5381165919282511, |
| "grad_norm": 16.5, |
| "learning_rate": 2.776824594894661e-07, |
| "loss": 1.1370849609375, |
| "mean_token_accuracy": 0.7134297341108322, |
| "num_tokens": 151113962.0, |
| "step": 258 |
| }, |
| { |
| "entropy": 1.02685546875, |
| "epoch": 1.5440956651718984, |
| "grad_norm": 13.875, |
| "learning_rate": 2.7103137257858863e-07, |
| "loss": 1.1080322265625, |
| "mean_token_accuracy": 0.7190811783075333, |
| "num_tokens": 151703586.0, |
| "step": 259 |
| }, |
| { |
| "entropy": 1.048828125, |
| "epoch": 1.5500747384155455, |
| "grad_norm": 9.25, |
| "learning_rate": 2.644483999962449e-07, |
| "loss": 1.1405029296875, |
| "mean_token_accuracy": 0.712283693253994, |
| "num_tokens": 152292444.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 1.01025390625, |
| "epoch": 1.5560538116591929, |
| "grad_norm": 9.625, |
| "learning_rate": 2.579341568508779e-07, |
| "loss": 1.09228515625, |
| "mean_token_accuracy": 0.721127025783062, |
| "num_tokens": 152882090.0, |
| "step": 261 |
| }, |
| { |
| "entropy": 1.03466796875, |
| "epoch": 1.5620328849028402, |
| "grad_norm": 6.875, |
| "learning_rate": 2.514892518288988e-07, |
| "loss": 1.1090087890625, |
| "mean_token_accuracy": 0.7168014496564865, |
| "num_tokens": 153471720.0, |
| "step": 262 |
| }, |
| { |
| "entropy": 1.056640625, |
| "epoch": 1.5680119581464873, |
| "grad_norm": 6.25, |
| "learning_rate": 2.4511428713781236e-07, |
| "loss": 1.1324462890625, |
| "mean_token_accuracy": 0.7105724215507507, |
| "num_tokens": 154061310.0, |
| "step": 263 |
| }, |
| { |
| "entropy": 1.0419921875, |
| "epoch": 1.5739910313901344, |
| "grad_norm": 16.75, |
| "learning_rate": 2.3880985844994673e-07, |
| "loss": 1.1239013671875, |
| "mean_token_accuracy": 0.713848665356636, |
| "num_tokens": 154650888.0, |
| "step": 264 |
| }, |
| { |
| "entropy": 1.037109375, |
| "epoch": 1.5799701046337817, |
| "grad_norm": 8.9375, |
| "learning_rate": 2.3257655484679372e-07, |
| "loss": 1.12451171875, |
| "mean_token_accuracy": 0.7131286934018135, |
| "num_tokens": 155239209.0, |
| "step": 265 |
| }, |
| { |
| "entropy": 1.04638671875, |
| "epoch": 1.585949177877429, |
| "grad_norm": 8.5625, |
| "learning_rate": 2.264149587639671e-07, |
| "loss": 1.13037109375, |
| "mean_token_accuracy": 0.7152972370386124, |
| "num_tokens": 155828817.0, |
| "step": 266 |
| }, |
| { |
| "entropy": 1.005859375, |
| "epoch": 1.5919282511210762, |
| "grad_norm": 13.9375, |
| "learning_rate": 2.2032564593677772e-07, |
| "loss": 1.0977783203125, |
| "mean_token_accuracy": 0.7207474857568741, |
| "num_tokens": 156418416.0, |
| "step": 267 |
| }, |
| { |
| "entropy": 1.0166015625, |
| "epoch": 1.5979073243647235, |
| "grad_norm": 15.25, |
| "learning_rate": 2.1430918534643994e-07, |
| "loss": 1.1092529296875, |
| "mean_token_accuracy": 0.7178195714950562, |
| "num_tokens": 156996671.0, |
| "step": 268 |
| }, |
| { |
| "entropy": 1.0126953125, |
| "epoch": 1.6038863976083708, |
| "grad_norm": 10.1875, |
| "learning_rate": 2.0836613916690427e-07, |
| "loss": 1.097900390625, |
| "mean_token_accuracy": 0.7219494804739952, |
| "num_tokens": 157586304.0, |
| "step": 269 |
| }, |
| { |
| "entropy": 1.041015625, |
| "epoch": 1.609865470852018, |
| "grad_norm": 7.375, |
| "learning_rate": 2.0249706271232946e-07, |
| "loss": 1.13525390625, |
| "mean_token_accuracy": 0.714814230799675, |
| "num_tokens": 158175939.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 1.01953125, |
| "epoch": 1.615844544095665, |
| "grad_norm": 14.4375, |
| "learning_rate": 1.9670250438519386e-07, |
| "loss": 1.1107177734375, |
| "mean_token_accuracy": 0.719508022069931, |
| "num_tokens": 158765530.0, |
| "step": 271 |
| }, |
| { |
| "entropy": 1.03076171875, |
| "epoch": 1.6218236173393124, |
| "grad_norm": 6.09375, |
| "learning_rate": 1.9098300562505264e-07, |
| "loss": 1.112060546875, |
| "mean_token_accuracy": 0.7187488600611687, |
| "num_tokens": 159355121.0, |
| "step": 272 |
| }, |
| { |
| "entropy": 1.05078125, |
| "epoch": 1.6278026905829597, |
| "grad_norm": 9.875, |
| "learning_rate": 1.8533910085794713e-07, |
| "loss": 1.1397705078125, |
| "mean_token_accuracy": 0.7100469321012497, |
| "num_tokens": 159937687.0, |
| "step": 273 |
| }, |
| { |
| "entropy": 1.01904296875, |
| "epoch": 1.6337817638266068, |
| "grad_norm": 10.8125, |
| "learning_rate": 1.7977131744646724e-07, |
| "loss": 1.1077880859375, |
| "mean_token_accuracy": 0.720647431910038, |
| "num_tokens": 160518664.0, |
| "step": 274 |
| }, |
| { |
| "entropy": 0.9990234375, |
| "epoch": 1.639760837070254, |
| "grad_norm": 20.25, |
| "learning_rate": 1.742801756404759e-07, |
| "loss": 1.09033203125, |
| "mean_token_accuracy": 0.7235589995980263, |
| "num_tokens": 161102238.0, |
| "step": 275 |
| }, |
| { |
| "entropy": 1.033203125, |
| "epoch": 1.6457399103139014, |
| "grad_norm": 12.5625, |
| "learning_rate": 1.688661885284972e-07, |
| "loss": 1.125, |
| "mean_token_accuracy": 0.7176312282681465, |
| "num_tokens": 161676535.0, |
| "step": 276 |
| }, |
| { |
| "entropy": 1.02783203125, |
| "epoch": 1.6517189835575485, |
| "grad_norm": 18.75, |
| "learning_rate": 1.6352986198977325e-07, |
| "loss": 1.10791015625, |
| "mean_token_accuracy": 0.718546986579895, |
| "num_tokens": 162266110.0, |
| "step": 277 |
| }, |
| { |
| "entropy": 1.0009765625, |
| "epoch": 1.6576980568011959, |
| "grad_norm": 10.5, |
| "learning_rate": 1.5827169464699575e-07, |
| "loss": 1.0906982421875, |
| "mean_token_accuracy": 0.7236178815364838, |
| "num_tokens": 162855683.0, |
| "step": 278 |
| }, |
| { |
| "entropy": 1.033203125, |
| "epoch": 1.6636771300448432, |
| "grad_norm": 8.375, |
| "learning_rate": 1.5309217781971416e-07, |
| "loss": 1.1171875, |
| "mean_token_accuracy": 0.7165531665086746, |
| "num_tokens": 163428337.0, |
| "step": 279 |
| }, |
| { |
| "entropy": 1.0244140625, |
| "epoch": 1.6696562032884903, |
| "grad_norm": 15.5625, |
| "learning_rate": 1.479917954784282e-07, |
| "loss": 1.10693359375, |
| "mean_token_accuracy": 0.7174379974603653, |
| "num_tokens": 164017962.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 1.03466796875, |
| "epoch": 1.6756352765321374, |
| "grad_norm": 22.125, |
| "learning_rate": 1.429710241993656e-07, |
| "loss": 1.1173095703125, |
| "mean_token_accuracy": 0.7154664248228073, |
| "num_tokens": 164605762.0, |
| "step": 281 |
| }, |
| { |
| "entropy": 1.04541015625, |
| "epoch": 1.6816143497757847, |
| "grad_norm": 14.875, |
| "learning_rate": 1.380303331199507e-07, |
| "loss": 1.1348876953125, |
| "mean_token_accuracy": 0.7117466628551483, |
| "num_tokens": 165195338.0, |
| "step": 282 |
| }, |
| { |
| "entropy": 1.009765625, |
| "epoch": 1.687593423019432, |
| "grad_norm": 21.0, |
| "learning_rate": 1.3317018389496926e-07, |
| "loss": 1.111083984375, |
| "mean_token_accuracy": 0.7207304239273071, |
| "num_tokens": 165784893.0, |
| "step": 283 |
| }, |
| { |
| "entropy": 1.02734375, |
| "epoch": 1.6935724962630792, |
| "grad_norm": 8.6875, |
| "learning_rate": 1.283910306534308e-07, |
| "loss": 1.1119384765625, |
| "mean_token_accuracy": 0.717054933309555, |
| "num_tokens": 166374502.0, |
| "step": 284 |
| }, |
| { |
| "entropy": 1.02783203125, |
| "epoch": 1.6995515695067265, |
| "grad_norm": 12.75, |
| "learning_rate": 1.2369331995613663e-07, |
| "loss": 1.1182861328125, |
| "mean_token_accuracy": 0.7191892936825752, |
| "num_tokens": 166964071.0, |
| "step": 285 |
| }, |
| { |
| "entropy": 1.01904296875, |
| "epoch": 1.7055306427503738, |
| "grad_norm": 19.625, |
| "learning_rate": 1.1907749075395146e-07, |
| "loss": 1.1087646484375, |
| "mean_token_accuracy": 0.7190410420298576, |
| "num_tokens": 167553522.0, |
| "step": 286 |
| }, |
| { |
| "entropy": 1.03466796875, |
| "epoch": 1.711509715994021, |
| "grad_norm": 14.3125, |
| "learning_rate": 1.145439743467902e-07, |
| "loss": 1.11865234375, |
| "mean_token_accuracy": 0.71589395403862, |
| "num_tokens": 168143165.0, |
| "step": 287 |
| }, |
| { |
| "entropy": 1.037109375, |
| "epoch": 1.717488789237668, |
| "grad_norm": 9.0, |
| "learning_rate": 1.1009319434331621e-07, |
| "loss": 1.1199951171875, |
| "mean_token_accuracy": 0.7174848467111588, |
| "num_tokens": 168727049.0, |
| "step": 288 |
| }, |
| { |
| "entropy": 1.052734375, |
| "epoch": 1.7234678624813156, |
| "grad_norm": 13.3125, |
| "learning_rate": 1.0572556662136035e-07, |
| "loss": 1.1346435546875, |
| "mean_token_accuracy": 0.7104056030511856, |
| "num_tokens": 169308189.0, |
| "step": 289 |
| }, |
| { |
| "entropy": 1.0556640625, |
| "epoch": 1.7294469357249627, |
| "grad_norm": 10.3125, |
| "learning_rate": 1.014414992890611e-07, |
| "loss": 1.1441650390625, |
| "mean_token_accuracy": 0.7134399339556694, |
| "num_tokens": 169897805.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 1.03564453125, |
| "epoch": 1.7354260089686098, |
| "grad_norm": 12.4375, |
| "learning_rate": 9.724139264673114e-08, |
| "loss": 1.1241455078125, |
| "mean_token_accuracy": 0.7146468609571457, |
| "num_tokens": 170487344.0, |
| "step": 291 |
| }, |
| { |
| "entropy": 1.0625, |
| "epoch": 1.741405082212257, |
| "grad_norm": 9.625, |
| "learning_rate": 9.312563914945459e-08, |
| "loss": 1.14111328125, |
| "mean_token_accuracy": 0.7087547183036804, |
| "num_tokens": 171076956.0, |
| "step": 292 |
| }, |
| { |
| "entropy": 1.0302734375, |
| "epoch": 1.7473841554559044, |
| "grad_norm": 12.3125, |
| "learning_rate": 8.909462337041507e-08, |
| "loss": 1.119384765625, |
| "mean_token_accuracy": 0.7157952710986137, |
| "num_tokens": 171666573.0, |
| "step": 293 |
| }, |
| { |
| "entropy": 1.0283203125, |
| "epoch": 1.7533632286995515, |
| "grad_norm": 17.5, |
| "learning_rate": 8.514872196496181e-08, |
| "loss": 1.116943359375, |
| "mean_token_accuracy": 0.7182503044605255, |
| "num_tokens": 172247089.0, |
| "step": 294 |
| }, |
| { |
| "entropy": 1.0390625, |
| "epoch": 1.7593423019431988, |
| "grad_norm": 6.96875, |
| "learning_rate": 8.128830363541572e-08, |
| "loss": 1.132568359375, |
| "mean_token_accuracy": 0.7144065871834755, |
| "num_tokens": 172836721.0, |
| "step": 295 |
| }, |
| { |
| "entropy": 1.0361328125, |
| "epoch": 1.7653213751868462, |
| "grad_norm": 14.75, |
| "learning_rate": 7.751372909661768e-08, |
| "loss": 1.1168212890625, |
| "mean_token_accuracy": 0.7155275791883469, |
| "num_tokens": 173426281.0, |
| "step": 296 |
| }, |
| { |
| "entropy": 1.00146484375, |
| "epoch": 1.7713004484304933, |
| "grad_norm": 16.375, |
| "learning_rate": 7.382535104222364e-08, |
| "loss": 1.0948486328125, |
| "mean_token_accuracy": 0.7220958769321442, |
| "num_tokens": 174015810.0, |
| "step": 297 |
| }, |
| { |
| "entropy": 1.01513671875, |
| "epoch": 1.7772795216741404, |
| "grad_norm": 12.8125, |
| "learning_rate": 7.022351411174865e-08, |
| "loss": 1.097900390625, |
| "mean_token_accuracy": 0.7205198705196381, |
| "num_tokens": 174602847.0, |
| "step": 298 |
| }, |
| { |
| "entropy": 1.029296875, |
| "epoch": 1.7832585949177877, |
| "grad_norm": 11.8125, |
| "learning_rate": 6.670855485836524e-08, |
| "loss": 1.1104736328125, |
| "mean_token_accuracy": 0.718941256403923, |
| "num_tokens": 175192486.0, |
| "step": 299 |
| }, |
| { |
| "entropy": 1.0439453125, |
| "epoch": 1.789237668161435, |
| "grad_norm": 11.75, |
| "learning_rate": 6.328080171745509e-08, |
| "loss": 1.125, |
| "mean_token_accuracy": 0.7150193601846695, |
| "num_tokens": 175782052.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 1.03955078125, |
| "epoch": 1.7952167414050821, |
| "grad_norm": 13.6875, |
| "learning_rate": 5.994057497592031e-08, |
| "loss": 1.13037109375, |
| "mean_token_accuracy": 0.7155382409691811, |
| "num_tokens": 176366656.0, |
| "step": 301 |
| }, |
| { |
| "entropy": 1.03759765625, |
| "epoch": 1.8011958146487295, |
| "grad_norm": 10.3125, |
| "learning_rate": 5.6688186742256835e-08, |
| "loss": 1.11767578125, |
| "mean_token_accuracy": 0.7158585712313652, |
| "num_tokens": 176956185.0, |
| "step": 302 |
| }, |
| { |
| "entropy": 1.04052734375, |
| "epoch": 1.8071748878923768, |
| "grad_norm": 20.625, |
| "learning_rate": 5.352394091739021e-08, |
| "loss": 1.1318359375, |
| "mean_token_accuracy": 0.7144715860486031, |
| "num_tokens": 177545828.0, |
| "step": 303 |
| }, |
| { |
| "entropy": 1.01611328125, |
| "epoch": 1.813153961136024, |
| "grad_norm": 11.75, |
| "learning_rate": 5.0448133166279935e-08, |
| "loss": 1.1007080078125, |
| "mean_token_accuracy": 0.7212875410914421, |
| "num_tokens": 178126394.0, |
| "step": 304 |
| }, |
| { |
| "entropy": 1.04443359375, |
| "epoch": 1.819133034379671, |
| "grad_norm": 11.4375, |
| "learning_rate": 4.746105089029229e-08, |
| "loss": 1.1265869140625, |
| "mean_token_accuracy": 0.714270606637001, |
| "num_tokens": 178715841.0, |
| "step": 305 |
| }, |
| { |
| "entropy": 1.01171875, |
| "epoch": 1.8251121076233185, |
| "grad_norm": 12.625, |
| "learning_rate": 4.456297320034641e-08, |
| "loss": 1.0985107421875, |
| "mean_token_accuracy": 0.7234744802117348, |
| "num_tokens": 179304834.0, |
| "step": 306 |
| }, |
| { |
| "entropy": 1.025390625, |
| "epoch": 1.8310911808669657, |
| "grad_norm": 23.375, |
| "learning_rate": 4.1754170890833774e-08, |
| "loss": 1.1092529296875, |
| "mean_token_accuracy": 0.7182625830173492, |
| "num_tokens": 179894402.0, |
| "step": 307 |
| }, |
| { |
| "entropy": 1.0283203125, |
| "epoch": 1.8370702541106128, |
| "grad_norm": 13.875, |
| "learning_rate": 3.9034906414315725e-08, |
| "loss": 1.126953125, |
| "mean_token_accuracy": 0.7168915420770645, |
| "num_tokens": 180483905.0, |
| "step": 308 |
| }, |
| { |
| "entropy": 1.0234375, |
| "epoch": 1.84304932735426, |
| "grad_norm": 14.9375, |
| "learning_rate": 3.6405433856999676e-08, |
| "loss": 1.10693359375, |
| "mean_token_accuracy": 0.7204272672533989, |
| "num_tokens": 181073455.0, |
| "step": 309 |
| }, |
| { |
| "entropy": 1.01025390625, |
| "epoch": 1.8490284005979074, |
| "grad_norm": 15.5625, |
| "learning_rate": 3.386599891499764e-08, |
| "loss": 1.0946044921875, |
| "mean_token_accuracy": 0.7214725464582443, |
| "num_tokens": 181663017.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 1.04638671875, |
| "epoch": 1.8550074738415545, |
| "grad_norm": 12.625, |
| "learning_rate": 3.141683887136892e-08, |
| "loss": 1.13232421875, |
| "mean_token_accuracy": 0.7134620323777199, |
| "num_tokens": 182245322.0, |
| "step": 311 |
| }, |
| { |
| "entropy": 1.01513671875, |
| "epoch": 1.8609865470852018, |
| "grad_norm": 11.0, |
| "learning_rate": 2.9058182573947986e-08, |
| "loss": 1.0958251953125, |
| "mean_token_accuracy": 0.7222162559628487, |
| "num_tokens": 182826090.0, |
| "step": 312 |
| }, |
| { |
| "entropy": 1.00341796875, |
| "epoch": 1.8669656203288492, |
| "grad_norm": 8.1875, |
| "learning_rate": 2.6790250413961546e-08, |
| "loss": 1.0860595703125, |
| "mean_token_accuracy": 0.7237614244222641, |
| "num_tokens": 183415723.0, |
| "step": 313 |
| }, |
| { |
| "entropy": 1.0322265625, |
| "epoch": 1.8729446935724963, |
| "grad_norm": 11.375, |
| "learning_rate": 2.4613254305434815e-08, |
| "loss": 1.10894775390625, |
| "mean_token_accuracy": 0.7184558361768723, |
| "num_tokens": 183996444.0, |
| "step": 314 |
| }, |
| { |
| "entropy": 1.03515625, |
| "epoch": 1.8789237668161434, |
| "grad_norm": 6.46875, |
| "learning_rate": 2.2527397665391024e-08, |
| "loss": 1.1177978515625, |
| "mean_token_accuracy": 0.716648705303669, |
| "num_tokens": 184586073.0, |
| "step": 315 |
| }, |
| { |
| "entropy": 1.05078125, |
| "epoch": 1.8849028400597907, |
| "grad_norm": 12.1875, |
| "learning_rate": 2.053287539484405e-08, |
| "loss": 1.1304931640625, |
| "mean_token_accuracy": 0.7115833833813667, |
| "num_tokens": 185175613.0, |
| "step": 316 |
| }, |
| { |
| "entropy": 1.0009765625, |
| "epoch": 1.890881913303438, |
| "grad_norm": 11.8125, |
| "learning_rate": 1.8629873860586564e-08, |
| "loss": 1.0841064453125, |
| "mean_token_accuracy": 0.7247348576784134, |
| "num_tokens": 185765103.0, |
| "step": 317 |
| }, |
| { |
| "entropy": 0.99609375, |
| "epoch": 1.8968609865470851, |
| "grad_norm": 10.75, |
| "learning_rate": 1.6818570877776718e-08, |
| "loss": 1.0699462890625, |
| "mean_token_accuracy": 0.7280925586819649, |
| "num_tokens": 186347971.0, |
| "step": 318 |
| }, |
| { |
| "entropy": 1.03369140625, |
| "epoch": 1.9028400597907325, |
| "grad_norm": 8.0, |
| "learning_rate": 1.5099135693322773e-08, |
| "loss": 1.1158447265625, |
| "mean_token_accuracy": 0.716788075864315, |
| "num_tokens": 186937575.0, |
| "step": 319 |
| }, |
| { |
| "entropy": 1.0078125, |
| "epoch": 1.9088191330343798, |
| "grad_norm": 9.4375, |
| "learning_rate": 1.3471728970068985e-08, |
| "loss": 1.0909423828125, |
| "mean_token_accuracy": 0.7214584723114967, |
| "num_tokens": 187525095.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 1.041015625, |
| "epoch": 1.9147982062780269, |
| "grad_norm": 7.65625, |
| "learning_rate": 1.1936502771783486e-08, |
| "loss": 1.1307373046875, |
| "mean_token_accuracy": 0.7133340612053871, |
| "num_tokens": 188114646.0, |
| "step": 321 |
| }, |
| { |
| "entropy": 1.0556640625, |
| "epoch": 1.920777279521674, |
| "grad_norm": 12.0, |
| "learning_rate": 1.0493600548948877e-08, |
| "loss": 1.140625, |
| "mean_token_accuracy": 0.7104567736387253, |
| "num_tokens": 188700786.0, |
| "step": 322 |
| }, |
| { |
| "entropy": 1.04296875, |
| "epoch": 1.9267563527653215, |
| "grad_norm": 14.4375, |
| "learning_rate": 9.143157125359513e-09, |
| "loss": 1.134033203125, |
| "mean_token_accuracy": 0.7124024033546448, |
| "num_tokens": 189285842.0, |
| "step": 323 |
| }, |
| { |
| "entropy": 1.02294921875, |
| "epoch": 1.9327354260089686, |
| "grad_norm": 15.375, |
| "learning_rate": 7.885298685522235e-09, |
| "loss": 1.117431640625, |
| "mean_token_accuracy": 0.7192790359258652, |
| "num_tokens": 189868226.0, |
| "step": 324 |
| }, |
| { |
| "entropy": 1.017578125, |
| "epoch": 1.9387144992526157, |
| "grad_norm": 26.25, |
| "learning_rate": 6.720142762867032e-09, |
| "loss": 1.107666015625, |
| "mean_token_accuracy": 0.718171015381813, |
| "num_tokens": 190450814.0, |
| "step": 325 |
| }, |
| { |
| "entropy": 0.98291015625, |
| "epoch": 1.944693572496263, |
| "grad_norm": 29.375, |
| "learning_rate": 5.647798228764156e-09, |
| "loss": 1.0780029296875, |
| "mean_token_accuracy": 0.7297961264848709, |
| "num_tokens": 191040409.0, |
| "step": 326 |
| }, |
| { |
| "entropy": 1.03125, |
| "epoch": 1.9506726457399104, |
| "grad_norm": 8.9375, |
| "learning_rate": 4.668365282351372e-09, |
| "loss": 1.1124267578125, |
| "mean_token_accuracy": 0.7161725759506226, |
| "num_tokens": 191630067.0, |
| "step": 327 |
| }, |
| { |
| "entropy": 1.02685546875, |
| "epoch": 1.9566517189835575, |
| "grad_norm": 10.5625, |
| "learning_rate": 3.7819354411713355e-09, |
| "loss": 1.11083984375, |
| "mean_token_accuracy": 0.7190196141600609, |
| "num_tokens": 192219651.0, |
| "step": 328 |
| }, |
| { |
| "entropy": 1.03076171875, |
| "epoch": 1.9626307922272048, |
| "grad_norm": 12.25, |
| "learning_rate": 2.9885915326203216e-09, |
| "loss": 1.1121826171875, |
| "mean_token_accuracy": 0.7161883562803268, |
| "num_tokens": 192809216.0, |
| "step": 329 |
| }, |
| { |
| "entropy": 1.0224609375, |
| "epoch": 1.9686098654708521, |
| "grad_norm": 13.5625, |
| "learning_rate": 2.2884076862089707e-09, |
| "loss": 1.108642578125, |
| "mean_token_accuracy": 0.7192875891923904, |
| "num_tokens": 193394355.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 1.041015625, |
| "epoch": 1.9745889387144993, |
| "grad_norm": 13.125, |
| "learning_rate": 1.6814493266357199e-09, |
| "loss": 1.129638671875, |
| "mean_token_accuracy": 0.7148761376738548, |
| "num_tokens": 193983861.0, |
| "step": 331 |
| }, |
| { |
| "entropy": 0.98583984375, |
| "epoch": 1.9805680119581464, |
| "grad_norm": 11.0625, |
| "learning_rate": 1.1677731676733581e-09, |
| "loss": 1.0601806640625, |
| "mean_token_accuracy": 0.7276952490210533, |
| "num_tokens": 194573512.0, |
| "step": 332 |
| }, |
| { |
| "entropy": 1.0263671875, |
| "epoch": 1.9865470852017937, |
| "grad_norm": 12.9375, |
| "learning_rate": 7.474272068698217e-10, |
| "loss": 1.1114501953125, |
| "mean_token_accuracy": 0.7177807167172432, |
| "num_tokens": 195163062.0, |
| "step": 333 |
| }, |
| { |
| "entropy": 1.0537109375, |
| "epoch": 1.992526158445441, |
| "grad_norm": 10.9375, |
| "learning_rate": 4.204507210633368e-10, |
| "loss": 1.135498046875, |
| "mean_token_accuracy": 0.7125765532255173, |
| "num_tokens": 195752597.0, |
| "step": 334 |
| }, |
| { |
| "entropy": 1.03369140625, |
| "epoch": 1.9985052316890881, |
| "grad_norm": 10.375, |
| "learning_rate": 1.8687426271246642e-10, |
| "loss": 1.1175537109375, |
| "mean_token_accuracy": 0.7172495499253273, |
| "num_tokens": 196342135.0, |
| "step": 335 |
| }, |
| { |
| "entropy": 0.9921875, |
| "epoch": 2.0, |
| "grad_norm": 10.0, |
| "learning_rate": 4.6719657041283115e-11, |
| "loss": 1.0634765625, |
| "mean_token_accuracy": 0.7287841141223907, |
| "num_tokens": 196489548.0, |
| "step": 336 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 336, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.8420381399090463e+19, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|