Instructions to use codingmonster1234/chess-sft-model with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use codingmonster1234/chess-sft-model with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("codingmonster1234/chess-sft-model", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 200, | |
| "global_step": 252, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.0494567453861237, | |
| "epoch": 0.011904761904761904, | |
| "grad_norm": 3.25, | |
| "learning_rate": 2e-05, | |
| "loss": 2.3006844520568848, | |
| "mean_token_accuracy": 0.5718516036868095, | |
| "num_tokens": 27309.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.4309201687574387, | |
| "epoch": 0.023809523809523808, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 1.999922292480975e-05, | |
| "loss": 1.5732698440551758, | |
| "mean_token_accuracy": 0.6426078528165817, | |
| "num_tokens": 53222.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.6588206142187119, | |
| "epoch": 0.03571428571428571, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 1.9996891820008165e-05, | |
| "loss": 1.5225862264633179, | |
| "mean_token_accuracy": 0.6484016999602318, | |
| "num_tokens": 80191.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.5771028399467468, | |
| "epoch": 0.047619047619047616, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 1.9993007047883988e-05, | |
| "loss": 1.362404227256775, | |
| "mean_token_accuracy": 0.6749871224164963, | |
| "num_tokens": 106227.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.4989110380411148, | |
| "epoch": 0.05952380952380952, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 1.9987569212189224e-05, | |
| "loss": 1.2965717315673828, | |
| "mean_token_accuracy": 0.6907966956496239, | |
| "num_tokens": 133050.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.433018073439598, | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 1.9980579158045322e-05, | |
| "loss": 1.2726430892944336, | |
| "mean_token_accuracy": 0.6866589412093163, | |
| "num_tokens": 159164.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.393497884273529, | |
| "epoch": 0.08333333333333333, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 1.9972037971811802e-05, | |
| "loss": 1.2401988506317139, | |
| "mean_token_accuracy": 0.6934479027986526, | |
| "num_tokens": 185172.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.3325403779745102, | |
| "epoch": 0.09523809523809523, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 1.9961946980917457e-05, | |
| "loss": 1.1832386255264282, | |
| "mean_token_accuracy": 0.7075331285595894, | |
| "num_tokens": 211963.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.3557329028844833, | |
| "epoch": 0.10714285714285714, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 1.9950307753654016e-05, | |
| "loss": 1.2336500883102417, | |
| "mean_token_accuracy": 0.6940329149365425, | |
| "num_tokens": 237773.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 1.3284195959568024, | |
| "epoch": 0.11904761904761904, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 1.9937122098932428e-05, | |
| "loss": 1.1534968614578247, | |
| "mean_token_accuracy": 0.7091642618179321, | |
| "num_tokens": 264529.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.311100259423256, | |
| "epoch": 0.13095238095238096, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 1.9922392066001724e-05, | |
| "loss": 1.1124509572982788, | |
| "mean_token_accuracy": 0.71821578592062, | |
| "num_tokens": 291892.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 1.3356539607048035, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 1.9906119944130527e-05, | |
| "loss": 1.1072099208831787, | |
| "mean_token_accuracy": 0.7180067598819733, | |
| "num_tokens": 318784.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 1.3447266072034836, | |
| "epoch": 0.15476190476190477, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 1.9888308262251286e-05, | |
| "loss": 1.1118253469467163, | |
| "mean_token_accuracy": 0.7157415449619293, | |
| "num_tokens": 345251.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 1.3487544059753418, | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 1.9868959788567213e-05, | |
| "loss": 1.101747751235962, | |
| "mean_token_accuracy": 0.7200475409626961, | |
| "num_tokens": 370047.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 1.340834841132164, | |
| "epoch": 0.17857142857142858, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.9848077530122083e-05, | |
| "loss": 1.0545036792755127, | |
| "mean_token_accuracy": 0.7277030423283577, | |
| "num_tokens": 395849.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 1.376106932759285, | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 1.9825664732332886e-05, | |
| "loss": 1.1372387409210205, | |
| "mean_token_accuracy": 0.7086482048034668, | |
| "num_tokens": 422584.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 1.346299633383751, | |
| "epoch": 0.20238095238095238, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 1.9801724878485438e-05, | |
| "loss": 1.0804857015609741, | |
| "mean_token_accuracy": 0.7215344980359077, | |
| "num_tokens": 448091.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 1.2913489788770676, | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1.977626168919305e-05, | |
| "loss": 1.0192596912384033, | |
| "mean_token_accuracy": 0.740901842713356, | |
| "num_tokens": 474916.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 1.3171072006225586, | |
| "epoch": 0.2261904761904762, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1.9749279121818235e-05, | |
| "loss": 1.0609946250915527, | |
| "mean_token_accuracy": 0.7298643589019775, | |
| "num_tokens": 501552.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 1.311616376042366, | |
| "epoch": 0.23809523809523808, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 1.9720781369857747e-05, | |
| "loss": 1.0525301694869995, | |
| "mean_token_accuracy": 0.7259486094117165, | |
| "num_tokens": 527463.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.3294030278921127, | |
| "epoch": 0.25, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1.969077286229078e-05, | |
| "loss": 1.0771116018295288, | |
| "mean_token_accuracy": 0.7219423204660416, | |
| "num_tokens": 554225.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 1.2698159664869308, | |
| "epoch": 0.2619047619047619, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1.9659258262890683e-05, | |
| "loss": 0.9951949119567871, | |
| "mean_token_accuracy": 0.7395628690719604, | |
| "num_tokens": 580513.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 1.285600259900093, | |
| "epoch": 0.27380952380952384, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1.962624246950012e-05, | |
| "loss": 1.017578363418579, | |
| "mean_token_accuracy": 0.7323780730366707, | |
| "num_tokens": 606397.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 1.287465438246727, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1.9591730613269878e-05, | |
| "loss": 1.0441396236419678, | |
| "mean_token_accuracy": 0.7301788926124573, | |
| "num_tokens": 633019.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 1.260251984000206, | |
| "epoch": 0.2976190476190476, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1.955572805786141e-05, | |
| "loss": 0.9977378845214844, | |
| "mean_token_accuracy": 0.7360019683837891, | |
| "num_tokens": 659559.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 1.2459818571805954, | |
| "epoch": 0.30952380952380953, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1.9518240398613226e-05, | |
| "loss": 0.9966230392456055, | |
| "mean_token_accuracy": 0.7403357177972794, | |
| "num_tokens": 686301.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 1.2732659578323364, | |
| "epoch": 0.32142857142857145, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1.947927346167132e-05, | |
| "loss": 1.0405187606811523, | |
| "mean_token_accuracy": 0.7255821749567986, | |
| "num_tokens": 712849.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 1.2356024384498596, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1.9438833303083677e-05, | |
| "loss": 0.9517513513565063, | |
| "mean_token_accuracy": 0.7443812191486359, | |
| "num_tokens": 738885.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 1.2620837837457657, | |
| "epoch": 0.34523809523809523, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1.9396926207859085e-05, | |
| "loss": 1.0259062051773071, | |
| "mean_token_accuracy": 0.7276915162801743, | |
| "num_tokens": 765741.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 1.2437313944101334, | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1.935355868899034e-05, | |
| "loss": 0.9744393825531006, | |
| "mean_token_accuracy": 0.7423476874828339, | |
| "num_tokens": 791180.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.2606688439846039, | |
| "epoch": 0.36904761904761907, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1.9308737486442045e-05, | |
| "loss": 1.0080578327178955, | |
| "mean_token_accuracy": 0.7328718677163124, | |
| "num_tokens": 817215.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 1.2107220739126205, | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.926246956610309e-05, | |
| "loss": 0.9275428056716919, | |
| "mean_token_accuracy": 0.7561154067516327, | |
| "num_tokens": 842758.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 1.2505638897418976, | |
| "epoch": 0.39285714285714285, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1.921476211870408e-05, | |
| "loss": 0.988660991191864, | |
| "mean_token_accuracy": 0.7380376160144806, | |
| "num_tokens": 869333.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 1.2321503013372421, | |
| "epoch": 0.40476190476190477, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1.9165622558699763e-05, | |
| "loss": 0.9799079298973083, | |
| "mean_token_accuracy": 0.7438922673463821, | |
| "num_tokens": 895603.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 1.2264672666788101, | |
| "epoch": 0.4166666666666667, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1.9115058523116734e-05, | |
| "loss": 0.942933201789856, | |
| "mean_token_accuracy": 0.7487473487854004, | |
| "num_tokens": 922166.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 1.234758123755455, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1.9063077870366504e-05, | |
| "loss": 0.9918478727340698, | |
| "mean_token_accuracy": 0.7380805388092995, | |
| "num_tokens": 949316.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 1.2392372637987137, | |
| "epoch": 0.44047619047619047, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1.900968867902419e-05, | |
| "loss": 0.9514465928077698, | |
| "mean_token_accuracy": 0.7454220503568649, | |
| "num_tokens": 974399.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 1.202688753604889, | |
| "epoch": 0.4523809523809524, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1.895489924657301e-05, | |
| "loss": 0.9055959582328796, | |
| "mean_token_accuracy": 0.7554031237959862, | |
| "num_tokens": 1000802.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 1.230818286538124, | |
| "epoch": 0.4642857142857143, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1.8898718088114688e-05, | |
| "loss": 0.9365515112876892, | |
| "mean_token_accuracy": 0.7501723319292068, | |
| "num_tokens": 1026890.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 1.2267395704984665, | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1.8841153935046098e-05, | |
| "loss": 0.9159681797027588, | |
| "mean_token_accuracy": 0.7491171285510063, | |
| "num_tokens": 1053091.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.1970189958810806, | |
| "epoch": 0.4880952380952381, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1.8782215733702286e-05, | |
| "loss": 0.8979389071464539, | |
| "mean_token_accuracy": 0.7574636936187744, | |
| "num_tokens": 1079663.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 1.2507815808057785, | |
| "epoch": 0.5, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1.8721912643966055e-05, | |
| "loss": 0.9719507098197937, | |
| "mean_token_accuracy": 0.742218367755413, | |
| "num_tokens": 1105048.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 1.2222629934549332, | |
| "epoch": 0.5119047619047619, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1.866025403784439e-05, | |
| "loss": 0.9402767419815063, | |
| "mean_token_accuracy": 0.748046763241291, | |
| "num_tokens": 1132222.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 1.2003469914197922, | |
| "epoch": 0.5238095238095238, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.8597249498011906e-05, | |
| "loss": 0.8898881077766418, | |
| "mean_token_accuracy": 0.7626642361283302, | |
| "num_tokens": 1157814.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 1.2209035754203796, | |
| "epoch": 0.5357142857142857, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.8532908816321557e-05, | |
| "loss": 0.9445702433586121, | |
| "mean_token_accuracy": 0.7497504875063896, | |
| "num_tokens": 1184124.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 1.2391884326934814, | |
| "epoch": 0.5476190476190477, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.8467241992282842e-05, | |
| "loss": 0.9409447312355042, | |
| "mean_token_accuracy": 0.7404728680849075, | |
| "num_tokens": 1210393.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 1.2089068293571472, | |
| "epoch": 0.5595238095238095, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1.8400259231507716e-05, | |
| "loss": 0.8960930705070496, | |
| "mean_token_accuracy": 0.7594130486249924, | |
| "num_tokens": 1236048.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 1.2001971304416656, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 1.833197094412449e-05, | |
| "loss": 0.900606632232666, | |
| "mean_token_accuracy": 0.7573259472846985, | |
| "num_tokens": 1261791.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 1.2007492184638977, | |
| "epoch": 0.5833333333333334, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1.826238774315995e-05, | |
| "loss": 0.892056405544281, | |
| "mean_token_accuracy": 0.7559861242771149, | |
| "num_tokens": 1287305.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 1.211816444993019, | |
| "epoch": 0.5952380952380952, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1.819152044288992e-05, | |
| "loss": 0.9308467507362366, | |
| "mean_token_accuracy": 0.74553432315588, | |
| "num_tokens": 1312769.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.191390410065651, | |
| "epoch": 0.6071428571428571, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1.811938005715857e-05, | |
| "loss": 0.896440327167511, | |
| "mean_token_accuracy": 0.7532382532954216, | |
| "num_tokens": 1338512.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 1.2459852695465088, | |
| "epoch": 0.6190476190476191, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1.8045977797666685e-05, | |
| "loss": 0.992501974105835, | |
| "mean_token_accuracy": 0.7368015274405479, | |
| "num_tokens": 1363546.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 1.1912168115377426, | |
| "epoch": 0.6309523809523809, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1.7971325072229227e-05, | |
| "loss": 0.9404975771903992, | |
| "mean_token_accuracy": 0.7475577220320702, | |
| "num_tokens": 1390501.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 1.2136415243148804, | |
| "epoch": 0.6428571428571429, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1.7895433483002356e-05, | |
| "loss": 0.9598132371902466, | |
| "mean_token_accuracy": 0.7476249039173126, | |
| "num_tokens": 1416027.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 1.1992420703172684, | |
| "epoch": 0.6547619047619048, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1.78183148246803e-05, | |
| "loss": 0.9130250811576843, | |
| "mean_token_accuracy": 0.748944066464901, | |
| "num_tokens": 1442485.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 1.199160858988762, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 1.7739981082662275e-05, | |
| "loss": 0.9233703017234802, | |
| "mean_token_accuracy": 0.7508212402462959, | |
| "num_tokens": 1469875.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 1.2234519720077515, | |
| "epoch": 0.6785714285714286, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1.766044443118978e-05, | |
| "loss": 0.931207537651062, | |
| "mean_token_accuracy": 0.7512670606374741, | |
| "num_tokens": 1495641.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 1.1871410608291626, | |
| "epoch": 0.6904761904761905, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1.757971723145453e-05, | |
| "loss": 0.8872414231300354, | |
| "mean_token_accuracy": 0.7571859434247017, | |
| "num_tokens": 1521893.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 1.2121420055627823, | |
| "epoch": 0.7023809523809523, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1.7497812029677344e-05, | |
| "loss": 0.8940609693527222, | |
| "mean_token_accuracy": 0.7537769973278046, | |
| "num_tokens": 1546611.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 1.2078164517879486, | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.741474155515827e-05, | |
| "loss": 0.9187013506889343, | |
| "mean_token_accuracy": 0.7539865076541901, | |
| "num_tokens": 1573642.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.2090130001306534, | |
| "epoch": 0.7261904761904762, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1.7330518718298263e-05, | |
| "loss": 0.9223328828811646, | |
| "mean_token_accuracy": 0.7508932426571846, | |
| "num_tokens": 1600715.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 1.2016019374132156, | |
| "epoch": 0.7380952380952381, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.7245156608592727e-05, | |
| "loss": 0.8950364589691162, | |
| "mean_token_accuracy": 0.7588292881846428, | |
| "num_tokens": 1627520.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 1.234247699379921, | |
| "epoch": 0.75, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1.7158668492597186e-05, | |
| "loss": 0.950704038143158, | |
| "mean_token_accuracy": 0.7440979778766632, | |
| "num_tokens": 1652694.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 1.1987243592739105, | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1.7071067811865477e-05, | |
| "loss": 0.9169787764549255, | |
| "mean_token_accuracy": 0.749868243932724, | |
| "num_tokens": 1678752.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 1.2067360877990723, | |
| "epoch": 0.7738095238095238, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.698236818086073e-05, | |
| "loss": 0.9410067200660706, | |
| "mean_token_accuracy": 0.7454248741269112, | |
| "num_tokens": 1705052.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 1.175739899277687, | |
| "epoch": 0.7857142857142857, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1.689258338483947e-05, | |
| "loss": 0.8811770677566528, | |
| "mean_token_accuracy": 0.7612746432423592, | |
| "num_tokens": 1731320.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 1.2032074332237244, | |
| "epoch": 0.7976190476190477, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.6801727377709195e-05, | |
| "loss": 0.8974794149398804, | |
| "mean_token_accuracy": 0.7525004893541336, | |
| "num_tokens": 1757343.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 1.1771471053361893, | |
| "epoch": 0.8095238095238095, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1.67098142798597e-05, | |
| "loss": 0.8747112154960632, | |
| "mean_token_accuracy": 0.7630033940076828, | |
| "num_tokens": 1783385.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 1.1831130683422089, | |
| "epoch": 0.8214285714285714, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1.6616858375968596e-05, | |
| "loss": 0.8969507217407227, | |
| "mean_token_accuracy": 0.7549743354320526, | |
| "num_tokens": 1809358.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 1.1840351969003677, | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.6522874112781213e-05, | |
| "loss": 0.8991947174072266, | |
| "mean_token_accuracy": 0.7609456926584244, | |
| "num_tokens": 1835687.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.1689308136701584, | |
| "epoch": 0.8452380952380952, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1.6427876096865394e-05, | |
| "loss": 0.8942866921424866, | |
| "mean_token_accuracy": 0.7570540234446526, | |
| "num_tokens": 1863074.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 1.176902562379837, | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 1.6331879092341402e-05, | |
| "loss": 0.8916264772415161, | |
| "mean_token_accuracy": 0.7569359466433525, | |
| "num_tokens": 1889483.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 1.1485393792390823, | |
| "epoch": 0.8690476190476191, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 1.6234898018587336e-05, | |
| "loss": 0.8300937414169312, | |
| "mean_token_accuracy": 0.7689579054713249, | |
| "num_tokens": 1916554.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 1.188908874988556, | |
| "epoch": 0.8809523809523809, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1.6136947947920477e-05, | |
| "loss": 0.8971793055534363, | |
| "mean_token_accuracy": 0.7554975599050522, | |
| "num_tokens": 1942761.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 1.1443840563297272, | |
| "epoch": 0.8928571428571429, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 1.6038044103254775e-05, | |
| "loss": 0.8417671918869019, | |
| "mean_token_accuracy": 0.7658839598298073, | |
| "num_tokens": 1970108.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 1.1925117820501328, | |
| "epoch": 0.9047619047619048, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1.5938201855735017e-05, | |
| "loss": 0.9137139320373535, | |
| "mean_token_accuracy": 0.7506694570183754, | |
| "num_tokens": 1996053.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 1.146860048174858, | |
| "epoch": 0.9166666666666666, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 1.5837436722347902e-05, | |
| "loss": 0.8144647479057312, | |
| "mean_token_accuracy": 0.7788625955581665, | |
| "num_tokens": 2022462.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 1.1756789684295654, | |
| "epoch": 0.9285714285714286, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1.573576436351046e-05, | |
| "loss": 0.8733536601066589, | |
| "mean_token_accuracy": 0.7620046883821487, | |
| "num_tokens": 2049019.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 1.1542808413505554, | |
| "epoch": 0.9404761904761905, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 1.563320058063622e-05, | |
| "loss": 0.842410147190094, | |
| "mean_token_accuracy": 0.7684632763266563, | |
| "num_tokens": 2075647.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 1.1835035681724548, | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.5529761313679396e-05, | |
| "loss": 0.8729836344718933, | |
| "mean_token_accuracy": 0.7571216598153114, | |
| "num_tokens": 2101511.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.1786216050386429, | |
| "epoch": 0.9642857142857143, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1.5425462638657597e-05, | |
| "loss": 0.9298287034034729, | |
| "mean_token_accuracy": 0.750826895236969, | |
| "num_tokens": 2128981.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 1.1665120720863342, | |
| "epoch": 0.9761904761904762, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1.5320320765153367e-05, | |
| "loss": 0.8446856141090393, | |
| "mean_token_accuracy": 0.7674100771546364, | |
| "num_tokens": 2155665.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 1.180627018213272, | |
| "epoch": 0.9880952380952381, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1.5214352033794981e-05, | |
| "loss": 0.8966464996337891, | |
| "mean_token_accuracy": 0.7552101761102676, | |
| "num_tokens": 2182098.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 1.174801453948021, | |
| "epoch": 1.0, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1.5107572913716859e-05, | |
| "loss": 0.9110517501831055, | |
| "mean_token_accuracy": 0.7524702101945877, | |
| "num_tokens": 2206852.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 1.1386308819055557, | |
| "epoch": 1.0119047619047619, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.8197644352912903, | |
| "mean_token_accuracy": 0.7711560949683189, | |
| "num_tokens": 2233877.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 1.1694304198026657, | |
| "epoch": 1.0238095238095237, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.4891650011092896e-05, | |
| "loss": 0.8406116366386414, | |
| "mean_token_accuracy": 0.7672885581851006, | |
| "num_tokens": 2259853.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 1.163503646850586, | |
| "epoch": 1.0357142857142858, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1.4782539786213184e-05, | |
| "loss": 0.8342965841293335, | |
| "mean_token_accuracy": 0.7698492407798767, | |
| "num_tokens": 2286573.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 1.140608012676239, | |
| "epoch": 1.0476190476190477, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 1.4672686282730622e-05, | |
| "loss": 0.8190659284591675, | |
| "mean_token_accuracy": 0.7743507474660873, | |
| "num_tokens": 2312393.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 1.1497679501771927, | |
| "epoch": 1.0595238095238095, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1.4562106573531632e-05, | |
| "loss": 0.8387782573699951, | |
| "mean_token_accuracy": 0.7691004797816277, | |
| "num_tokens": 2338407.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 1.122629627585411, | |
| "epoch": 1.0714285714285714, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 1.4450817844365924e-05, | |
| "loss": 0.821854293346405, | |
| "mean_token_accuracy": 0.7695046216249466, | |
| "num_tokens": 2365270.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.1272503286600113, | |
| "epoch": 1.0833333333333333, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.4338837391175582e-05, | |
| "loss": 0.8212571144104004, | |
| "mean_token_accuracy": 0.7709975987672806, | |
| "num_tokens": 2391281.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 1.127255380153656, | |
| "epoch": 1.0952380952380953, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.4226182617406996e-05, | |
| "loss": 0.8506991267204285, | |
| "mean_token_accuracy": 0.7657980620861053, | |
| "num_tokens": 2417731.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 1.1089943200349808, | |
| "epoch": 1.1071428571428572, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 1.4112871031306118e-05, | |
| "loss": 0.8343423008918762, | |
| "mean_token_accuracy": 0.7691437527537346, | |
| "num_tokens": 2444368.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 1.1271415501832962, | |
| "epoch": 1.119047619047619, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1.3998920243197408e-05, | |
| "loss": 0.8489895462989807, | |
| "mean_token_accuracy": 0.7649731263518333, | |
| "num_tokens": 2470388.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 1.1003229320049286, | |
| "epoch": 1.130952380952381, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1.3884347962746949e-05, | |
| "loss": 0.8006178736686707, | |
| "mean_token_accuracy": 0.7761545479297638, | |
| "num_tokens": 2496905.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 1.1041414737701416, | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1.3769171996210053e-05, | |
| "loss": 0.8553689122200012, | |
| "mean_token_accuracy": 0.764044426381588, | |
| "num_tokens": 2523652.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 1.110143706202507, | |
| "epoch": 1.1547619047619047, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1.3653410243663953e-05, | |
| "loss": 0.8074391484260559, | |
| "mean_token_accuracy": 0.7706331759691238, | |
| "num_tokens": 2548790.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 1.1172381490468979, | |
| "epoch": 1.1666666666666667, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1.3537080696225815e-05, | |
| "loss": 0.8375645875930786, | |
| "mean_token_accuracy": 0.7649935036897659, | |
| "num_tokens": 2575096.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 1.1418239027261734, | |
| "epoch": 1.1785714285714286, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1.342020143325669e-05, | |
| "loss": 0.8995247483253479, | |
| "mean_token_accuracy": 0.7510313019156456, | |
| "num_tokens": 2602353.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 1.0998874455690384, | |
| "epoch": 1.1904761904761905, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1.3302790619551673e-05, | |
| "loss": 0.8080697059631348, | |
| "mean_token_accuracy": 0.7733101099729538, | |
| "num_tokens": 2628852.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.1344088912010193, | |
| "epoch": 1.2023809523809523, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1.3184866502516846e-05, | |
| "loss": 0.8783486485481262, | |
| "mean_token_accuracy": 0.7589947134256363, | |
| "num_tokens": 2655409.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 1.1111710369586945, | |
| "epoch": 1.2142857142857142, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1.3066447409333345e-05, | |
| "loss": 0.8078486919403076, | |
| "mean_token_accuracy": 0.7743134796619415, | |
| "num_tokens": 2681389.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 1.130819410085678, | |
| "epoch": 1.2261904761904763, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.2947551744109044e-05, | |
| "loss": 0.8322258591651917, | |
| "mean_token_accuracy": 0.769818089902401, | |
| "num_tokens": 2707806.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 1.1183798164129257, | |
| "epoch": 1.2380952380952381, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1.2828197985018276e-05, | |
| "loss": 0.8078725337982178, | |
| "mean_token_accuracy": 0.7757357433438301, | |
| "num_tokens": 2733442.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 1.1384148299694061, | |
| "epoch": 1.25, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1.2708404681430054e-05, | |
| "loss": 0.856360137462616, | |
| "mean_token_accuracy": 0.7635187655687332, | |
| "num_tokens": 2759424.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 1.1452041417360306, | |
| "epoch": 1.2619047619047619, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1.2588190451025209e-05, | |
| "loss": 0.9050104022026062, | |
| "mean_token_accuracy": 0.750573143362999, | |
| "num_tokens": 2786367.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 1.1285814046859741, | |
| "epoch": 1.2738095238095237, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.2467573976902936e-05, | |
| "loss": 0.8008386492729187, | |
| "mean_token_accuracy": 0.7742740660905838, | |
| "num_tokens": 2812132.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 1.1281604319810867, | |
| "epoch": 1.2857142857142856, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1.2346574004677154e-05, | |
| "loss": 0.8196746706962585, | |
| "mean_token_accuracy": 0.7726086974143982, | |
| "num_tokens": 2838298.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 1.1383277475833893, | |
| "epoch": 1.2976190476190477, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1.2225209339563144e-05, | |
| "loss": 0.8370059132575989, | |
| "mean_token_accuracy": 0.7635474875569344, | |
| "num_tokens": 2865310.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 1.1403304636478424, | |
| "epoch": 1.3095238095238095, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.210349884345496e-05, | |
| "loss": 0.8350245952606201, | |
| "mean_token_accuracy": 0.7653406709432602, | |
| "num_tokens": 2891493.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.126132920384407, | |
| "epoch": 1.3214285714285714, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1.1981461431993978e-05, | |
| "loss": 0.8264559507369995, | |
| "mean_token_accuracy": 0.7698510065674782, | |
| "num_tokens": 2917435.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 1.1356945484876633, | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1.1859116071629148e-05, | |
| "loss": 0.8438840508460999, | |
| "mean_token_accuracy": 0.7598991841077805, | |
| "num_tokens": 2943915.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 1.127932071685791, | |
| "epoch": 1.3452380952380953, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1.1736481776669307e-05, | |
| "loss": 0.8465971350669861, | |
| "mean_token_accuracy": 0.7641785144805908, | |
| "num_tokens": 2970567.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 1.1035424768924713, | |
| "epoch": 1.3571428571428572, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.1613577606328068e-05, | |
| "loss": 0.796665370464325, | |
| "mean_token_accuracy": 0.7764606773853302, | |
| "num_tokens": 2996884.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 1.1041745096445084, | |
| "epoch": 1.369047619047619, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 1.1490422661761744e-05, | |
| "loss": 0.8078674674034119, | |
| "mean_token_accuracy": 0.7755235582590103, | |
| "num_tokens": 3022892.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 1.1297542303800583, | |
| "epoch": 1.380952380952381, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.1367036083100735e-05, | |
| "loss": 0.845131516456604, | |
| "mean_token_accuracy": 0.7647897973656654, | |
| "num_tokens": 3048844.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 1.115392044186592, | |
| "epoch": 1.3928571428571428, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1.1243437046474854e-05, | |
| "loss": 0.8097545504570007, | |
| "mean_token_accuracy": 0.7723855003714561, | |
| "num_tokens": 3074806.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 1.126968502998352, | |
| "epoch": 1.4047619047619047, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1.1119644761033079e-05, | |
| "loss": 0.8312132358551025, | |
| "mean_token_accuracy": 0.7707365080714226, | |
| "num_tokens": 3100660.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 1.1252721548080444, | |
| "epoch": 1.4166666666666667, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1.0995678465958168e-05, | |
| "loss": 0.8202887177467346, | |
| "mean_token_accuracy": 0.7648059278726578, | |
| "num_tokens": 3126427.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 1.144352748990059, | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.0871557427476585e-05, | |
| "loss": 0.8726206421852112, | |
| "mean_token_accuracy": 0.7587710469961166, | |
| "num_tokens": 3153355.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.1297250092029572, | |
| "epoch": 1.4404761904761905, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1.0747300935864245e-05, | |
| "loss": 0.8322736024856567, | |
| "mean_token_accuracy": 0.7670553028583527, | |
| "num_tokens": 3179491.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 1.099809244275093, | |
| "epoch": 1.4523809523809523, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1.0622928302448523e-05, | |
| "loss": 0.8082002997398376, | |
| "mean_token_accuracy": 0.7719271555542946, | |
| "num_tokens": 3207329.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 1.135951966047287, | |
| "epoch": 1.4642857142857144, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1.0498458856606972e-05, | |
| "loss": 0.8370299935340881, | |
| "mean_token_accuracy": 0.7657874599099159, | |
| "num_tokens": 3233160.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 1.134585291147232, | |
| "epoch": 1.4761904761904763, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.037391194276326e-05, | |
| "loss": 0.8596113324165344, | |
| "mean_token_accuracy": 0.7584040239453316, | |
| "num_tokens": 3259695.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 1.1345889419317245, | |
| "epoch": 1.4880952380952381, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1.0249306917380731e-05, | |
| "loss": 0.8585492372512817, | |
| "mean_token_accuracy": 0.7621477544307709, | |
| "num_tokens": 3286287.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 1.1395476311445236, | |
| "epoch": 1.5, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.0124663145954152e-05, | |
| "loss": 0.853608250617981, | |
| "mean_token_accuracy": 0.7603440955281258, | |
| "num_tokens": 3312811.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 1.1343800723552704, | |
| "epoch": 1.5119047619047619, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.855441689491272, | |
| "mean_token_accuracy": 0.7621574178338051, | |
| "num_tokens": 3338485.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 1.147290289402008, | |
| "epoch": 1.5238095238095237, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 9.87533685404585e-06, | |
| "loss": 0.8562023639678955, | |
| "mean_token_accuracy": 0.7617470622062683, | |
| "num_tokens": 3363352.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 1.0908429026603699, | |
| "epoch": 1.5357142857142856, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 9.750693082619274e-06, | |
| "loss": 0.7935777902603149, | |
| "mean_token_accuracy": 0.7803073972463608, | |
| "num_tokens": 3389709.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 1.1274128407239914, | |
| "epoch": 1.5476190476190477, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 9.626088057236745e-06, | |
| "loss": 0.8381516337394714, | |
| "mean_token_accuracy": 0.7682216316461563, | |
| "num_tokens": 3415496.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.1263544410467148, | |
| "epoch": 1.5595238095238095, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 9.501541143393028e-06, | |
| "loss": 0.8319387435913086, | |
| "mean_token_accuracy": 0.7662338837981224, | |
| "num_tokens": 3441710.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 1.1065820306539536, | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 9.377071697551479e-06, | |
| "loss": 0.8057957887649536, | |
| "mean_token_accuracy": 0.7757579833269119, | |
| "num_tokens": 3468647.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 1.1226835697889328, | |
| "epoch": 1.5833333333333335, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 9.252699064135759e-06, | |
| "loss": 0.8140422701835632, | |
| "mean_token_accuracy": 0.7766503915190697, | |
| "num_tokens": 3494627.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 1.1279116421937943, | |
| "epoch": 1.5952380952380953, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 9.128442572523418e-06, | |
| "loss": 0.8333442807197571, | |
| "mean_token_accuracy": 0.7701004445552826, | |
| "num_tokens": 3520726.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 1.118562862277031, | |
| "epoch": 1.6071428571428572, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 9.004321534041836e-06, | |
| "loss": 0.8082501292228699, | |
| "mean_token_accuracy": 0.7714490741491318, | |
| "num_tokens": 3546604.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 1.1184325218200684, | |
| "epoch": 1.619047619047619, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 8.880355238966923e-06, | |
| "loss": 0.8013947606086731, | |
| "mean_token_accuracy": 0.7703685536980629, | |
| "num_tokens": 3572560.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 1.1083856672048569, | |
| "epoch": 1.630952380952381, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 8.756562953525151e-06, | |
| "loss": 0.8402643799781799, | |
| "mean_token_accuracy": 0.7718946933746338, | |
| "num_tokens": 3600409.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 1.1246822774410248, | |
| "epoch": 1.6428571428571428, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 8.632963916899268e-06, | |
| "loss": 0.8138586282730103, | |
| "mean_token_accuracy": 0.7705142721533775, | |
| "num_tokens": 3626126.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 1.1232711374759674, | |
| "epoch": 1.6547619047619047, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 8.509577338238255e-06, | |
| "loss": 0.8325715065002441, | |
| "mean_token_accuracy": 0.7666014134883881, | |
| "num_tokens": 3652299.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 1.1074089109897614, | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 8.386422393671934e-06, | |
| "loss": 0.7925585508346558, | |
| "mean_token_accuracy": 0.7779242396354675, | |
| "num_tokens": 3678732.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.127850890159607, | |
| "epoch": 1.6785714285714286, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 8.263518223330698e-06, | |
| "loss": 0.8398563861846924, | |
| "mean_token_accuracy": 0.7660870477557182, | |
| "num_tokens": 3705434.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 1.1316388994455338, | |
| "epoch": 1.6904761904761905, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 8.140883928370855e-06, | |
| "loss": 0.8630028963088989, | |
| "mean_token_accuracy": 0.7604737058281898, | |
| "num_tokens": 3732503.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 1.1168327778577805, | |
| "epoch": 1.7023809523809523, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 8.018538568006027e-06, | |
| "loss": 0.8182620406150818, | |
| "mean_token_accuracy": 0.7686956897377968, | |
| "num_tokens": 3759558.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 1.1009707152843475, | |
| "epoch": 1.7142857142857144, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 7.896501156545044e-06, | |
| "loss": 0.7965549230575562, | |
| "mean_token_accuracy": 0.7743760347366333, | |
| "num_tokens": 3786872.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 1.0930486172437668, | |
| "epoch": 1.7261904761904763, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 7.774790660436857e-06, | |
| "loss": 0.7760134935379028, | |
| "mean_token_accuracy": 0.784682035446167, | |
| "num_tokens": 3812661.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 1.1450048387050629, | |
| "epoch": 1.7380952380952381, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 7.653425995322852e-06, | |
| "loss": 0.8734183311462402, | |
| "mean_token_accuracy": 0.7527253702282906, | |
| "num_tokens": 3839385.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 1.1145509034395218, | |
| "epoch": 1.75, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 7.532426023097063e-06, | |
| "loss": 0.7770429849624634, | |
| "mean_token_accuracy": 0.7795330286026001, | |
| "num_tokens": 3864694.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 1.1127979755401611, | |
| "epoch": 1.7619047619047619, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 7.411809548974792e-06, | |
| "loss": 0.8262644410133362, | |
| "mean_token_accuracy": 0.7699695378541946, | |
| "num_tokens": 3890836.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 1.1189321875572205, | |
| "epoch": 1.7738095238095237, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 7.291595318569951e-06, | |
| "loss": 0.8311002850532532, | |
| "mean_token_accuracy": 0.7660280093550682, | |
| "num_tokens": 3917364.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 1.0803979188203812, | |
| "epoch": 1.7857142857142856, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 7.171802014981726e-06, | |
| "loss": 0.7628324627876282, | |
| "mean_token_accuracy": 0.7882028222084045, | |
| "num_tokens": 3943383.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.1122339814901352, | |
| "epoch": 1.7976190476190477, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 7.052448255890958e-06, | |
| "loss": 0.8079776167869568, | |
| "mean_token_accuracy": 0.7733784541487694, | |
| "num_tokens": 3968880.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 1.1262486726045609, | |
| "epoch": 1.8095238095238095, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 6.933552590666659e-06, | |
| "loss": 0.8450883626937866, | |
| "mean_token_accuracy": 0.76470497995615, | |
| "num_tokens": 3994252.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 1.1084678769111633, | |
| "epoch": 1.8214285714285714, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 6.815133497483157e-06, | |
| "loss": 0.8062177896499634, | |
| "mean_token_accuracy": 0.7726119607686996, | |
| "num_tokens": 4021377.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 1.097622349858284, | |
| "epoch": 1.8333333333333335, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 6.697209380448333e-06, | |
| "loss": 0.7983017563819885, | |
| "mean_token_accuracy": 0.774423360824585, | |
| "num_tokens": 4048074.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 1.111462488770485, | |
| "epoch": 1.8452380952380953, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 6.579798566743314e-06, | |
| "loss": 0.8211446404457092, | |
| "mean_token_accuracy": 0.7687254920601845, | |
| "num_tokens": 4074401.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 1.091482549905777, | |
| "epoch": 1.8571428571428572, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 6.462919303774186e-06, | |
| "loss": 0.7832185626029968, | |
| "mean_token_accuracy": 0.7785474807024002, | |
| "num_tokens": 4101171.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 1.1196855455636978, | |
| "epoch": 1.869047619047619, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 6.34658975633605e-06, | |
| "loss": 0.8435440063476562, | |
| "mean_token_accuracy": 0.7660573944449425, | |
| "num_tokens": 4127519.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 1.090627670288086, | |
| "epoch": 1.880952380952381, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 6.230828003789949e-06, | |
| "loss": 0.7934207320213318, | |
| "mean_token_accuracy": 0.7779519408941269, | |
| "num_tokens": 4154910.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 1.130862444639206, | |
| "epoch": 1.8928571428571428, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 6.115652037253054e-06, | |
| "loss": 0.8495190739631653, | |
| "mean_token_accuracy": 0.7622320130467415, | |
| "num_tokens": 4180994.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 1.10136216878891, | |
| "epoch": 1.9047619047619047, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 6.001079756802592e-06, | |
| "loss": 0.7951112389564514, | |
| "mean_token_accuracy": 0.7760354280471802, | |
| "num_tokens": 4206873.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.0941227227449417, | |
| "epoch": 1.9166666666666665, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 5.887128968693887e-06, | |
| "loss": 0.7749046087265015, | |
| "mean_token_accuracy": 0.7782920673489571, | |
| "num_tokens": 4232763.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 1.0888655632734299, | |
| "epoch": 1.9285714285714286, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 5.773817382593008e-06, | |
| "loss": 0.8022912740707397, | |
| "mean_token_accuracy": 0.772666685283184, | |
| "num_tokens": 4258969.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 1.1020135283470154, | |
| "epoch": 1.9404761904761905, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 5.66116260882442e-06, | |
| "loss": 0.8075801134109497, | |
| "mean_token_accuracy": 0.7722523957490921, | |
| "num_tokens": 4285853.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 1.1252782344818115, | |
| "epoch": 1.9523809523809523, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 5.549182155634076e-06, | |
| "loss": 0.8001527190208435, | |
| "mean_token_accuracy": 0.7735483199357986, | |
| "num_tokens": 4310282.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 1.110137328505516, | |
| "epoch": 1.9642857142857144, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 5.43789342646837e-06, | |
| "loss": 0.8033909797668457, | |
| "mean_token_accuracy": 0.7731057927012444, | |
| "num_tokens": 4336076.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 1.1079717725515366, | |
| "epoch": 1.9761904761904763, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 5.32731371726938e-06, | |
| "loss": 0.7891072034835815, | |
| "mean_token_accuracy": 0.7758950665593147, | |
| "num_tokens": 4361690.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 1.111808881163597, | |
| "epoch": 1.9880952380952381, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 5.217460213786822e-06, | |
| "loss": 0.8317010402679443, | |
| "mean_token_accuracy": 0.7686774134635925, | |
| "num_tokens": 4388453.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 1.0940789729356766, | |
| "epoch": 2.0, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 5.108349988907111e-06, | |
| "loss": 0.7986106872558594, | |
| "mean_token_accuracy": 0.7781749367713928, | |
| "num_tokens": 4413704.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 1.1143835335969925, | |
| "epoch": 2.011904761904762, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 5.000000000000003e-06, | |
| "loss": 0.800915539264679, | |
| "mean_token_accuracy": 0.7711915224790573, | |
| "num_tokens": 4440194.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 1.0910003781318665, | |
| "epoch": 2.0238095238095237, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 4.892427086283147e-06, | |
| "loss": 0.7806589603424072, | |
| "mean_token_accuracy": 0.7765552401542664, | |
| "num_tokens": 4466934.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.1028515249490738, | |
| "epoch": 2.0357142857142856, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 4.78564796620502e-06, | |
| "loss": 0.7799363136291504, | |
| "mean_token_accuracy": 0.7761322408914566, | |
| "num_tokens": 4493010.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 1.1085332185029984, | |
| "epoch": 2.0476190476190474, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 4.679679234846636e-06, | |
| "loss": 0.7984216213226318, | |
| "mean_token_accuracy": 0.7793152034282684, | |
| "num_tokens": 4518942.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 1.124870389699936, | |
| "epoch": 2.0595238095238093, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 4.5745373613424075e-06, | |
| "loss": 0.7885993719100952, | |
| "mean_token_accuracy": 0.7786193862557411, | |
| "num_tokens": 4543683.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 1.1041023209691048, | |
| "epoch": 2.0714285714285716, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 4.470238686320606e-06, | |
| "loss": 0.8044976592063904, | |
| "mean_token_accuracy": 0.7772313058376312, | |
| "num_tokens": 4569451.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 1.109834685921669, | |
| "epoch": 2.0833333333333335, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 4.3667994193637794e-06, | |
| "loss": 0.802185595035553, | |
| "mean_token_accuracy": 0.7703160792589188, | |
| "num_tokens": 4595980.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 1.1169404536485672, | |
| "epoch": 2.0952380952380953, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 4.264235636489542e-06, | |
| "loss": 0.8406084775924683, | |
| "mean_token_accuracy": 0.7628436088562012, | |
| "num_tokens": 4623082.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 1.0920062735676765, | |
| "epoch": 2.107142857142857, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 4.162563277652104e-06, | |
| "loss": 0.807350754737854, | |
| "mean_token_accuracy": 0.7676851376891136, | |
| "num_tokens": 4651364.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 1.0805025100708008, | |
| "epoch": 2.119047619047619, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 4.061798144264986e-06, | |
| "loss": 0.7802121043205261, | |
| "mean_token_accuracy": 0.7830931693315506, | |
| "num_tokens": 4678114.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 1.1019368171691895, | |
| "epoch": 2.130952380952381, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 3.961955896745224e-06, | |
| "loss": 0.8179029226303101, | |
| "mean_token_accuracy": 0.7715626880526543, | |
| "num_tokens": 4704903.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 1.1142058372497559, | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 3.8630520520795275e-06, | |
| "loss": 0.820464015007019, | |
| "mean_token_accuracy": 0.767073430120945, | |
| "num_tokens": 4730882.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.0781855285167694, | |
| "epoch": 2.1547619047619047, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 3.7651019814126656e-06, | |
| "loss": 0.7623270153999329, | |
| "mean_token_accuracy": 0.7844816967844963, | |
| "num_tokens": 4756693.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 1.0938004553318024, | |
| "epoch": 2.1666666666666665, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 3.6681209076586035e-06, | |
| "loss": 0.780868649482727, | |
| "mean_token_accuracy": 0.7797767668962479, | |
| "num_tokens": 4783150.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 1.1012120842933655, | |
| "epoch": 2.1785714285714284, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 3.5721239031346067e-06, | |
| "loss": 0.8160427212715149, | |
| "mean_token_accuracy": 0.7717493698000908, | |
| "num_tokens": 4809980.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 1.1437917947769165, | |
| "epoch": 2.1904761904761907, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 3.4771258872187917e-06, | |
| "loss": 0.8643677830696106, | |
| "mean_token_accuracy": 0.7510505765676498, | |
| "num_tokens": 4834990.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 1.1224986612796783, | |
| "epoch": 2.2023809523809526, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 3.3831416240314085e-06, | |
| "loss": 0.8391753435134888, | |
| "mean_token_accuracy": 0.7646627351641655, | |
| "num_tokens": 4860271.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 1.1111707538366318, | |
| "epoch": 2.2142857142857144, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 3.290185720140301e-06, | |
| "loss": 0.8266609311103821, | |
| "mean_token_accuracy": 0.7683383226394653, | |
| "num_tokens": 4887403.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 1.0867665261030197, | |
| "epoch": 2.2261904761904763, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 3.1982726222908046e-06, | |
| "loss": 0.7631702423095703, | |
| "mean_token_accuracy": 0.7865067571401596, | |
| "num_tokens": 4913478.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 1.107857659459114, | |
| "epoch": 2.238095238095238, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 3.10741661516053e-06, | |
| "loss": 0.8279802799224854, | |
| "mean_token_accuracy": 0.7636834606528282, | |
| "num_tokens": 4940367.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 1.0950567573308945, | |
| "epoch": 2.25, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 3.017631819139273e-06, | |
| "loss": 0.7990702986717224, | |
| "mean_token_accuracy": 0.775329478085041, | |
| "num_tokens": 4966500.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 1.1100285053253174, | |
| "epoch": 2.261904761904762, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 2.9289321881345257e-06, | |
| "loss": 0.7843649387359619, | |
| "mean_token_accuracy": 0.7755291163921356, | |
| "num_tokens": 4992143.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.090055227279663, | |
| "epoch": 2.2738095238095237, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 2.8413315074028157e-06, | |
| "loss": 0.7791961431503296, | |
| "mean_token_accuracy": 0.7800277844071388, | |
| "num_tokens": 5018477.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 1.0741526409983635, | |
| "epoch": 2.2857142857142856, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 2.7548433914072736e-06, | |
| "loss": 0.7515400052070618, | |
| "mean_token_accuracy": 0.785953663289547, | |
| "num_tokens": 5044293.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 1.08182904869318, | |
| "epoch": 2.2976190476190474, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 2.669481281701739e-06, | |
| "loss": 0.7701444625854492, | |
| "mean_token_accuracy": 0.7801418378949165, | |
| "num_tokens": 5071488.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 1.1468330323696136, | |
| "epoch": 2.3095238095238093, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 2.5852584448417327e-06, | |
| "loss": 0.8802750110626221, | |
| "mean_token_accuracy": 0.756720781326294, | |
| "num_tokens": 5097447.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 1.0781276375055313, | |
| "epoch": 2.3214285714285716, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 2.502187970322657e-06, | |
| "loss": 0.768257200717926, | |
| "mean_token_accuracy": 0.7819816321134567, | |
| "num_tokens": 5124041.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 1.0870239287614822, | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 2.420282768545469e-06, | |
| "loss": 0.8156233429908752, | |
| "mean_token_accuracy": 0.7710027247667313, | |
| "num_tokens": 5151020.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 1.1158137768507004, | |
| "epoch": 2.3452380952380953, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 2.339555568810221e-06, | |
| "loss": 0.8283431529998779, | |
| "mean_token_accuracy": 0.7646451517939568, | |
| "num_tokens": 5177398.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 1.104174643754959, | |
| "epoch": 2.357142857142857, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 2.2600189173377263e-06, | |
| "loss": 0.8111021518707275, | |
| "mean_token_accuracy": 0.7714014053344727, | |
| "num_tokens": 5203526.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 1.0948980003595352, | |
| "epoch": 2.369047619047619, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 2.1816851753197023e-06, | |
| "loss": 0.7999440431594849, | |
| "mean_token_accuracy": 0.7765351012349129, | |
| "num_tokens": 5229649.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 1.1040179580450058, | |
| "epoch": 2.380952380952381, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 2.104566516997647e-06, | |
| "loss": 0.7910847067832947, | |
| "mean_token_accuracy": 0.7759528011083603, | |
| "num_tokens": 5255150.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.380952380952381, | |
| "eval_entropy": 1.1016432423340647, | |
| "eval_loss": 0.850182056427002, | |
| "eval_mean_token_accuracy": 0.7646622657775879, | |
| "eval_num_tokens": 5255150.0, | |
| "eval_runtime": 20.7435, | |
| "eval_samples_per_second": 7.231, | |
| "eval_steps_per_second": 0.916, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.1015710830688477, | |
| "epoch": 2.392857142857143, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 2.0286749277707783e-06, | |
| "loss": 0.7965137958526611, | |
| "mean_token_accuracy": 0.775518424808979, | |
| "num_tokens": 5280462.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 1.073985680937767, | |
| "epoch": 2.4047619047619047, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1.9540222023333165e-06, | |
| "loss": 0.7600584030151367, | |
| "mean_token_accuracy": 0.7819822654128075, | |
| "num_tokens": 5307159.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 1.1074542552232742, | |
| "epoch": 2.4166666666666665, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1.880619942841435e-06, | |
| "loss": 0.8110378980636597, | |
| "mean_token_accuracy": 0.7697334587574005, | |
| "num_tokens": 5333284.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 1.0784734338521957, | |
| "epoch": 2.4285714285714284, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 1.808479557110081e-06, | |
| "loss": 0.7567155361175537, | |
| "mean_token_accuracy": 0.7863621711730957, | |
| "num_tokens": 5359494.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 1.1178395748138428, | |
| "epoch": 2.4404761904761907, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1.7376122568400533e-06, | |
| "loss": 0.8341485857963562, | |
| "mean_token_accuracy": 0.7668850645422935, | |
| "num_tokens": 5385894.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 1.0967416018247604, | |
| "epoch": 2.4523809523809526, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.6680290558755119e-06, | |
| "loss": 0.7977155447006226, | |
| "mean_token_accuracy": 0.7702686116099358, | |
| "num_tokens": 5411830.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 1.1018935441970825, | |
| "epoch": 2.4642857142857144, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.599740768492286e-06, | |
| "loss": 0.8191432952880859, | |
| "mean_token_accuracy": 0.7678048387169838, | |
| "num_tokens": 5438295.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 1.0825640261173248, | |
| "epoch": 2.4761904761904763, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.5327580077171589e-06, | |
| "loss": 0.789887011051178, | |
| "mean_token_accuracy": 0.7770953327417374, | |
| "num_tokens": 5464705.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 1.1018773317337036, | |
| "epoch": 2.488095238095238, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.467091183678444e-06, | |
| "loss": 0.7919491529464722, | |
| "mean_token_accuracy": 0.7767357230186462, | |
| "num_tokens": 5490516.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 1.1184441447257996, | |
| "epoch": 2.5, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.4027505019880972e-06, | |
| "loss": 0.8499428033828735, | |
| "mean_token_accuracy": 0.7632577866315842, | |
| "num_tokens": 5516680.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.0862347781658173, | |
| "epoch": 2.511904761904762, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 1.339745962155613e-06, | |
| "loss": 0.7583701610565186, | |
| "mean_token_accuracy": 0.7843123078346252, | |
| "num_tokens": 5542288.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 1.097862184047699, | |
| "epoch": 2.5238095238095237, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 1.278087356033947e-06, | |
| "loss": 0.783574104309082, | |
| "mean_token_accuracy": 0.7759824991226196, | |
| "num_tokens": 5568234.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 1.0988486558198929, | |
| "epoch": 2.5357142857142856, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.2177842662977136e-06, | |
| "loss": 0.7944028973579407, | |
| "mean_token_accuracy": 0.7767082825303078, | |
| "num_tokens": 5594208.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 1.0893902629613876, | |
| "epoch": 2.5476190476190474, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 1.1588460649539036e-06, | |
| "loss": 0.7720337510108948, | |
| "mean_token_accuracy": 0.7832249104976654, | |
| "num_tokens": 5620184.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 1.1016841530799866, | |
| "epoch": 2.5595238095238093, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1.1012819118853147e-06, | |
| "loss": 0.7969903945922852, | |
| "mean_token_accuracy": 0.7736426144838333, | |
| "num_tokens": 5646695.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 1.0920927673578262, | |
| "epoch": 2.571428571428571, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 1.0451007534269908e-06, | |
| "loss": 0.7718170285224915, | |
| "mean_token_accuracy": 0.7819743752479553, | |
| "num_tokens": 5672524.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 1.107266440987587, | |
| "epoch": 2.5833333333333335, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 9.903113209758098e-07, | |
| "loss": 0.8304149508476257, | |
| "mean_token_accuracy": 0.7672121822834015, | |
| "num_tokens": 5699060.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 1.0764499753713608, | |
| "epoch": 2.5952380952380953, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 9.369221296335007e-07, | |
| "loss": 0.7635530829429626, | |
| "mean_token_accuracy": 0.7877878844738007, | |
| "num_tokens": 5725051.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 1.0995107293128967, | |
| "epoch": 2.607142857142857, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 8.849414768832687e-07, | |
| "loss": 0.8118206858634949, | |
| "mean_token_accuracy": 0.7716490253806114, | |
| "num_tokens": 5752373.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 1.0860945731401443, | |
| "epoch": 2.619047619047619, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 8.343774413002382e-07, | |
| "loss": 0.7682137489318848, | |
| "mean_token_accuracy": 0.7820399552583694, | |
| "num_tokens": 5778613.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.1067247688770294, | |
| "epoch": 2.630952380952381, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 7.852378812959227e-07, | |
| "loss": 0.8284558653831482, | |
| "mean_token_accuracy": 0.7689725607633591, | |
| "num_tokens": 5805434.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 1.0849140882492065, | |
| "epoch": 2.642857142857143, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 7.375304338969135e-07, | |
| "loss": 0.7875180840492249, | |
| "mean_token_accuracy": 0.7756547182798386, | |
| "num_tokens": 5832316.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 1.115451768040657, | |
| "epoch": 2.6547619047619047, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 6.912625135579587e-07, | |
| "loss": 0.8203433752059937, | |
| "mean_token_accuracy": 0.7700524106621742, | |
| "num_tokens": 5858167.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 1.0904614925384521, | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 6.464413110096601e-07, | |
| "loss": 0.7759373188018799, | |
| "mean_token_accuracy": 0.7771004885435104, | |
| "num_tokens": 5884359.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 1.0829641073942184, | |
| "epoch": 2.678571428571429, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 6.030737921409169e-07, | |
| "loss": 0.789995014667511, | |
| "mean_token_accuracy": 0.7763750106096268, | |
| "num_tokens": 5910953.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 1.100945070385933, | |
| "epoch": 2.6904761904761907, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 5.611666969163243e-07, | |
| "loss": 0.7801922559738159, | |
| "mean_token_accuracy": 0.7809778973460197, | |
| "num_tokens": 5936246.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 1.094831421971321, | |
| "epoch": 2.7023809523809526, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 5.207265383286831e-07, | |
| "loss": 0.802288830280304, | |
| "mean_token_accuracy": 0.7725913003087044, | |
| "num_tokens": 5963034.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 1.1124836802482605, | |
| "epoch": 2.7142857142857144, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 4.817596013867765e-07, | |
| "loss": 0.8340546488761902, | |
| "mean_token_accuracy": 0.765786811709404, | |
| "num_tokens": 5989514.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 1.1165993362665176, | |
| "epoch": 2.7261904761904763, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 4.4427194213859216e-07, | |
| "loss": 0.8222470283508301, | |
| "mean_token_accuracy": 0.7660126239061356, | |
| "num_tokens": 6015850.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 1.1128047853708267, | |
| "epoch": 2.738095238095238, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 4.082693867301224e-07, | |
| "loss": 0.8243576288223267, | |
| "mean_token_accuracy": 0.7661043703556061, | |
| "num_tokens": 6042209.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.0889495462179184, | |
| "epoch": 2.75, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 3.7375753049987974e-07, | |
| "loss": 0.7938658595085144, | |
| "mean_token_accuracy": 0.7809677720069885, | |
| "num_tokens": 6069318.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 1.1063835173845291, | |
| "epoch": 2.761904761904762, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 3.4074173710931804e-07, | |
| "loss": 0.8031449317932129, | |
| "mean_token_accuracy": 0.7716307789087296, | |
| "num_tokens": 6095421.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 1.0925931185483932, | |
| "epoch": 2.7738095238095237, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 3.0922713770922155e-07, | |
| "loss": 0.7671008110046387, | |
| "mean_token_accuracy": 0.7852345705032349, | |
| "num_tokens": 6120669.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 1.0808859765529633, | |
| "epoch": 2.7857142857142856, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 2.7921863014225504e-07, | |
| "loss": 0.7566604614257812, | |
| "mean_token_accuracy": 0.7864194139838219, | |
| "num_tokens": 6146379.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 1.1180297583341599, | |
| "epoch": 2.7976190476190474, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 2.507208781817638e-07, | |
| "loss": 0.8459954857826233, | |
| "mean_token_accuracy": 0.7657622396945953, | |
| "num_tokens": 6172360.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 1.09176704287529, | |
| "epoch": 2.8095238095238093, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 2.2373831080695463e-07, | |
| "loss": 0.7911088466644287, | |
| "mean_token_accuracy": 0.7778226584196091, | |
| "num_tokens": 6199270.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 1.1062865853309631, | |
| "epoch": 2.821428571428571, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1.9827512151456175e-07, | |
| "loss": 0.8155214190483093, | |
| "mean_token_accuracy": 0.771714448928833, | |
| "num_tokens": 6225009.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 1.115729421377182, | |
| "epoch": 2.8333333333333335, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1.7433526766711727e-07, | |
| "loss": 0.8296111226081848, | |
| "mean_token_accuracy": 0.7672879844903946, | |
| "num_tokens": 6250868.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 1.0913526266813278, | |
| "epoch": 2.8452380952380953, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 1.519224698779198e-07, | |
| "loss": 0.775915801525116, | |
| "mean_token_accuracy": 0.7825674042105675, | |
| "num_tokens": 6276930.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 1.093573346734047, | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1.3104021143278911e-07, | |
| "loss": 0.7870185971260071, | |
| "mean_token_accuracy": 0.7790666744112968, | |
| "num_tokens": 6303822.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.0675100684165955, | |
| "epoch": 2.869047619047619, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1.1169173774871478e-07, | |
| "loss": 0.7764073014259338, | |
| "mean_token_accuracy": 0.7834136635065079, | |
| "num_tokens": 6330367.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 1.1061383187770844, | |
| "epoch": 2.880952380952381, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 9.388005586947191e-08, | |
| "loss": 0.8234024047851562, | |
| "mean_token_accuracy": 0.7703361287713051, | |
| "num_tokens": 6356783.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 1.0825785994529724, | |
| "epoch": 2.892857142857143, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 7.760793399827937e-08, | |
| "loss": 0.7715331315994263, | |
| "mean_token_accuracy": 0.7832503244280815, | |
| "num_tokens": 6382945.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 1.112140253186226, | |
| "epoch": 2.9047619047619047, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 6.287790106757396e-08, | |
| "loss": 0.8297468423843384, | |
| "mean_token_accuracy": 0.7650200128555298, | |
| "num_tokens": 6409925.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 1.0952338576316833, | |
| "epoch": 2.9166666666666665, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 4.9692246345985905e-08, | |
| "loss": 0.802385687828064, | |
| "mean_token_accuracy": 0.7693765163421631, | |
| "num_tokens": 6436469.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 1.1076784878969193, | |
| "epoch": 2.928571428571429, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 3.805301908254455e-08, | |
| "loss": 0.8204672336578369, | |
| "mean_token_accuracy": 0.7711711004376411, | |
| "num_tokens": 6463005.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 1.090786725282669, | |
| "epoch": 2.9404761904761907, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 2.796202818819871e-08, | |
| "loss": 0.8098682761192322, | |
| "mean_token_accuracy": 0.773357167840004, | |
| "num_tokens": 6490495.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 1.0970018655061722, | |
| "epoch": 2.9523809523809526, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.9420841954681525e-08, | |
| "loss": 0.8029658198356628, | |
| "mean_token_accuracy": 0.7747255638241768, | |
| "num_tokens": 6516957.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 1.0752539932727814, | |
| "epoch": 2.9642857142857144, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 1.2430787810776556e-08, | |
| "loss": 0.801226019859314, | |
| "mean_token_accuracy": 0.7782684937119484, | |
| "num_tokens": 6544240.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 1.0858174860477448, | |
| "epoch": 2.9761904761904763, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 6.992952116013918e-09, | |
| "loss": 0.7791233062744141, | |
| "mean_token_accuracy": 0.7781018689274788, | |
| "num_tokens": 6570925.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.0985930263996124, | |
| "epoch": 2.988095238095238, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 3.1081799918375454e-09, | |
| "loss": 0.7869044542312622, | |
| "mean_token_accuracy": 0.7782788202166557, | |
| "num_tokens": 6596366.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 1.1129964143037796, | |
| "epoch": 3.0, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 7.770751902513862e-10, | |
| "loss": 0.8193890452384949, | |
| "mean_token_accuracy": 0.7702639102935791, | |
| "num_tokens": 6620556.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_entropy": 1.1019742865311473, | |
| "eval_loss": 0.8498820066452026, | |
| "eval_mean_token_accuracy": 0.7648934659204984, | |
| "eval_num_tokens": 6620556.0, | |
| "eval_runtime": 20.9234, | |
| "eval_samples_per_second": 7.169, | |
| "eval_steps_per_second": 0.908, | |
| "step": 252 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 252, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.5249393952995226e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |